U
    hw#                     @  s   d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ ertd dlZd dlmZ dd	d
ddZG dd deZdddddZG dd deZeZdS )    )annotationsN)Path)TYPE_CHECKINGDictListOptionalUnionDocument)
BaseLoader)
EntityLikedictstr)rowreturnc                 C  s.   | d }| d }| d }| d| d| dS )zBCombine message information in a readable format ready to be used.datefromtextz on z: 

 )r   r   Zsenderr   r   r   Q/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/telegram.pyconcatenate_rows   s    r   c                   @  s,   e Zd ZdZddddZdddd	Zd
S )TelegramChatFileLoaderzLoad from `Telegram chat` dump.zUnion[str, Path])pathc                 C  s
   || _ dS )zInitialize with a path.N)	file_path)selfr   r   r   r   __init__   s    zTelegramChatFileLoader.__init__List[Document]r   c              	   C  s^   t | j}t|dd}t|}W 5 Q R X ddd |d D }dt|i}t||dgS )	Load documents.utf8encoding c                 s  s0   | ](}|d  dkrt |d trt|V  qdS )typemessager   N)
isinstancer   r   ).0r%   r   r   r   	<genexpr>'   s    z.TelegramChatFileLoader.load.<locals>.<genexpr>messagessourcepage_contentmetadata)r   r   openjsonloadjoinr   r
   )r   pfdr   r-   r   r   r   r0       s    

zTelegramChatFileLoader.loadN)__name__
__module____qualname____doc__r   r0   r   r   r   r   r      s   r   zUnion[str, List[str]]r   )r   r   c           	   
   C  s   ddl m} |ddddddd	d
dgdd}t| tr:| g} dd | D }t|D ]\}}|d |jd< qPg }|D ]d}||j}t|D ]J\}}t||jd |dd}|jd  d|jd  |jd< |	| qqp|S )zIConvert a string or list of strings to a list of Documents with metadata.r   )RecursiveCharacterTextSplitteri   r   
.!?, r#      )
chunk_size
separatorsZchunk_overlapc                 S  s   g | ]}t |d qS ))r,   r	   )r'   pager   r   r   
<listcomp>>   s     z text_to_docs.<locals>.<listcomp>   rC   )rC   chunkr+   -rF   r*   )
Zlangchain_text_splittersr9   r&   r   	enumerater-   Z
split_textr,   r
   append)	r   r9   Ztext_splitterZ	page_docsidocZ
doc_chunkschunksrF   r   r   r   text_to_docs1   s,    
  rM   c                   @  sf   e Zd ZdZdddddddd	d
ZddddZdddddZddddddZddddZdS )TelegramChatApiLoaderz)Load `Telegram` chat json directory dump.Ntelegram_data.jsonzOptional[EntityLike]zOptional[int]zOptional[str]r   chat_entityapi_idapi_hashusernamer   c                 C  s"   || _ || _|| _|| _|| _dS )aI  Initialize with API parameters.

        Args:
            chat_entity: The chat entity to fetch data from.
            api_id: The API ID.
            api_hash: The API hash.
            username: The username.
            file_path: The file path to save the data to. Defaults to
                 "telegram_data.json".
        NrP   )r   rQ   rR   rS   rT   r   r   r   r   r   V   s
    zTelegramChatApiLoader.__init__Noner   c                   s   ddl m} g }|| j| j| j4 I dH `}|| j2 zJ3 dH W }|jdk	}|rZ|jjnd}|	|j
|j|j |j||d q86 W 5 Q I dH R X t| jddd}tj||dd	d
 W 5 Q R X dS )z8Fetch data from Telegram API and save it as a JSON file.r   )TelegramClientN)	sender_idr   r   
message.idis_replyreply_to_idwzutf-8r!   F   )ensure_asciiindent)Ztelethon.syncrV   rT   rR   rS   Ziter_messagesrQ   Zreply_toZreply_to_msg_idrI   rW   r   r   	isoformatidr.   r   r/   dump)r   rV   dataclientr%   rY   rZ   r3   r   r   r   fetch_data_from_telegramn   s"    
z.TelegramChatApiLoader.fetch_data_from_telegrampd.DataFramer   )rb   r   c                   sh   dddd fdd ||d   }||d  j dgd	d td<  fd
d|d D }|S )a
  Create a dictionary of message threads from the given data.

        Args:
            data (pd.DataFrame): A DataFrame containing the conversation                 data with columns:
                - message.sender_id
                - text
                - date
                - message.id
                - is_reply
                - reply_to_id

        Returns:
            dict: A dictionary where the key is the parent message ID and                 the value is a list of message IDs in ascending order.
        intre   z	List[int])	parent_id
reply_datar   c                   s>   ||d | k d   }g }|D ]}||g || 7 }q |S )a^  
            Recursively find all replies to a given parent message ID.

            Args:
                parent_id (int): The parent message ID.
                reply_data (pd.DataFrame): A DataFrame containing reply messages.

            Returns:
                list: A list of message IDs that are replies to the parent message ID.
            rZ   rX   )tolist)rg   rh   Zdirect_repliesZall_repliesZreply_id)find_repliesr   r   rj      s    z@TelegramChatApiLoader._get_message_threads.<locals>.find_repliesrY   rZ   )Zsubsetc                   s   i | ]}||g | qS r   r   )r'   rg   rj   Zreply_messagesr   r   
<dictcomp>   s    z>TelegramChatApiLoader._get_message_threads.<locals>.<dictcomp>rX   )ZdropnaZastyperf   )r   rb   Zparent_messagesmessage_threadsr   rk   r   _get_message_threads   s    z*TelegramChatApiLoader._get_message_threadszDict[int, List[int]])rm   rb   r   c                 C  s`   d}|  D ]J\}}||d | jddd  }dd |D }|d|d	 7 }q| S )
aw  
        Combine the message texts for each parent message ID based             on the list of message threads.

        Args:
            message_threads (dict): A dictionary where the key is the parent message                 ID and the value is a list of message IDs in ascending order.
            data (pd.DataFrame): A DataFrame containing the conversation data:
                - message.sender_id
                - text
                - date
                - message.id
                - is_reply
                - reply_to_id

        Returns:
            str: A combined string of message texts sorted by date.
        r#   rX   r   )Zbyr   c                 S  s   g | ]}t |qS r   )r   )r'   elemr   r   r   rD      s     z@TelegramChatApiLoader._combine_message_texts.<locals>.<listcomp>r?   z.
)itemsisinZsort_valuesri   r1   strip)r   rm   rb   Zcombined_textrg   Zmessage_idsZmessage_textsr   r   r   _combine_message_texts   s    z,TelegramChatApiLoader._combine_message_textsr   c           
   	   C  s   | j dk	rJz"ddl}|  t|   W n tk
rH   tdY nX t| j}t	|dd}t
|}W 5 Q R X zddl}W n tk
r   tdY nX ||}||}| |}| ||}	t|	S )r   Nr   zy`nest_asyncio` package not found.
                    please install with `pip install nest_asyncio`
                    r    r!   zf`pandas` package not found. 
                please install with `pip install pandas`
                )rQ   nest_asyncioapplyasynciorunrd   ImportErrorr   r   r.   r/   r0   pandasZjson_normalizeZ	DataFramern   rs   rM   )
r   rt   r2   r3   r4   pdZnormalized_messagesZdfrm   Zcombined_textsr   r   r   r0      s.    






zTelegramChatApiLoader.load)NNNNrO   )	r5   r6   r7   r8   r   rd   rn   rs   r0   r   r   r   r   rN   S   s        ;&rN   )
__future__r   rv   r/   pathlibr   typingr   r   r   r   r   Zlangchain_core.documentsr
   Z)langchain_community.document_loaders.baser   ry   rz   Ztelethon.hintsr   r   r   rM   rN   ZTelegramChatLoaderr   r   r   r   <module>   s   " :