U
    hX$                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ eeZG dd	 d	eZdS )
z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)DictIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                   @   s   e Zd ZU dZdZeed< d"ddeeeee	e ee	e edd	d
Z
ee dddZddddZee dddZeddddZedddZee dddZeee dddZee dddZeeedddZeddd d!ZdS )#PebbloSafeLoaderzkPebblo Safe Loader class is a wrapper around document loaders enabling the data
    to be scrutinized.
    F_discover_sent Nlocal)classifier_location)langchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   c          
      C   s   |rt |tstd|| _tt | _|| _tj	
dp>|| _|| _|| _t| j| _g | _g | _tt| jdd dd }	t|	| _t| j| _t| _|	| j| jd| jdkrdt| jini | _|  | _t|||d	| _| j| j d S )
NzMust specify a valid name.ZPEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r   r   r    ) 
isinstancestr	NameErrorapp_nameuuidZuuid4load_idr$   osenvirongetr   r   r   r   r%   docsdocs_with_idtypesplitr   r&   r   r'   r	   
batch_sizeloader_details_get_app_detailsappr   	pb_clientZsend_loader_discover)
selfr   r   r   r   r   r   r    r   Zloader_name r;   O/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/pebblo.py__init__%   s<    "

zPebbloSafeLoader.__init__)returnc                 C   s   | j  | _|   | jS )zxLoad Documents.

        Returns:
            list: Documents fetched from load method of the wrapped `loader`.
        )r$   loadr1   classify_in_batches)r:   r;   r;   r<   r?   T   s    zPebbloSafeLoader.loadc           	      C   s   t | j| j}g }t|}t|D ]l\}}||d k}|| _|  | _| jj| j| j	| j
|d}| | | jr|| |}n|  }|| q"|| _dS )z
        Classify documents in batches.
        This is to avoid API timeouts when sending large number of documents.
        Batches are generated based on the page_content size.
           )Zloading_endN)r   r1   r5   len	enumerate_index_docsr2   r9   classify_documentsr8   r6   _add_pebblo_specific_metadatar   _add_semantic_to_docs_unindex_docsextend)	r:   ZbatchesZprocessed_docsZtotal_batchesibatchZis_last_batchclassified_docsZbatch_processed_docsr;   r;   r<   r@   _   s,     

z$PebbloSafeLoader.classify_in_batchesc              
   c   s   z| j  }W nF tk
rT } z(| j jj d}t| t||W 5 d}~X Y nX zt|}W n tk
r   g | _	Y qY nX t
|f| _	|  | _| j| j| j| j}| | | jr| || _	n
|  | _	| j	d V  qVdS )zLoad documents in lazy fashion.

        Raises:
            NotImplementedError: raised when lazy_load id not implemented
            within wrapped loader.

        Yields:
            list: Documents from loader's lazy loading.
        z does not implement lazy_load()Nr   )r$   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr1   listrD   r2   r9   rE   r8   r6   rF   r   rG   rH   )r:   Zdoc_iteratorexcerr_strdocclassified_docr;   r;   r<   rM      s.    



  

zPebbloSafeLoader.lazy_loadc                 C   s
   d| _ d S )NT)r   )clsr;   r;   r<   set_discover_sent   s    z"PebbloSafeLoader.set_discover_sentc                 C   s:   t  \}}t| j| j| j| j||ttdtddd}|S )z\Fetch app details. Internal method.

        Returns:
            App: App details.
        Zlangchain_community)r   r   )r   r   r   r-   runtime	frameworkZplugin_versionclient_version)	r   r   r+   r   r   r-   r
   r   r   )r:   r]   r\   r8   r;   r;   r<   r7      s    
z!PebbloSafeLoader._get_app_detailsc                 C   s   dd t | jD }|S )z
        Indexes the documents and returns a list of IndexedDocument objects.

        Returns:
            List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
        c                 S   s*   g | ]"\}}t f d t|i| qS )pb_id)r   r)   dict.0rJ   rX   r;   r;   r<   
<listcomp>   s   z0PebbloSafeLoader._index_docs.<locals>.<listcomp>)rC   r1   )r:   r2   r;   r;   r<   rD      s    zPebbloSafeLoader._index_docs)rL   r>   c                 C   sV   dd | j D }| D ]&}|d}||kr| || | qdd | D }|S )aF  
        Adds semantic metadata to the given list of documents.

        Args:
            classified_docs (Dict): A dictionary of dictionaries containing the
                classified documents with pb_id as key.

        Returns:
            List[Document]: A list of Document objects with added semantic metadata.
        c                 S   s    i | ]}|j t|j|jd qS )page_contentmetadata)r_   r   re   rf   rb   rX   r;   r;   r<   
<dictcomp>   s    z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<dictcomp>r_   c                 S   s   g | ]}|qS r;   r;   rg   r;   r;   r<   rc      s     z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<listcomp>)r2   valuesr0   _add_semantic_to_doc)r:   rL   Zindexed_docsrY   Zdoc_idZsemantic_metadata_docsr;   r;   r<   rG      s    
z&PebbloSafeLoader._add_semantic_to_docsc                 C   s   dd t | jD }|S )z
        Converts a list of IndexedDocument objects to a list of Document objects.

        Returns:
            List[Document]: A list of Document objects.
        c                 S   s    g | ]\}}t |j|jd qS rd   )r   re   rf   ra   r;   r;   r<   rc      s   z2PebbloSafeLoader._unindex_docs.<locals>.<listcomp>)rC   r2   )r:   r1   r;   r;   r<   rH      s    zPebbloSafeLoader._unindex_docs)rX   rY   r>   c                 C   s8   t |di  |jd< t |di  |jd< |S )a4  
        Adds semantic metadata to the given document in-place.

        Args:
            doc (Document): A Document object.
            classified_doc (dict): A dictionary containing the classified document.

        Returns:
            Document: The Document object with added semantic metadata.
        entitiesZpebblo_semantic_entitiesZtopicsZpebblo_semantic_topics)rU   r0   keysrf   )r:   rX   rY   r;   r;   r<   rj      s    

z%PebbloSafeLoader._add_semantic_to_docc              	   C   st   | j D ]h}|j}| jjjdkr6t|d| j|d< nt|d|d| j|d< ||ji dd|d< qdS )z*Add Pebblo specific metadata to documents.ZSharePointLoadersource	full_pathZpb_checksumN)	r2   rf   r$   rO   rP   r   r0   r%   r_   )r:   rL   rX   Zdoc_metadatar;   r;   r<   rF     s     

  z.PebbloSafeLoader._add_pebblo_specific_metadata)r   r   NFN)rP   
__module____qualname____doc__r   bool__annotations__r   r)   r   r=   r   r   r?   r@   r   rM   classmethodr[   r   r7   r   rD   r   rG   rH   r`   rj   rF   r;   r;   r;   r<   r      s<   
     
/ "r   ) rq   loggingr.   r,   importlib.metadatar   typingr   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z$langchain_community.utilities.pebblor	   r
   r   r   r   r   r   r   r   r   r   r   	getLoggerrP   rQ   r   r;   r;   r;   r<   <module>   s   8
