U
    hb                     @  s(  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ e e!Z"dZ#dZ$dZ%dZ&ddddddddddddgZ'dddd d!gZ(d"gZ)d#d$d%gZ*e'e(e)e*d&Z+G d'd( d(e,eZ-G d)d* d*eZ.G d+d, d,eZ/G d-d. d.eZ0G d/d0 d0eZ1G d1d2 d2eZ2d3d3d4d5d6Z3d3d3d7d8d9Z4d:d3d7d;d<Z5d=d>d?d@Z6d3d>dAdBZ7dTdCdDdEdFdGdHZ8d3d3dIdJdKZ9d3dDdLdMdNZ:d3dDdOdPdQZ;G dRdS dSeZ<dS )U    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)	BaseModel)get_from_dict_or_env)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  Z
JSONLoaderS3FileLoaderZUnstructuredMarkdownLoaderZUnstructuredPDFLoaderZUnstructuredFileLoaderZUnstructuredJsonLoaderZPyPDFLoaderGCSFileLoaderZAmazonTextractPDFLoaderZ	CSVLoaderZUnstructuredExcelLoaderZUnstructuredEmailLoaderZDirectoryLoaderZS3DirLoaderZSlackDirectoryLoaderZPyPDFDirectoryLoaderZNotionDirectoryLoaderDataFrameLoaderNotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                   @  s   e Zd ZdZdZdZdS )Routesz2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discoverN)__name__
__module____qualname____doc__
loader_docloader_app_discover r#   r#   H/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/utilities/pebblo.pyr   C   s   r   c                   @  s   e Zd ZU dZded< dS )IndexedDocumentzPebblo Indexed Document.strpb_idNr   r   r   r    __annotations__r#   r#   r#   r$   r%   J   s   
r%   c                   @  sn   e Zd ZU dZdZded< ded< ded< dZded	< ded
< ded< ded< ded< ded< dZded< dS )RuntimezPebblo Runtime.localr&   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimeN)r   r   r   r    r,   r)   r1   r7   r#   r#   r#   r$   r*   Q   s   
r*   c                   @  s"   e Zd ZU dZded< ded< dS )	FrameworkzPebblo Framework instance.r&   nameversionNr(   r#   r#   r#   r$   r8   j   s   
r8   c                   @  sR   e Zd ZU dZded< ded< ded< ded< ded	< d
ed< ded< d
ed< dS )AppzPebblo AI application.r&   r9   ownerr0   descriptionload_idr*   r7   r8   	frameworkplugin_versionclient_versionNr(   r#   r#   r#   r$   r;   s   s   
r;   c                   @  sZ   e Zd ZU dZded< ded< ded< ded< ded< d	ed
< ded< ded< ded< dS )DoczPebblo document.r&   r9   r<   listdocsr@   r>   dictloader_detailsboolloading_endsource_ownerclassifier_locationNr(   r#   r#   r#   r$   rB      s   
rB   r&   )r.   returnc                 C  sF   | r d| ks d| d ks | dkr$| S t | }| r>| }t|S )zReturn an absolute local path for a local file/directory,
    for a network related path, return as is.

    Args:
        path (str): Relative path to be resolved.

    Returns:
        str: Resolved absolute path.
    z:///r   )unknown-r   )pathlibPathexistsresolver&   )r.   	full_pathr#   r#   r$   get_full_path   s    

rT   )loaderrK   c                 C  s&   t  D ]\}}| |kr|  S qdS )zReturn loader type among, file, dir or in-memory.

    Args:
        loader (str): Name of the loader, whose type is to be resolved.

    Returns:
        str: One of the loader type among, file/dir/in-memory.
    unsupported)LOADER_TYPE_MAPPINGitems)rU   loader_typeloadersr#   r#   r$   get_loader_type   s    	
r[   r   c                 C  s   ddl m}m}m}m} d}t| ts4td |S | j	}zd|krt| |rfd| j
 d| j }nt| |rd| j
 d| j }nRd	|kr|d	 }|rd
|kr|d
 }|r| d| }nd|kr|d }nd|kr|d }nd|kr$|d }|rt|trt|dkr|d }nt| |r6d}nt| |rPd| j }n| jjdkr|dr|d}	d|	 }nZ|dr|dg }
ddd |
D }n,|dr|dg }ddd |D }W n tk
r   Y nX tt|S )zReturn an absolute source path of source of loader based on the
    keys present in Document.

    Args:
        loader (BaseLoader): Langchain document loader, derived from Baseloader.
    r   )r   r   r   r   rN   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://rL   zs3://sourcechannelr.   	file_path	web_pathsr   znotiondb://r   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, c                 S  s   g | ]}d | dqS )z https://drive.google.com/file/d/z/viewr#   ).0Zfile_idr#   r#   r$   
<listcomp>   s   z(get_loader_full_path.<locals>.<listcomp>document_idsc                 S  s   g | ]}d | dqS )z#https://docs.google.com/document/d/z/editr#   )rc   doc_idr#   r#   r$   rd     s   )Z$langchain_community.document_loadersr   r   r   r   
isinstancer   loggererror__dict__r\   ZblobkeyrC   lenZdatabase_id	__class__r   getjoin	ExceptionrT   r&   )rU   r   r   r   r   locationZloader_dictr^   r`   ra   rb   re   r#   r#   r$   get_loader_full_path   sj    




 

rr   zTuple[Framework, Runtime])rK   c                  C  s   t  } td| ddd}t }t|jtjd | dd|j	|j
t | dd| d	dd
}d|jkrvd|_d|_td|  td|  ||fS )zFetch the current Framework and Runtime details.

    Returns:
        Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
    Z	langchainZlibrary_versionN)r9   r:   ZPWDr2   rM   r7   Zruntime_version)r-   r.   r2   r3   r4   r1   r5   r6   DarwinZdesktopzMac OSXz
framework zruntime )r   r8   rn   r2   unamer*   noder3   environsystemr:   get_ipr,   r7   rh   debug)Zruntime_envr?   rt   r7   r#   r#   r$   get_runtime  s,     




rz   c                  C  sB   ddl } |  }z| |}W n tk
r<   | d}Y nX |S )zJFetch local runtime ip address.

    Returns:
        str: IP address
    r   N	localhost)socketgethostnamegethostbynamerp   )r|   r-   Z	public_ipr#   r#   r$   rx   ,  s    rx   zList[Document]intzList[List[Document]])rD   max_batch_sizerK   c                 C  s~   g }g }d}| D ]Z}t |jd}||kr:||g q|| |krX|| g }d}|| ||7 }q|rz|| |S )a  
    Generate batches of documents based on page_content size.
    Args:
        docs: List of documents to be batched.
        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
    Returns:
        List[List[Document]]: List of batches of documents
    r   utf-8)rl   page_contentencodeappend)rD   r   ZbatchesZcurrent_batchZcurrent_batch_sizedocZdoc_sizer#   r#   r$   generate_size_based_batches<  s     



r   )r_   rK   c                 C  sB   z$ddl }t| j}||j}W n tk
r<   d}Y nX |S )zFetch owner of local file path.

    Args:
        file_path (str): Local file path.

    Returns:
        str: Name of owner.
    r   NrM   )pwdr3   statst_uidgetpwuidpw_namerp   )r_   r   Zfile_owner_uidZfile_owner_namer#   r#   r$   get_file_owner_from_pathd  s    	
r   )source_pathrK   c                 C  s   | sdS d}t j| r&t j| }n^t j| rd}t | D ]>\}}}|D ].}t j||}t j|sN|t j|7 }qNq@|}|S )zFetch size of source path. Source can be a directory or a file.

    Args:
        source_path (str): Local path of data source.

    Returns:
        int: Source size in bytes.
    r   )r3   r.   isfilegetsizeisdirwalkro   islink)r   sizeZ
total_sizedirpath_	filenamesffpr#   r#   r$   get_source_sizew  s    	r   )datarK   c                 C  s   |  d}t|}|S )zCalculate the content size in bytes:
    - Encode the string to bytes using a specific encoding (e.g., UTF-8)
    - Get the length of the encoded bytes.

    Args:
        data (str): Data string.

    Returns:
        int: Size of string in bytes.
    r   )r   rl   )r   Zencoded_contentr   r#   r#   r$   calculate_content_size  s    
r   c                	      s   e Zd ZU dZded< dZded< ded< ded< d	d
 fddZdddddZd2ddddddddZdddddZ	d3dddddZ
ddddd ddd!d"d#Zed4dddd&d d'd(d)d*Zedddd+d,d-d.Zedddd/d0d1Z  ZS )5PebbloLoaderAPIWrapperzWrapper for Pebblo Loader API.r0   api_keyr+   r&   rJ   classifier_url	cloud_urlr   )kwargsc                   sH   t |ddd|d< t |ddt|d< t |ddt|d< t jf | dS )	z%Validate that api key in environment.r   ZPEBBLO_API_KEYr/   r   ZPEBBLO_CLASSIFIER_URLr   ZPEBBLO_CLOUD_URLN)r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfr   rm   r#   r$   r     s&             zPebbloLoaderAPIWrapper.__init__r;   None)apprK   c           	      C  s   d}|j dd}| jdkrD|  }| j tjj }| d|||}| jr| jdd}|rzt	
|jd}|d|i |dti | j tjj }| d|||}dS )	z
        Send app discovery request to Pebblo server & cloud.

        Args:
            app (App): App instance to be discovered.
        NTZexclude_unsetr+   POSTcloud_requestpebblo_server_versionZpebblo_client_version)rE   rJ   _make_headersr   r   r"   valuemake_requestr   jsonloadstextrn   updatePLUGIN_VERSIONr   )	r   r   pebblo_resppayloadheadersZapp_discover_urlr   pebblo_cloud_urlr   r#   r#   r$   send_loader_discover  s"    
z+PebbloLoaderAPIWrapper.send_loader_discoverFzList[IndexedDocument]rE   rG   )docs_with_idr   rF   rH   rK   c              
   C  s(  | dd}t|}| |||\}}| ||||||}	i }
| jdkr|  }| j tjj	 }zF| 
d|||	d}|rt|j dg D ]}|
|d |i qW n. tk
r } ztd| W 5 d	}~X Y nX | jr| jdkr| |	d |
 | |	 n| jd
kr$td td|
S )a  
        Send documents to Pebblo server for classification.
        Then send classified documents to Daxa cloud(If api_key is present).

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            app (App): App instance.
            loader_details (dict): Loader details.
            loading_end (bool): Boolean, indicating the halt of data loading by loader.
        r   r/   r+   r   i,  rD   r'   z3An Exception caught in classify_documents: local %sNzpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)rn   r   prepare_docs_for_classificationbuild_classification_payloadrJ   r   r   r   r!   r   r   r   r   r   r   rp   rh   warningr   update_doc_datasend_docs_to_pebblo_cloud	NameError)r   r   r   rF   rH   r   rI   rD   source_aggregate_sizer   classified_docsr   Zload_doc_urlr   Zclassified_docer#   r#   r$   classify_documents  sT           
    



z)PebbloLoaderAPIWrapper.classify_documents)r   rK   c              
   C  sf   | j dd}| j tjj }z| d|||}W n. tk
r` } ztd| W 5 d}~X Y nX dS )z
        Send documents to Pebblo cloud.

        Args:
            payload (dict): The payload containing documents to be sent.
        Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r   r!   r   r   rp   rh   r   )r   r   r   r   r   r   r#   r#   r$   r     s    z0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloud)r   rK   c                 C  s4   ddd}|r0| j r&|d| j i n
td |S )z
        Generate headers for the request.

        args:
            cloud_request (bool): flag indicating whether the request is for Pebblo
            cloud.
        returns:
            dict: Headers for the request.

        zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   rh   r   )r   r   r   r#   r#   r$   r   "  s    
z$PebbloLoaderAPIWrapper._make_headersz
List[dict]r   )r   rD   rF   rI   r   rH   rK   c              
   C  sZ   |j |j|t|j|d|| jd	}|dkrDd|d< d|krD||d d< tf |jdd}|S )	a  
        Build the payload for document classification.

        Args:
            app (App): App instance.
            docs (List[dict]): List of documents to be classified.
            loader_details (dict): Loader details.
            source_owner (str): Owner of the source.
            source_aggregate_size (int): Aggregate size of the source.
            loading_end (bool): Boolean indicating the halt of data loading by loader.

        Returns:
            dict: Payload for document classification.
        false)	r9   r<   rD   r@   r>   rF   rH   rI   rJ   TtruerH   rF   r   r   )r9   r<   r   r>   rJ   rB   rE   )r   r   rD   rF   rI   r   rH   r   r#   r#   r$   r   9  s"    
z3PebbloLoaderAPIWrapper.build_classification_payloadN   zOptional[dict]zOptional[Response])methodurlr   r   timeoutrK   c              
   C  s   zt | ||||d}td| |j jtt|j jr6|j jng t|j |jtj	krht
d|j  n>|jtjkrt
d|j  n|jtjkrt
d|j  |W S  tk
r   t
d| Y n. tk
r } zt
d| W 5 d}~X Y nX dS )	a  
        Make a request to the Pebblo API

        Args:
            method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
            url (str): URL for the request.
            headers (dict): Headers for the request.
            payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
            timeout (int): Timeout for the request in seconds.

        Returns:
            Optional[Response]: Response object if the request is successful.
        )r   r   r   r   r   z5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   rh   ry   r   r&   rl   bodystatus_coder   INTERNAL_SERVER_ERRORr   BAD_REQUESTr   OKr   rp   )r   r   r   r   r   responser   r#   r#   r$   r   d  s:        
z#PebbloLoaderAPIWrapper.make_requestzTuple[List[dict], int])r   r   rF   rK   c              
   C  s8  g }d}dd | D }d}|D ]}| di }| dg }	|d dkr^t| d	|d
 }
nt| d| d	|}
| dt|
}| dt|
}t| d}t|}||7 }| ddpd}|||
|| di  d|d|	rd|	ini |dk	rd|ini  |d dkr|s| d|d
< d}q||fS )a  
        Prepare documents for classification.

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
            loader_details (dict): Contains loader info.

        Returns:
            Tuple[List[dict], int]: Documents and the aggregate size
            of the source.
        r   c                 S  s   g | ]}|  qS r#   )rE   )rc   r   r#   r#   r$   rd     s     zJPebbloLoaderAPIWrapper.prepare_docs_for_classification.<locals>.<listcomp>FmetadataZauthorized_identitiesrU   r   r]   r   rS   r<   r   r   r'   Nlast_modified)r   r   r'   r   Z
file_ownerZsource_path_sizeZsource_full_urlT)rn   rT   r   r   r&   r   r   )r   r   rF   rD   r   Zdoc_contentZsource_path_updater   Zdoc_metadataZdoc_authorized_identitiesZdoc_source_pathZdoc_source_ownerZdoc_source_sizer   Zpage_content_sizerf   r#   r#   r$   r     s`    

 

z6PebbloLoaderAPIWrapper.prepare_docs_for_classification)rD   r   rK   c              
   C  sX   | D ]N}| |d i }|| d| d| di | di d |d qdS )	z
        Update the document data with classified information.

        Args:
            docs (List[dict]): List of document data to be updated.
            classified_docs (dict): The dictionary containing classified documents.
        r'   pb_checksumloader_source_pathentitiestopics)r   r   r   r   r   N)rn   r   pop)rD   r   Zdoc_dataZclassified_datar#   r#   r$   r     s    	

	z&PebbloLoaderAPIWrapper.update_doc_data)F)F)Nr   )r   r   r   r    r)   rJ   r   r   r   r   r   r   staticmethodr   r   r   __classcell__r#   r#   r   r$   r     s(   
$ <+  1Fr   )r   )=
__future__r   r   loggingr3   rO   r2   enumr   httpr   typingr   r   r   r   r	   Zlangchain_core.documentsr
   Zlangchain_core.envr   Zlangchain_core.pydantic_v1r   Zlangchain_core.utilsr   requestsr   r   Zrequests.exceptionsr   Z)langchain_community.document_loaders.baser   	getLoggerr   rh   r   r   r   ZBATCH_SIZE_BYTESZfile_loaderZ
dir_loaderZ	in_memoryZcloud_folderrW   r&   r   r%   r*   r8   r;   rB   rT   r[   rr   rz   rx   r   r   r   r   r   r#   r#   r#   r$   <module>   s   
	H (