U
    hG5                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZ d dlmZmZ d dlmZ dZd	Zd
ZdZdZdZdZdZdZ e!e"Z#eddddG dd deeZ$dS )    N)Path)AnyDictListMappingOptionalSequenceUnion)
deprecated)Document)	BaseModelroot_validator)
BaseLoaderz#{http://www.w3.org/1999/xhtml}tablexpathidsourcename	structuretagprojectsz#https://api.docugami.com/v1preview1z0.0.24z1.0z!docugami_langchain.DocugamiLoader)ZsinceZremovalZalternative_importc                   @   s  e Zd ZU dZeZeed< ej	
dZee ed< dZeed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< ee ed< eee  ed< eeeeef   ed< dZeed< eddeeef eeef dddZd.eee ee  e!e" dddZ#ee!e d d!d"Z$ee!e d d#d$Z%eed%d&d'Z&d/eeee ee  e!e" d(d)d*Z'e!e" d+d,d-Z(dS )0DocugamiLoaderzdLoad from `Docugami`.

    To use, you should have the ``dgml-utils`` python package installed.
    apiZDOCUGAMI_API_KEYaccess_tokeni   max_text_length    min_text_lengthi   max_metadata_lengthFinclude_xml_tagsr   parent_hierarchy_levelsdoc_idparent_id_keysub_chunk_tablesTwhitespace_normalize_text	docset_iddocument_ids
file_paths(include_project_metadata_in_doc_metadata)pre)valuesreturnc                 C   sX   | dr| drtd| ds8| ds8td| drT| dsTtd|S )zValidate that either local file paths are given, or remote API docset ID.

        Args:
            values: The values to validate.

        Returns:
            The validated values.
        r%   r#   z7Cannot specify both file_paths and remote API docset_idz6Must specify either file_paths or remote API docset_idr   z7Must specify access token if using remote API docset_id)get
ValueError)clsr(    r-   Q/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/docugami.pyvalidate_local_or_remoteT   s    
z'DocugamiLoader.validate_local_or_remoteN)contentdocument_nameadditional_doc_metadatar)   c              	      s.  zddl m} W n tk
r,   tdY nX zddlm} ddlm} W n tk
rf   tdY nX |td fdd	}|t	
|}| }	||	jjjjjjd
}
i }|
D ]^}||}|jt}|r|||< |jr||j}|jt}|r|jr||jj< |||< qt| S )z6Parse a single DGML document into a list of Documents.r   etreePCould not import lxml python package. Please install it with `pip install lxml`.)Chunk)
get_chunkszaCould not import from dgml-utils python package. Please install it with `pip install dgml-utils`.)dg_chunkr)   c                    sf   t | j  }t| jt|tt	t
| jt| ji}| j} rPjrP|  t|d j |dS )N)page_contentmetadata)hashlibmd5textencode	hexdigest	XPATH_KEYr   ID_KEYDOCUMENT_NAME_KEYDOCUMENT_SOURCE_KEYSTRUCTURE_KEYr   TAG_KEYr   r&   updater   r   )r8   Z
_hashed_idr:   r=   r2   r1   selfr-   r.   _build_framework_chunk   s,          	
z:DocugamiLoader._parse_dgml.<locals>._build_framework_chunk)r   r   r"   r!   r   r   )lxmlr4   ImportErrorZdgml_utils.modelsr6   Zdgml_utils.segmentationr7   r   parseioBytesIOgetrootr   r   r"   r!   r   r   r:   r*   rA   parentr9   r    listr(   )rH   r0   r1   r2   r4   r6   r7   rI   treerootZ	dg_chunksZframework_chunksr8   Zframework_chunkZchunk_idZframework_parent_chunkZ	parent_idr-   rG   r.   _parse_dgmli   sL    





zDocugamiLoader._parse_dgml)r#   r)   c                 C   s|   | j  d| d}g }|rxtj|dd| j id}|jr^| }||d  |dd}qtd	| d
|j dq|S )z1Gets all document details for the given docset ID	/docsets/z
/documentsAuthorizationBearer )headersZ	documentsnextNFailed to download 
 (status: ))	r   requestsr*   r   okjsonextend	Exceptionstatus_code)rH   r#   urlZall_documentsresponsedatar-   r-   r.   _document_details_for_docset_id   s    z.DocugamiLoader._document_details_for_docset_idc                 C   s~   | j  d| }g }|rztjd|dd| j ii d}|jr`| }||d  |dd}qtd	| d
|j	 dq|S )z0Gets all project details for the given docset IDz/projects?docset.id=GETrV   rW   rX   re   r   rY   NrZ   r[   r\   )
r   r]   requestr   r^   r_   r`   r*   ra   rb   )rH   r#   rc   Zall_projectsrd   re   r-   r-   r.   _project_details_for_docset_id   s"    z-DocugamiLoader._project_details_for_docset_id)projectr)   c              	   C   s  | t}| j d| d}g }i }|rtjd|dd| j ii d}|jrp| }||d  | dd	}q$|j	d
kr~|S t
d| d|j	 dq$|D ]8}| d}	| d}
| d}|	dkr|
r|r|t }i }tjd|
 ddd| j ii d}|jrzddlm} W n tk
r6   tdY nX |t|j}| }|j}|jd|d}|D ]L}|jd|dd j}d|jd|dd   }|d	| j ||< qj|||< qt
d|
 dd q|S )z#Gets project metadata for all filesz
/projects/z/artifacts/latestrg   rV   rW   rh   Z	artifactsrY   Ni  rZ   r[   r\   r   rc   documentzreport-values.xmlz/contentr   r3   r5   z
//pr:Entry)
namespacesz./pr:Heading z
./pr:Valuez	/content z (status: {response.status_code}))r*   rA   r   r]   ri   r   r^   r_   r`   rb   ra   rJ   r4   rK   rL   rM   rN   r0   rO   Znsmapr   r=   joinitertextstripr   )rH   rk   Z
project_idrc   Zall_artifactsZper_file_metadatard   re   ZartifactZartifact_nameZartifact_urlZartifact_docr   r:   r4   Zartifact_treeZartifact_rootnsentriesentryheadingvaluer-   r-   r.   _metadata_for_project   sr    








z$DocugamiLoader._metadata_for_project)document_idr#   r1   additional_metadatar)   c                 C   sj   | j  d| d| d}tjd|dd| j ii d}|jrN| j|j||dS td	| d
|j ddS )zLoad chunks for a document.rU   z/documents/z/dgmlrg   rV   rW   rh   )r0   r1   r2   rZ   r[   r\   N)	r   r]   ri   r   r^   rT   r0   ra   rb   )rH   rx   r#   r1   ry   rc   rd   r-   r-   r.   _load_chunks_for_document'  s     z(DocugamiLoader._load_chunks_for_document)r)   c              
      s"  g } j rԈ jrԈ  j} jr4 fdd|D }  j}i }|r jr|D ]@} |}|D ],}||kr~|| ||< qd|| ||  qdqR|D ]8}|t }	|	t
}
|	|	}| j|	 j|
|d7 }qnJ jr jD ]:}t|}t|d}| j| |jd7 }W 5 Q R X q|S )zLoad documents.c                    s   g | ]}|t   jkr|qS r-   )rA   r$   ).0drH   r-   r.   
<listcomp>K  s     z'DocugamiLoader.load.<locals>.<listcomp>)rx   r#   r1   ry   rb)r0   r1   )r   r#   rf   r$   rj   r&   rw   rF   rA   r*   rB   rz   r%   r   openrT   readr   )rH   chunksZ_document_detailsZ_project_detailsZcombined_project_metadatark   r:   Zfile_iddocr   Zdoc_nameZdoc_metadatapathfiler-   r}   r.   loadC  sF    





zDocugamiLoader.load)NN)NN))__name__
__module____qualname____doc__DEFAULT_API_ENDPOINTr   str__annotations__osenvironr*   r   r   r   intr   r   r   boolr   r    r!   r"   r   r	   r   r&   r   r   r   r/   bytesr   r   r   rT   rf   rj   rw   rz   r   r-   r-   r-   r.   r      sL   
"  KK  r   )%r;   rM   loggingr   pathlibr   typingr   r   r   r   r   r   r	   r]   Zlangchain_core._api.deprecationr
   Zlangchain_core.documentsr   Zlangchain_core.pydantic_v1r   r   Z)langchain_community.document_loaders.baser   Z
TABLE_NAMEr@   rA   rC   rB   rD   rE   ZPROJECTS_KEYr   	getLoggerr   loggerr   r-   r-   r-   r.   <module>   s4   $
