U
    hQ                     @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ G dd deeZG dd	 d	eZG d
d deZdS )    N)ABCabstractmethod)DictIteratorOptionalTupleUnion)Document)
BaseLoaderc                   @   s  e Zd ZdZdddddddd	d	dddd
d
deeeeeef eeeeeeeeef eeef eeef ee ee d
dddZ	e
e dddZeedddZeedddZeee
e dddZd eeee e
e dddZeeeef dddZd
S )!DedocBaseLoadera  
    Base Loader that uses `dedoc` (https://dedoc.readthedocs.io).

    Loader enables extracting text, tables and attached files from the given file:
        * `Text` can be split by pages, `dedoc` tree nodes, textual lines
            (according to the `split` parameter).
        * `Attached files` (when with_attachments=True)
            are split according to the `split` parameter.
            For attachments, langchain Document object has an additional metadata field
            `type`="attachment".
        * `Tables` (when with_tables=True) are not split - each table corresponds to one
            langchain Document object.
            For tables, Document object has additional metadata fields `type`="table"
            and `text_as_html` with table HTML representation.
    documentTF
   
auto_tabbyrus+eng:autoN)splitwith_tableswith_attachmentsrecursion_deep_attachmentspdf_with_text_layerlanguagepagesis_one_column_documentdocument_orientationneed_header_footer_analysisneed_binarizationneed_pdf_table_analysis	delimiterencoding)	file_pathr   r   r   r   r   r   r   r   r   r   r   r   r   r   returnc                C   s   dd t   D | _ddddh| _|| jkrFtd| d| j d	|| _|| _|| _| jdkrfd
nd}|| jd< || jd< dS )a
  
        Initialize with file path and parsing parameters.

        Args:
            file_path: path to the file for processing
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document text is returned as a single langchain Document
                    object (don't split)
                "page": split document text into pages (works for PDF, DJVU, PPTX, PPT,
                    ODP)
                "node": split document text into tree nodes (title nodes, list item
                    nodes, raw text nodes)
                "line": split document text into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        c                 S   s   i | ]\}}|d kr||qS )>   r   r   r    self ).0keyvaluer#   r#   N/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/dedoc.py
<dictcomp>d   s    z,DedocBaseLoader.__init__.<locals>.<dictcomp>r   pagenodelineGot $ for `split`, but should be one of ``treeZlinearstructure_typeZneed_content_analysisN)localsitemsparsing_parametersvalid_split_values
ValueErrorr   r   r    )r"   r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r#   r#   r'   __init__#   s    A

zDedocBaseLoader.__init__r!   c              	   c   s   ddl }zddlm} W n tk
r4   tdY nX ||  d}d|jd _|  }|j| j	| j
d|id	}W 5 Q R X | j|  | jd
E dH  dS )Lazily load documents.r   N)DedocManagerzE`dedoc` package not found, please install it with `pip install dedoc`)Zmanager_configTloggerZattachments_dir)r    
parametersdocument_treer   )tempfileZdedocr9   ImportError_make_configconfigdisabledTemporaryDirectoryparser    r3   _split_documentZto_api_schemadictr   )r"   r>   r9   Zdedoc_managerZtmpdirr=   r#   r#   r'   	lazy_loadw   s$    


 zDedocBaseLoader.lazy_loadc                 C   s   dS )zu
        Make configuration for DedocManager according to the file extension and
        parsing parameters.
        Nr#   r"   r#   r#   r'   r@      s    zDedocBaseLoader._make_config)	paragraphr!   c                    s>   d  fdd|d D }|r2|d  d| n|d }|S )z1Get text (recursively) of the document tree node.
c                    s   g | ]}  |qS r#   )	_json2txt)r$   subparagraphrH   r#   r'   
<listcomp>   s   z-DedocBaseLoader._json2txt.<locals>.<listcomp>subparagraphstext)join)r"   rI   Zsubparagraphs_textrO   r#   rH   r'   rK      s    
zDedocBaseLoader._json2txt)r=   document_metadatar!   c                 c   sR   t |d dkr4|d D ]}| j||dE dH  qnt|d ||d dV  dS )z4Parse recursively document tree obtained by `dedoc`.rN   r   r=   rQ   NrO   metadataZpage_contentrS   )len_parse_subparagraphsr	   )r"   r=   rQ   rL   r#   r#   r'   rV      s     
z$DedocBaseLoader._parse_subparagraphs)r=   r   additional_metadatar!   c                 c   s  |d }|r||}|dkrB| j |d d d}t||dV  n|dkr|d d d }|d	 d d
 }d}|D ]P}	|	d d
 |kr||  |	7 }qrt||d
|idV  |	d d
 }|  |	}qrt||d
|idV  n|dkr|d d d D ]$}	|	d }
t|  |	||
dV  qn@|dkrF| j|d d |dE dH  ntd| d| j d| jr|d d D ]0}| |\}}t||d d|ddV  qr|d D ]"}| j|| jddidE dH  qdS )z=Split document into parts according to the `split` parameter.rS   r   contentZ	structure)rI   rT   r)   rN   r   page_id r+   r*   rR   Nr,   r-   r.   Ztablestable)typeZtext_as_htmlattachmentsr\   
attachment)r=   r   rW   )	rK   r	   rV   r5   r4   r   
_get_tablerE   r   )r"   r=   r   rW   rQ   rO   ZnodesrY   Z	page_textr*   Zline_metadatar[   
table_text
table_htmlr^   r#   r#   r'   rE      sl    





	zDedocBaseLoader._split_document)r[   r!   c              
   C   s   d}|d D ]:}|D ](}|d dd |d D 7 }|d7 }q|d7 }qd	}|d D ]|}|d
7 }|D ]b}d dd |d D }t|}|d7 }|d r|d7 }|d|d  d|d  d| d7 }qd|d7 }qT|d7 }||fS )z.Get text and HTML representation of the table.rZ   cells c                 s   s   | ]}|d  V  qdS rO   Nr#   r$   r+   r#   r#   r'   	<genexpr>  s     z-DedocBaseLoader._get_table.<locals>.<genexpr>lines	rJ   zK<table border="1" style="border-collapse: collapse; width: 100%;">
<tbody>
z<tr>
c                 s   s   | ]}|d  V  qdS rd   r#   re   r#   r#   r'   rf     s     z<tdZ	invisiblez style="display: none" z
 colspan="Zcolspanz" rowspan="Zrowspanz">z</td>
z</tr>
z</tbody>
</table>)rP   htmlescape)r"   r[   r`   rowcellra   Z	cell_textr#   r#   r'   r_      s,    



zDedocBaseLoader._get_table)N)__name__
__module____qualname____doc__strboolr   intr   r6   r   r	   rG   r   rF   r@   rK   rV   rE   r   r_   r#   r#   r#   r'   r      sb   



T  Kr   c                   @   s   e Zd ZdZedddZdS )DedocFileLoaderaw  
    DedocFileLoader document loader integration to load files using `dedoc`.

    The file loader automatically detects the file type (with the correct extension).
    The list of supported file types is gives at
    https://dedoc.readthedocs.io/en/latest/index.html#id1.
    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        Install ``dedoc`` package.

        .. code-block:: bash

            pip install -U dedoc

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocFileLoader

            loader = DedocFileLoader(
                file_path="example.pdf",
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    r7   c                 C   s    ddl m} || j| j| jdS )Nr   )make_manager_config)r    Zparsing_paramsr   )Zdedoc.utils.langchainru   r    r3   r   )r"   ru   r#   r#   r'   r@   `  s    zDedocFileLoader._make_configN)rm   rn   ro   rp   rF   r@   r#   r#   r#   r'   rt     s   Brt   c                       s   e Zd ZdZdddddddd	d
d
ddddddeeeeeeef eeeeeeeeef eeef eeef ee ee dd fddZ	e
e dddZedddZeeeeeeeeef f dddZ  ZS )DedocAPIFileLoaderaU  
    Load files using `dedoc` API.
    The file loader automatically detects the file type (even with the wrong extension).
    By default, the loader makes a call to the locally hosted `dedoc` API.
    More information about `dedoc` API can be found in `dedoc` documentation:
        https://dedoc.readthedocs.io/en/latest/dedoc_api_usage/api.html

    Please see the documentation of DedocBaseLoader to get more details.

    Setup:
        You don't need to install `dedoc` library for using this loader.
        Instead, the `dedoc` API needs to be run.
        You may use Docker container for this purpose.
        Please see `dedoc` documentation for more details:
            https://dedoc.readthedocs.io/en/latest/getting_started/installation.html#install-and-run-dedoc-using-docker

        .. code-block:: bash

            docker pull dedocproject/dedoc
            docker run -p 1231:1231

    Instantiate:
        .. code-block:: python

            from langchain_community.document_loaders import DedocAPIFileLoader

            loader = DedocAPIFileLoader(
                file_path="example.pdf",
                # url=...,
                # split=...,
                # with_tables=...,
                # pdf_with_text_layer=...,
                # pages=...,
                # ...
            )

    Load:
        .. code-block:: python

            docs = loader.load()
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }

    Lazy load:
        .. code-block:: python

            docs = []
            docs_lazy = loader.lazy_load()

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

        .. code-block:: python

            Some text
            {
                'file_name': 'example.pdf',
                'file_type': 'application/pdf',
                # ...
            }
    zhttp://0.0.0.0:1231r   TFr   r   r   r   r   N)urlr   r   r   r   r   r   r   r   r   r   r   r   r   r   )r    rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r!   c                   s>   t  j||||||||	|
||||||d || _d| jd< dS )a
  Initialize with file path, API url and parsing parameters.

        Args:
            file_path: path to the file for processing
            url: URL to call `dedoc` API
            split: type of document splitting into parts (each part is returned
                separately), default value "document"
                "document": document is returned as a single langchain Document object
                    (don't split)
                "page": split document into pages (works for PDF, DJVU, PPTX, PPT, ODP)
                "node": split document into tree nodes (title nodes, list item nodes,
                    raw text nodes)
                "line": split document into lines
            with_tables: add tables to the result - each table is returned as a single
                langchain Document object

            Parameters used for document parsing via `dedoc`
                (https://dedoc.readthedocs.io/en/latest/parameters/parameters.html):

                with_attachments: enable attached files extraction
                recursion_deep_attachments: recursion level for attached files
                    extraction, works only when with_attachments==True
                pdf_with_text_layer: type of handler for parsing PDF documents,
                    available options
                    ["true", "false", "tabby", "auto", "auto_tabby" (default)]
                language: language of the document for PDF without a textual layer and
                    images, available options ["eng", "rus", "rus+eng" (default)],
                    the list of languages can be extended, please see
                    https://dedoc.readthedocs.io/en/latest/tutorials/add_new_language.html
                pages: page slice to define the reading range for parsing PDF documents
                is_one_column_document: detect number of columns for PDF without
                    a textual layer and images, available options
                    ["true", "false", "auto" (default)]
                document_orientation: fix document orientation (90, 180, 270 degrees)
                    for PDF without a textual layer and images, available options
                    ["auto" (default), "no_change"]
                need_header_footer_analysis: remove headers and footers from the output
                    result for parsing PDF and images
                need_binarization: clean pages background (binarize) for PDF without a
                    textual layer and images
                need_pdf_table_analysis: parse tables for PDF without a textual layer
                    and images
                delimiter: column separator for CSV, TSV files
                encoding: encoding of TXT, CSV, TSV
        )r    r   r   r   r   r   r   r   r   r   r   r   r   r   r   jsonZreturn_formatN)superr6   rw   r3   )r"   r    rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__r#   r'   r6     s&    AzDedocAPIFileLoader.__init__r7   c                 c   s0   | j | j| j| jd}| j|| jdE dH  dS )r8   )rw   r    r;   r<   N)
_send_filerw   r    r3   rE   r   )r"   Zdoc_treer#   r#   r'   rG   	  s      zDedocAPIFileLoader.lazy_loadc                 C   s   i S )Nr#   rH   r#   r#   r'   r@     s    zDedocAPIFileLoader._make_config)rw   r    r;   r!   c           
   	   C   s~   ddl }tj|}t|d(}d||fi}|j| d||d}W 5 Q R X |jdkrjtd|j	  t
|j	 }	|	S )	z7Send POST-request to `dedoc` API and return the resultsr   Nrbfilez/upload)filesdata   zError during file handling: )requestsospathbasenameopenpoststatus_coder5   rX   decoderx   loads)
r"   rw   r    r;   r   	file_namer~   r   rresultr#   r#   r'   r|     s     
zDedocAPIFileLoader._send_file)rm   rn   ro   rp   rq   rr   r   rs   r   r6   r   r	   rG   rF   r@   r   listr|   __classcell__r#   r#   rz   r'   rv   j  sT   M



U  rv   )ri   rx   r   abcr   r   typingr   r   r   r   r   Zlangchain_core.documentsr	   Z)langchain_community.document_loaders.baser
   r   rt   rv   r#   r#   r#   r'   <module>   s     M