U
    hb
                     @   sf   d dl Z d dlZd dlmZ d dlmZmZmZ d dlm	Z	 d dl
mZ eeZG dd deZdS )    N)Path)DictIteratorUnion)Document)
BaseLoaderc                   @   sR   e Zd ZdZd
eeef eedf eedf eddddZe	e
 ddd	ZdS )MHTMLLoaderz)Parse `MHTML` files with `BeautifulSoup`.N )	file_pathopen_encoding	bs_kwargsget_text_separatorreturnc                 C   sV   zddl }W n tk
r(   tdY nX || _|| _|dkrFddi}|| _|| _dS )a  initialize with path, and optionally, file encoding to use, and any kwargs
        to pass to the BeautifulSoup object.

        Args:
            file_path: Path to file to load.
            open_encoding: The encoding to use when opening the file.
            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
            get_text_separator: The separator to use when getting the text
                from the soup.
        r   NzUbeautifulsoup4 package not found, please install it with `pip install beautifulsoup4`featureslxml)bs4ImportErrorr
   r   r   r   )selfr
   r   r   r   r    r   N/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/mhtml.py__init__   s    
zMHTMLLoader.__init__)r   c              	   c   s   ddl m} t| jd| jd}t| }| }t	|t
sF|g}|D ]}| dkrJ|jdd }||f| j}|| j}|jrt|jj}	nd}	t| j|	d	}
t||
d
V   W 5 Q R  dS qJW 5 Q R X dS )z*Load MHTML document into document objects.r   )BeautifulSoupr)encodingz	text/htmlT)decoder	   )sourcetitle)Zpage_contentmetadataN)r   r   openr
   r   emailmessage_from_stringreadget_payload
isinstancelistget_content_typer   r   Zget_textr   r   strstringr   )r   r   fmessagepartsparthtmlZsouptextr   r   r   r   r   	lazy_load0   s&    
zMHTMLLoader.lazy_load)NNr	   )__name__
__module____qualname____doc__r   r&   r   dictr   r   r   r.   r   r   r   r   r      s      


 r   )r   loggingpathlibr   typingr   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   	getLoggerr/   loggerr   r   r   r   r   <module>   s   
