U
    h                     @   sX   d dl mZmZmZmZ d dlmZ er4d dlmZ G dd deZG dd de	Z
dS )	    )TYPE_CHECKINGDictListUnion)UnstructuredFileLoaderchmc                   @   s   e Zd ZdZedddZdS )UnstructuredCHMLoaderar  Load `CHM` files using `Unstructured`.

    CHM means Microsoft Compiled HTML Help.

    Examples
    --------
    from langchain_community.document_loaders import UnstructuredCHMLoader

    loader = UnstructuredCHMLoader("example.chm")
    docs = loader.load()

    References
    ----------
    https://github.com/dottedmag/pychm
    http://www.jedrea.com/chmlib/
    returnc              
      sF   ddl m  tj&} fdd| D W  5 Q R  S Q R X d S )Nr   )partition_htmlc                    s$   g | ]} f d |d ij qS )textcontent)Zunstructured_kwargs).0itemr   self L/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/chm.py
<listcomp>   s   z7UnstructuredCHMLoader._get_elements.<locals>.<listcomp>)Zunstructured.partition.htmlr   	CHMParser	file_pathload_all)r   fr   r   r   _get_elements   s
    z#UnstructuredCHMLoader._get_elementsN)__name__
__module____qualname____doc__r   r   r   r   r   r   r	   	   s   r	   c                   @   s   e Zd ZU dZeed< ded< edddZdd	 Zd
d Ze	edddZ
eeeef  dddZeeef edddZeeeef  dddZdS )r   z*Microsoft Compiled HTML Help (CHM) Parser.pathzchm.CHMFilefile)r   c                 C   s,   ddl m } || _| | _| j| d S )Nr   r   )r   r   ZCHMFiler    ZLoadCHM)r   r   r   r   r   r   __init__+   s    
zCHMParser.__init__c                 C   s   | S Nr   r   r   r   r   	__enter__2   s    zCHMParser.__enter__c                 C   s   | j r| j   d S r"   )r    ZCloseCHM)r   exc_type	exc_value	tracebackr   r   r   __exit__5   s    zCHMParser.__exit__r
   c                 C   s   | j  dS )Nutf-8)r    ZGetEncodingdecoder#   r   r   r   encoding9   s    zCHMParser.encodingc           
      C   s   ddl m} ddlm} g }| j | j}||}|dD ]z}d}d}|dD ],}	|	d dkrn|	d	 }|	d d
krV|	d	 }qV|r@|sq@||j	}|
dsd| }|||d q@|S )Nr   )urlparse)BeautifulSoupobject paramnameNamevalueZLocal/)r1   local)urllib.parser,   Zbs4r-   r    ZGetTopicsTreer*   r+   Zfind_allr   
startswithappend)
r   r,   r-   resindexZsoupobjr1   r5   r0   r   r   r   r:   =   s(    


zCHMParser.index)r   r   c                 C   s<   t |tr|d}| j|d }| j|d | jS )Nr)      )
isinstancestrencoder    ZResolveObjectZRetrieveObjectr*   r+   )r   r   r;   r   r   r   loadZ   s    

zCHMParser.loadc                 C   sB   g }|   }|D ],}| |d }||d |d |d q|S )Nr5   r1   )r1   r5   r   )r:   r@   r8   )r   r9   r:   r   r   r   r   r   r   `   s    zCHMParser.load_allN)r   r   r   r   r>   __annotations__r!   r$   r(   propertyr+   r   r   r:   r   bytesr@   r   r   r   r   r   r   %   s   
r   N)typingr   r   r   r   Z1langchain_community.document_loaders.unstructuredr   r   r	   r.   r   r   r   r   r   <module>   s
   