U
    h^-                     @  s   d dl mZ d dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
mZmZmZmZmZ d dlZd dlmZ d dlmZ G dd deZG d	d
 d
ZG dd dZdS )    )annotationsN)BytesIOStringIO)AnyDictIterableListOptionalTuple	TypedDictcastDocument)RecursiveCharacterTextSplitterc                   @  s2   e Zd ZU dZded< ded< ded< ded< dS )	ElementTypezElement type as typed dict.strurlxpathcontentzDict[str, str]metadataN)__name__
__module____qualname____doc____annotations__ r   r   A/tmp/pip-unpacked-wheel-a648t6hw/langchain_text_splitters/html.pyr      s
   
r   c                   @  sd   e Zd ZdZddddddZdd	d
ddZddd	dddZdd	dddZdd	dddZdS )HTMLHeaderTextSplitterzU
    Splitting HTML files based on specified headers.
    Requires lxml package.
    FList[Tuple[str, str]]bool)headers_to_split_onreturn_each_elementc                 C  s   || _ t|| _dS )ay  Create a new HTMLHeaderTextSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
            return_each_element: Return each element w/ associated headers.
        N)r!   sortedr    )selfr    r!   r   r   r   __init__   s    zHTMLHeaderTextSplitter.__init__zList[ElementType]List[Document])elementsreturnc                 C  s\   g }|D ]D}|rB|d d |d krB|d d  d|d  7  < q| | qdd |D S )zCombine elements with common metadata into chunks

        Args:
            elements: HTML element content with associated identifying info and metadata
        r   r   z  
c                 S  s    g | ]}t |d  |d dqS r   r   page_contentr   r   .0chunkr   r   r   
<listcomp>E   s   zGHTMLHeaderTextSplitter.aggregate_elements_to_chunks.<locals>.<listcomp>)append)r#   r&   Zaggregated_chunkselementr   r   r   aggregate_elements_to_chunks.   s    z3HTMLHeaderTextSplitter.aggregate_elements_to_chunksr   r   )r   kwargsr'   c                 K  s   t j|f|}| t|jS )zSplit HTML from web URL

        Args:
            url: web URL
            **kwargs: Arbitrary additional keyword arguments. These are usually passed
                to the fetch url content request.
        )requestsgetsplit_text_from_filer   r   )r#   r   r3   rr   r   r   split_text_from_urlJ   s    z*HTMLHeaderTextSplitter.split_text_from_urltextr'   c                 C  s   |  t|S zJSplit HTML text string

        Args:
            text: HTML text
        r6   r   r#   r:   r   r   r   
split_textU   s    z!HTMLHeaderTextSplitter.split_textfiler'   c                   sj  zddl m} W n, tk
r< } ztd|W 5 d}~X Y nX |jdd}|||}ttjd }||}|	|}||}	|
t|	}
dd	 | jD  t| jd
di}g }|
d|D ]}|ds|dr|t|ddd	 |d|D ddd	 |d|D fddt fdd|d|D d q| jsX| |S dd	 |D S dS )CSplit HTML file

        Args:
            file: HTML file
        r   etree>Unable to import lxml, please install with `pip install lxml`.Nzutf-8)encodingz!xsl/html_chunks_with_headers.xsltc                 S  s   g | ]}|d  qS )r   r   )r-   headerr   r   r   r/   x   s     z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<listcomp>hzhttp://www.w3.org/1999/xhtmlz*//*z*[@class='headers']z*[@class='chunk'] c                 S  s   g | ]}|j pd qS rH   r:   r-   noder   r   r   r/      s   z*[@class='xpath']c                 S  s   g | ]}|j pd qS rI   rJ   rK   r   r   r   r/      s   c                   s   i | ]} |j  |jpd qS rI   )tagr:   rK   )header_mappingr   r   
<dictcomp>   s    z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<dictcomp>c                   s
   | j  kS )N)rM   )x)header_filterr   r   <lambda>       z=HTMLHeaderTextSplitter.split_text_from_file.<locals>.<lambda>z*[@class='headers']/*)r   r   r   r   c                 S  s    g | ]}t |d  |d dqS r)   r   r,   r   r   r   r/      s   )lxmlrC   ImportError
HTMLParserparsepathlibPath__file__parentXSLT
fromstringr   r    dictfindallr0   r   joinfilterr!   r2   )r#   r@   rC   eparsertree	xslt_path	xslt_tree	transformresultZ
result_domZns_mapr&   r1   r   )rQ   rN   r   r6   ]   sb    








z+HTMLHeaderTextSplitter.split_text_from_fileN)F)	r   r   r   r   r$   r2   r8   r>   r6   r   r   r   r   r      s    r   c                   @  s   e Zd ZdZd"ddddddd	Zd
ddddZdddddZd#ddddddZdddddZdddddZ	dddd d!Z
dS )$HTMLSectionSplitterz`
    Splitting HTML files based on specified tag and font sizes.
    Requires lxml package.
    Nr   zOptional[str]r   None)r    re   r3   r'   c                 K  sD   t || _|dkr*ttjd  | _nt| | _|| _dS )a  Create a new HTMLSectionSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2"].
            xslt_path: path to xslt file for document transformation.
            Uses a default if not passed.
            Needed for html contents that using different format and layouts.
        Nzxsl/converting_to_header.xslt)	r^   r    rX   rY   rZ   r[   absolutere   r3   )r#   r    re   r3   r   r   r   r$      s    

zHTMLSectionSplitter.__init__zIterable[Document]r%   )	documentsr'   c                 C  sP   g g  }}|D ]}| |j | |j q| j||d}tf | j}||S )zSplit documents.)	metadatas)r0   r+   r   create_documentsr   r3   split_documents)r#   rl   textsrm   docresultsZtext_splitterr   r   r   ro      s    
z#HTMLSectionSplitter.split_documentsr   r9   c                 C  s   |  t|S r;   r<   r=   r   r   r   r>      s    zHTMLSectionSplitter.split_textz	List[str]zOptional[List[dict]])rp   rm   r'   c                 C  s   |pi gt | }g }t|D ]t\}}| |D ]`}t|| }|j D ] }	|j|	 dkrL|d |j|	< qL||j}t|j|d}
|	|
 q0q|S )z&Create documents from a list of texts.#TITLE#ZTitler*   )
len	enumerater>   copydeepcopyr   keysr   r+   r0   )r#   rp   rm   Z
_metadatasrl   ir:   r.   r   keyZnew_docr   r   r   rn      s    
z$HTMLSectionSplitter.create_documentszList[Dict[str, Optional[str]]])html_docr'   c              
   C  s  zddl m}m} W n, tk
r@ } ztd|W 5 d }~X Y nX ||d}t| j }g }|dg| }t|D ]\}}	|	}
|dkrd}d}g }n|
j	
 }|
j}g }|
jD ]<}|d t|k r|||d  kr qt|tr|| qd	|
 }|d
krv||||d qv|S )Nr   )BeautifulSoupPageElementzzUnable to import BeautifulSoup/PageElement,                     please install with `pip install                     bs4`.zhtml.parserbodyrs   h1    rH   )rF   r   tag_name)Zbs4r|   r}   rU   listr    rx   Zfind_allru   r:   stripnameZnext_elementsrt   
isinstancer   r0   r`   )r#   r{   r|   r}   rb   Zsoupheaderssectionsry   rF   Zheader_elementZcurrent_headerZcurrent_header_tagZsection_contentr1   r   r   r   r   split_html_by_headers   sF    


 
z)HTMLSectionSplitter.split_html_by_headers)html_contentr'   c           	   
   C  s   | j d kr|S zddlm} W n, tk
rJ } ztd|W 5 d }~X Y nX | }|t||}|| j }||}||}t|S )Nr   rB   rD   )	re   rT   rC   rU   rV   rW   r   r\   r   )	r#   r   rC   rb   rc   rd   rf   rg   rh   r   r   r   convert_possible_tags_to_header  s     

z3HTMLSectionSplitter.convert_possible_tags_to_headerr?   c                   s.   |  } |} |} fdd|D S )rA   c                   s8   g | ]0}t tt|d   jt|d  |d idqS )r   r   rF   )r   )r   r   r   r    )r-   sectionr#   r   r   r/   7  s   	 z<HTMLSectionSplitter.split_text_from_file.<locals>.<listcomp>)getvaluer   r   )r#   r@   Zfile_contentr   r   r   r   r6   -  s    


	z(HTMLSectionSplitter.split_text_from_file)N)N)r   r   r   r   r$   ro   r>   rn   r   r   r6   r   r   r   r   ri      s    	 ,ri   )
__future__r   rv   rX   ior   r   typingr   r   r   r   r	   r
   r   r   r4   Zlangchain_core.documentsr   Z"langchain_text_splitters.characterr   r   r   ri   r   r   r   r   <module>   s   (	 