U
    h(<                     @  s   d dl mZ d dlZd dlmZmZmZmZmZm	Z	 d dl
mZ d dlmZ d dlmZ G dd deZG d	d
 d
ZG dd deZG dd deZG dd dZdS )    )annotationsN)AnyDictListTuple	TypedDictUnionDocument)Language)RecursiveCharacterTextSplitterc                      s(   e Zd ZdZddd fddZ  ZS )MarkdownTextSplitterz=Attempts to split the text along Markdown-formatted headings.r   None)kwargsreturnc                   s&   |  tj}t jf d|i| dS )z"Initialize a MarkdownTextSplitter.
separatorsN)Zget_separators_for_languager   ZMARKDOWNsuper__init__)selfr   r   	__class__ E/tmp/pip-unpacked-wheel-a648t6hw/langchain_text_splitters/markdown.pyr      s    zMarkdownTextSplitter.__init__)__name__
__module____qualname____doc__r   __classcell__r   r   r   r   r      s   r   c                   @  sD   e Zd ZdZdddddddZd	d
dddZdd
dddZdS )MarkdownHeaderTextSplitterz4Splitting markdown files based on specified headers.FTzList[Tuple[str, str]]boolheaders_to_split_onreturn_each_linestrip_headersc                 C  s$   || _ t|dd dd| _|| _dS )a  Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
        c                 S  s   t | d S )Nr   )len)splitr   r   r   <lambda>*       z5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>T)keyreverseN)r"   sortedr!   r#   r   r!   r"   r#   r   r   r   r      s      z#MarkdownHeaderTextSplitter.__init__zList[LineType]List[Document])linesr   c                 C  s   g }|D ]}|rB|d d |d krB|d d  d|d  7  < q|r|d d |d krt |d d t |d k r|d d dd d dkr| js|d d  d|d  7  < |d |d d< q|| qdd	 |D S )
zCombine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
        metadatacontentz  

r   #c                 S  s    g | ]}t |d  |d dqS r0   r/   page_contentr/   r	   .0chunkr   r   r   
<listcomp>T   s   zHMarkdownHeaderTextSplitter.aggregate_lines_to_chunks.<locals>.<listcomp>)r$   r%   r#   append)r   r-   Zaggregated_chunksliner   r   r   aggregate_lines_to_chunks/   s.    z4MarkdownHeaderTextSplitter.aggregate_lines_to_chunksstrtextr   c                 C  s&  | d}g }g }i }g }i }d}d}	|D ]}
|
 }dttj|}|s|drp|ddkrpd}d}	q|drd}d}	n||	rd}d}	|r|| q*| j	D ]\}}||rt
|t
|ks|t
| dkr|d	k	r`|d
}|r.|d d |kr.| }|d |kr||d  q|||t
|d	  d}|| |d ||< |r|d|| d |  | js||  qq|r|| n(|r|d|| d |  | }q*|r|d||d | js| |S dd |D S d	S )zASplit markdown file
        Args:
            text: Markdown filer1   F z```   Tz~~~ Nr2   r.   levelname)rC   rD   datarE   )r0   r/   c                 S  s    g | ]}t |d  |d dqS r3   r	   r6   r   r   r   r9      s   z9MarkdownHeaderTextSplitter.split_text.<locals>.<listcomp>)r%   stripjoinfilterr=   isprintable
startswithcountr:   r!   r$   popcopyclearr#   r"   r<   )r   r?   r-   Zlines_with_metadataZcurrent_contentZcurrent_metadataZheader_stackZinitial_metadataZin_code_blockZopening_fencer;   Zstripped_lineseprD   Zcurrent_header_levelZpopped_headerheaderr   r   r   
split_textY   s    











z%MarkdownHeaderTextSplitter.split_textN)FT)r   r   r   r   r   r<   rQ   r   r   r   r   r      s     *r   c                   @  s"   e Zd ZU dZded< ded< dS )LineTypezLine type as typed dict.zDict[str, str]r/   r=   r0   Nr   r   r   r   __annotations__r   r   r   r   rR      s   
rR   c                   @  s*   e Zd ZU dZded< ded< ded< dS )
HeaderTypezHeader type as typed dict.intrC   r=   rD   rE   NrS   r   r   r   r   rU      s   
rU   c                   @  s   e Zd ZdZdddddddZd*ddddddZdddddZddddddZddddddZddd d!Z	dd"d#d$d%Z
dd"d#d&d'Zdd"d#d(d)Zd	S )+&ExperimentalMarkdownSyntaxTextSplittera  
    An experimental text splitter for handling Markdown syntax.

    This splitter aims to retain the exact whitespace of the original text while
    extracting structured metadata, such as headers. It is a re-implementation of the
    MarkdownHeaderTextSplitter with notable changes to the approach and
    additional features.

    Key Features:
    - Retains the original whitespace and formatting of the Markdown text.
    - Extracts headers, code blocks, and horizontal rules as metadata.
    - Splits out code blocks and includes the language in the "Code" metadata key.
    - Splits text on horizontal rules (`---`) as well.
    - Defaults to sensible splitting behavior, which can be overridden using the
      `headers_to_split_on` parameter.

    Parameters:
    ----------
    headers_to_split_on : List[Tuple[str, str]], optional
        Headers to split on, defaulting to common Markdown headers if not specified.
    return_each_line : bool, optional
        When set to True, returns each line as a separate chunk. Default is False.

    Usage example:
    --------------
    >>> headers_to_split_on = [
    >>>     ("#", "Header 1"),
    >>>     ("##", "Header 2"),
    >>> ]
    >>> splitter = ExperimentalMarkdownSyntaxTextSplitter(
    >>>     headers_to_split_on=headers_to_split_on
    >>> )
    >>> chunks = splitter.split(text)
    >>> for chunk in chunks:
    >>>     print(chunk)

    This class is currently experimental and subject to change based on feedback and
    further development.
    zHeader 1zHeader 2zHeader 3zHeader 4zHeader 5zHeader 6)r2   z##z###z####z#####z######NFTz"Union[List[Tuple[str, str]], None]r   r    c                 C  s@   g | _ tdd| _g | _|| _|r.t|| _n| j| _|| _d S )Nr@   r5   )	chunksr
   current_chunkcurrent_header_stackr#   dictsplittable_headersDEFAULT_HEADER_KEYSr"   r+   r   r   r   r     s    z/ExperimentalMarkdownSyntaxTextSplitter.__init__r=   r,   r>   c           	      C  s   |j dd}|r|d}| |}| |}| |}|r|   | jsZ| j j|7  _t	|
d}|
d}| || q|r|   | ||| j_|
d| jjd< |   q|r|   q| j j|7  _q|   | jrdd | jD S | jS )	NT)keependsr   rA      ZCodec                 S  s6   g | ].}|j  D ]}|r| st||jd qqS )r4   )r5   
splitlinesisspacer
   r/   )r7   r8   r;   r   r   r   r9   I  s     zEExperimentalMarkdownSyntaxTextSplitter.split_text.<locals>.<listcomp>)ra   rL   _match_header_match_code_match_horz_complete_chunk_docr#   rZ   r5   r$   group_resolve_header_stack_resolve_code_chunkr/   r"   rY   )	r   r?   	raw_linesraw_lineZheader_matchZ
code_matchZ
horz_matchheader_depthheader_textr   r   r   rQ   &  s<    




 

z1ExperimentalMarkdownSyntaxTextSplitter.split_textrV   r   )rl   rm   r   c                 C  s\   t | jD ]<\}\}}||kr
||f| j|< | jd |d  | _ d S q
| j||f d S )NrA   )	enumerater[   r:   )r   rl   rm   idepth_r   r   r   rh   Q  s    z<ExperimentalMarkdownSyntaxTextSplitter._resolve_header_stackz	List[str])current_linerj   r   c                 C  s.   |}|r*| d}||7 }| |r|S qdS )Nr   r@   )rL   rd   )r   rr   rj   r8   rk   r   r   r   ri   Y  s    

z:ExperimentalMarkdownSyntaxTextSplitter._resolve_code_chunk)r   c                 C  s^   | j j}|rN| sN| jD ]$\}}| jd| }|| j j|< q| j| j  t	dd| _ d S )Nr2   r@   rX   )
rZ   r5   rb   r[   r]   getr/   rY   r:   r
   )r   Zchunk_contentrp   valueZ
header_keyr   r   r   rf   b  s    z:ExperimentalMarkdownSyntaxTextSplitter._complete_chunk_doczUnion[re.Match, None])r;   r   c                 C  s(   t d|}|r$|d| jkr$|S d S )Nz^(#{1,6}) (.*)rA   )rematchrg   r]   )r   r;   rv   r   r   r   rc   o  s    z4ExperimentalMarkdownSyntaxTextSplitter._match_headerc                   s&    fdddD }t dd |D d S )Nc                   s   g | ]}t | qS r   ru   rv   r7   Zruler;   r   r   r9   w  s     zFExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<listcomp>)z^```(.*)z^~~~(.*)c                 s  s   | ]}|r|V  qd S Nr   r7   rv   r   r   r   	<genexpr>x  s      zEExperimentalMarkdownSyntaxTextSplitter._match_code.<locals>.<genexpr>nextr   r;   matchesr   ry   r   rd   v  s    z2ExperimentalMarkdownSyntaxTextSplitter._match_codec                   s&    fdddD }t dd |D d S )Nc                   s   g | ]}t | qS r   rw   rx   ry   r   r   r9   {  s    zFExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<listcomp>)z
^\*\*\*+\nz^---+\nz^___+\nc                 s  s   | ]}|r|V  qd S rz   r   r{   r   r   r   r|   ~  s      zEExperimentalMarkdownSyntaxTextSplitter._match_horz.<locals>.<genexpr>r}   r   r   ry   r   re   z  s    
z2ExperimentalMarkdownSyntaxTextSplitter._match_horz)NFT)r   r   r   r   r^   r   rQ   rh   ri   rf   rc   rd   re   r   r   r   r   rW      s&   )   +	rW   )
__future__r   ru   typingr   r   r   r   r   r   Zlangchain_core.documentsr
   Zlangchain_text_splitters.baser   Z"langchain_text_splitters.characterr   r   r   rR   rU   rW   r   r   r   r   <module>   s    	 @