U
    h                     @  sT   d dl mZ d dlmZmZ d dlmZ G dd deZdddd	d
dddZdS )    )annotations)AnyList)TextSplitterc                	      sH   e Zd ZdZddddddd	d
dd fddZdddddZ  ZS )SpacyTextSplitteraR  Splitting text using Spacy package.


    Per default, Spacy's `en_core_web_sm` model is used and
    its default max_length is 1000000 (it is the length of maximum character
    this model takes which can be increased for large files). For a faster, but
    potentially less accurate splitting, you can use `pipeline='sentencizer'`.
    

en_core_web_sm@B T)strip_whitespacestrintboolr   None)	separatorpipeline
max_lengthr
   kwargsreturnc                  s,   t  jf | t||d| _|| _|| _dS )z#Initialize the spacy text splitter.r   N)super__init__"_make_spacy_pipeline_for_splitting
_tokenizer
_separator_strip_whitespace)selfr   r   r   r
   r   	__class__ B/tmp/pip-unpacked-wheel-a648t6hw/langchain_text_splitters/spacy.pyr      s    
 zSpacyTextSplitter.__init__z	List[str])textr   c                   s(    fdd  |jD } | jS )z&Split incoming text and return chunks.c                 3  s    | ]} j r|jn|jV  qd S )N)r   r    Ztext_with_ws).0sr   r   r   	<genexpr>%   s   z/SpacyTextSplitter.split_text.<locals>.<genexpr>)r   ZsentsZ_merge_splitsr   )r   r    splitsr   r#   r   
split_text#   s    

zSpacyTextSplitter.split_text)r   r   r	   )__name__
__module____qualname____doc__r   r&   __classcell__r   r   r   r   r      s       r   r	   r   r   r   r   )r   r   r   c                C  sl   zdd l }W n tk
r(   tdY nX | dkrPddlm} | }|d n|j| ddgd}||_|S )Nr   zCSpacy is not installed, please install it with `pip install spacy`.sentencizer)EnglishZnerZtagger)exclude)spacyImportErrorZspacy.lang.enr-   Zadd_pipeloadr   )r   r   r/   r-   r,   r   r   r   r   ,   s    
r   N)	
__future__r   typingr   r   Zlangchain_text_splitters.baser   r   r   r   r   r   r   <module>   s
   %