U
    h                     @   sj   d dl mZmZ dZdZdZdZdZdd Zd	d
 Z	dd Z
dd Zdd Zdd Zee	e
eeedZdS )    )data_gym_to_mergeable_bpe_ranksload_tiktoken_bpez<|endoftext|>z<|fim_prefix|>z<|fim_middle|>z<|fim_suffix|>z<|endofprompt|>c                  C   s$   t ddddd} ddd| td	id
S )NzIhttps://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpezLhttps://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.jsonZ@1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5Z@196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783)Zvocab_bpe_fileZencoder_json_fileZvocab_bpe_hashZencoder_json_hashgpt2Q  G'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+P  nameZexplicit_n_vocabpat_strmergeable_ranksspecial_tokens)r   	ENDOFTEXTr    r   >/tmp/pip-unpacked-wheel-taldhpgq/tiktoken_ext/openai_public.pyr   
   s    r   c                  C   s    t ddd} ddd| tdidS )	NzGhttps://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktokenZ@306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930Zexpected_hash	r50k_baser   r   r   r   r   r   r   r   r   r   r      s    r   c                  C   s    t ddd} ddd| tdidS )	NGhttps://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken@94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069r   	p50k_basei  r   r   r   r   r   r   r   r   r   +   s    r   c                  C   s.   t ddd} tdtdtdtdi}dd	| |d
S )Nr   r   r   r   r   ij  ik  	p50k_editr   r	   r
   r   r   )r   r   
FIM_PREFIX
FIM_MIDDLE
FIM_SUFFIXr   r   r   r   r   r   9   s    r   c               
   C   s2   t ddd} tdtdtdtdtdi}d	d
| |dS )NzIhttps://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktokenZ@223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7r   i i i i i cl100k_basezm'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+r   )r   r   r   r   r   ENDOFPROMPTr   r   r   r   r   G   s(         r   c               	   C   s>   t ddd} tdtdi}dddd	d
dddg}d|| |dS )NzHhttps://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktokenZ@446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2dr   i? iR |zi[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?zi[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?z
\p{N}{1,3}z ?[^\s\p{L}\p{N}]+[\r\n/]*z
\s*[\r\n]+z	\s+(?!\S)z\s+
o200k_baser   )r   r   r   join)r   r   r
   r   r   r   r!   [   s0      r!   )r   r   r   r   r   r!   N)Ztiktoken.loadr   r   r   r   r   r   r   r   r   r   r   r   r!   ZENCODING_CONSTRUCTORSr   r   r   r   <module>   s$   