U
    h                     @  s   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
Z
dddddZddd	d
ddZdddddddZdddddddddZddddddZdddddddZdS )    )annotationsN)Optionalstrbytes)blobpathreturnc              
   C  s   |  dsv|  dsvzdd l}W n, tk
rL } ztd|W 5 d }~X Y nX || d}| W  5 Q R  S Q R X t| }|  |jS )Nzhttp://zhttps://r   Oblobfile is not installed. Please install it by running `pip install blobfile`.rb)	
startswithblobfileImportErrorBlobFilereadrequestsgetraise_for_statuscontent)r   r   efresp r   1/tmp/pip-unpacked-wheel-taldhpgq/tiktoken/load.py	read_file   s    
r   bool)dataexpected_hashr   c                 C  s   t |  }||kS N)hashlibsha256	hexdigest)r   r   Zactual_hashr   r   r   
check_hash   s    r    zOptional[str])r   r   r   c           
   	   C  s  d}dt jkrt jd }n,dt jkr0t jd }nt jt d}d}|dkrVt| S t| 	 
 }t j||}t j|rt|d}| }W 5 Q R X |d kst||r|S zt | W n tk
r   Y nX t| }|r
t||s
td|  d	| d
zVt j|dd |d tt  d }	t|	d}|| W 5 Q R X t |	| W n tk
r~   |rz Y nX |S )NTZTIKTOKEN_CACHE_DIRZDATA_GYM_CACHE_DIRzdata-gym-cacheF r	   z'Hash mismatch for data downloaded from z (expected z<). This may indicate a corrupted download. Please try again.)exist_ok.z.tmpwb)osenvironpathjointempfile
gettempdirr   r   sha1encoder   existsopenr   r    removeOSError
ValueErrormakedirsr   uuidZuuid4writerename)
r   r   Zuser_specified_cache	cache_dir	cache_key
cache_pathr   r   contentsZtmp_filenamer   r   r   read_file_cached#   sF    

r:   zdict[bytes, int])vocab_bpe_fileencoder_json_filevocab_bpe_hashencoder_json_hashr   c                   s>  dd t dD }dd |D  d}t dD ].}||kr,|| | td| < |d7 }q,t|dksltt| | }dd |d	dd
 D }ddd fdddd t|D }	t|	}|D ]$\}
}||	|
| < |d7 }qt	
t||}fdd| D }|dd  |dd  |	|ks:t|	S )Nc                 S  s(   g | ] }t | rt |d kr|qS ) )chrisprintable.0br   r   r   
<listcomp>\   s       z3data_gym_to_mergeable_bpe_ranks.<locals>.<listcomp>   c                 S  s   i | ]}t ||qS r   )r@   rB   r   r   r   
<dictcomp>^   s      z3data_gym_to_mergeable_bpe_ranks.<locals>.<dictcomp>r      c                 S  s   g | ]}t | qS r   )tuplesplit)rC   Z	merge_strr   r   r   rE   i   s     
r   r   )valuer   c                   s   t  fdd| D S )Nc                 3  s   | ]} | V  qd S r   r   rB   data_gym_byte_to_byter   r   	<genexpr>l   s     zKdata_gym_to_mergeable_bpe_ranks.<locals>.decode_data_gym.<locals>.<genexpr>r   )rM   rN   r   r   decode_data_gymk   s    z8data_gym_to_mergeable_bpe_ranks.<locals>.decode_data_gymc                 S  s   i | ]\}}t |g|qS r   rQ   )rC   irD   r   r   r   rG   o   s      c                   s   i | ]\}} ||qS r   r   )rC   kv)rR   r   r   rG   z   s      s   <|endoftext|>s   <|startoftext|>)rangeappendr@   lenAssertionErrorr:   decoderJ   	enumeratejsonloadsitemspop)r;   r<   r=   r>   Zrank_to_intbytenrD   Zvocab_bpe_contentsZ
bpe_merges	bpe_ranksfirstsecondZencoder_jsonZencoder_json_loadedr   )rO   rR   r   data_gym_to_mergeable_bpe_ranksU   s.    


rd   None)ra   tiktoken_bpe_filer   c              
   C  s   zdd l }W n, tk
r8 } ztd|W 5 d }~X Y nX ||dH}t|  dd dD ],\}}|t|d t|	  d  q\W 5 Q R X d S )	Nr   r   r$   c                 S  s   | d S )NrH   r   )xr   r   r   <lambda>       z#dump_tiktoken_bpe.<locals>.<lambda>)key       
)
r   r   r   sortedr^   r4   base64	b64encoder   r,   )ra   rf   r   r   r   tokenrankr   r   r   dump_tiktoken_bpe   s    rr   )rf   r   r   c                 C  s&   t | |}dd dd | D D S )Nc                 S  s    i | ]\}}t |t|qS r   )rn   	b64decodeint)rC   rp   rq   r   r   r   rG      s    z%load_tiktoken_bpe.<locals>.<dictcomp>c                 s  s   | ]}|r|  V  qd S r   )rJ   )rC   liner   r   r   rP      s      z$load_tiktoken_bpe.<locals>.<genexpr>)r:   
splitlines)rf   r   r9   r   r   r   load_tiktoken_bpe   s    
rw   )N)NN)N)
__future__r   rn   r   r\   r%   r)   r3   typingr   r   r   r    r:   rd   rr   rw   r   r   r   r   <module>   s"   5  . 