U
    h*                      @   s   d Z ddlZddlmZ ddlZddlZG dd dZdeee	f eee
 ee	 ddd	Zde
e	e
ee
 eee	f d
ddZee ddddZdd ZdS )zJThis is an educational implementation of the byte pair encoding algorithm.    N)Optionalc                   @   s   e Zd Zeeeef ddddZdeee e	e dddZ
e	e ed	d
dZe	e ed	ddZe	e e	e d	ddZeeeedddZedd ZdS )SimpleBytePairEncodingN)pat_strmergeable_ranksreturnc                C   s0   || _ || _dd | D | _t|| _dS )zCreates an Encoding object.c                 S   s   i | ]\}}||qS  r   ).0token_bytestokenr   r   9/tmp/pip-unpacked-wheel-taldhpgq/tiktoken/_educational.py
<dictcomp>   s      z3SimpleBytePairEncoding.__init__.<locals>.<dictcomp>N)r   r   items_decoderregexcompile_pat)selfr   r   r   r   r   __init__   s    zSimpleBytePairEncoding.__init__colour)text	visualiser   c                 C   sB   | j |}g }|D ](}|d}t| j||d}|| q|S )z`Encodes a string into tokens.

        >>> enc.encode("hello world")
        [388, 372]
        utf-8)r   )r   findallencode
bpe_encoder   extend)r   r   r   wordstokenswordZ
word_bytesZword_tokensr   r   r   r      s    
zSimpleBytePairEncoding.encode)r   r   c                    s   d  fdd|D S )znDecodes a list of tokens into bytes.

        >>> enc.decode_bytes([388, 372])
        b'hello world'
            c                 3   s   | ]} j | V  qd S Nr   r   r
   r   r   r   	<genexpr>+   s     z6SimpleBytePairEncoding.decode_bytes.<locals>.<genexpr>)joinr   r   r   r#   r   decode_bytes%   s    z#SimpleBytePairEncoding.decode_bytesc                 C   s   |  |jdddS )u   Decodes a list of tokens into a string.

        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace
        the invalid bytes with the replacement character "�".

        >>> enc.decode([388, 372])
        'hello world'
        r   replaceerrors)r'   decoder&   r   r   r   r+   -   s    	zSimpleBytePairEncoding.decodec                    s    fdd|D S )zDecodes a list of tokens into a list of bytes.

        Useful for visualising how a string is tokenised.

        >>> enc.decode_tokens_bytes([388, 372])
        [b'hello', b' world']
        c                    s   g | ]} j | qS r   r!   r"   r#   r   r   
<listcomp>@   s     z>SimpleBytePairEncoding.decode_tokens_bytes.<locals>.<listcomp>r   r&   r   r#   r   decode_tokens_bytes8   s    z*SimpleBytePairEncoding.decode_tokens_bytes)training_data
vocab_sizer   c                 C   s   t | ||d}t||dS )z#Train a BPE tokeniser on some data!)datar/   r   r   r   )	bpe_trainr   )r.   r/   r   r   r   r   r   trainB   s    zSimpleBytePairEncoding.trainc                 C   s$   t | trt| } t| j| jdS )Nr1   )
isinstancestrtiktokenZget_encodingr   Z_pat_strZ_mergeable_ranks)encodingr   r   r   from_tiktokenH   s    

 z$SimpleBytePairEncoding.from_tiktoken)r   )__name__
__module____qualname__r5   dictbytesintr   r   listr   r'   r+   r-   staticmethodr3   r8   r   r   r   r   r   
   s   

r   r   )r   inputr   r   c           
         s  dd |D }|r4|dkr$t | n|dkr4t| d }d }tt|d d |dd  D ]>\}} |d |d  }|d k	rZ|d ks||k rZ|}|}qZ|d krq|d k	st|d | || ||d   g ||d d   }q|rt   fd	d|D }	|	S )
Nc                 S   s   g | ]}t |gqS r   r=   r   br   r   r   r,   T   s     zbpe_encode.<locals>.<listcomp>r   colorsimple   r      c                    s   g | ]} | qS r   r   )r   partr   r   r   r,   q   s     )visualise_tokensprint	enumeratezipgetAssertionError)
r   rA   r   partsmin_idxZmin_rankipairZrankr   r   rL   r   r   Q   s*    
&4r   )r0   r/   r   r   r   c                    s
  |dk rt di }tdD ]}||t|g< qdd t|| D }t||k rt  |D ]4}t|d d |dd  D ]} |  d7  < q~q`t	  fddd	}	|	d
 |	d  }
t|}|||
< g }|D ]}g }d
}|t|d k r6|| ||d  f|	kr|
|
 |d7 }q|
||  |d7 }q|t|d krV|
||  |
| q|}|rFtd|	d
  d|	d   td|
 dt| d |dkrtd tdd |d d D  n.|dkrtd |d d D ]}t| qtd qF|S )N   z;vocab_size must be at least 256, so we can encode all bytesc                 S   s    g | ]}d d | dD qS )c                 S   s   g | ]}t |gqS r   rB   rC   r   r   r   r,      s     z(bpe_train.<locals>.<listcomp>.<listcomp>r   )r   )r   r   r   r   r   r,      s    zbpe_train.<locals>.<listcomp>rH   rI   c                    s    |  S r    r   )xstatsr   r   <lambda>   r   zbpe_train.<locals>.<lambda>)keyr   rJ   z The current most common pair is z + zSo we made z our zth tokenrE   z9Now the first fifty words in our training data look like:c                 S   s   g | ]}|D ]}|qqS r   r   )r   r   r
   r   r   r   r,      s       2   rG   z:Now the first twenty words in our training data look like:   
)
ValueErrorranger=   r   r   lencollectionsCounterrP   maxappendrN   rM   )r0   r/   r   r   ZranksrU   r   ZpiecerV   Zmost_common_pairr	   r
   Z	new_wordsr   Znew_wordr   rY   r   r2   u   sV    






r2   )token_valuesr   c                 C   s   dd dD }dd | D }d}d }|D ]\}||t |  }||krd||d t |  }||ksdt|}|t |7 }t|| dd q(td	 d S )
Nc                 S   s   g | ]}d | dqS )z[48;5;mr   )r   rU   r   r   r   r,      s     z$visualise_tokens.<locals>.<listcomp>)         M   P   D      c                 S   s   g | ]}|j d ddqS )r   r(   r)   )r+   )r   rX   r   r   r   r,      s     r   rI    )endz[0m)rb   rR   rN   )rg   Z
backgroundZunicode_token_valuesZrunning_lengthZ
last_colorr
   rF   r   r   r   rM      s    rM   c               	   C   s   d} t td}| }W 5 Q R X tj|d| d}td |d}||dksVt|	|dksht|
|dd	gks~t|S )
NzN's|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+riX  )r/   r   zJThis is the sequence of merges performed in order to encode 'hello world':zhello worlds   hello worlds   hellos    world)open__file__readr   r3   rN   r   r+   rR   r'   r-   )Zgpt2_patternfr0   encr   r   r   r   train_simple_encoding   s    
rx   )r   )r   )__doc__rc   typingr   r   r6   r   r<   r=   r>   r5   r?   r   r2   rM   rx   r   r   r   r   <module>   s,   H 
  %    
E