U
    hM                     @   sl   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ G dd de	Zd	S )
    )AnyDictIteratorListOptional)CallbackManagerForLLMRun)LLM)GenerationChunk)Field)pre_initc                   @   sL  e Zd ZU dZeed< eed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< eZed
Zee ed< edZeed< edZeed< edZeed< edZee ed< eeeef eeef dddZeedddZeedddZd eeee  ee eedddZd!eeee  ee ee e! dddZ"dS )"	ExLlamaV2a+  ExllamaV2 API.

    - working only with GPTQ models for now.
    - Lora models are not supported yet.

    To use, you should have the exllamav2 library installed, and provide the
    path to the Llama model as a named parameter to the constructor.
    Check out:

    Example:
        .. code-block:: python

            from langchain_community.llms import Exllamav2

            llm = Exllamav2(model_path="/path/to/llama/model")

    #TODO:
    - Add loras support
    - Add support for custom settings
    - Add support for custom stop sequences
    client
model_pathNexllama_cacheconfig	generator	tokenizersettings stop_sequences   max_new_tokensT	streamingverbosedisallowed_tokens)valuesreturnc              
   C   s  zdd l }W n, tk
r8 } ztd|W 5 d }~X Y nX |j sLtdz,ddlm}m}m}m	} ddl
m}m}	 W n tk
r   tdY nX |d }
|
sdd	 |d
< |d
 }|d r|d }||j ntd| }|d |_|  ||}||dd}|| ||}|d r0|	|||}n||||}dd |d D |d< t|d|d  |d|d   |d}|r||| ||d< ||d< ||d< ||d< ||d< |S )Nr   z@Unable to import torch, please install with `pip install torch`.z/CUDA is not available. ExllamaV2 requires CUDA.)r   ExLlamaV2CacheExLlamaV2ConfigExLlamaV2Tokenizer)ExLlamaV2BaseGeneratorExLlamaV2StreamingGeneratorzCould not import exllamav2 library. Please install the exllamav2 library with (cuda 12.1 is required)example : !python -m pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whlr   c                  _   s   d S )N )argskwargsr"   r"   F/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/llms/exllamav2.py<lambda>_       z0ExLlamaV2.validate_environment.<locals>.<lambda>logfuncr   z<settings is required. Custom settings are not supported yet.r   T)Zlazyr   c                 S   s   g | ]}|   qS r"   )striplower).0xr"   r"   r%   
<listcomp>z   s     z2ExLlamaV2.validate_environment.<locals>.<listcomp>r   zstop_sequences r   r   r   r   r   r   )torchImportErrorZcudaZis_availableEnvironmentErrorZ	exllamav2r   r   r   r   Zexllamav2.generatorr    r!   __dict__NotImplementedErrorZ	model_dirprepareZload_autosplitsetattrgetZdisallow_tokens)clsr   r.   er   r   r   r   r    r!   r   r(   r   r   modelr   r   r   Z
disallowedr"   r"   r%   validate_environment>   sb    





zExLlamaV2.validate_environment)r   c                 C   s   dS )zReturn type of llm.r   r"   )selfr"   r"   r%   	_llm_type   s    zExLlamaV2._llm_type)textr   c                 C   s   | j j|S )z-Get the number of tokens present in the text.)r   r   
num_tokens)r:   r<   r"   r"   r%   get_num_tokens   s    zExLlamaV2.get_num_tokens)promptstoprun_managerr$   r   c           	      K   sd   | j }| jr8d}| j||||dD ]}|t|7 }q"|S |j|| j| jd}|t|d  }|S d S )Nr   )r?   r@   rA   r$   )r?   Zgen_settingsr=   )r   r   _streamstrZgenerate_simpler   r   len)	r:   r?   r@   rA   r$   r   Zcombined_text_outputchunkoutputr"   r"   r%   _call   s$       
zExLlamaV2._callc           
      k   s~   | j |}| j  | jg  | j|| j d}| j \}}}	|d7 }|rb|j|| j	d |V  |sz|| j
kr6qzq6d S )Nr      )tokenr   )r   encoder   ZwarmupZset_stop_conditionsZbegin_streamr   streamZon_llm_new_tokenr   r   )
r:   r?   r@   rA   r$   Z	input_idsZgenerated_tokensrE   Zeos_r"   r"   r%   rB      s     
zExLlamaV2._stream)NN)NN)#__name__
__module____qualname____doc__r   __annotations__rC   r   r   r   r   r   printr(   r
   r   r   r   intr   boolr   r   r   r   r9   propertyr;   r>   r   r   rG   r   r	   rB   r"   r"   r"   r%   r   
   sJ   
"K  
  
r   N)typingr   r   r   r   r   Zlangchain_core.callbacksr   Zlangchain_core.language_modelsr   Zlangchain_core.outputsr	   Zlangchain_core.pydantic_v1r
   Zlangchain_core.utilsr   r   r"   r"   r"   r%   <module>   s   