U
    h'                     @   st   d dl mZmZmZmZmZmZmZmZ d dl	m
Z
 d dlmZ er`d dlmZmZ d dlmZ G dd deZdS )	    )TYPE_CHECKINGAnyDict	GeneratorListMappingOptionalUnion)CallbackManagerForLLMRun)LLM)RESTfulChatModelHandleRESTfulGenerateModelHandle)LlamaCppGenerateConfigc                	       s   e Zd ZU dZeed< ee ed< ee ed< eeef ed< dee ee ed fdd	Z	e
ed
ddZe
eeef d
ddZdeeee  ee eedddZded eee ed eeddf dddZ  ZS )
Xinferencea  `Xinference` large-scale model inference service.

    To use, you should have the xinference library installed:

    .. code-block:: bash

       pip install "xinference[all]"

    If you're simply using the services provided by Xinference, you can utilize the xinference_client package:

    .. code-block:: bash

        pip install xinference_client

    Check out: https://github.com/xorbitsai/inference
    To run, you need to start a Xinference supervisor on one server and Xinference workers on the other servers

    Example:
        To start a local instance of Xinference, run

        .. code-block:: bash

           $ xinference

        You can also deploy Xinference in a distributed cluster. Here are the steps:

        Starting the supervisor:

        .. code-block:: bash

           $ xinference-supervisor

        Starting the worker:

        .. code-block:: bash

           $ xinference-worker

    Then, launch a model using command line interface (CLI).

    Example:

    .. code-block:: bash

       $ xinference launch -n orca -s 3 -q q4_0

    It will return a model UID. Then, you can use Xinference with LangChain.

    Example:

    .. code-block:: python

        from langchain_community.llms import Xinference

        llm = Xinference(
            server_url="http://0.0.0.0:9997",
            model_uid = {model_uid} # replace model_uid with the model UID return from launching the model
        )

        llm.invoke(
            prompt="Q: where can we visit in the capital of France? A:",
            generate_config={"max_tokens": 1024, "stream": True},
        )

    To view all the supported builtin models, run:

    .. code-block:: bash

        $ xinference list --all

    client
server_url	model_uidmodel_kwargsNr   r   r   c                    s   zddl m} W nR tk
rb   zddlm} W n, tk
r\ } ztd|W 5 d }~X Y nX Y nX |pji }t jf |||d | jd krtd| jd krtd||| _	d S )Nr   )RESTfulClientzCould not import RESTfulClient from xinference. Please install it with `pip install xinference` or `pip install xinference_client`.r   zPlease provide server URLzPlease provide the model UID)
xinference.clientr   ImportErrorZxinference_clientsuper__init__r   
ValueErrorr   r   )selfr   r   r   r   e	__class__ G/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/llms/xinference.pyr   \   s.    

zXinference.__init__)returnc                 C   s   dS )zReturn type of llm.Z
xinferencer   r   r   r   r    	_llm_type   s    zXinference._llm_typec                 C   s   d| j id| jid| jiS )zGet the identifying parameters.r   r   r   r   r"   r   r   r    _identifying_params   s    zXinference._identifying_params)promptstoprun_managerkwargsr!   c           
      K   s   | j | j}|di }| j|}|r0||d< |rf|drfd}| j||||dD ]}||7 }qT|S |j||d}	|	d d d	 S d
S )aq  Call the xinference model and return the output.

        Args:
            prompt: The prompt to use for generation.
            stop: Optional list of stop words to use when generating.
            generate_config: Optional dictionary for the configuration used for
                generation.

        Returns:
            The generated string by the model.
        generate_configr&   stream )modelr%   r'   r)   r%   r)   choicesr   textN)r   Z	get_modelr   getr   _stream_generategenerate)
r   r%   r&   r'   r(   r,   r)   Zcombined_text_outputtoken
completionr   r   r    _call   s"    


zXinference._call)r   r   r   )r,   r%   r'   r)   r!   c                 c   sz   |j ||d}|D ]b}t|tr|dg }|r|d }t|tr|dd}	|d}
|rn|j|	| j|
d |	V  qdS )	a^  
        Args:
            prompt: The prompt to use for generation.
            model: The model used for generation.
            stop: Optional list of stop words to use when generating.
            generate_config: Optional dictionary for the configuration used for
                generation.

        Yields:
            A string token.
        r-   r.   r   r/   r+   Zlogprobs)r3   verbose	log_probsN)r2   
isinstancedictr0   Zon_llm_new_tokenr6   )r   r,   r%   r'   r)   Zstreaming_responsechunkr.   choicer3   r7   r   r   r    r1      s&     


  zXinference._stream_generate)NN)NN)NN)__name__
__module____qualname____doc__r   __annotations__r   strr   r   propertyr#   r   r$   r   r
   r5   r	   r   r1   __classcell__r   r   r   r    r      sD   
H  #  
.  r   N)typingr   r   r   r   r   r   r   r	   Zlangchain_core.callbacksr
   Z#langchain_core.language_models.llmsr   r   r   r   Zxinference.model.llm.corer   r   r   r   r   r    <module>   s   (