U
    hg                     @  s   d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZmZ d dlmZmZmZ eeZ ddd	d
dddddZ!G dd deeZ"dS )    )annotationsN)AnyDictIterableListLiteralMappingOptionalSequenceSetTupleUnioncast)
Embeddings)	BaseModelField	SecretStrroot_validator)from_envget_pydantic_field_namessecret_from_envintzList[Union[List[int], str]]List[List[float]]z	List[int]boolzList[Optional[List[float]]])	num_textstokensbatched_embeddingsindices
skip_emptyreturnc           	        s&  dd t | D }dd t | D t t|D ]J |rJt|  dkrJq0||   |   |   t|   q0g }t | D ] |  }t|dkr|d  qqt|dkr||d  qqt   fddt| D }tdd |D d	 |fd
d|D  q|S )Nc                 S  s   g | ]}g qS  r    .0_r    r    D/tmp/pip-unpacked-wheel-n8reftby/langchain_openai/embeddings/base.py
<listcomp>&   s     z7_process_batched_chunked_embeddings.<locals>.<listcomp>c                 S  s   g | ]}g qS r    r    r!   r    r    r$   r%   +   s        r   c                   s,   g | ]$}t d d t|  D  qS )c                 s  s   | ]\}}|| V  qd S Nr    )r"   valZweightr    r    r$   	<genexpr>J   s   zA_process_batched_chunked_embeddings.<locals>.<listcomp>.<genexpr>)sumzip)r"   	embedding)inum_tokens_in_batchtotal_weightr    r$   r%   I   s   c                 s  s   | ]}|d  V  qdS )   Nr    r"   r(   r    r    r$   r)   T   s     z6_process_batched_chunked_embeddings.<locals>.<genexpr>g      ?c                   s   g | ]}|  qS r    r    r1   )	magnituder    r$   r%   U   s     )rangelenappendr*   r+   )	r   r   r   r   r   results
embeddingsZ_resultZaverager    )r-   r2   r.   r/   r$   #_process_batched_chunked_embeddings   s.    	
r8   c                   @  s  e Zd ZU dZedddZded< edddZded< dZd	ed
< dZ	ded< eZ
ded< eedddddZded< ededdddZded< eeddddZded< eeddddZded< dZded< ed ed!dddZd"ed#< ed$ed%d&gdddZded'< dZd(ed)< dZd*ed+< d,Zded-< d.Zded/< edd0d1Zd2ed3< dZded4< dZd5ed6< dZded7< d8Zd5ed9< eedZd:ed;< d8Zd5ed<< dZ d=ed>< dZ!d?ed@< dAZ"dedB< dCZ#dedD< dZ$dEedF< dZ%dEedG< dZ&d5edH< G dIdJ dJZ'e(ddKd:d:dLdMdNZ)e(d8dddOdPdPdLdQdRZ*e+d:dSdTdUZ,dVddWdXdYdZZ-dd[dVd	dd\d]d^d_Z.dd[dVd	dd\d]d`daZ/dmdVdd\dXdcddZ0dndVdd\dXdedfZ1d	dgdhdidjZ2d	dgdhdkdlZ3dS )oOpenAIEmbeddingsu	  OpenAI embedding model integration.

    Setup:
        Install ``langchain_openai`` and set environment variable ``OPENAI_API_KEY``.

        .. code-block:: bash

            pip install -U langchain_openai
            export OPENAI_API_KEY="your-api-key"

    Key init args — embedding params:
        model: str
            Name of OpenAI model to use.
        dimensions: Optional[int] = None
            The number of dimensions the resulting output embeddings should have.
            Only supported in `text-embedding-3` and later models.

    Key init args — client params:
        api_key: Optional[SecretStr] = None
            OpenAI API key.
        organization: Optional[str] = None
            OpenAI organization ID. If not passed in will be read
            from env var OPENAI_ORG_ID.
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
            Timeout for requests to OpenAI completion API

    See full list of supported init args and their descriptions in the params section.

    Instantiate:
        .. code-block:: python

            from langchain_openai import OpenAIEmbeddings

            embed = OpenAIEmbeddings(
                model="text-embedding-3-large"
                # With the `text-embedding-3` class
                # of models, you can specify the size
                # of the embeddings you want returned.
                # dimensions=1024
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            vector = embeddings.embed_query("hello")
            print(vector[:3])

        .. code-block:: python

            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Embed multiple texts:
        .. code-block:: python

            vectors = embeddings.embed_documents(["hello", "goodbye"])
            # Showing only the first 3 coordinates
            print(len(vectors))
            print(vectors[0][:3])

        .. code-block:: python

            2
            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Async:
        .. code-block:: python

            await embed.aembed_query(input_text)
            print(vector[:3])

            # multiple:
            # await embed.aembed_documents(input_texts)

        .. code-block:: python

            [-0.009100092574954033, 0.005071679595857859, -0.0029193938244134188]
    NT)defaultexcluder   clientasync_clientztext-embedding-ada-002strmodelzOptional[int]
dimensionszOptional[str]
deploymentZOPENAI_API_VERSION)r:   api_version)default_factoryaliasopenai_api_versionbase_urlZOPENAI_API_BASE)rD   rC   openai_api_baseZOPENAI_API_TYPE)rC   openai_api_typeZOPENAI_PROXYopenai_proxyi  r   embedding_ctx_lengthapi_keyZOPENAI_API_KEYzOptional[SecretStr]openai_api_keyorganizationZOPENAI_ORG_IDZOPENAI_ORGANIZATIONopenai_organizationz%Union[Literal['all'], Set[str], None]allowed_specialz4Union[Literal['all'], Set[str], Sequence[str], None]disallowed_speciali  
chunk_sizer0   max_retriestimeout)r:   rD   z0Optional[Union[float, Tuple[float, float], Any]]request_timeoutheadersr   tiktoken_enabledtiktoken_model_nameFshow_progress_barzDict[str, Any]model_kwargsr   zUnion[Mapping[str, str], None]default_headersz!Union[Mapping[str, object], None]default_query   retry_min_seconds   retry_max_secondszUnion[Any, None]http_clienthttp_async_clientcheck_embedding_ctx_lengthc                   @  s   e Zd ZdZdZdZdS )zOpenAIEmbeddings.Configz'Configuration for this pydantic object.ZforbidTN)__name__
__module____qualname____doc__extraZallow_population_by_field_namer    r    r    r$   Config
  s   rh   )pre)valuesr   c              
   C  s   t | }|di }t|D ]P}||kr8td| d||krtd| d| d| d ||||< q|| }|rtd| d	||d< |S )
z>Build extra kwargs from additional params that were passed in.rY   zFound z supplied twice.z	WARNING! z/ is not default parameter.
                    zJ was transferred to model_kwargs.
                    Please confirm that z is what you intended.zParameters za should be specified explicitly. Instead they were passed in as part of `model_kwargs` parameter.)	r   getlist
ValueErrorwarningswarnpopintersectionkeys)clsrj   Zall_required_field_namesrg   
field_nameZinvalid_model_kwargsr    r    r$   build_extra  s"    
zOpenAIEmbeddings.build_extra)ri   Zskip_on_failureZallow_reuser   c           
   
   C  s  |d dkrt d|d r(|d  nd|d |d |d |d	 |d
 |d d}|d r|d sl|d r|d }|d }|d }t d|d|d||ds.|d r|d szddl}W n, tk
r } ztd|W 5 d}~X Y nX |j|d d|d< d|d i}tjf ||j|d< |ds|d r|d szddl}W n. tk
r } ztd|W 5 d}~X Y nX |j	|d d|d< d|d i}	tj
f ||	j|d< |S )z?Validate that api key and python package exists in environment.rH   )ZazureZazure_adZazureadzEIf you are using Azure, please use the `AzureOpenAIEmbeddings` class.rL   NrN   rG   rT   rR   rZ   r[   )rK   rM   rF   rS   rR   rZ   r[   rI   r`   ra   zwCannot specify 'openai_proxy' if one of 'http_client'/'http_async_client' is already specified. Received:
openai_proxy=z
http_client=z
http_async_client=r<   r   zRCould not import httpx python package. Please install it with `pip install httpx`.)proxyr=   )rm   Zget_secret_valuerk   httpxImportErrorZClientopenaiZOpenAIr7   ZAsyncClientZAsyncOpenAI)
rs   rj   Zclient_paramsrI   r`   ra   rw   eZsync_specificZasync_specificr    r    r$   validate_environment*  sx     
 z%OpenAIEmbeddings.validate_environmentr   c                 C  s(   d| j i| j}| jd k	r$| j|d< |S )Nr?   r@   )r?   rY   r@   )selfparamsr    r    r$   _invocation_paramsk  s    

z#OpenAIEmbeddings._invocation_paramsz	List[str]z<Tuple[Iterable[int], List[Union[List[int], str]], List[int]])textsrQ   r   c                 C  s  g }g }| j p| j}| jszddlm} W n tk
rF   tdY nX |j|d}t|D ]^\}}	|j	|	dd}
t
dt|
| jD ]4}|
||| j  }||}|| || qq\nzt|}W n tk
r   td}Y nX dd	 | j| jd
 D }t|D ]\}}	| jdr2|	dd}	|rH|j	|	f|}n
||	}t
dt|| jD ](}||||| j   || qdq| jrz$ddlm} |t
dt||}W n& tk
r   t
dt||}Y nX nt
dt||}|||fS )a  
        Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:

        We have `batches`, where batches are sets of individual texts
        we want responses from the openai api. The length of a single batch is
        `chunk_size` texts.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

        This function returns a 3-tuple of the following:

        _iter: An iterable of the starting index in `tokens` for each *batch*
        tokens: A list of tokenized texts, where each text has already been split
            into sub-texts based on the `embedding_ctx_length` parameter. In the
            case of tiktoken, this is a list of token arrays. In the case of
            HuggingFace transformers, this is a list of strings.
        indices: An iterable of the same length as `tokens` that maps each token-array
            to the index of the original text in `texts`.
        r   )AutoTokenizerzCould not import transformers python package. This is needed for OpenAIEmbeddings to work without `tiktoken`. Please install it with `pip install transformers`. )Zpretrained_model_name_or_pathF)Zadd_special_tokensZcl100k_basec                 S  s   i | ]\}}|d k	r||qS r'   r    )r"   kvr    r    r$   
<dictcomp>  s    z.OpenAIEmbeddings._tokenize.<locals>.<dictcomp>)rO   rP   Z001
 )tqdm)rW   r?   rV   Ztransformersr   rx   rm   Zfrom_pretrained	enumerateencoder3   r4   rJ   decoder5   tiktokenZencoding_for_modelKeyErrorZget_encodingrO   rP   itemsendswithreplaceZencode_ordinaryrX   Z	tqdm.autor   )r}   r   rQ   r   r   Z
model_namer   	tokenizerr-   textZ	tokenizedjZtoken_chunkZ
chunk_textencodingZencoder_kwargstokenr   _iterr    r    r$   	_tokenizer  sb    
 


zOpenAIEmbeddings._tokenize)rQ   r   )r   enginerQ   r   c                  s   |pj }||\}}}g }|D ]R}	jjf d||	|	|  ij}
t|
ts^|
 }
|dd |
d D  q$t	t
||||j}d dd fdd	fd
d|D S )al  
        Generate length-safe embeddings for a list of texts.

        This method handles tokenization and embedding generation, respecting the
        set embedding context length and chunk size. It supports both tiktoken
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        inputc                 s  s   | ]}|d  V  qdS r,   Nr    r"   rr    r    r$   r)     s     z<OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<genexpr>dataNList[float]r|   c                    sF    d krBj jf ddij} t| ts2|  } | d d d   S Nr    r   r   r,   )r<   creater   
isinstancedict
model_dumpZaverage_embedded_cached_empty_embeddingr}   r    r$   empty_embedding  s    

zBOpenAIEmbeddings._get_len_safe_embeddings.<locals>.empty_embeddingc                   s   g | ]}|d k	r|n  qS r'   r    r"   rz   r   r    r$   r%     s     z=OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<listcomp>)rQ   r   r<   r   r   r   r   r   extendr8   r4   r   r}   r   r   rQ   Z_chunk_sizer   r   r   r   r-   responser7   r    r   r   r}   r$   _get_len_safe_embeddings  s,    


    z)OpenAIEmbeddings._get_len_safe_embeddingsc                  s   |pj }||\}}}g }|p(j }tdt||D ]X}	jjf d||	|	|  ijI dH }
t|
tsz|
	 }
|
dd |
d D  q:tt||||j}d dd fd	d
fdd|D I dH S )a  
        Asynchronously generate length-safe embeddings for a list of texts.

        This method handles tokenization and asynchronous embedding generation,
        respecting the set embedding context length and chunk size. It supports both
        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        r   r   Nc                 s  s   | ]}|d  V  qdS r   r    r   r    r    r$   r)   #  s     z=OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<genexpr>r   r   r|   c                    sL    d krHj jf ddijI d H } t| ts8|  } | d d d   S r   )r=   r   r   r   r   r   r   r   r    r$   r   *  s    

zCOpenAIEmbeddings._aget_len_safe_embeddings.<locals>.empty_embeddingc                   s$   g | ]}|d k	r|n
  I d H qS r'   r    r   r   r    r$   r%   5  s     z>OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<listcomp>)rQ   r   r3   r4   r=   r   r   r   r   r   r   r8   r   r   r    r   r$   _aget_len_safe_embeddings  s.    



    z*OpenAIEmbeddings._aget_len_safe_embeddingsr   c                 C  st   | j sZg }|D ]F}| jjf d|i| j}t|ts<| }|dd |d D  q|S tt| j	}| j
||dS )aM  Call out to OpenAI's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   c                 s  s   | ]}|d  V  qdS r   r    r   r    r    r$   r)   J  s     z3OpenAIEmbeddings.embed_documents.<locals>.<genexpr>r   r   )rb   r<   r   r   r   r   r   r   r>   rA   r   r}   r   rQ   r7   r   r   r   r    r    r$   embed_documents7  s    
z OpenAIEmbeddings.embed_documentsc                   s   | j s`g }|D ]L}| jjf d|i| jI dH }t|tsB| }|dd |d D  q|S tt| j	}| j
||dI dH S )aS  Call out to OpenAI's embedding endpoint async for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   Nc                 s  s   | ]}|d  V  qdS r   r    r   r    r    r$   r)   g  s     z4OpenAIEmbeddings.aembed_documents.<locals>.<genexpr>r   r   )rb   r=   r   r   r   r   r   r   r>   rA   r   r   r    r    r$   aembed_documentsR  s    

z!OpenAIEmbeddings.aembed_documentsr   )r   r   c                 C  s   |  |gd S )zCall out to OpenAI's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        r   )r   )r}   r   r    r    r$   embed_queryo  s    	zOpenAIEmbeddings.embed_queryc                   s   |  |gI dH }|d S )zCall out to OpenAI's embedding endpoint async for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        Nr   )r   )r}   r   r7   r    r    r$   aembed_queryz  s    	zOpenAIEmbeddings.aembed_query)r   )r   )4rc   rd   re   rf   r   r<   __annotations__r=   r?   r@   rA   r   rE   rG   rH   rI   rJ   r   rL   rN   rO   rP   rQ   rR   rT   rU   rV   rW   rX   r   rY   r   rZ   r[   r]   r_   r`   ra   rb   rh   r   ru   r{   propertyr   r   r   r   r   r   r   r   r    r    r    r$   r9   Z   s   
Q
 


 
  
@c23  r9   )#
__future__r   loggingrn   typingr   r   r   r   r   r   r	   r
   r   r   r   r   ry   r   Zlangchain_core.embeddingsr   Zlangchain_core.pydantic_v1r   r   r   r   Zlangchain_core.utilsr   r   r   	getLoggerrc   loggerr8   r9   r    r    r    r$   <module>   s   8
=