U
    h)                     @  s   d Z ddlmZ ddlZddlZddlmZmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ dddddZdddddZG dd deeeZG dd deeeZG dd deZ dS )z-LLM Chains for evaluating question answering.    )annotationsN)AnyListOptionalSequenceTuple)	Callbacks)BaseLanguageModel)PromptTemplate)LLMChain)CONTEXT_PROMPT
COT_PROMPTPROMPT)LLMEvalChainStringEvaluator)RUN_KEYstrzOptional[Tuple[str, int]])textreturnc                 C  s   t d|  t j}|rD|d dkr.dS |d dkrDdS z|   d t	ddt
j}| dkrzW dS | dkrW dS |   d	 t	ddt
j}| dkrW dS | dkrW dS W n tk
r   Y nX d S )
Nzgrade:\s*(correct|incorrect)   CORRECT)r   r   	INCORRECT)r   r   r    )researchstrip
IGNORECASEgroupuppersplit	translater   	maketransstringpunctuation
IndexError)r   matchZ
first_word	last_word r(   F/tmp/pip-unpacked-wheel-bo69hh5q/langchain/evaluation/qa/eval_chain.py
_get_score   s4     

r*   dictc                 C  s6   |   }t|}|dkr"d\}}n|\}}|||dS )zParse the output text.

    Args:
        text (str): The output text to parse.

    Returns:
        Any: The parsed output.
    N)NN)	reasoningvaluescore)r   r*   )r   r,   Zparsed_scoresr-   r.   r(   r(   r)   _parse_string_eval_output0   s    	
r/   c                
   @  s  e Zd ZU dZdZded< G dd dZeddd	d
Ze	ddddZ
e	ddddZe	ddddZed.dddd dddZd/ddddddddddd d!Zd"d"d#d$d%Zdddd&d'dd(d(dddd"d)d*d+Zdddd&d'dd(d(dddd"d)d,d-ZdS )0QAEvalChainz,LLM Chain for evaluating question answering.resultsr   
output_keyc                   @  s   e Zd ZdZdS )zQAEvalChain.ConfigignoreN__name__
__module____qualname__extrar(   r(   r(   r)   ConfigK   s   r9   boolr   c                 C  s   dS NFr(   clsr(   r(   r)   is_lc_serializableN   s    zQAEvalChain.is_lc_serializablec                 C  s   dS )NZcorrectnessr(   selfr(   r(   r)   evaluation_nameR   s    zQAEvalChain.evaluation_namec                 C  s   dS NTr(   r@   r(   r(   r)   requires_referenceV   s    zQAEvalChain.requires_referencec                 C  s   dS rC   r(   r@   r(   r(   r)   requires_inputZ   s    zQAEvalChain.requires_inputNr	   Optional[PromptTemplate]r   llmpromptkwargsr   c                 K  sJ   |pt }dddh}|t|jkr6td| d|j | f ||d|S )a  Load QA Eval Chain from LLM.

        Args:
            llm (BaseLanguageModel): the base language model to use.

            prompt (PromptTemplate): A prompt template containing the input_variables:
            'input', 'answer' and 'result' that will be used as the prompt
            for evaluation.
            Defaults to PROMPT.

            **kwargs: additional keyword arguments.

        Returns:
            QAEvalChain: the loaded QA eval chain.
        queryanswerresultInput variables should be 
, but got rH   rI   )r   setinput_variables
ValueError)r>   rH   rI   rJ   expected_input_varsr(   r(   r)   from_llm^   s    
zQAEvalChain.from_llmrK   rL   rM   	callbackszSequence[dict]r   
List[dict])examplespredictionsquestion_key
answer_keyprediction_keyrW   r   c                  s*    fddt |D }| j||dS )5Evaluate question answering examples and predictions.c                   s,   g | ]$\}}| |  |  d qS )rK   rL   rM   r(   .0iZexampler\   r]   rZ   r[   r(   r)   
<listcomp>   s
   
z(QAEvalChain.evaluate.<locals>.<listcomp>rV   	enumerateapply)rA   rY   rZ   r[   r\   r]   rW   inputsr(   rc   r)   evaluate}   s    	zQAEvalChain.evaluater+   rM   r   c                 C  s&   t || j }t|kr"|t |t< |S Nr/   r2   r   rA   rM   parsed_resultr(   r(   r)   _prepare_output   s    zQAEvalChain._prepare_outputF	referenceinputrW   include_run_infoOptional[str]
predictionrq   rr   rW   rs   rJ   r   c                K  s    | |||d||d}|  |S )a  Evaluate Chain or LLM output, based on optional input and label.

        Args:
            prediction (str): the LLM or chain prediction to evaluate.
            reference (Optional[str], optional): the reference label
                to evaluate against.
            input (Optional[str], optional): the input to consider during evaluation
            callbacks (Callbacks, optional): the callbacks to use for tracing.
            include_run_info (bool, optional): whether to include run info in the
                returned results.
            **kwargs: additional keyword arguments, including callbacks, tags, etc.
        Returns:
            dict: The evaluation results containing the score or value.
        r_   rW   rs   ro   rA   rv   rq   rr   rW   rs   rJ   rM   r(   r(   r)   _evaluate_strings   s    	zQAEvalChain._evaluate_stringsc                  s(   | j |||d||dI d H }| |S )Nr_   rh   rW   rs   Zacallro   ry   r(   r(   r)   _aevaluate_strings   s    

zQAEvalChain._aevaluate_strings)N)rK   rL   rM   )r5   r6   r7   __doc__r2   __annotations__r9   classmethodr?   propertyrB   rD   rE   rU   ri   ro   rz   r}   r(   r(   r(   r)   r0   F   s>   
 "   
'r0   c                
   @  s  e Zd ZdZeddddZeddddZedddd	ZG d
d dZ	edddddZ
eddddZed0dddd dddZd1dddddddd dd!d"d#Zd$d$d%d&d'Zdddd(d)dd*d*d ddd$d+d,d-Zdddd(d)dd*d*d ddd$d+d.d/ZdS )2ContextQAEvalChainz3LLM Chain for evaluating QA w/o GT based on contextr:   r;   c                 C  s   dS r<   r(   r=   r(   r(   r)   r?      s    z%ContextQAEvalChain.is_lc_serializablec                 C  s   dS )z.Whether the chain requires a reference string.Tr(   r@   r(   r(   r)   rD      s    z%ContextQAEvalChain.requires_referencec                 C  s   dS )z+Whether the chain requires an input string.Tr(   r@   r(   r(   r)   rE      s    z!ContextQAEvalChain.requires_inputc                   @  s   e Zd ZdZdS )zContextQAEvalChain.Configr3   Nr4   r(   r(   r(   r)   r9      s   r9   r
   None)rI   r   c                 C  s2   dddh}|t |jkr.td| d|j d S )NrK   contextrM   rN   rO   )rQ   rR   rS   )r>   rI   rT   r(   r(   r)   _validate_input_vars   s
    
z'ContextQAEvalChain._validate_input_varsr   c                 C  s   dS )NzContextual Accuracyr(   r@   r(   r(   r)   rB      s    z"ContextQAEvalChain.evaluation_nameNr	   rF   r   rG   c                 K  s&   |pt }| | | f ||d|S )a  Load QA Eval Chain from LLM.

        Args:
            llm (BaseLanguageModel): the base language model to use.

            prompt (PromptTemplate): A prompt template containing the input_variables:
            'query', 'context' and 'result' that will be used as the prompt
            for evaluation.
            Defaults to PROMPT.

            **kwargs: additional keyword arguments.

        Returns:
            ContextQAEvalChain: the loaded QA eval chain.
        rP   )r   r   r>   rH   rI   rJ   r(   r(   r)   rU      s    
zContextQAEvalChain.from_llmrK   r   rM   rV   rX   r   )rY   rZ   r[   context_keyr]   rW   r   c                  s*    fddt |D }| j||dS )r^   c                   s,   g | ]$\}}| |  |  d qS )rK   r   rM   r(   r`   r   r]   rZ   r[   r(   r)   rd     s
   
z/ContextQAEvalChain.evaluate.<locals>.<listcomp>rV   re   )rA   rY   rZ   r[   r   r]   rW   rh   r(   r   r)   ri   	  s    	zContextQAEvalChain.evaluater+   rj   c                 C  s&   t || j }t|kr"|t |t< |S rk   rl   rm   r(   r(   r)   ro     s    z"ContextQAEvalChain._prepare_outputFrp   rt   ru   c                K  s    | |||d||d}|  |S )Nr   rw   rx   ry   r(   r(   r)   rz   %  s    
	z$ContextQAEvalChain._evaluate_stringsc                  s(   | j |||d||dI d H }| |S )Nr   r{   r|   ry   r(   r(   r)   r}   :  s    

z%ContextQAEvalChain._aevaluate_strings)N)rK   r   rM   )r5   r6   r7   r~   r   r?   r   rD   rE   r9   r   rB   rU   ri   ro   rz   r}   r(   r(   r(   r)   r      s@       
r   c                   @  sN   e Zd ZdZeddddZeddddZedd
ddd dddZd	S )CotQAEvalChainz=LLM Chain for evaluating QA using chain of thought reasoning.r:   r;   c                 C  s   dS r<   r(   r=   r(   r(   r)   r?   O  s    z!CotQAEvalChain.is_lc_serializabler   c                 C  s   dS )NzCOT Contextual Accuracyr(   r@   r(   r(   r)   rB   S  s    zCotQAEvalChain.evaluation_nameNr	   rF   r   rG   c                 K  s&   |pt }| | | f ||d|S )zLoad QA Eval Chain from LLM.rP   )r   r   r   r(   r(   r)   rU   W  s    
zCotQAEvalChain.from_llm)N)	r5   r6   r7   r~   r   r?   r   rB   rU   r(   r(   r(   r)   r   L  s    r   )!r~   
__future__r   r   r#   typingr   r   r   r   r   Z langchain_core.callbacks.managerr   Zlangchain_core.language_modelsr	   Zlangchain_core.promptsr
   Zlangchain.chains.llmr   Z#langchain.evaluation.qa.eval_promptr   r   r   Zlangchain.evaluation.schemar   r   Zlangchain.schemar   r*   r/   r0   r   r   r(   r(   r(   r)   <module>   s"    	~