U
    hx                     @  sj  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZ ddlmZ ddlmZ zddlmZmZmZmZ W n, ek
r   ddlmZmZmZmZ Y nX ddlZdd	lm Z  dd
l!m"Z"m#Z#m$Z$m%Z% e&e'Z(G dd deZ)G dd deddZ*G dd deZ+G dd deddZ,G dd dZ-ee+e,e.f Z/G dd deZ0ee0e.f Z1G dd de-Z2ddddZ3dZ4d d!d"d#Z5d$d$d%d&d'Z6G d(d) d)Z7d*d)d+d,d-Z8d.d/d+d0d1Z9d.d2d+d3d4Z:d5d6d7d8d9Z;eeeej% eej$ gee+e,f f eeej% eej$ gee+e,f f f Z<d.d:d+d;d<Z=dS )=z?This module contains the evaluator classes for evaluating runs.    )annotationsN)abstractmethod)
Any	AwaitableCallableDictListLiteralOptionalSequenceUnioncast)	TypedDict)schemas)	BaseModelFieldValidationError	validator)wraps)
SCORE_TYPE
VALUE_TYPEExampleRunc                   @  s"   e Zd ZU dZded< ded< dS )Categoryz$A category for categorical feedback.Optional[Union[float, int]]valuestrlabelN__name__
__module____qualname____doc____annotations__ r$   r$   B/tmp/pip-unpacked-wheel-cqvhoa9t/langsmith/evaluation/evaluator.pyr   1   s   
r   c                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
FeedbackConfigziConfiguration to define a type of feedback.

    Applied on on the first creation of a feedback_key.
    z2Literal[('continuous', 'categorical', 'freeform')]typer   minmaxz%Optional[List[Union[Category, dict]]]
categoriesNr   r$   r$   r$   r%   r&   :   s
   
r&   F)totalc                   @  s   e Zd ZU dZded< dZded< dZded< dZd	ed
< dZded< e	e
dZded< dZded< dZded< dZded< dZded< G dd dZeddddd ZdS )EvaluationResultzEvaluation result.r   keyNr   scorer   r   zOptional[str]commentzOptional[Dict]
correction)default_factoryr   evaluator_infoz%Optional[Union[FeedbackConfig, dict]]feedback_configOptional[Union[uuid.UUID, str]]source_run_idtarget_run_idextrac                   @  s   e Zd ZdZdZdS )zEvaluationResult.ConfigzPydantic model configuration.FN)r   r    r!   r"   Zallow_extrar$   r$   r$   r%   Configd   s   r8   T)prec                 C  s6   d|ks|d dkr2t |ttfr2td|  |S )z$Check that the value is not numeric.r.   NzJNumeric values should be provided in the 'score' field, not 'value'. Got: )
isinstanceintfloatloggerwarning)clsvvaluesr$   r$   r%   check_value_non_numerici   s    z(EvaluationResult.check_value_non_numeric)r   r    r!   r"   r#   r.   r   r/   r0   r   dictr2   r3   r5   r6   r7   r8   r   rB   r$   r$   r$   r%   r,   I   s   

r,   c                   @  s   e Zd ZU dZded< dS )EvaluationResultszqBatch evaluation results.

    This makes it easy for your evaluator to return multiple
    metrics at once.
    zList[EvaluationResult]resultsNr   r$   r$   r$   r%   rD   x   s   
rD   c                   @  s<   e Zd ZdZedddddddZdddddd	d
ZdS )RunEvaluatorzEvaluator interface class.Nr   Optional[Example]*Union[EvaluationResult, EvaluationResults]runexamplereturnc                 C  s   dS )zEvaluate an example.Nr$   selfrJ   rK   r$   r$   r%   evaluate_run   s    zRunEvaluator.evaluate_runc                   s   t  d| j||I dH S )z#Evaluate an example asynchronously.N)asyncioZget_running_loopZrun_in_executorrO   rM   r$   r$   r%   aevaluate_run   s       zRunEvaluator.aevaluate_run)N)N)r   r    r!   r"   r   rO   rQ   r$   r$   r$   r%   rF      s     rF   c                   @  s:   e Zd ZU dZded< ded< dZded< dZd	ed
< dS )ComparisonEvaluationResultzFeedback scores for the results of comparative evaluations.

    These are generated by functions that compare two or more runs,
    returning a ranking or other feedback.
    r   r-   z'Dict[Union[uuid.UUID, str], SCORE_TYPE]scoresNr4   r5   z6Optional[Union[str, Dict[Union[uuid.UUID, str], str]]]r/   )r   r    r!   r"   r#   r5   r/   r$   r$   r$   r%   rR      s   
rR   c                      s   e Zd ZdZd)dddddZd*d	d
dddddZdd
ddddZdd
ddddZeddddZ	d+dddddd Z
d,ddd! fd"d#Zd-ddddd$d%Zd&dd'd(Z  ZS ).DynamicRunEvaluatora  A dynamic evaluator that wraps a function and transforms it into a `RunEvaluator`.

    This class is designed to be used with the `@run_evaluator` decorator, allowing
    functions that take a `Run` and an optional `Example` as arguments, and return
    an `EvaluationResult` or `EvaluationResults`, to be used as instances of `RunEvaluator`.

    Attributes:
        func (Callable): The function that is wrapped by this evaluator.
    NXCallable[[Run, Optional[Example]], Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]]]zIOptional[Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]]]funcafuncc                 C  s   t |}|rt |}t||  ddlm} |dk	rR|j|td| _t|dd| _t	
|r|dk	rltd|j|td| _t|dd| _n4|jttttt gtf |td| _t|dd| _dS )zInitialize the DynamicRunEvaluator with a given function.

        Args:
            func (Callable): A function that takes a `Run` and an optional `Example` as
            arguments, and returns a dict or `ComparisonEvaluationResult`.
        r   run_helpersNZprocess_inputsr   rT   Func was provided as a coroutine function, but afunc was also provided. If providing both, func should be a regular function to avoid ambiguity.)_normalize_evaluator_funcr   	langsmithrZ   ensure_traceable_serialize_inputsrX   getattr_nameinspectiscoroutinefunction	TypeErrorr   r   r   r
   r   _RUNNABLE_OUTPUTrW   rN   rW   rX   rZ   r$   r$   r%   __init__   s4     
 zDynamicRunEvaluator.__init__FzUnion[EvaluationResult, dict]	uuid.UUIDboolr,   )resultr5   allow_no_keyrL   c              
     s   t  tr js| _ S z` s.td  d krD|rD| j d< t fdddD rhtd  tf d|i W S  tk
r } ztd  |W 5 d }~X Y nX d S )	NziExpected an EvaluationResult object, or dict with a metric 'key' and optional 'score'; got empty result: r-   c                 3  s   | ]}| kV  qd S Nr$   ).0krk   r$   r%   	<genexpr>   s     z@DynamicRunEvaluator._coerce_evaluation_result.<locals>.<genexpr>)r.   r   r/   zrExpected an EvaluationResult object, or dict with a metric 'key' and optional 'score' or categorical 'value'; got r5   z[Expected an EvaluationResult object, or dict with a metric 'key' and optional 'score'; got )r:   r,   r5   
ValueErrorrb   allr   )rN   rk   r5   rl   er$   rp   r%   _coerce_evaluation_result   s,    

z-DynamicRunEvaluator._coerce_evaluation_resultzUnion[dict, EvaluationResults]rH   )rE   r5   rL   c                   sL   d|kr6|  } fdd|d D |d< tf |S  jtt|ddS )NrE   c                   s   g | ]} j |d qS ))r5   )ru   )rn   rrN   r5   r$   r%   
<listcomp>  s   zBDynamicRunEvaluator._coerce_evaluation_results.<locals>.<listcomp>T)r5   rl   )copyrD   ru   r   rC   )rN   rE   r5   cpr$   rw   r%   _coerce_evaluation_results  s    

  z.DynamicRunEvaluator._coerce_evaluation_resultszMUnion[EvaluationResult, EvaluationResults, dict, str, int, bool, float, list])rk   r5   rL   c                 C  s.   t |tr|js||_|S t|}| ||S rm   )r:   r,   r5   _format_evaluator_resultr{   )rN   rk   r5   r$   r$   r%   _format_result  s    
z"DynamicRunEvaluator._format_resultrL   c                 C  s
   t | dS zCheck if the evaluator function is asynchronous.

        Returns:
            bool: True if the evaluator function is asynchronous, False otherwise.
        rX   hasattrrN   r$   r$   r%   is_async'  s    zDynamicRunEvaluator.is_asyncr   rG   rI   c                 C  s   t | ds6t }| r$tdn|| ||S t }d|j	i}t
|ddrbt|j|d< | j||||dd}| ||S )	a  Evaluate a run using the wrapped function.

        This method directly invokes the wrapped function with the provided arguments.

        Args:
            run (Run): The run to be evaluated.
            example (Optional[Example]): An optional example to be used in the evaluation.

        Returns:
            Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
        rW   tCannot call `evaluate_run` on an async run evaluator from within an running event loop. Use `aevaluate_run` instead.r6   
session_idN
experimentrun_idmetadataZlangsmith_extra)r   rP   get_event_loop
is_runningRuntimeErrorrun_until_completerQ   uuiduuid4idra   r   r   rW   r}   )rN   rJ   rK   running_loopr5   r   rk   r$   r$   r%   rO   0  s"    

z DynamicRunEvaluator.evaluate_runrJ   rK   c                   sr   t | dst ||I dH S t }d|ji}t|ddrJt|j|d< | j	||||ddI dH }| 
||S )a  Evaluate a run asynchronously using the wrapped async function.

        This method directly invokes the wrapped async function with the
            provided arguments.

        Args:
            run (Run): The run to be evaluated.
            example (Optional[Example]): An optional example to be used
                in the evaluation.

        Returns:
            Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
        rX   Nr6   r   r   r   r   )r   superrQ   r   r   r   ra   r   r   rX   r}   )rN   rJ   rK   r5   r   rk   	__class__r$   r%   rQ   R  s    

z!DynamicRunEvaluator.aevaluate_runc                 C  s   |  ||S )a  Make the evaluator callable, allowing it to be used like a function.

        This method enables the evaluator instance to be called directly, forwarding the
        call to `evaluate_run`.

        Args:
            run (Run): The run to be evaluated.
            example (Optional[Example]): An optional example to be used in the evaluation.

        Returns:
            Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
        )rO   rM   r$   r$   r%   __call__m  s    zDynamicRunEvaluator.__call__r   c                 C  s   d| j  dS ))Represent the DynamicRunEvaluator object.z<DynamicRunEvaluator >rb   r   r$   r$   r%   __repr__~  s    zDynamicRunEvaluator.__repr__)N)F)N)N)N)r   r    r!   r"   rh   ru   r{   r}   propertyr   rO   rQ   r   r   __classcell__r$   r$   r   r%   rT      s    6 	 " rT   rU   rW   c                 C  s   t | S )zmCreate a run evaluator from a function.

    Decorator that transforms a function into a `RunEvaluator`.
    )rT   r   r$   r$   r%   run_evaluator  s    	r   i'  r   )objc                 C  s,   t | }t|tkr(|d td  d }|S )N   z...))reprlen_MAXSIZE)r   sr$   r$   r%   _maxsize_repr  s    r   rC   )inputsrL   c                 C  s&   t | d}t | d}||dS )NrJ   rK   r   )r   get)r   Zrun_truncatedZexample_truncatedr$   r$   r%   r`     s    r`   c                   @  s   e Zd ZdZd"dddddZedd	d
dZd#ddddddZd$ddddddZd%ddddddZ	dd	ddZ
edddddZdddddd d!ZdS )&DynamicComparisonRunEvaluatorz4Compare predictions (as traces) from 2 or more runs.NfCallable[[Sequence[Run], Optional[Example]], Union[_COMPARISON_OUTPUT, Awaitable[_COMPARISON_OUTPUT]]]zUOptional[Callable[[Sequence[Run], Optional[Example]], Awaitable[_COMPARISON_OUTPUT]]]rV   c                 C  s   t |}|rt |}t||  ddlm} |dk	rR|j|td| _t|dd| _t	
|r|dk	rltd|j|td| _t|dd| _n8|jtttt tt gtf |td| _t|dd| _dS )zInitialize the DynamicRunEvaluator with a given function.

        Args:
            func (Callable): A function that takes a `Run` and an optional `Example` as
            arguments, and returns an `EvaluationResult` or `EvaluationResults`.
        r   rY   Nr[   r   rT   r\   )$_normalize_comparison_evaluator_funcr   r^   rZ   r_   r`   rX   ra   rb   rc   rd   re   r   r   r   r   r
   r   _COMPARISON_OUTPUTrW   rg   r$   r$   r%   rh     sB     
 
z&DynamicComparisonRunEvaluator.__init__rj   r~   c                 C  s
   t | dS r   r   r   r$   r$   r%   r     s    z&DynamicComparisonRunEvaluator.is_asyncSequence[Run]rG   rR   runsrK   rL   c                 C  sl   t | ds6t }| r$tdn|| ||S t }| 	|}| j
||||dd}| |||S )zCompare runs to score preferences.

        Args:
            runs: A list of runs to compare.
            example: An optional example to be used in the evaluation.

        rW   r   r   tagsr   )r   rP   r   r   r   r   acompare_runsr   r   	_get_tagsrW   _format_results)rN   r   rK   r   r5   r   rk   r$   r$   r%   compare_runs  s"    



z*DynamicComparisonRunEvaluator.compare_runsc                   sR   t | ds| ||S t }| |}| j||||ddI dH }| |||S )a  Evaluate a run asynchronously using the wrapped async function.

        This method directly invokes the wrapped async function with the
            provided arguments.

        Args:
            runs (Run): The runs to be evaluated.
            example (Optional[Example]): An optional example to be used
                in the evaluation.

        Returns:
            ComparisonEvaluationResult: The result of the evaluation.
        rX   r   r   N)r   r   r   r   r   rX   r   )rN   r   rK   r5   r   rk   r$   r$   r%   r     s    

z+DynamicComparisonRunEvaluator.acompare_runsc                 C  s   |  ||S )a  Make the evaluator callable, allowing it to be used like a function.

        This method enables the evaluator instance to be called directly, forwarding the
        call to `evaluate_run`.

        Args:
            run (Run): The run to be evaluated.
            example (Optional[Example]): An optional example to be used in the evaluation.

        Returns:
            ComparisonEvaluationResult: The result of the evaluation.
        )r   )rN   r   rK   r$   r$   r%   r     s    z&DynamicComparisonRunEvaluator.__call__r   c                 C  s   d| j  dS )r   z<DynamicComparisonRunEvaluator r   r   r   r$   r$   r%   r   /  s    z&DynamicComparisonRunEvaluator.__repr__z	List[str])r   rL   c                 C  sF   g }| D ]8}| dt|j  t|ddr| dt|j  q|S )zExtract tags from runs.zrun:r   Nzexperiment:)appendr   r   ra   r   )r   r   rJ   r$   r$   r%   r   3  s    z'DynamicComparisonRunEvaluator._get_tagsz-Union[dict, list, ComparisonEvaluationResult]ri   )rk   r5   r   rL   c              
   C  s   t |tr|js||_|S t |trDdd t||D | j|d}n0t |trbd|krt| j|d< nd|}t|ztf d|i|W S  tk
r } ztd| |W 5 d }~X Y nX d S )Nc                 S  s   i | ]\}}|j |qS r$   )r   )rn   rJ   r.   r$   r$   r%   
<dictcomp>J  s      zADynamicComparisonRunEvaluator._format_results.<locals>.<dictcomp>)rS   r-   r5   r-   zXExpected 'dict', 'list' or 'ComparisonEvaluationResult' result object. Received: result=r5   zExpected a dictionary with a 'key' and dictionary of scores mappingrun IDs to numeric scores, or ComparisonEvaluationResult object, got )	r:   rR   r5   listziprb   rC   rr   r   )rN   rk   r5   r   msgrt   r$   r$   r%   r   >  s2    



z-DynamicComparisonRunEvaluator._format_results)N)N)N)N)r   r    r!   r"   rh   r   r   r   r   r   r   staticmethodr   r   r$   r$   r$   r%   r     s    8	    
r   r   )rW   rL   c                 C  s   t | S )z.Create a comaprison evaluator from a function.)r   r   r$   r$   r%   comparison_evaluatorc  s    r   r   z|Union[Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]]]c                   s  dt  }dd |j D rHtfddD s^tdkr^d d}t|ntfd	dD rd
dgkr S t  rdddd fdd}t drt	 dn|j
|_
|S dddd fdd}t drt	 dn|j
|_
|S d S )NrJ   rK   r   outputsreference_outputsc                 S  s&   g | ]\}}|j |j|jfkr|qS r$   kindPOSITIONAL_OR_KEYWORDPOSITIONAL_ONLYrn   pnamepr$   r$   r%   rx   u  s   z-_normalize_evaluator_func.<locals>.<listcomp>c                 3  s   | ]}| kV  qd S rm   r$   rn   r   supported_argsr$   r%   rq   {  s     z,_normalize_evaluator_func.<locals>.<genexpr>   kInvalid evaluator function. Must have at least one positional argument. Supported positional arguments are . Please see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluatorsc                 3  s   | ]}| kV  qd S rm   r$   r   r   r$   r%   rq     s    rJ   rK   r   rG   rf   rI   c                   sN   | ||r|j ni | jpi |r&|jp(i ni d  fddD }| I d H S )Nr   c                 3  s   | ]} | V  qd S rm   r$   rn   argZarg_mapr$   r%   rq     s     z>_normalize_evaluator_func.<locals>.awrapper.<locals>.<genexpr>r   r   rJ   rK   argsrW   positional_argsr   r%   awrapper  s    z+_normalize_evaluator_func.<locals>.awrapperr   r   c                   sH   | ||r|j ni | jpi |r&|jp(i ni d  fddD }| S )Nr   c                 3  s   | ]} | V  qd S rm   r$   r   r   r$   r%   rq     s     z=_normalize_evaluator_func.<locals>.wrapper.<locals>.<genexpr>r   r   r   r   r%   wrapper  s    z*_normalize_evaluator_func.<locals>.wrapperrc   	signature
parametersitemsrs   r   rr   rd   r   ra   r   rW   sigr   r   r   r$   rW   r   r   r%   r]   m  s@    





r]   zUnion[Callable[[Sequence[Run], Optional[Example]], _COMPARISON_OUTPUT], Callable[[Sequence[Run], Optional[Example]], Awaitable[_COMPARISON_OUTPUT]]]c                   s  dt  }dd |j D rHtfddD s^tdkr^d d}t|ntfd	dD rd
dgkr S t  rdddd fdd}t drt	 dn|j
|_
|S dddd fdd}t drt	 dn|j
|_
|S d S )Nr   rK   r   r   r   c                 S  s&   g | ]\}}|j |j|jfkr|qS r$   r   r   r$   r$   r%   rx     s   z8_normalize_comparison_evaluator_func.<locals>.<listcomp>c                 3  s   | ]}| kV  qd S rm   r$   r   r   r$   r%   rq     s     z7_normalize_comparison_evaluator_func.<locals>.<genexpr>r   r   r   c                 3  s   | ]}| kV  qd S rm   r$   r   r   r$   r%   rq     s    r   rK   r   rG   r   r   c                   sR   | ||r|j ni dd | D |r*|jp,i ni d  fddD }| I d H S )Nc                 S  s   g | ]}|j pi qS r$   r   rn   rJ   r$   r$   r%   rx     s     zJ_normalize_comparison_evaluator_func.<locals>.awrapper.<locals>.<listcomp>r   c                 3  s   | ]} | V  qd S rm   r$   r   r   r$   r%   rq     s     zI_normalize_comparison_evaluator_func.<locals>.awrapper.<locals>.<genexpr>r   r   rK   r   r   r   r%   r     s    z6_normalize_comparison_evaluator_func.<locals>.awrapperr   r   c                   sL   | ||r|j ni dd | D |r*|jp,i ni d  fddD }| S )Nc                 S  s   g | ]}|j pi qS r$   r   r   r$   r$   r%   rx     s     zI_normalize_comparison_evaluator_func.<locals>.wrapper.<locals>.<listcomp>r   c                 3  s   | ]} | V  qd S rm   r$   r   r   r$   r%   rq     s     zH_normalize_comparison_evaluator_func.<locals>.wrapper.<locals>.<genexpr>r   r   r   r   r%   r     s    z5_normalize_comparison_evaluator_func.<locals>.wrapperr   r   r$   r   r%   r     s@    





r   z;Union[EvaluationResults, dict, str, int, bool, float, list]zUnion[EvaluationResults, dict])rk   rL   c                 C  s   t | tttfrd| i} nx| s.td|  ndt | trdtdd | D sZtd|  dd| i} n.t | trxd| i} nt | trntd	|  | S )
Nr.   zdExpected a non-empty dict, str, bool, int, float, list, EvaluationResult, or EvaluationResults. Got c                 s  s   | ]}t |tV  qd S rm   )r:   rC   )rn   xr$   r$   r%   rq     s     z+_format_evaluator_result.<locals>.<genexpr>z8Expected a list of dicts or EvaluationResults. Received .rE   r   zZExpected a dict, str, bool, int, float, list, EvaluationResult, or EvaluationResults. Got )	r:   rj   r<   r;   rr   r   rs   r   rC   rp   r$   r$   r%   r|     s(    






r|   SUMMARY_EVALUATOR_Tc                   s   dt  }dd |j D rHtfddD srtdkrrd d}rh|d	 d7 }t|n^tfd
dD rddgkr S dddd fdd}t drt dn|j	|_	|S d S )Nr   examplesr   r   r   c                 S  s&   g | ]\}}|j |j|jfkr|qS r$   r   r   r$   r$   r%   rx   ,  s   z0_normalize_summary_evaluator.<locals>.<listcomp>c                 3  s   | ]}| kV  qd S rm   r$   r   r   r$   r%   rq   2  s     z/_normalize_summary_evaluator.<locals>.<genexpr>r   r   r   z Received positional arguments c                 3  s   | ]}| kV  qd S rm   r$   r   r   r$   r%   rq   >  s    r   r   zSequence[schemas.Run]zSequence[schemas.Example]rH   )r   r   rL   c                   s^   | |dd |D dd | D dd |D d  fddD }| }t |trV|S t|S )Nc                 S  s   g | ]
}|j qS r$   )r   rn   rK   r$   r$   r%   rx   J  s     zA_normalize_summary_evaluator.<locals>.wrapper.<locals>.<listcomp>c                 S  s   g | ]}|j pi qS r$   r   r   r$   r$   r%   rx   K  s     c                 S  s   g | ]}|j pi qS r$   r   r   r$   r$   r%   rx   L  s     r   c                 3  s   | ]} | V  qd S rm   r$   r   r   r$   r%   rq   N  s     z@_normalize_summary_evaluator.<locals>.wrapper.<locals>.<genexpr>)r:   r,   r|   )r   r   r   rk   r   r   r%   r   D  s    
z-_normalize_summary_evaluator.<locals>.wrapperr   )
rc   r   r   r   rs   r   rr   r   ra   r   )rW   r   r   r   r$   r   r%   _normalize_summary_evaluator)  s2    




r   )>r"   
__future__r   rP   rc   r   abcr   typingr   r   r   r   r   r	   r
   r   r   r   Ztyping_extensionsr   r^   r   Zpydantic.v1r   r   r   r   ImportErrorZpydanticlogging	functoolsr   Zlangsmith.schemasr   r   r   r   	getLoggerr   r=   r   r&   r,   rD   rF   rC   rf   rR   r   rT   r   r   r   r`   r   r   r]   r   r|   r   r   r$   r$   r$   r%   <module>   sd   0
	/ W E
JJ

