U
    h                     @  sF  d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl"m&Z' ddl"m(Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl4m7Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZR ddlSm&ZT ddlSmUZUmVZV erddlWZXeYeZZ[eeg eeIe#f f eee\gef e#eIf Z]eeg eeIe#f f ef Z^G dd de_Z`G d d! d!e\ZaG d"d# d#e\Zbdd%d&d'd(d)d*Zcd+d&d,d-d.ZdG d/d0 d0eGZed+d1d,d2d3Zfd4d5d6d7d8d9Zgd4d:d5d6d;d<d=Zhd4d'd5d6d>d?d@Zid'dAdBdCdDdEdFdGZjdHdIdJdKdLdMZkdHdIdJdNdOdPZldHdIdJdQdRdSZmdTdUd&dCdIdJdJdJdVdW	dXdYZndHdIdIdIdZd[d\d]ZodHd&dCdIdIdId^d_d`daZpdddddbdcd+dIddd5dedfdgdhdiZqddddjdkd+dddId5dedldmdndoZrddpd4dqd'd5drdsdtduZsddddjdcd+dddId5dedfdvdwdxZtddddjdkd+dddId5dedydmdzd{Zuddpd4dqd'd5drdsd|d}Zvdd~d&d%d&dedIdddddZwG dd deGddZxejyG dd dZzddddZ{dd6dddZ|dZ}dddddddddd&d%dBdddJdeddJdd+dddZ~dddddddddd&d%dBdddJdeddJdd+dddZdZee_ edde~_ dS )z>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)	TYPE_CHECKINGAnyCallableDictListOptionalTupleUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                   @  s   e Zd ZdZdS )InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__ r;   r;   K/tmp/pip-unpacked-wheel-bo69hh5q/langchain/smith/evaluation/runner_utils.pyr6   N   s   r6   c                   @  s,   e Zd ZdZddddZddddZdS )	
TestResultz1A dictionary of the results of a single test run.pd.DataFramereturnc                 C  s.   |   }dd |jD }|jddj|ddS )zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        c                 S  s6   g | ].}| d s.| ds.|dks.| dr|qS )inputs.outputs.>   outputinput	reference)
startswith).0colr;   r;   r<   
<listcomp>e   s   


z5TestResult.get_aggregate_feedback.<locals>.<listcomp>all)include   )Zaxis)to_dataframecolumnsZdescribeZdrop)selfZdfZto_dropr;   r;   r<   get_aggregate_feedbackX   s
    z!TestResult.get_aggregate_feedbackc              
   C  sJ  zddl }W n, tk
r8 } ztd|W 5 d}~X Y nX g }g }| d  D ]\}}|d }|d}t|trdd | D }	n|dkri }	nd|i}	d	d |d
  D |	}
d|krt|d tr|
dd |d  D  n|d |
d< |
dd |D |d|d |dd ||
 || qN|j||dS )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrC   c                 S  s   i | ]\}}d | |qS )rB   r;   rG   kvr;   r;   r<   
<dictcomp>   s      z+TestResult.to_dataframe.<locals>.<dictcomp>c                 S  s   i | ]\}}d | |qS )rA   r;   rS   r;   r;   r<   rV      s      rD   rE   c                 S  s   i | ]\}}d | |qS )z
reference.r;   rS   r;   r;   r<   rV      s      c                 S  s   i | ]}d |j  |jqS )z	feedback.)keyZscore)rG   fr;   r;   r<   rV      s     
 Errorexecution_timerun_id)errorrZ   r[   )index)	pandasImportErroritemsget
isinstancedictupdateappendZ	DataFrame)rO   pdeindicesrecords
example_idresultrR   Zoutput_rC   rr;   r;   r<   rM   o   sJ    


zTestResult.to_dataframeN)r7   r8   r9   r:   rP   rM   r;   r;   r;   r<   r=   U   s   r=   c                      s:   e Zd ZdZdddd fddZddd	d
dZ  ZS )	EvalErrorz"Your architecture raised an error.BaseExceptionr   None)rY   kwargsr@   c                   s   t  jf d|i| d S )NrY   )super__init__)rO   rY   rp   	__class__r;   r<   rr      s    zEvalError.__init__str)namer@   c                 C  s4   z
| | W S  t k
r.   td| dY nX d S )Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rO   rv   r;   r;   r<   __getattr__   s    
zEvalError.__getattr__)r7   r8   r9   r:   rr   rz   __classcell__r;   r;   rs   r<   rm      s   rm   <my_dataset>MODEL_OR_CHAIN_FACTORYru   MCF)llm_or_chain_factorydataset_namer@   c                   sf  t | trR|   jj}| jdk	rF jjj}td| d| d| d fddS t | tr`| S t | trz| fddS t| rbt	| rt
tt| fd	dS z
|  }W nP tk
r    tt| }t|}td
| d t|fdd Y S X tt| t |tr|S t	tt|rFt
tt|fddS t |ts^fddS S | S )zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.Na$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                     s    S Nr;   r;   )chainr;   r<   <lambda>       z(_wrap_in_chain_factory.<locals>.<lambda>c                     s    S r   r;   r;   )lcfr;   r<   r      r   c                     s    S r   r;   r;   	runnable_r;   r<   r      r   zWrapping function z as RunnableLambda.c                     s    S r   r;   r;   )wrappedr;   r<   r      r   c                     s    S r   r;   r;   r   r;   r<   r      r   c                     s   t  S r   )r   r;   )constructorr;   r<   r      r   )rb   r.   rt   r7   Zmemory
ValueErrorr   r   callabler%   r$   r   r   	TypeErrorinspect	signatureloggerinfor   )r   r   Zchain_classZmemory_class_modelZ	user_funcsigr;   )r   r   r   r   r   r<   _wrap_in_chain_factory   sH    









r   zDict[str, Any])inputsr@   c                 C  s4  | st dg }d| krJt| d ts>t dt| d j | d g}nd| krt| d trvtdd | d D st dt| d j | d }nnt| dkrtt	| 
 }t|tr|g}n0t|trtd	d |D r|}nt d
|  nt d|  t|dkr|d S t dt| ddS )zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc                 s  s   | ]}t |tV  qd S r   rb   ru   rG   ir;   r;   r<   	<genexpr>   s    z_get_prompt.<locals>.<genexpr>z,Expected list of strings for 'prompts', got rL   c                 s  s   | ]}t |tV  qd S r   r   r   r;   r;   r<   r     s     z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.N)r6   rb   ru   typer7   listrJ   lennextitervalues)r   r   Zprompt_r;   r;   r<   _get_prompt   s@    

r   c                   @  s   e Zd ZU dZded< dS )ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zList[BaseMessage]messagesNr7   r8   r9   r:   __annotations__r;   r;   r;   r<   r     s   
r   rc   c                 C  s   | st d|  }d| kr,|d|d< n t| dkrLtt|  |d< d|kr|d }t|tr~t	dd |D r~|g}t|dkrt
|d |d< nt d|S t d	|  d
S )zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rD   rL   c                 s  s   | ]}t |tV  qd S r   )rb   rc   r   r;   r;   r<   r   9  s    z _get_messages.<locals>.<genexpr>r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got N)r6   copypopr   r   r   r   rb   r   rJ   r   )r   Z
input_copyZraw_messagesr;   r;   r<   _get_messages%  s,    r   r(   Optional[Callable[[Dict], Any]]ro   )first_exampleinput_mapperr@   c                 C  s   |rP|| j }t|tst|tr4tdd |D std| dt| dnZzt| j  W nJ tk
r   zt| j  W n& tk
r   td| j  dY nX Y nX d S )Nc                 s  s   | ]}t |tV  qd S r   rb   r   rG   msgr;   r;   r<   r   U  s     z>_validate_example_inputs_for_language_model.<locals>.<genexpr>zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   rb   ru   r   rJ   r6   r   r   r   )r   r   Zprompt_inputr;   r;   r<   +_validate_example_inputs_for_language_modelM  s&    

r   r.   )r   r   r   r@   c                 C  s   |rb|| j }t|j|}t|tsBtd| dt| d|rtd|j d|  nP| j }t|j|}t	|dkrt	|jdkrn|rtd|j d|  dS )	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rL   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencerb   rc   r6   r   keysr   )r   r   r   Zfirst_inputsZmissing_keysr;   r;   r<   "_validate_example_inputs_for_chainn  s&    

r   )exampler   r   r@   c                 C  sR   t |trt| | n8| }t |tr4t| || nt |trNtd|  dS )z9Validate that the example inputs are valid for the model.zSkipping input validation for N)rb   r   r   r.   r   r   r   debug)r   r   r   r   r;   r;   r<   _validate_example_inputs  s    


r   List[Example]"Optional[smith_eval.RunEvalConfig]r'   zOptional[List[RunEvaluator]])r   examplesr3   	data_typer@   c           	      C  s   |rzt | trd\}}d}n2d}|  }t |tr6|jnd}t |trJ|jnd}t||||d jrnt|d jnd||}nd}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )rb   r   r.   r   Zoutput_keys_load_run_evaluatorsoutputsr   )	r   r   r3   r   
run_inputsrun_outputsrun_typer   run_evaluatorsr;   r;   r<   _setup_evaluation  s$    

r   zsmith_eval.RunEvalConfigOptional[List[str]]Optional[str])r   r   r@   c                 C  sz   d }| j r6| j }|rv||krvtd| d| d n@|rPt|dkrP|d }n&|d k	rvt|dkrvtd| d |S )Nz
Input key z% not in chain's specified input keys '. Evaluation behavior may be undefined.rL   r   z#Chain expects multiple input keys: z, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   r;   r;   r<   _determine_input_key  s    

r   )r   r   r@   c                 C  sz   d }| j r6| j }|rv||krvtd| d| d n@|rPt|dkrP|d }n&|d k	rvt|dkrvtd| d |S )NzPrediction key z& not in chain's specified output keys r   rL   r   z$Chain expects multiple output keys: zl, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   r;   r;   r<   _determine_prediction_key  s    

r   )r   example_outputsr@   c                 C  sT   | j r.| j }|rP||krPtd| d| n"|rLt|dkrLt|d }nd }|S )NzReference key z! not in Dataset example outputs: rL   r   )reference_keyr   r   r   )r   r   r   r;   r;   r<   _determine_reference_key  s    r   zYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]zOptional[BaseLanguageModel]r"   )	eval_configeval_llmr   r   r   r   r   r   r@   c              	   C  sB  t | tr| S t | ttfrBt | ts.t| } t| |d}| j}	nt | tjrd|i|  }
t| j	f|
}| j	j}	t | tj
r| jp|}| jp|}| jp|}n"t| rt| S tdt|  t |tr|jr|d krtd|	 d| dtjj|||||||	gd}n.t |tr.td|	 d	ntd|	 d
|S )N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)rb   r"   r0   ru   r/   valuesmith_eval_configZ
EvalConfigZ
get_kwargsZevaluator_typeZSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r2   Zrequires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer1   NotImplementedError)r   r   r   r   r   r   r   r   Z
evaluator_Zeval_type_tagrp   r#   r;   r;   r<   _construct_run_evaluator  sP    



	
	
r   z2Tuple[Optional[str], Optional[str], Optional[str]])r   r   r   r   r@   c                 C  s(   t | |}t| |}t| |}|||fS r   )r   r   r   )r   r   r   r   r   r   r   r;   r;   r<   	_get_keysI  s    


r   zList[RunEvaluator])r   r   r   r   r   r   r@   c                 C  s   g }d\}}}	| j s.| jrBtdd | jD rBt| |||\}}}	| j D ]&}
t|
| j||||	||}|| qH| jpxg }|D ]l}t|tr|| q~t|t	r|t
jj||||||	d q~t|r|t| q~td| dq~|S )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    )NNNc                 S  s   g | ]}t |tqS r;   )rb   r2   )rG   rg   r;   r;   r<   rI   j  s     z(_load_run_evaluators.<locals>.<listcomp>)r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr   r   r   re   rb   r"   r2   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r#   r   Zcustom_evaluatorr;   r;   r<   r   U  s\    
   







r   r   	callbacksr   metadatar   r   Optional[Dict[str, Any]]zUnion[str, BaseMessage])r   r   r   r   r   r   r@   c          
        s   |dk	rn||}t |ts6t |tr\tdd |D r\| j|t||pFg |pLi ddI dH S td| dn|z2t|}| j|t||pg |pi ddI dH }W nH tk
r   t|}	| jf |	dt||pg |pi diI dH }Y nX |S )	a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  s   | ]}t |tV  qd S r   r   r   r;   r;   r<   r     s     z_arun_llm.<locals>.<genexpr>r   r   r   r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   )	rb   ru   r   rJ   ainvoker   r6   r   r   )
r   r   r   r   r   r   prompt_or_messagesr   
llm_output
llm_inputsr;   r;   r<   	_arun_llm  sR      
    r   r   r   r   zUnion[Chain, Runnable]zUnion[dict, str])r   r   r   r   r   r   r@   c          
        s   |dkr|n||}t | trrt |trrt|dkrr| jrrtt| }| j|t	||pZg |p`i ddI dH }n*t	|pzg ||pi d}	| j||	dI dH }|S )z%Run a chain asynchronously on inputs.NrL   r   r   r   r   r   )
rb   r.   rc   r   r   r   r   r   r   r   
r   r   r   r   r   r   Zinputs_valrC   runnable_configr;   r;   r<   _arun_chain  s2    

    r   )r   r   z'Union[dict, str, LLMResult, ChatResult])r   r   r   r   r@   c          	        s   t |trdnd}d}znt |trNt|| j|d |d ||ddI dH }n0| }t|| j|d |d ||ddI dH }|}W nT tk
r } z6t| d| j	 d	| j d
t
|  t|d}W 5 d}~X Y nX |S )a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr.   Nr   r   r   r    failed for example  with inputs 
rY   )rb   r   r   r   ra   r   	Exceptionr   r   idreprrm   )	r   r   r   r   chain_or_llmrk   rC   r   rg   r;   r;   r<   _arun_llm_or_chain  s:    
	 r   )r   r   r   r   r   r   r@   c          
      C  s   |dk	rj||}t |ts6t |trXtdd |D rX| j|t||pFg |pLi dd}qtd| dnjz,t|}| j|t||pg |pi dd}W n< tk
r   t|}	| jf |	dt||pi d	i}Y nX |S )
a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  s   | ]}t |tV  qd S r   r   r   r;   r;   r<   r   O  s     z_run_llm.<locals>.<genexpr>r   r   z'Input mapper returned invalid format:  r   r   )r   r   )	rb   ru   r   rJ   invoker   r6   r   r   )
r   r   r   r   r   r   r   r   Zllm_promptsr   r;   r;   r<   _run_llm1  sJ      
  
r   zUnion[Dict, str]c          
      C  s   |dkr|n||}t | trlt |trlt|dkrl| jrltt| }| j|t	||pZg |p`i dd}n$t	|ptg ||p|i d}	| j||	d}|S )zRun a chain on inputs.NrL   r   r   r   )
rb   r.   rc   r   r   r   r   r   r   r   r   r;   r;   r<   
_run_chaino  s2    

    r   c          
      C  s   t |trdnd}d}zbt |trHt|| j|d |d ||dd}n*| }t|| j|d |d ||dd}|}W n` tk
r } zBt|j}	t	
| d| j d	| j d
|	 d| 	 t|d}W 5 d}~X Y nX |S )a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r   r.   Nr   r   r   r   r   r   z
Error Type: z, Message: r   )rb   r   r   r   ra   r   r   r   r7   r   r   r   rm   )
r   r   r   r   r   rk   rC   r   rg   Z
error_typer;   r;   r<   _run_llm_or_chain  s<    
	
"r   r   zOptional[Union[str, datetime]]z1Tuple[MCF, TracerSession, Dataset, List[Example]])clientr   r   project_nameproject_metadatar   dataset_versionr@   c              
   C  sj  t ||}| j|d}t| j|j|d}	|	s>td| ddd |	D }
|
rXt|
nd }|rh| nd }zJ|pti }t }|r|d|i}||d< | j	||j|rd	|ini |d
}W nn t
ttfk
r$ } zHdt|kr|t }d| d| d| d}td| d| W 5 d }~X Y nX |jd|j  }td| d| d| d|j dd ||||	fS )N)r   )Z
dataset_idZas_ofzDataset z has no example rows.c                 S  s   g | ]}|j r|j qS r;   )modified_at)rG   exr;   r;   r<   rI     s      z%_prepare_eval_run.<locals>.<listcomp>gitr  r   )Zreference_dataset_idZproject_extrar   zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   Zread_datasetr   Zlist_examplesr   r   max	isoformatr   Zcreate_projectr,   r+   ru   uuidZuuid4urlprint)r   r   r   r  r  r   r  wrapped_modeldatasetr   r  Zmax_modified_atZinferred_versionZgit_infoprojectrg   uidZexample_msgZcomparison_urlr;   r;   r<   _prepare_eval_run  sV    	
 
r  c                   @  s*   e Zd ZU dZded< ded< ded< dS )	
_RowResultz5A dictionary of the results for a single example row.z Optional[List[EvaluationResult]]rR   zOptional[float]rZ   r   r[   Nr   r;   r;   r;   r<   r    s   
r  F)totalc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< dZded< ddddddZdddddZddddZddd d!d"Z	d5dd$dd%d&d'Z
ed6dd)d*d+d,d-d.d/d0d+d1d d2d3d4ZdS )7_DatasetRunContainerz3A container to help manage the state of a eval run.r   r   r*   r  r~   r  r   r   zList[RunnableConfig]configsNz6Optional[List[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsr   zDict[str, _RowResult]rc   )batch_resultsall_eval_resultsr@   c                 C  s   i }t | j|D ]\}}tt|t|ji }|j|dg |d|dd|t|j< t|t	r~|j
|t|j d< n||t|j d< |jr|j|t|j d< q|S )NrR   rZ   r[   )rD   rR   rZ   r[   rY   rC   rE   )zipr   r   r  ra   ru   r   r   rb   rm   rY   r   )rO   r  r  rQ   r   rC   Z
row_resultr;   r;   r<   _merge_test_outputs  s    

z(_DatasetRunContainer._merge_test_outputszDict[str, Run]z
List[dict])runsr@   c           	        s   | j }|sg S  fdd| jD }g }tj }|D ]}zR||| j}t|trZ| }|t	t| |j
| jjf|d | jjd W q6 tk
r } ztdt| d|  W 5 d }~X Y q6X q6W 5 Q R X |S )Nc                   s   g | ]} t |j qS r;   )ru   r   rG   r   r  r;   r<   rI   1  s     z>_DatasetRunContainer._run_batch_evaluators.<locals>.<listcomp>)r[   Z
project_idzError running batch evaluator z: )r  r   
concurrentZfuturesZThreadPoolExecutorrb   r!   rc   re   r   Zsubmitr   Zcreate_feedbackr  r   r   r   r\   r   )	rO   r  r   Z	runs_listaggregate_feedbackexecutorZ	evaluatorrk   rg   r;   r  r<   _run_batch_evaluators-  s0    
"z*_DatasetRunContainer._run_batch_evaluatorsz,Tuple[Dict[str, _RowResult], Dict[str, Run]]r?   c                 C  s   i }i }| j D ]}tt|d D ]}t|trf|j}| D ]&\\}}}|t|i 	d|i q<q t|t
r |j}	|	r|	jr|	j|	j  nd }
|	rt|	jnd }|t|ji 	|
||	d |	|t|j< q qttttf ||fS )Nr   rR   )rZ   r[   run)r  r   r   rb   r   Zlogged_eval_resultsr`   
setdefaultru   rd   r   Z
latest_runend_time
start_timetotal_secondsr   rj   r   r  )rO   r  all_runsccallbackZeval_results_rj   rU   r"  rZ   r[   r;   r;   r<   _collect_metricsF  s6    


z%_DatasetRunContainer._collect_metricsz-List[Union[dict, str, LLMResult, ChatResult]]r=   )r  r@   c                 C  sX   t d t  |  \}}d }| jr:t d | |}| ||}t| jj	||dS )Nz#Waiting for evaluators to complete.zRunning session evaluators.)r  rQ   Zaggregate_metrics)
r   r   r   r+  r  r!  r  r=   r  rv   )rO   r  r  r'  r  rQ   r;   r;   r<   _collect_test_resultsc  s    


z*_DatasetRunContainer._collect_test_resultsFbool)r  verboser@   c              
   C  s   |  |}|rZz| }t| W n6 tk
rX } ztdt|  W 5 d }~X Y nX z | jj| j	j
ttjd W n6 tk
r } ztdt|  W 5 d }~X Y nX |S )Nz$Failed to print aggregate feedback: )r$  zFailed to close project: )r,  rP   _display_aggregate_resultsr   r   r   r   r   Zupdate_projectr  r   r   nowr   utc)rO   r  r.  rQ   Zagg_feedbackrg   r;   r;   r<   finishu  s    
& 

&z_DatasetRunContainer.finish   ru   r}   r   r   r   r   intr   Optional[Union[datetime, str]])r   r   r   r  r3   r   r   concurrency_levelr  revision_idr  r@   c              	     s  |p
t  }|
r&|	si }	|	d|
i t ||||	|d\}}}pJg jdpZi  D ]\}}d| d|  q`djd i|
r|
d< t|}t	||||j
ptjt|d || tt| fdd	|D }|  ||||r|jnd d
S )Nr7  )r  r   r  r  zgit:=r  r   c              
     sB   g | ]:}t tj |jd tp"g  |jddgdqS ))r  r   rj   r   )r   r   rj   max_concurrency)r   r   r9  r   )r   r   rv   r   r   r  r   r6  progress_barr  r   Zrun_metadatar   r;   r<   rI     s&   z0_DatasetRunContainer.prepare.<locals>.<listcomp>)r   r  r  r   r  r  )r4   Zrandom_namerd   r  r   ra   r`   re   r   r   r   r'   kvr   r5   ZProgressBarCallbackr   r  )clsr   r   r   r  r3   r   r   r6  r  r7  r  r  r  r   rT   rU   r  r;   r:  r<   prepare  sP    	   
z_DatasetRunContainer.prepare)F)NNNr3  NNN)r7   r8   r9   r:   r   r  r  r!  r+  r,  r2  classmethodr>  r;   r;   r;   r<   r    s*   
       r  r-  r?   c                  C  sJ   z.ddl m}  |  }|  d k	o,dtt|kW S  tk
rD   Y dS X d S )Nr   )get_ipythonZzmqshellF)ZIPythonr@  ru   r   r_   )r@  resr;   r;   r<   _is_jupyter_environment  s    rB  r>   )aggregate_resultsr@   c                 C  sR   t  r,ddlm}m} ||d ||  n"| jdd dd}td t| d S )	Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                 S  s   | dS )Nz.2fr;   )xr;   r;   r<   r     r   z,_display_aggregate_results.<locals>.<lambda>right)Zfloat_formatZjustifyz
 Experiment Results:)rB  ZIPython.displayrD  rE  Z	to_stringr  )rC  rD  rE  Zformatted_stringr;   r;   r<   r/    s    
 r/  a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)r3  )r3   r  r6  r  r  r.  r7  zOptional[Client]r5  r4  r   )r   r   r   r3   r  r6  r  r  r.  r7  rp   r@   c                  s   |
 dd }|rtdtdd |	d kr2t d}	|
 dd }|rPtdddd |
rntdd	|
  d
dd | pvt } tj| |||||||||	|d}t	j
|jd dfttjt|j|d|j|j I d H }|j||dS )Nr   0.0.305Tmessagependingr7  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   rJ  Zremovalr  r7  r  r   r9  r   r   r.  )r   r   _INPUT_MAPPER_DEP_WARNINGr    ra   r   r   r  r>  runnable_utilsZgather_with_concurrencyr  map	functoolspartialr   r  r   r2  )r   r   r   r3   r  r6  r  r  r.  r7  rp   r   r   	containerr  r;   r;   r<   arun_on_dataset  s\    
rY  c                  s  |
 dd rtdtdd |
 dd }|r<tdddd |	d krPt d}	|
rntdd	|
  d
dd | pvt } tj| ||||||||	|d |dkr fddt	 j
 jD }n@t jd *}t|tjt jd j
 j}W 5 Q R X  j||dS )Nr   rH  TrI  r   rL  rM  r7  rN  r   rO  rP  r   c                   s"   g | ]\}}t || jd qS )rQ  )r   r  )rG   r   r   rX  r   r;   r<   rI   l  s   z"run_on_dataset.<locals>.<listcomp>rQ  rR  )r   r   rS  r    ra   r   r   r  r>  r  r   r  r   Zget_executor_for_configr   rU  rV  rW  r   r  r2  )r   r   r   r3   r  r6  r  r  r.  r7  rp   r   r  r   r;   rZ  r<   run_on_dataset9  sb    

r[  a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()r|   )NNN)r:   
__future__r   concurrent.futuresr  ZdataclassesrV  r   loggingr
  r   r   typingr   r   r   r   r	   r
   r   r   r   Zlangchain_core._apir   Z langchain_core.callbacks.managerr   Zlangchain_core.language_modelsr   Zlangchain_core.messagesr   r   Zlangchain_core.outputsr   r   Zlangchain_core.runnablesr   r   r   r   r   r   rT  Z!langchain_core.tracers.evaluationr   r   Z langchain_core.tracers.langchainr   Zlangsmith.clientr   Zlangsmith.envr   r    Zlangsmith.evaluationr!   r"   r#   r   Zlangsmith.run_helpersr$   r%   Zlangsmith.schemasr&   r'   r(   r)   r*   Zlangsmith.utilsr+   requestsr,   Ztyping_extensionsr-   Zlangchain.chains.baser.   Zlangchain.evaluation.loadingr/   Zlangchain.evaluation.schemar0   r1   r2   Zlangchain.smithr3   r   Zlangchain.smith.evaluationr   r4   r5   r^   rf   	getLoggerr7   r   rc   r}   r~   r   r6   r=   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  Z	dataclassr  rB  r/  rS  rY  r[  Z_RUN_ON_DATASET_DOCSTRINGreplacer;   r;   r;   r<   <module>   s   ,
I =3
(!%CGE%>C%=   = C
(F(Mk 