U
    hc                  
   @   s^  d Z ddlZddlZddlZddlZddlmZmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ eeddd	Zejeee d
ddZeddddde	ej eee ee eeejdddZ eeeej dddZ!edZ"edZ#ee" ee# ee
e"e#f  dddZ$edddee%ee& ee ddddZ'dS )zfBeta utility functions to assist in common eval workflows.

These functions may change in the future.
    N)DefaultDictListOptionalSequenceTupleTypeVar)
evaluation)	warn_beta)Client)run_dictid_mapc                 C   sf   | d }|  D ]\}}|t|t|}q|| d< | drP|| d  | d< | dsbi | d< | S )a  Convert the IDs in the run dictionary using the provided ID map.

    Parameters:
    - run_dict (dict): The dictionary representing a run.
    - id_map (dict): The dictionary mapping old IDs to new IDs.

    Returns:
    - dict: The updated run dictionary.
    dotted_orderparent_run_idextra)itemsreplacestrget)r   r   Zdokv r   9/tmp/pip-unpacked-wheel-cqvhoa9t/langsmith/beta/_evals.py_convert_ids   s    


r   )rootrun_to_example_mapreturnc                    s   | g}t  }| j|i g }|r| }|jdddhd} |d t   |d <  |d  |d<  |d  |d< |jr||j || q fdd|D }|| j	 |d	 d
< |S )a&  Convert the root run and its child runs to a list of dictionaries.

    Parameters:
    - root (ls_schemas.Run): The root run to convert.
    - run_to_example_map (dict): The dictionary mapping run IDs to example IDs.

    Returns:
    - List[dict]: The list of converted run dictionaries.
    Zparent_run_idsZchild_run_idsZ
session_id)excludeidtrace_idc                    s   g | ]}t | qS r   )r   .0rr   r   r   
<listcomp>@   s     z%_convert_root_run.<locals>.<listcomp>r   Zreference_example_id)
uuiduuid4r   popdictr   
child_runsextendappendr   )r   r   Zruns_r   resultssrcZsrc_dictresultr   r"   r   _convert_root_run)   s     

r.   F)test_project_nameclientload_child_runsinclude_outputs)runsdataset_namer/   r0   r1   r2   r   c                   sN  | st d|   pt   j|d}|r<dd | D nd} jdd | D |dd | D |jd sr| }n fd	d| D }|pd
t jdd  }t	 j
|d}	dd |	D |	d jr|	d jn|	d j}
fdd|D } j||jd|
 dd}|D ]} jf |d|i q j|jtjjtjjdd}|S )a  Convert the following runs to a dataset + test.

    This makes it easy to sample prod runs into a new regression testing
    workflow and compare against a candidate system.

    Internally, this function does the following:
        1. Create a dataset from the provided production run inputs.
        2. Create a new test project.
        3. Clone the production runs and re-upload against the dataset.

    Parameters:
    - runs (Sequence[ls_schemas.Run]): A sequence of runs to be executed as a test.
    - dataset_name (str): The name of the dataset to associate with the test runs.
    - client (Optional[Client]): An optional LangSmith client instance. If not provided,
        a new client will be created.
    - load_child_runs (bool): Whether to load child runs when copying runs.
        Defaults to False.

    Returns:
    - ls_schemas.TracerSession: The project containing the cloned runs.

    Examples:
    --------
    .. code-block:: python

        import langsmith
        import random

        client = langsmith.Client()

        # Randomly sample 100 runs from a prod project
        runs = list(client.list_runs(project_name="My Project", execution_order=1))
        sampled_runs = random.sample(runs, min(len(runs), 100))

        runs_as_test(runs, dataset_name="Random Runs")

        # Select runs named "extractor" whose root traces received good feedback
        runs = client.list_runs(
            project_name="<your_project>",
            filter='eq(name, "extractor")',
            trace_filter='and(eq(feedback_key, "user_score"), eq(feedback_score, 1))',
        )
        runs_as_test(runs, dataset_name="Extraction Good")
    z1Expected a non-empty sequence of runs. Received: )r4   c                 S   s   g | ]
}|j qS r   )outputsr   r   r   r   r#      s     z(convert_runs_to_test.<locals>.<listcomp>Nc                 S   s   g | ]
}|j qS r   )inputsr   r   r   r   r#      s     c                 S   s   g | ]
}|j qS r   )r   r   r   r   r   r#      s     )r6   r5   Zsource_run_idsZ
dataset_idc                    s   g | ]} j |jd qS ))r1   )Zread_runr   r   )r0   r1   r   r   r#      s    zprod-baseline-   c                 S   s   i | ]}|j |jqS r   )Zsource_run_idr   )r    er   r   r   
<dictcomp>   s      z(convert_runs_to_test.<locals>.<dictcomp>r   c                    s    g | ]}t | D ]}|qqS r   )r.   )r    Zroot_runr   )r   r   r   r#      s    zprod-baseline)whichdataset_version)project_nameZreference_dataset_idmetadatar<   )tz)Zend_time)
ValueErrorrtget_cached_clientZcreate_datasetZcreate_examplesr   r$   r%   hexlistZlist_examplesZmodified_atZ
created_atZcreate_project	isoformatZ
create_runZupdate_projectdatetimenowtimezoneutc)r3   r4   r/   r0   r1   r2   Zdsr5   Zruns_to_copyZexamplesr;   Z	to_createprojectZnew_run_r   )r0   r1   r   r   convert_runs_to_testE   sL    6
	 rK   )r<   r0   r   c           	      C   s   |j | d}tt}g }i }|D ]4}|jd k	rB||j | n
|| |||j< q"| D ]\}}t|dd d|| _	q`|S )N)r<   c                 S   s   | j S N)r   )r!   r   r   r   <lambda>       z%_load_nested_traces.<locals>.<lambda>)key)
Z	list_runscollectionsdefaultdictrC   r   r*   r   r   sortedr(   )	r<   r0   r3   Ztreemapr+   Zall_runsrunZrun_idr(   r   r   r   _load_nested_traces   s    

rT   TU)list1list2r   c                 C   s   t t| |S rL   )rC   	itertoolsproduct)rW   rX   r   r   r   _outer_product   s    r[   
   )max_concurrencyr0   )r<   
evaluatorsr]   r0   r   c             	   C   s   ddl m} g }|D ]H}t|tjr0|| qt|rJ|t| qtdt	| q|pht
 }t| |}||d"}|j|jftt||  }	W 5 Q R X |	D ]}
qdS )a  Compute test metrics for a given test name using a list of evaluators.

    Args:
        project_name (str): The name of the test project to evaluate.
        evaluators (list): A list of evaluators to compute metrics with.
        max_concurrency (Optional[int], optional): The maximum number of concurrent
            evaluations. Defaults to 10.
        client (Optional[Client], optional): The client to use for evaluations.
            Defaults to None.

    Returns:
        None: This function does not return any value.
    r   )ContextThreadPoolExecutorz5Evaluation not yet implemented for evaluator of type )max_workersN)	langsmithr_   
isinstancels_evalZRunEvaluatorr*   callableZrun_evaluatorNotImplementedErrortyper@   rA   rT   mapZevaluate_runzipr[   )r<   r^   r]   r0   r_   Zevaluators_funcZtracesexecutorr+   rJ   r   r   r   compute_test_metrics   s(    
rk   )(__doc__rP   rE   rY   r$   typingr   r   r   r   r   r   Zlangsmith.run_treesZ	run_treesr@   Zlangsmith.schemasZschemasZ
ls_schemasra   r   rc   Z#langsmith._internal._beta_decoratorr	   Zlangsmith.clientr
   r'   r   ZRunr.   r   boolZTracerSessionrK   rT   rU   rV   r[   rC   intrk   r   r   r   r   <module>   sP    h&