U
    h\H                     @  s"  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZdgZ eG dd dZ!ddhZ"ddddddhZ#dddddZ$G dd deZ%G dd  d eZ&eG d!d" d"eZ'dS )#zLoads YouTube transcript.    )annotationsN)Enum)Path)AnyDict	GeneratorListOptionalSequenceUnion)parse_qsurlparse)
ParseError)Document)root_validator)	dataclass)
BaseLoaderz0https://www.googleapis.com/auth/youtube.readonlyc                   @  s   e Zd ZU dZe d d Zded< e d d Zded< e d d Z	ded< d	d
ddZ
edddddddZdd
ddZdS )GoogleApiClienta  Generic Google API Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.
    As the google api expects credentials you need to set up a google account and
    register your Service. "https://developers.google.com/docs/api/quickstart/python"

    *Security Note*: Note that parsing of the transcripts relies on the standard
        xml library but the input is viewed as trusted in this case.


    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )

    z.credentialszcredentials.jsonr   credentials_pathservice_account_pathz
token.json
token_pathNonereturnc                 C  s   |   | _d S N)_load_credentialscredsself r   P/tmp/pip-unpacked-wheel-9gdii04g/langchain_community/document_loaders/youtube.py__post_init__2   s    zGoogleApiClient.__post_init__TpreDict[str, Any]valuesr   c                 C  s    | ds| dstd|S )DValidate that either folder_id or document_ids is set, but not both.r   r   -Must specify either channel_name or video_idsget
ValueErrorclsr&   r   r   r    #validate_channel_or_videoIds_is_set5   s
    z3GoogleApiClient.validate_channel_or_videoIds_is_setr   c           	   	   C  s  z@ddl m} ddlm} ddlm} ddlm} ddlm	} W n t
k
r\   t
dY nX d}| j r~|jt| jS | j r|t| jt}|r|js|r|jr|jr||  n|t| jt}|jdd	}t| jd
}||  W 5 Q R X |S )zLoad credentials.r   )Request)service_account)Credentials)InstalledAppFlowYouTubeTranscriptApiYou must run`pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib youtube-transcript-api` to use the Google Drive loaderN)portw)Zgoogle.auth.transport.requestsr/   Zgoogle.oauth2r0   Zgoogle.oauth2.credentialsr1   Zgoogle_auth_oauthlib.flowr2   youtube_transcript_apir4   ImportErrorr   existsZfrom_service_account_filestrr   Zfrom_authorized_user_fileSCOPESZvalidZexpiredZrefresh_tokenZrefreshZfrom_client_secrets_filer   Zrun_local_serveropenwriteZto_json)	r   r/   r0   r1   r2   r4   r   Zflowtokenr   r   r    r   A   s8    
	

 z!GoogleApiClient._load_credentialsN)__name__
__module____qualname____doc__r   homer   __annotations__r   r   r!   r   r.   r   r   r   r   r    r      s   
r   httphttpszyoutu.bezm.youtube.comzyoutube.comzwww.youtube.comzwww.youtube-nocookie.comzvid.plusr;   Optional[str])urlr   c                 C  s   t | }|jtkrdS |jtkr$dS |j}|drp|j}t|}d|krj|d }t	|t
r`|n|d }qdS n|jd}|dd }t|dkrdS |S )zEParse a YouTube URL and return the video ID if valid, otherwise None.Nz/watchvr   /   )r   schemeALLOWED_SCHEMESnetlocALLOWED_NETLOCSpathendswithqueryr   
isinstancer;   lstripsplitlen)rI   
parsed_urlrR   rT   Zparsed_queryidsvideo_idr   r   r    _parse_video_idu   s$    


r\   c                   @  s   e Zd ZdZdZdZdZdS )TranscriptFormatz3Output formats of transcripts from `YoutubeLoader`.textlineschunksN)r@   rA   rB   rC   TEXTLINESCHUNKSr   r   r   r    r]      s   r]   c                	   @  s   e Zd ZdZdddejddfdddd	d
dddddZedddddZe	ddd dddZ
ddddddZdddddZdd d!d"Zd#d d$d%ZdS )&YoutubeLoaderz!Load `YouTube` video transcripts.FenNx   r;   boolzUnion[str, Sequence[str]]rH   r]   int)r[   add_video_infolanguagetranslationtranscript_formatcontinue_on_failurechunk_size_secondsc                 C  sR   || _ d|i| _|| _|| _t|tr0|g| _n|| _|| _|| _|| _|| _	dS )z!Initialize with YouTube video ID.sourceN)
r[   	_metadatari   rj   rU   r;   rk   rl   rm   rn   )r   r[   ri   rj   rk   rl   rm   rn   r   r   r    __init__   s    


zYoutubeLoader.__init__)youtube_urlr   c                 C  s    t | }|std|  d|S )z*Extract video ID from common YouTube URLs.z.Could not determine the video ID for the URL "z".)r\   r+   )rr   r[   r   r   r    extract_video_id   s    
zYoutubeLoader.extract_video_idr   )rr   kwargsr   c                 K  s   |  |}| |f|S )z|Given a YouTube URL, construct a loader.
        See `YoutubeLoader()` constructor for a list of keyword arguments.
        )rs   )r-   rr   rt   r[   r   r   r    from_youtube_url   s    
zYoutubeLoader.from_youtube_urlz
List[Dict]r   )chunk_pieceschunk_start_secondsr   c              
   C  sl   t |d\}}t |d\}}tdtdd || j||dd|dd|dd| j d| d	d
dS )z0Create Document from chunk of transcript pieces.<    c                 S  s   | d  dS Nr^   ry   strip)Zchunk_piecer   r   r    <lambda>       z4YoutubeLoader._make_chunk_document.<locals>.<lambda>Z02d: https://www.youtube.com/watch?v=z&t=s)Zstart_secondsZstart_timestampro   page_contentmetadata)divmodr   joinmaprp   r[   )r   rv   rw   mr   hr   r   r    _make_chunk_document   s    z"YoutubeLoader._make_chunk_documentzGenerator[Document, None, None])transcript_piecesr   c                 c  s|   g }d}| j }|D ]J}|d |d  }||krR|r@| ||V  g }|}|| j 7 }|| qt|dkrx| ||V  d S )Nr   startduration)rn   r   appendrX   )r   r   rv   rw   Zchunk_time_limittranscript_pieceZ	piece_endr   r   r    _get_transcript_chunks   s    
z$YoutubeLoader._get_transcript_chunksList[Document]r   c                 C  sD  zddl m}m}m} W n tk
r4   tdY nX | jrP|  }| j| z|	| j
}W n |k
rx   g  Y S X z|| j}W n  |k
r   |dg}Y nX | jdk	r|| j}| }| jtjk rdtdd |}t|| jd	gS | jtjkrttd
d |S | jtjkr8t| |S tddS )z1Load YouTube transcripts into `Document` objects.r   )NoTranscriptFoundTranscriptsDisabledr4   zvCould not import "youtube_transcript_api" Python package. Please install it with `pip install youtube-transcript-api`.re   Nry   c                 S  s   | d  dS rz   r{   r   r   r   r    r}     r~   z$YoutubeLoader.load.<locals>.<lambda>r   c                 S  s(   t | d dttdd |  dS )Nr^   ry   c                 S  s   | d dkS )Nr   r^   r   )itemr   r   r    r}   #  r~   z6YoutubeLoader.load.<locals>.<lambda>.<locals>.<lambda>r   )r   r|   dictfilteritemsr   r   r   r    r}     s    zUnknown transcript format.)r8   r   r   r4   r9   ri   _get_video_inforp   updatelist_transcriptsr[   find_transcriptrj   rk   	translatefetchrl   r]   ra   r   r   r   rb   listrc   r   r+   )r   r   r   r4   
video_infotranscript_list
transcriptr   r   r   r    load   sL    


zYoutubeLoader.loadr   c                 C  s   zddl m} W n tk
r,   tdY nX |d| j }|jpFd|jpNd|jpVd|jp^d|jrp|j	dnd|j
pzd|jpdd}|S )zGet important video information.

        Components include:
            - title
            - description
            - thumbnail URL,
            - publish_date
            - channel author
            - and more.
        r   )YouTubezVCould not import "pytube" Python package. Please install it with `pip install pytube`.r   Unknownz%Y-%m-%d %H:%M:%S)titledescriptionZ
view_countthumbnail_urlpublish_datelengthauthor)Zpytuber   r9   r[   r   r   Zviewsr   r   strftimer   r   )r   r   Zytr   r   r   r    r   0  s$    
zYoutubeLoader._get_video_info)r@   rA   rB   rC   r]   ra   rq   staticmethodrs   classmethodru   r   r   r   r   r   r   r   r    rd      s    	?rd   c                   @  s   e Zd ZU dZded< dZded< dZded< d	Zd
ed< dZded< dZ	d
ed< ddddZ
dddddZed	ddddddZddddd Zddd!d"d#d$Zddd%d&d'Zddd(d)d*Zddd+d,d-d.Zd+dd/d0ZdS )1GoogleApiYoutubeLoadera  Load all Videos from a `YouTube` Channel.

    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

    Additionally you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"



    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            from langchain_community.document_loaders import GoogleApiYoutubeLoader
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )
            loader = GoogleApiYoutubeLoader(
                google_api_client=google_api_client,
                channel_name = "CodeAesthetic"
            )
            load.load()

    r   google_api_clientNrH   channel_namezOptional[List[str]]	video_idsTrg   ri   re   r;   captions_languageFrm   r   r   c                 C  s   |  | jj| _d S r   )_build_youtube_clientr   r   youtube_clientr   r   r   r    r!   w  s    z$GoogleApiYoutubeLoader.__post_init__r   )r   r   c                 C  sH   zddl m} ddlm} W n tk
r8   tdY nX |dd|dS )Nr   )buildr3   r5   ZyoutubeZv3)credentials)Zgoogleapiclient.discoveryr   r8   r4   r9   )r   r   r   r4   r   r   r    r   z  s    
	z,GoogleApiYoutubeLoader._build_youtube_clientr"   r$   r%   c                 C  s    | ds| dstd|S )r'   r   r   r(   r)   r,   r   r   r    r.     s    z:GoogleApiYoutubeLoader.validate_channel_or_videoIds_is_set)r[   r   c                 C  sv   ddl m}m} ||}z|| jg}W n, |k
rX   |D ]}|| j}q@q@Y nX | }ddd |D S )Nr   )r   r4   ry   c                 S  s   g | ]}|d   dqS )r^   ry   r{   ).0tr   r   r    
<listcomp>  s     zGGoogleApiYoutubeLoader._get_transcripe_for_video_id.<locals>.<listcomp>)	r8   r   r4   r   r   r   r   r   r   )r   r[   r   r4   r   r   Zavailable_transcriptr   r   r   r    _get_transcripe_for_video_id  s    

z3GoogleApiYoutubeLoader._get_transcripe_for_video_idr   )r[   rt   r   c                 K  s8   |  |}| j jd|d }t||dd dS )N
id,snippetpartidr   r   r   )r   r   Zvideosr   executer   r*   )r   r[   rt   ZcaptionsZvideo_responser   r   r    _get_document_for_video_id  s    

z1GoogleApiYoutubeLoader._get_document_for_video_id)r   r   c                 C  s8   | j  jd|ddd}| }|d d d d }|S )Nr   channel   )r   qtype
maxResultsr   r   Z	channelId)r   searchr   r   )r   r   requestresponse
channel_idr   r   r    _get_channel_id  s    
z&GoogleApiYoutubeLoader._get_channel_id)r   r   c                 C  s4   | j  jd|d}| }|d d d d d S )NZcontentDetailsr   r   r   ZrelatedPlaylistsZuploads)r   Zchannelsr   r   )r   r   r   r   r   r   r    _get_uploads_playlist_id  s    
z/GoogleApiYoutubeLoader._get_uploads_playlist_idr   )r   rt   r   c                 K  sN  zddl m}m} W n tk
r0   tdY nX | |}| |}| j jd|dd}g }|d k	rJ|	 }	|	d D ]}
|
d d	 d
 }d
|i}| j
r|
d d ||
d  z | |}|t||d W qz ||tfk
r2 } z4| jrtdd|
d d
  d|   n|W 5 d }~X Y qzX qz| j ||	}q`|S )Nr   )r   r   zTYou must run`pip install --upgrade youtube-transcript-api` to use the youtube loaderr   2   )r   Z
playlistIdr   r   ZsnippetZ
resourceIdZvideoIdZ
thumbnailsr   zError fetching transscript ry   r   z, exception: )r8   r   r   r9   r   r   r   ZplaylistItemsr   r   ri   popr   r   r   r   r   rm   loggererrorr   Z	list_next)r   r   rt   r   r   r   Zuploads_playlist_idr   r   r   r   r[   Z	meta_datar   er   r   r    _get_document_for_channel  sR    





z0GoogleApiYoutubeLoader._get_document_for_channelc                   sL   g } j r|  j  n* jr@| fdd jD  ntd|S )zLoad documents.c                   s   g | ]}  |qS r   )r   )r   r[   r   r   r    r     s   z/GoogleApiYoutubeLoader.load.<locals>.<listcomp>r(   )r   extendr   r   r+   )r   Zdocument_listr   r   r    r     s    
zGoogleApiYoutubeLoader.load)r@   rA   rB   rC   rE   r   r   ri   r   rm   r!   r   r   r.   r   r   r   r   r   r   r   r   r   r    r   R  s"   
5r   )(rC   
__future__r   loggingenumr   pathlibr   typingr   r   r   r   r	   r
   r   urllib.parser   r   xml.etree.ElementTreer   Zlangchain_core.documentsr   Zlangchain_core.pydantic_v1r   Z&langchain_core.pydantic_v1.dataclassesr   Z)langchain_community.document_loaders.baser   	getLoggerr@   r   r<   r   rO   rQ   r\   r]   rd   r   r   r   r   r    <module>   s:   $
R
 8