import warnings from pathlib import Path from typing import Optional, Iterable, Union from http.cookiejar import MozillaCookieJar, LoadError from requests import Session from .proxies import ProxyConfig, GenericProxyConfig from ._transcripts import TranscriptListFetcher, FetchedTranscript, TranscriptList from ._errors import CookiePathInvalid, CookieInvalid def _load_cookie_jar(cookies: Union[Path, str]) -> MozillaCookieJar: try: cookie_jar = MozillaCookieJar() cookie_jar.load(str(cookies)) if not cookie_jar: raise CookieInvalid(cookies) return cookie_jar except (FileNotFoundError, LoadError): raise CookiePathInvalid(cookies) class YouTubeTranscriptApi: def __init__( self, cookie_path: Optional[Union[Path, str]] = None, proxy_config: Optional[ProxyConfig] = None, http_client: Optional[Session] = None, ): """ Note on thread-safety: As this class will initialize a `requests.Session` object, it is not thread-safe. Make sure to initialize an instance of `YouTubeTranscriptApi` per thread, if used in a multi-threading scenario! :param cookie_path: Path to a text file containing YouTube authorization cookies :param proxy_config: an optional ProxyConfig object, defining proxies used for all network requests. This can be used to work around your IP being blocked by YouTube, as described in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception) :param http_client: You can optionally pass in a requests.Session object, if you manually want to share cookies between different instances of `YouTubeTranscriptApi`, overwrite defaults, specify SSL certificates, etc. """ http_client = Session() if http_client is None else http_client http_client.headers.update({"Accept-Language": "en-US"}) if cookie_path is not None: http_client.cookies = _load_cookie_jar(cookie_path) if proxy_config is not None: http_client.proxies = proxy_config.to_requests_dict() if proxy_config.prevent_keeping_connections_alive: http_client.headers.update({"Connection": "close"}) self._fetcher = TranscriptListFetcher(http_client, proxy_config=proxy_config) def fetch( self, video_id: str, languages: Iterable[str] = ("en",), preserve_formatting: bool = False, ) -> FetchedTranscript: """ Retrieves the transcript for a single video. This is just a shortcut for calling: `YouTubeTranscriptApi().list(video_id).find_transcript(languages).fetch(preserve_formatting=preserve_formatting)` :param video_id: the ID of the video you want to retrieve the transcript for. Make sure that this is the actual ID, NOT the full URL to the video! :param languages: A list of language codes in a descending priority. For example, if this is set to ["de", "en"] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. This defaults to ["en"]. :param preserve_formatting: whether to keep select HTML text formatting """ return ( self.list(video_id) .find_transcript(languages) .fetch(preserve_formatting=preserve_formatting) ) def list( self, video_id: str, ) -> TranscriptList: """ Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide metadata and can either be fetched by calling `transcript.fetch()` or translated by calling `transcript.translate( 'en')`. Example: ``` ytt_api = YouTubeTranscriptApi() # retrieve the available transcripts transcript_list = ytt_api.list('video_id') # iterate over all available transcripts for transcript in transcript_list: # the Transcript object provides metadata properties print( transcript.video_id, transcript.language, transcript.language_code, # whether it has been manually created or generated by YouTube transcript.is_generated, # a list of languages the transcript can be translated to transcript.translation_languages, ) # fetch the actual transcript data print(transcript.fetch()) # translating the transcript will return another transcript object print(transcript.translate('en').fetch()) # you can also directly filter for the language you are looking for, using the transcript list transcript = transcript_list.find_transcript(['de', 'en']) # or just filter for manually created transcripts transcript = transcript_list.find_manually_created_transcript(['de', 'en']) # or automatically generated ones transcript = transcript_list.find_generated_transcript(['de', 'en']) ``` :param video_id: the ID of the video you want to retrieve the transcript for. Make sure that this is the actual ID, NOT the full URL to the video! """ return self._fetcher.fetch(video_id) @classmethod def list_transcripts(cls, video_id, proxies=None, cookies=None): """ DEPRECATED: use the `list` method instead! Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide metadata and can either be fetched by calling `transcript.fetch()` or translated by calling `transcript.translate('en')`. Example: # retrieve the available transcripts transcript_list = YouTubeTranscriptApi.list_transcripts('video_id') # iterate over all available transcripts for transcript in transcript_list: # the Transcript object provides metadata properties print( transcript.video_id, transcript.language, transcript.language_code, # whether it has been manually created or generated by YouTube transcript.is_generated, # a list of languages the transcript can be translated to transcript.translation_languages, ) # fetch the actual transcript data print(transcript.fetch()) # translating the transcript will return another transcript object print(transcript.translate('en').fetch()) # you can also directly filter for the language you are looking for, using the transcript list transcript = transcript_list.find_transcript(['de', 'en']) # or just filter for manually created transcripts transcript = transcript_list.find_manually_created_transcript(['de', 'en']) # or automatically generated ones transcript = transcript_list.find_generated_transcript(['de', 'en']) :param video_id: the youtube video id :type video_id: str :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str :return: the list of available transcripts :rtype TranscriptList: """ warnings.warn( "`list_transcripts` is deprecated and will be removed in a future version. " "Use the `list` method instead!", DeprecationWarning, ) proxy_config = None if proxies: if isinstance(proxies, ProxyConfig): proxy_config = proxies else: proxy_config = GenericProxyConfig( http_url=proxies.get("http"), https_url=proxies.get("https") ) ytt_api = YouTubeTranscriptApi( proxy_config=proxy_config, cookie_path=Path(cookies) if cookies else None, ) return ytt_api.list(video_id) @classmethod def get_transcripts( cls, video_ids, languages=("en",), continue_after_error=False, proxies=None, cookies=None, preserve_formatting=False, ): """ DEPRECATED: use the `fetch` method instead! Retrieves the transcripts for a list of videos. :param video_ids: a list of youtube video ids :type video_ids: list[str] :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. :type languages: list[str] :param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving one of the video transcripts :type continue_after_error: bool :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str :param preserve_formatting: whether to keep select HTML text formatting :type preserve_formatting: bool :return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of video ids, which could not be retrieved :rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}): """ warnings.warn( "`get_transcripts` is deprecated and will be removed in a future version. " "Use the `fetch` method instead!", DeprecationWarning, ) assert isinstance(video_ids, list), "`video_ids` must be a list of strings" data = {} unretrievable_videos = [] for video_id in video_ids: try: data[video_id] = cls.get_transcript( video_id, languages, proxies, cookies, preserve_formatting ) except Exception as exception: if not continue_after_error: raise exception unretrievable_videos.append(video_id) return data, unretrievable_videos @classmethod def get_transcript( cls, video_id, languages=("en",), proxies=None, cookies=None, preserve_formatting=False, ): """ DEPRECATED: use the `fetch` method instead! Retrieves the transcript for a single video. This is just a shortcut for calling:: YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch() :param video_id: the youtube video id :type video_id: str :param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to do so. :type languages: list[str] :param proxies: a dictionary mapping of http and https proxies to be used for the network requests :type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies :param cookies: a string of the path to a text file containing youtube authorization cookies :type cookies: str :param preserve_formatting: whether to keep select HTML text formatting :type preserve_formatting: bool :return: a list of dictionaries containing the 'text', 'start' and 'duration' keys :rtype [{'text': str, 'start': float, 'end': float}]: """ warnings.warn( "`get_transcript` is deprecated and will be removed in a future version. " "Use the `fetch` method instead!", DeprecationWarning, ) assert isinstance(video_id, str), "`video_id` must be a string" return ( cls.list_transcripts(video_id, proxies, cookies) .find_transcript(languages) .fetch(preserve_formatting=preserve_formatting) .to_raw_data() )
Memory