import json import pprint from typing import List, Iterable from ._transcripts import FetchedTranscript, FetchedTranscriptSnippet class Formatter: """Formatter should be used as an abstract base class. Formatter classes should inherit from this class and implement their own .format() method which should return a string. A transcript is represented by a List of Dictionary items. """ def format_transcript(self, transcript: FetchedTranscript, **kwargs) -> str: raise NotImplementedError( "A subclass of Formatter must implement " "their own .format_transcript() method." ) def format_transcripts(self, transcripts: List[FetchedTranscript], **kwargs): raise NotImplementedError( "A subclass of Formatter must implement " "their own .format_transcripts() method." ) class PrettyPrintFormatter(Formatter): def format_transcript(self, transcript: FetchedTranscript, **kwargs) -> str: """Pretty prints a transcript. :param transcript: :return: A pretty printed string representation of the transcript. """ return pprint.pformat(transcript.to_raw_data(), **kwargs) def format_transcripts(self, transcripts: List[FetchedTranscript], **kwargs) -> str: """Converts a list of transcripts into a JSON string. :param transcripts: :return: A JSON string representation of the transcript. """ return pprint.pformat( [transcript.to_raw_data() for transcript in transcripts], **kwargs ) class JSONFormatter(Formatter): def format_transcript(self, transcript: FetchedTranscript, **kwargs) -> str: """Converts a transcript into a JSON string. :param transcript: :return: A JSON string representation of the transcript. """ return json.dumps(transcript.to_raw_data(), **kwargs) def format_transcripts(self, transcripts: List[FetchedTranscript], **kwargs) -> str: """Converts a list of transcripts into a JSON string. :param transcripts: :return: A JSON string representation of the transcript. """ return json.dumps( [transcript.to_raw_data() for transcript in transcripts], **kwargs ) class TextFormatter(Formatter): def format_transcript(self, transcript: FetchedTranscript, **kwargs) -> str: """Converts a transcript into plain text with no timestamps. :param transcript: :return: all transcript text lines separated by newline breaks. """ return "\n".join(line.text for line in transcript) def format_transcripts(self, transcripts: List[FetchedTranscript], **kwargs) -> str: """Converts a list of transcripts into plain text with no timestamps. :param transcripts: :return: all transcript text lines separated by newline breaks. """ return "\n\n\n".join( [self.format_transcript(transcript, **kwargs) for transcript in transcripts] ) class _TextBasedFormatter(TextFormatter): def _format_timestamp(self, hours: int, mins: int, secs: int, ms: int) -> str: raise NotImplementedError( "A subclass of _TextBasedFormatter must implement " "their own .format_timestamp() method." ) def _format_transcript_header(self, lines: Iterable[str]) -> str: raise NotImplementedError( "A subclass of _TextBasedFormatter must implement " "their own _format_transcript_header method." ) def _format_transcript_helper( self, i: int, time_text: str, snippet: FetchedTranscriptSnippet ) -> str: raise NotImplementedError( "A subclass of _TextBasedFormatter must implement " "their own _format_transcript_helper method." ) def _seconds_to_timestamp(self, time: float) -> str: """Helper that converts `time` into a transcript cue timestamp. :reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp :param time: a float representing time in seconds. :type time: float :return: a string formatted as a cue timestamp, 'HH:MM:SS.MS' :example: >>> self._seconds_to_timestamp(6.93) '00:00:06.930' """ time = float(time) hours_float, remainder = divmod(time, 3600) mins_float, secs_float = divmod(remainder, 60) hours, mins, secs = int(hours_float), int(mins_float), int(secs_float) ms = int(round((time - int(time)) * 1000, 2)) return self._format_timestamp(hours, mins, secs, ms) def format_transcript(self, transcript: FetchedTranscript, **kwargs) -> str: """A basic implementation of WEBVTT/SRT formatting. :param transcript: :reference: https://www.w3.org/TR/webvtt1/#introduction-caption https://www.3playmedia.com/blog/create-srt-file/ """ lines = [] for i, line in enumerate(transcript): end = line.start + line.duration time_text = "{} --> {}".format( self._seconds_to_timestamp(line.start), self._seconds_to_timestamp( transcript[i + 1].start if i < len(transcript) - 1 and transcript[i + 1].start < end else end ), ) lines.append(self._format_transcript_helper(i, time_text, line)) return self._format_transcript_header(lines) class SRTFormatter(_TextBasedFormatter): def _format_timestamp(self, hours: int, mins: int, secs: int, ms: int) -> str: return "{:02d}:{:02d}:{:02d},{:03d}".format(hours, mins, secs, ms) def _format_transcript_header(self, lines: Iterable[str]) -> str: return "\n\n".join(lines) + "\n" def _format_transcript_helper( self, i: int, time_text: str, snippet: FetchedTranscriptSnippet ) -> str: return "{}\n{}\n{}".format(i + 1, time_text, snippet.text) class WebVTTFormatter(_TextBasedFormatter): def _format_timestamp(self, hours: int, mins: int, secs: int, ms: int) -> str: return "{:02d}:{:02d}:{:02d}.{:03d}".format(hours, mins, secs, ms) def _format_transcript_header(self, lines: Iterable[str]) -> str: return "WEBVTT\n\n" + "\n\n".join(lines) + "\n" def _format_transcript_helper( self, i: int, time_text: str, snippet: FetchedTranscriptSnippet ) -> str: return "{}\n{}".format(time_text, snippet.text) class FormatterLoader: TYPES = { "json": JSONFormatter, "pretty": PrettyPrintFormatter, "text": TextFormatter, "webvtt": WebVTTFormatter, "srt": SRTFormatter, } class UnknownFormatterType(Exception): def __init__(self, formatter_type: str): super().__init__( "The format '{formatter_type}' is not supported. " "Choose one of the following formats: {supported_formatter_types}".format( formatter_type=formatter_type, supported_formatter_types=", ".join(FormatterLoader.TYPES.keys()), ) ) def load(self, formatter_type: str = "pretty") -> Formatter: """ Loads the Formatter for the given formatter type. :param formatter_type: :return: Formatter object """ if formatter_type not in FormatterLoader.TYPES.keys(): raise FormatterLoader.UnknownFormatterType(formatter_type) return FormatterLoader.TYPES[formatter_type]()
Memory