# pyright: reportPrivateUsage=false """Provides `partition_html().""" from __future__ import annotations from typing import IO, Any, Iterator, List, Literal, Optional, cast import requests from lxml import etree from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element from unstructured.file_utils.encoding import read_txt_file from unstructured.file_utils.model import FileType from unstructured.partition.common.metadata import apply_metadata, get_last_modified_date from unstructured.partition.html.parser import Flow, html_parser from unstructured.partition.html.transformations import ( ontology_to_unstructured_elements, parse_html_to_ontology, ) from unstructured.utils import is_temp_file_path, lazyproperty @apply_metadata(FileType.HTML) @add_chunking_strategy def partition_html( filename: Optional[str] = None, *, file: Optional[IO[bytes]] = None, text: Optional[str] = None, encoding: Optional[str] = None, url: Optional[str] = None, headers: dict[str, str] = {}, ssl_verify: bool = True, skip_headers_and_footers: bool = False, detection_origin: Optional[str] = None, html_parser_version: Literal["v1", "v2"] = "v1", image_alt_mode: Optional[Literal["to_text"]] = "to_text", **kwargs: Any, ) -> list[Element]: """Partitions an HTML document into its constituent elements. HTML source parameters ---------------------- The HTML to be partitioned can be specified four different ways: filename A string defining the target filename path. file A file-like object using "r" mode --> open(filename, "r"). text The string representation of the HTML document. url The URL of a webpage to parse. Only for URLs that return an HTML document. headers The HTTP headers to be used in the HTTP request when `url` is specified. ssl_verify If the URL parameter is set, determines whether or not SSL verification is performed on the HTTP request. encoding The encoding method used to decode the text input. If None, utf-8 will be used. skip_headers_and_footers If True, ignores any content that is within <header> or <footer> tags html_parser_version (Literal['v1', 'v2']): The version of the HTML parser to use. The default is 'v1'. For 'v2' the parser will use the ontology schema to parse the HTML document. image_alt_mode (Literal['to_text']): When set 'to_text', the v2 parser will include the alternative text of images in the output. """ # -- parser rejects an empty str, nip that edge-case in the bud here -- if text is not None and text.strip() == "" and not file and not filename and not url: return [] opts = HtmlPartitionerOptions( file_path=filename, file=file, text=text, encoding=encoding, url=url, headers=headers, ssl_verify=ssl_verify, skip_headers_and_footers=skip_headers_and_footers, detection_origin=detection_origin, html_parser_version=html_parser_version, image_alt_mode=image_alt_mode, ) return list(_HtmlPartitioner.iter_elements(opts)) class HtmlPartitionerOptions: """Encapsulates partitioning option validation, computation, and application of defaults.""" def __init__( self, *, file_path: str | None, file: IO[bytes] | None, text: str | None, encoding: str | None, url: str | None, headers: dict[str, str], ssl_verify: bool, skip_headers_and_footers: bool, detection_origin: str | None, html_parser_version: Literal["v1", "v2"] = "v1", image_alt_mode: Optional[Literal["to_text"]] = "to_text", ): self._file_path = file_path self._file = file self._text = text self._encoding = encoding self._url = url self._headers = headers self._ssl_verify = ssl_verify self._skip_headers_and_footers = skip_headers_and_footers self._detection_origin = detection_origin self._html_parser_version = html_parser_version self._image_alt_mode = image_alt_mode @lazyproperty def detection_origin(self) -> str | None: """Trace of initial partitioner to be included in metadata for debugging purposes.""" return self._detection_origin @lazyproperty def html_text(self) -> str: """The HTML document as a string, loaded from wherever the caller specified.""" if self._file_path: return read_txt_file(filename=self._file_path, encoding=self._encoding)[1] if self._file: return read_txt_file(file=self._file, encoding=self._encoding)[1] if self._text: return str(self._text) if self._url: response = requests.get(self._url, headers=self._headers, verify=self._ssl_verify) if not response.ok: raise ValueError( f"Error status code on GET of provided URL: {response.status_code}" ) content_type = response.headers.get("Content-Type", "") if not content_type.startswith("text/html"): raise ValueError(f"Expected content type text/html. Got {content_type}.") return response.text raise ValueError("Exactly one of filename, file, text, or url must be specified.") @lazyproperty def last_modified(self) -> str | None: """The best last-modified date available, None if no sources are available.""" return ( None if not self._file_path or is_temp_file_path(self._file_path) else get_last_modified_date(self._file_path) ) @lazyproperty def skip_headers_and_footers(self) -> bool: """When True, elements located within a header or footer are pruned.""" return self._skip_headers_and_footers @lazyproperty def html_parser_version(self) -> Literal["v1", "v2"]: """When html_parser_version=='v2', HTML elements follow ontology schema.""" return self._html_parser_version @lazyproperty def add_img_alt_text(self) -> bool: """When True, the alternative text of images is included in the output.""" return self._image_alt_mode == "to_text" class _HtmlPartitioner: """Partition HTML document into document-elements.""" def __init__(self, opts: HtmlPartitionerOptions): self._opts = opts @classmethod def iter_elements(cls, opts: HtmlPartitionerOptions) -> Iterator[Element]: """Partition HTML document provided by `opts` into document-elements.""" yield from cls(opts)._iter_elements() def _iter_elements(self) -> Iterator[Element]: """Generated document-elements (e.g. Title, NarrativeText, etc.) parsed from document. Elements appear in document order. """ elements_iter = ( self._main.iter_elements() if self._opts.html_parser_version == "v1" else self._from_ontology ) for e in elements_iter: e.metadata.last_modified = self._opts.last_modified e.metadata.detection_origin = self._opts.detection_origin yield e @lazyproperty def _main(self) -> Flow: """The root HTML element.""" # NOTE(scanny) - get `html_text` first so any encoding error raised is not confused with a # recoverable parsing error. html_text = self._opts.html_text # NOTE(scanny) - `lxml` will not parse a `str` that includes an XML encoding declaration # and will raise the following error: # ValueError: Unicode strings with encoding declaration are not supported. ... # This is not valid HTML (would be in XHTML), but Chrome accepts it so we work around it # by UTF-8 encoding the str bytes and parsing those. try: root = etree.fromstring(html_text, html_parser) except ValueError: root = etree.fromstring(html_text.encode("utf-8"), html_parser) # -- remove a variety of HTML element types like <script> and <style> that we prefer not # -- to encounter while parsing. etree.strip_elements( root, ["del", "img", "link", "meta", "noscript", "script", "style"], with_tail=False ) # -- remove <header> and <footer> tags if the caller doesn't want their contents -- if self._opts.skip_headers_and_footers: etree.strip_elements(root, ["header", "footer"], with_tail=False) # -- jump to the core content if the document indicates where it is -- if (main := root.find(".//main")) is not None: return cast(Flow, main) if (body := root.find(".//body")) is not None: return cast(Flow, body) return cast(Flow, root) @lazyproperty def _from_ontology(self) -> List[Element]: """Convert an ontology elements represented in HTML to an ontology element.""" html_text = self._opts.html_text ontology = parse_html_to_ontology(html_text) unstructured_elements = ontology_to_unstructured_elements( ontology, add_img_alt_text=self._opts.add_img_alt_text ) return unstructured_elements
Memory