""" This file contains all classes allowed in the ontology V2. This Type is used as intermediate representation between HTML and Unstructured Elements. All the processing could be done without the intermediate representation, but it simplifies the process. It needs to be decide whether we keep it or not. The classes are represented as pydantic models to mimic Unstructured Elements V1 solutions. However it results in lots of code that could be strongly simplified. TODO (Pluto): OntologyElement is the only needed class. It could contains data about allowed html tags, css classes and descriptions as metadata. """ from __future__ import annotations import uuid from copy import copy from enum import Enum from typing import List, Optional from bs4 import BeautifulSoup, Tag from pydantic import BaseModel, Field class ElementTypeEnum(str, Enum): layout = "Layout" text = "Text" list = "List" table = "Table" media = "Media" code = "Code" mathematical = "Mathematical" reference = "Reference" metadata = "Metadata" navigation = "Navigation" form = "Form" annotation = "Annotation" specialized_text = "Specialized Text" document_specific = "Document-Specific" class OntologyElement(BaseModel): text: Optional[str] = Field("", description="Text content of the element") css_class_name: Optional[str] = Field( default_factory=lambda: "", description="CSS class associated with the element" ) html_tag_name: Optional[str] = Field( default_factory=lambda: "", description="HTML Tag name associated with the element" ) elementType: ElementTypeEnum = Field(..., description="Type of the element") children: List["OntologyElement"] = Field( default_factory=list, description="List of child elements" ) description: str = Field(..., description="Description of the element") allowed_tags: List[str] = Field(..., description="HTML tags associated with the element") additional_attributes: Optional[dict] = Field( {}, description="Optional HTML attributes or CSS properties" ) def __init__(self, **kwargs): super().__init__(**kwargs) if self.css_class_name == "": # if None, then do not set self.css_class_name = self.__class__.__name__ if self.html_tag_name == "": self.html_tag_name = self.allowed_tags[0] if "id" not in self.additional_attributes: self.additional_attributes["id"] = self.generate_unique_id() @staticmethod def generate_unique_id() -> str: return str(uuid.uuid4()).replace("-", "") def to_html(self, add_children=True) -> str: additional_attrs = copy(self.additional_attributes) additional_attrs.pop("class", None) attr_str = self._construct_attribute_string(additional_attrs) class_attr = f'class="{self.css_class_name}"' if self.css_class_name else "" combined_attr_str = f"{class_attr} {attr_str}".strip() children_html = self._generate_children_html(add_children) result_html = self._generate_final_html(combined_attr_str, children_html) return result_html def to_text(self, add_children=True, add_img_alt_text=True) -> str: """ Returns the text representation of the element. Args: add_children: If True, the text of the children will be included. Otherwise, element is represented as single self-closing tag. add_img_alt_text: If True, the alt text of the image will be included. """ if self.children and add_children: children_text = " ".join( child.to_text(add_children, add_img_alt_text).strip() for child in self.children ) return children_text text = BeautifulSoup(self.to_html(), "html.parser").get_text().strip() if add_img_alt_text and self.html_tag_name == "img" and "alt" in self.additional_attributes: text += f" {self.additional_attributes.get('alt', '')}" return text.strip() def _construct_attribute_string(self, attributes: dict) -> str: return " ".join( f'{key}="{value}"' if value else f"{key}" for key, value in attributes.items() ) def _generate_children_html(self, add_children: bool) -> str: if not add_children or not self.children: return "" return "".join(child.to_html() for child in self.children) def _generate_final_html(self, attr_str: str, children_html: str) -> str: text = self.text or "" if text or children_html: return f"<{self.html_tag_name} {attr_str}>{text} {children_html}</{self.html_tag_name}>" else: return f"<{self.html_tag_name} {attr_str} />" @property def id(self) -> str | None: return self.additional_attributes.get("id", None) @property def page_number(self) -> int | None: if "data-page-number" in self.additional_attributes: try: return int(self.additional_attributes.get("data-page-number")) except ValueError: return None return None def remove_ids_and_class_from_table(soup: Tag): for tag in soup.find_all(True): if tag.name != "table": tag.attrs.pop("class", None) tag.attrs.pop("id", None) if tag.name in ["td", "th"]: tag.string = " ".join(tag.stripped_strings) return soup # Define specific elements class Document(OntologyElement): description: str = Field("Root element of the document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["body"], frozen=True) class Section(OntologyElement): description: str = Field("A distinct part or subdivision of a document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["section"], frozen=True) class Page(OntologyElement): description: str = Field("A single side of a paper in a document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["div"], frozen=True) class Column(OntologyElement): description: str = Field("A vertical section of a page", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["div"], frozen=True) class Paragraph(OntologyElement): description: str = Field("A self-contained unit of discourse in writing", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["p"], frozen=True) class Header(OntologyElement): description: str = Field("The top section of a page", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["header"], frozen=True) class Footer(OntologyElement): description: str = Field("The bottom section of a page", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["footer"], frozen=True) class Sidebar(OntologyElement): description: str = Field("A side section of a page", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["aside"], frozen=True) class PageBreak(OntologyElement): description: str = Field("A break between pages", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.layout, frozen=True) allowed_tags: List[str] = Field(["hr"], frozen=True) class Title(OntologyElement): description: str = Field("Main heading of a document or section", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["h1"], frozen=True) class Subtitle(OntologyElement): description: str = Field("Secondary title of a document or section", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["h2"], frozen=True) class Heading(OntologyElement): description: str = Field("Section headings (levels 1-6)", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["h1", "h2", "h3", "h4", "h5", "h6"], frozen=True) class NarrativeText(OntologyElement): description: str = Field("Main content text", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["p"], frozen=True) class Quote(OntologyElement): description: str = Field("A repetition of someone else's statement", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["blockquote"], frozen=True) class Footnote(OntologyElement): description: str = Field("A note at the bottom of a page", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["div"], frozen=True) class Caption(OntologyElement): description: str = Field("Text describing a figure or image", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["figcaption"], frozen=True) class PageNumber(OntologyElement): description: str = Field("The number of a page", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["span"], frozen=True) class UncategorizedText(OntologyElement): description: str = Field("Miscellaneous text", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.text, frozen=True) allowed_tags: List[str] = Field(["span"], frozen=True) class OrderedList(OntologyElement): description: str = Field("A list with a specific sequence", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.list, frozen=True) allowed_tags: List[str] = Field(["ol"], frozen=True) class UnorderedList(OntologyElement): description: str = Field("A list without a specific sequence", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.list, frozen=True) allowed_tags: List[str] = Field(["ul"], frozen=True) class DefinitionList(OntologyElement): description: str = Field("A list of terms and their definitions", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.list, frozen=True) allowed_tags: List[str] = Field(["dl"], frozen=True) class ListItem(OntologyElement): description: str = Field("An item in a list", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.list, frozen=True) allowed_tags: List[str] = Field(["li"], frozen=True) class Table(OntologyElement): description: str = Field("A structured set of data", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["table"], frozen=True) def to_html(self, add_children=True) -> str: soup = BeautifulSoup(super().to_html(add_children), "html.parser") soup = remove_ids_and_class_from_table(soup) return str(soup) class TableBody(OntologyElement): description: str = Field("A body of the table", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["tbody"], frozen=True) class TableHeader(OntologyElement): description: str = Field("A header of the table", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["thead"], frozen=True) class TableRow(OntologyElement): description: str = Field("A row in a table", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["tr"], frozen=True) class TableCell(OntologyElement): description: str = Field("A cell in a table", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["td"], frozen=True) # Note(Pluto): Renamed from TableCellHeader to TableHeaderCell to be consistent with TableCell class TableCellHeader(OntologyElement): description: str = Field("A header cell in a table", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["th"], frozen=True) class Image(OntologyElement): description: str = Field("A visual representation", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["img"], frozen=True) class Figure(OntologyElement): description: str = Field("An illustration or diagram in a document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["figure"], frozen=True) class Video(OntologyElement): description: str = Field("A moving visual media element", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["video"], frozen=True) class Audio(OntologyElement): description: str = Field("A sound or music element", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["audio"], frozen=True) class Barcode(OntologyElement): description: str = Field("A machine-readable representation of data", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["img"], frozen=True) class QRCode(OntologyElement): description: str = Field("A two-dimensional barcode", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["img"], frozen=True) class Logo(OntologyElement): description: str = Field("A graphical representation of a company or brand", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.media, frozen=True) allowed_tags: List[str] = Field(["img"], frozen=True) class CodeBlock(OntologyElement): description: str = Field("A block of programming code", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.code, frozen=True) allowed_tags: List[str] = Field(["pre", "code"], frozen=True) class InlineCode(OntologyElement): description: str = Field("Code within a line of text", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.code, frozen=True) allowed_tags: List[str] = Field(["code"], frozen=True) class Formula(OntologyElement): description: str = Field("A mathematical formula", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.mathematical, frozen=True) allowed_tags: List[str] = Field(["math"], frozen=True) class Equation(OntologyElement): description: str = Field("A mathematical equation", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.mathematical, frozen=True) allowed_tags: List[str] = Field(["math"], frozen=True) class FootnoteReference(OntologyElement): description: str = Field( "A subscripted reference to a note at the bottom of a page", frozen=True ) elementType: ElementTypeEnum = Field(ElementTypeEnum.reference, frozen=True) allowed_tags: List[str] = Field(["sub"], frozen=True) class Citation(OntologyElement): description: str = Field("A reference to a source", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.reference, frozen=True) allowed_tags: List[str] = Field(["cite"], frozen=True) class Bibliography(OntologyElement): description: str = Field("A list of sources", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.reference, frozen=True) allowed_tags: List[str] = Field(["ul"], frozen=True) class Glossary(OntologyElement): description: str = Field("A list of terms and their definitions", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.reference, frozen=True) allowed_tags: List[str] = Field(["dl"], frozen=True) class Author(OntologyElement): description: str = Field("The creator of the document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.metadata, frozen=True) allowed_tags: List[str] = Field(["meta"], frozen=True) class MetaDate(OntologyElement): description: str = Field("The date associated with the document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.metadata, frozen=True) allowed_tags: List[str] = Field(["meta"], frozen=True) class Keywords(OntologyElement): description: str = Field("Key terms associated with the document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.metadata, frozen=True) allowed_tags: List[str] = Field(["meta"], frozen=True) class Abstract(OntologyElement): description: str = Field("A summary of the document", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.metadata, frozen=True) allowed_tags: List[str] = Field(["section"], frozen=True) class Hyperlink(OntologyElement): description: str = Field("A reference to data that can be directly followed", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.navigation, frozen=True) allowed_tags: List[str] = Field(["a"], frozen=True) class TableOfContents(OntologyElement): description: str = Field( "A list of the document's contents. Total table columns will be " "equal to the degree of hierarchy (n) plus 1 for the target value. " "Header Row: L1,L2,...Ln,Value", frozen=True, ) elementType: ElementTypeEnum = Field(ElementTypeEnum.table, frozen=True) allowed_tags: List[str] = Field(["table"], frozen=True) def to_html(self, add_children=True) -> str: soup = BeautifulSoup(super().to_html(add_children), "html.parser") soup = remove_ids_and_class_from_table(soup) return str(soup) class Index(OntologyElement): description: str = Field("An alphabetical list of terms and their page numbers", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.navigation, frozen=True) allowed_tags: List[str] = Field(["nav"], frozen=True) class Form(OntologyElement): description: str = Field("A document section with interactive controls", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["form"], frozen=True) class FormField(OntologyElement): description: str = Field("A property value of a form", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["label"], frozen=True) class FormFieldValue(OntologyElement): description: str = Field("A field for user input", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True) def to_text(self, add_children=True, add_img_alt_text=True) -> str: text = super().to_text(add_children, add_img_alt_text) value = self.additional_attributes.get("value", "") if not value: return text return f"{text} {value}".strip() class Checkbox(OntologyElement): description: str = Field("A small box that can be checked or unchecked", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True) class RadioButton(OntologyElement): description: str = Field("A circular button that can be selected", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["input"], frozen=True) class Button(OntologyElement): description: str = Field("An interactive button element", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.form, frozen=True) allowed_tags: List[str] = Field(["button"], frozen=True) class Comment(OntologyElement): description: str = Field("A note or remark", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.annotation, frozen=True) allowed_tags: List[str] = Field(["span"], frozen=True) class Highlight(OntologyElement): description: str = Field("Emphasized text or section", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.annotation, frozen=True) allowed_tags: List[str] = Field(["mark"], frozen=True) class RevisionInsertion(OntologyElement): description: str = Field("A changed or edited element", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.annotation, frozen=True) allowed_tags: List[str] = Field(["ins"], frozen=True) class RevisionDeletion(OntologyElement): description: str = Field("A changed or edited element", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.annotation, frozen=True) allowed_tags: List[str] = Field(["del"], frozen=True) class Address(OntologyElement): description: str = Field("A physical location", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["address"], frozen=True) class EmailAddress(OntologyElement): description: str = Field("An email address", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["a"], frozen=True) class PhoneNumber(OntologyElement): description: str = Field("A telephone number", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["span"], frozen=True) class CalendarDate(OntologyElement): description: str = Field("A calendar date", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["time"], frozen=True) class Time(OntologyElement): description: str = Field("A specific time", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["time"], frozen=True) class Currency(OntologyElement): description: str = Field("A monetary value", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["span"], frozen=True) class Measurement(OntologyElement): description: str = Field("A quantitative value with units", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.specialized_text, frozen=True) allowed_tags: List[str] = Field(["span"], frozen=True) class Letterhead(OntologyElement): description: str = Field("The heading at the top of a letter", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.document_specific, frozen=True) allowed_tags: List[str] = Field(["header"], frozen=True) class Signature(OntologyElement): description: str = Field("A person's name written in a distinctive way", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.document_specific, frozen=True) allowed_tags: List[str] = Field(["img", "svg"], frozen=True) class Watermark(OntologyElement): description: str = Field("A faint design made in paper during manufacture", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.document_specific, frozen=True) allowed_tags: List[str] = Field(["div"], frozen=True) class Stamp(OntologyElement): description: str = Field("An official mark or seal", frozen=True) elementType: ElementTypeEnum = Field(ElementTypeEnum.document_specific, frozen=True) allowed_tags: List[str] = Field(["img", "svg"], frozen=True)
Memory