PK]ZZZ�vH��soupsieve/__init__.py""" Soup Sieve. A CSS selector filter for BeautifulSoup4. MIT License Copyright (c) 2018 Isaac Muse Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from __future__ import annotations from .__meta__ import __version__, __version_info__ # noqa: F401 from . import css_parser as cp from . import css_match as cm from . import css_types as ct from .util import DEBUG, SelectorSyntaxError # noqa: F401 import bs4 # type: ignore[import] from typing import Any, Iterator, Iterable __all__ = ( 'DEBUG', 'SelectorSyntaxError', 'SoupSieve', 'closest', 'compile', 'filter', 'iselect', 'match', 'select', 'select_one' ) SoupSieve = cm.SoupSieve def compile( # noqa: A001 pattern: str, namespaces: dict[str, str] | None = None, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> cm.SoupSieve: """Compile CSS pattern.""" if isinstance(pattern, SoupSieve): if flags: raise ValueError("Cannot process 'flags' argument on a compiled selector list") elif namespaces is not None: raise ValueError("Cannot process 'namespaces' argument on a compiled selector list") elif custom is not None: raise ValueError("Cannot process 'custom' argument on a compiled selector list") return pattern return cp._cached_css_compile( pattern, ct.Namespaces(namespaces) if namespaces is not None else namespaces, ct.CustomSelectors(custom) if custom is not None else custom, flags ) def purge() -> None: """Purge cached patterns.""" cp._purge_cache() def closest( select: str, tag: bs4.Tag, namespaces: dict[str, str] | None = None, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> bs4.Tag: """Match closest ancestor.""" return compile(select, namespaces, flags, **kwargs).closest(tag) def match( select: str, tag: bs4.Tag, namespaces: dict[str, str] | None = None, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> bool: """Match node.""" return compile(select, namespaces, flags, **kwargs).match(tag) def filter( # noqa: A001 select: str, iterable: Iterable[bs4.Tag], namespaces: dict[str, str] | None = None, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> list[bs4.Tag]: """Filter list of nodes.""" return compile(select, namespaces, flags, **kwargs).filter(iterable) def select_one( select: str, tag: bs4.Tag, namespaces: dict[str, str] | None = None, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> bs4.Tag: """Select a single tag.""" return compile(select, namespaces, flags, **kwargs).select_one(tag) def select( select: str, tag: bs4.Tag, namespaces: dict[str, str] | None = None, limit: int = 0, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> list[bs4.Tag]: """Select the specified tags.""" return compile(select, namespaces, flags, **kwargs).select(tag, limit) def iselect( select: str, tag: bs4.Tag, namespaces: dict[str, str] | None = None, limit: int = 0, flags: int = 0, *, custom: dict[str, str] | None = None, **kwargs: Any ) -> Iterator[bs4.Tag]: """Iterate the specified tags.""" yield from compile(select, namespaces, flags, **kwargs).iselect(tag, limit) def escape(ident: str) -> str: """Escape identifier.""" return cp.escape(ident) PK]ZZZ�T�nnsoupsieve/__meta__.py"""Meta related things.""" from __future__ import annotations from collections import namedtuple import re RE_VER = re.compile( r'''(?x) (?P<major>\d+)(?:\.(?P<minor>\d+))?(?:\.(?P<micro>\d+))? (?:(?P<type>a|b|rc)(?P<pre>\d+))? (?:\.post(?P<post>\d+))? (?:\.dev(?P<dev>\d+))? ''' ) REL_MAP = { ".dev": "", ".dev-alpha": "a", ".dev-beta": "b", ".dev-candidate": "rc", "alpha": "a", "beta": "b", "candidate": "rc", "final": "" } DEV_STATUS = { ".dev": "2 - Pre-Alpha", ".dev-alpha": "2 - Pre-Alpha", ".dev-beta": "2 - Pre-Alpha", ".dev-candidate": "2 - Pre-Alpha", "alpha": "3 - Alpha", "beta": "4 - Beta", "candidate": "4 - Beta", "final": "5 - Production/Stable" } PRE_REL_MAP = {"a": 'alpha', "b": 'beta', "rc": 'candidate'} class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre", "post", "dev"])): """ Get the version (PEP 440). A biased approach to the PEP 440 semantic version. Provides a tuple structure which is sorted for comparisons `v1 > v2` etc. (major, minor, micro, release type, pre-release build, post-release build, development release build) Release types are named in is such a way they are comparable with ease. Accessors to check if a development, pre-release, or post-release build. Also provides accessor to get development status for setup files. How it works (currently): - You must specify a release type as either `final`, `alpha`, `beta`, or `candidate`. - To define a development release, you can use either `.dev`, `.dev-alpha`, `.dev-beta`, or `.dev-candidate`. The dot is used to ensure all development specifiers are sorted before `alpha`. You can specify a `dev` number for development builds, but do not have to as implicit development releases are allowed. - You must specify a `pre` value greater than zero if using a prerelease as this project (not PEP 440) does not allow implicit prereleases. - You can optionally set `post` to a value greater than zero to make the build a post release. While post releases are technically allowed in prereleases, it is strongly discouraged, so we are rejecting them. It should be noted that we do not allow `post0` even though PEP 440 does not restrict this. This project specifically does not allow implicit post releases. - It should be noted that we do not support epochs `1!` or local versions `+some-custom.version-1`. Acceptable version releases: ``` Version(1, 0, 0, "final") 1.0 Version(1, 2, 0, "final") 1.2 Version(1, 2, 3, "final") 1.2.3 Version(1, 2, 0, ".dev-alpha", pre=4) 1.2a4 Version(1, 2, 0, ".dev-beta", pre=4) 1.2b4 Version(1, 2, 0, ".dev-candidate", pre=4) 1.2rc4 Version(1, 2, 0, "final", post=1) 1.2.post1 Version(1, 2, 3, ".dev") 1.2.3.dev0 Version(1, 2, 3, ".dev", dev=1) 1.2.3.dev1 ``` """ def __new__( cls, major: int, minor: int, micro: int, release: str = "final", pre: int = 0, post: int = 0, dev: int = 0 ) -> Version: """Validate version info.""" # Ensure all parts are positive integers. for value in (major, minor, micro, pre, post): if not (isinstance(value, int) and value >= 0): raise ValueError("All version parts except 'release' should be integers.") if release not in REL_MAP: raise ValueError(f"'{release}' is not a valid release type.") # Ensure valid pre-release (we do not allow implicit pre-releases). if ".dev-candidate" < release < "final": if pre == 0: raise ValueError("Implicit pre-releases not allowed.") elif dev: raise ValueError("Version is not a development release.") elif post: raise ValueError("Post-releases are not allowed with pre-releases.") # Ensure valid development or development/pre release elif release < "alpha": if release > ".dev" and pre == 0: raise ValueError("Implicit pre-release not allowed.") elif post: raise ValueError("Post-releases are not allowed with pre-releases.") # Ensure a valid normal release else: if pre: raise ValueError("Version is not a pre-release.") elif dev: raise ValueError("Version is not a development release.") return super().__new__(cls, major, minor, micro, release, pre, post, dev) def _is_pre(self) -> bool: """Is prerelease.""" return bool(self.pre > 0) def _is_dev(self) -> bool: """Is development.""" return bool(self.release < "alpha") def _is_post(self) -> bool: """Is post.""" return bool(self.post > 0) def _get_dev_status(self) -> str: # pragma: no cover """Get development status string.""" return DEV_STATUS[self.release] def _get_canonical(self) -> str: """Get the canonical output string.""" # Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed.. if self.micro == 0: ver = f"{self.major}.{self.minor}" else: ver = f"{self.major}.{self.minor}.{self.micro}" if self._is_pre(): ver += f'{REL_MAP[self.release]}{self.pre}' if self._is_post(): ver += f".post{self.post}" if self._is_dev(): ver += f".dev{self.dev}" return ver def parse_version(ver: str) -> Version: """Parse version into a comparable Version tuple.""" m = RE_VER.match(ver) if m is None: raise ValueError(f"'{ver}' is not a valid version") # Handle major, minor, micro major = int(m.group('major')) minor = int(m.group('minor')) if m.group('minor') else 0 micro = int(m.group('micro')) if m.group('micro') else 0 # Handle pre releases if m.group('type'): release = PRE_REL_MAP[m.group('type')] pre = int(m.group('pre')) else: release = "final" pre = 0 # Handle development releases dev = m.group('dev') if m.group('dev') else 0 if m.group('dev'): dev = int(m.group('dev')) release = '.dev-' + release if pre else '.dev' else: dev = 0 # Handle post post = int(m.group('post')) if m.group('post') else 0 return Version(major, minor, micro, release, pre, post, dev) __version_info__ = Version(2, 5, 0, "final") __version__ = __version_info__._get_canonical() PK]ZZZŤ�`�`�soupsieve/css_match.py"""CSS matcher.""" from __future__ import annotations from datetime import datetime from . import util import re from . import css_types as ct import unicodedata import bs4 # type: ignore[import] from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401 # Empty tag pattern (whitespace okay) RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]') RE_NOT_WS = re.compile('[^ \t\r\n\f]+') # Relationships REL_PARENT = ' ' REL_CLOSE_PARENT = '>' REL_SIBLING = '~' REL_CLOSE_SIBLING = '+' # Relationships for :has() (forward looking) REL_HAS_PARENT = ': ' REL_HAS_CLOSE_PARENT = ':>' REL_HAS_SIBLING = ':~' REL_HAS_CLOSE_SIBLING = ':+' NS_XHTML = 'http://www.w3.org/1999/xhtml' NS_XML = 'http://www.w3.org/XML/1998/namespace' DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE DIR_MAP = { 'ltr': ct.SEL_DIR_LTR, 'rtl': ct.SEL_DIR_RTL, 'auto': 0 } RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$") RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$') RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$') RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$') RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$') RE_DATETIME = re.compile( r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$' ) RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)') MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November FEB = 2 SHORT_MONTH = 30 LONG_MONTH = 31 FEB_MONTH = 28 FEB_LEAP_MONTH = 29 DAYS_IN_WEEK = 7 class _FakeParent: """ Fake parent class. When we have a fragment with no `BeautifulSoup` document object, we can't evaluate `nth` selectors properly. Create a temporary fake parent so we can traverse the root element as a child. """ def __init__(self, element: bs4.Tag) -> None: """Initialize.""" self.contents = [element] def __len__(self) -> bs4.PageElement: """Length.""" return len(self.contents) class _DocumentNav: """Navigate a Beautiful Soup document.""" @classmethod def assert_valid_input(cls, tag: Any) -> None: """Check if valid input tag or document.""" # Fail on unexpected types. if not cls.is_tag(tag): raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}") @staticmethod def is_doc(obj: bs4.Tag) -> bool: """Is `BeautifulSoup` object.""" return isinstance(obj, bs4.BeautifulSoup) @staticmethod def is_tag(obj: bs4.PageElement) -> bool: """Is tag.""" return isinstance(obj, bs4.Tag) @staticmethod def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover """Is declaration.""" return isinstance(obj, bs4.Declaration) @staticmethod def is_cdata(obj: bs4.PageElement) -> bool: """Is CDATA.""" return isinstance(obj, bs4.CData) @staticmethod def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover """Is processing instruction.""" return isinstance(obj, bs4.ProcessingInstruction) @staticmethod def is_navigable_string(obj: bs4.PageElement) -> bool: """Is navigable string.""" return isinstance(obj, bs4.NavigableString) @staticmethod def is_special_string(obj: bs4.PageElement) -> bool: """Is special string.""" return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype)) @classmethod def is_content_string(cls, obj: bs4.PageElement) -> bool: """Check if node is content string.""" return cls.is_navigable_string(obj) and not cls.is_special_string(obj) @staticmethod def create_fake_parent(el: bs4.Tag) -> _FakeParent: """Create fake parent for a given element.""" return _FakeParent(el) @staticmethod def is_xml_tree(el: bs4.Tag) -> bool: """Check if element (or document) is from a XML tree.""" return bool(el._is_xml) def is_iframe(self, el: bs4.Tag) -> bool: """Check if element is an `iframe`.""" return bool( ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and self.is_html_tag(el) # type: ignore[attr-defined] ) def is_root(self, el: bs4.Tag) -> bool: """ Return whether element is a root element. We check that the element is the root of the tree (which we have already pre-calculated), and we check if it is the root element under an `iframe`. """ root = self.root and self.root is el # type: ignore[attr-defined] if not root: parent = self.get_parent(el) root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined] return root def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]: """Get contents or contents in reverse.""" if not no_iframe or not self.is_iframe(el): yield from el.contents def get_children( self, el: bs4.Tag, start: int | None = None, reverse: bool = False, tags: bool = True, no_iframe: bool = False ) -> Iterator[bs4.PageElement]: """Get children.""" if not no_iframe or not self.is_iframe(el): last = len(el.contents) - 1 if start is None: index = last if reverse else 0 else: index = start end = -1 if reverse else last + 1 incr = -1 if reverse else 1 if 0 <= index <= last: while index != end: node = el.contents[index] index += incr if not tags or self.is_tag(node): yield node def get_descendants( self, el: bs4.Tag, tags: bool = True, no_iframe: bool = False ) -> Iterator[bs4.PageElement]: """Get descendants.""" if not no_iframe or not self.is_iframe(el): next_good = None for child in el.descendants: if next_good is not None: if child is not next_good: continue next_good = None is_tag = self.is_tag(child) if no_iframe and is_tag and self.is_iframe(child): if child.next_sibling is not None: next_good = child.next_sibling else: last_child = child while self.is_tag(last_child) and last_child.contents: last_child = last_child.contents[-1] next_good = last_child.next_element yield child if next_good is None: break # Coverage isn't seeing this even though it's executed continue # pragma: no cover if not tags or is_tag: yield child def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag: """Get parent.""" parent = el.parent if no_iframe and parent is not None and self.is_iframe(parent): parent = None return parent @staticmethod def get_tag_name(el: bs4.Tag) -> str | None: """Get tag.""" return cast('str | None', el.name) @staticmethod def get_prefix_name(el: bs4.Tag) -> str | None: """Get prefix.""" return cast('str | None', el.prefix) @staticmethod def get_uri(el: bs4.Tag) -> str | None: """Get namespace `URI`.""" return cast('str | None', el.namespace) @classmethod def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: """Get next sibling tag.""" sibling = el.next_sibling while tags and not cls.is_tag(sibling) and sibling is not None: sibling = sibling.next_sibling return sibling @classmethod def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement: """Get previous sibling tag.""" sibling = el.previous_sibling while tags and not cls.is_tag(sibling) and sibling is not None: sibling = sibling.previous_sibling return sibling @staticmethod def has_html_ns(el: bs4.Tag) -> bool: """ Check if element has an HTML namespace. This is a bit different than whether a element is treated as having an HTML namespace, like we do in the case of `is_html_tag`. """ ns = getattr(el, 'namespace') if el else None # noqa: B009 return bool(ns and ns == NS_XHTML) @staticmethod def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]: """Return namespace and attribute name without the prefix.""" return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) @classmethod def normalize_value(cls, value: Any) -> str | Sequence[str]: """Normalize the value to be a string or list of strings.""" # Treat `None` as empty string. if value is None: return '' # Pass through strings if (isinstance(value, str)): return value # If it's a byte string, convert it to Unicode, treating it as UTF-8. if isinstance(value, bytes): return value.decode("utf8") # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. if isinstance(value, Sequence): new_value = [] for v in value: if not isinstance(v, (str, bytes)) and isinstance(v, Sequence): # This is most certainly a user error and will crash and burn later. # To keep things working, we'll do what we do with all objects, # And convert them to strings. new_value.append(str(v)) else: # Convert the child to a string new_value.append(cast(str, cls.normalize_value(v))) return new_value # Try and make anything else a string return str(value) @classmethod def get_attribute_by_name( cls, el: bs4.Tag, name: str, default: str | Sequence[str] | None = None ) -> str | Sequence[str] | None: """Get attribute by name.""" value = default if el._is_xml: try: value = cls.normalize_value(el.attrs[name]) except KeyError: pass else: for k, v in el.attrs.items(): if util.lower(k) == name: value = cls.normalize_value(v) break return value @classmethod def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]: """Iterate attributes.""" for k, v in el.attrs.items(): yield k, cls.normalize_value(v) @classmethod def get_classes(cls, el: bs4.Tag) -> Sequence[str]: """Get classes.""" classes = cls.get_attribute_by_name(el, 'class', []) if isinstance(classes, str): classes = RE_NOT_WS.findall(classes) return cast(Sequence[str], classes) def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str: """Get text.""" return ''.join( [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)] ) def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]: """Get Own Text.""" return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)] class Inputs: """Class for parsing and validating input items.""" @staticmethod def validate_day(year: int, month: int, day: int) -> bool: """Validate day.""" max_days = LONG_MONTH if month == FEB: max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH elif month in MONTHS_30: max_days = SHORT_MONTH return 1 <= day <= max_days @staticmethod def validate_week(year: int, week: int) -> bool: """Validate week.""" max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1] if max_week == 1: max_week = 53 return 1 <= week <= max_week @staticmethod def validate_month(month: int) -> bool: """Validate month.""" return 1 <= month <= 12 @staticmethod def validate_year(year: int) -> bool: """Validate year.""" return 1 <= year @staticmethod def validate_hour(hour: int) -> bool: """Validate hour.""" return 0 <= hour <= 23 @staticmethod def validate_minutes(minutes: int) -> bool: """Validate minutes.""" return 0 <= minutes <= 59 @classmethod def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None: """Parse the input value.""" parsed = None # type: tuple[float, ...] | None if value is None: return value if itype == "date": m = RE_DATE.match(value) if m: year = int(m.group('year'), 10) month = int(m.group('month'), 10) day = int(m.group('day'), 10) if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day): parsed = (year, month, day) elif itype == "month": m = RE_MONTH.match(value) if m: year = int(m.group('year'), 10) month = int(m.group('month'), 10) if cls.validate_year(year) and cls.validate_month(month): parsed = (year, month) elif itype == "week": m = RE_WEEK.match(value) if m: year = int(m.group('year'), 10) week = int(m.group('week'), 10) if cls.validate_year(year) and cls.validate_week(year, week): parsed = (year, week) elif itype == "time": m = RE_TIME.match(value) if m: hour = int(m.group('hour'), 10) minutes = int(m.group('minutes'), 10) if cls.validate_hour(hour) and cls.validate_minutes(minutes): parsed = (hour, minutes) elif itype == "datetime-local": m = RE_DATETIME.match(value) if m: year = int(m.group('year'), 10) month = int(m.group('month'), 10) day = int(m.group('day'), 10) hour = int(m.group('hour'), 10) minutes = int(m.group('minutes'), 10) if ( cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and cls.validate_hour(hour) and cls.validate_minutes(minutes) ): parsed = (year, month, day, hour, minutes) elif itype in ("number", "range"): m = RE_NUM.match(value) if m: parsed = (float(m.group('value')),) return parsed class CSSMatch(_DocumentNav): """Perform CSS matching.""" def __init__( self, selectors: ct.SelectorList, scope: bs4.Tag, namespaces: ct.Namespaces | None, flags: int ) -> None: """Initialize.""" self.assert_valid_input(scope) self.tag = scope self.cached_meta_lang = [] # type: list[tuple[str, str]] self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]] self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]] self.selectors = selectors self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str] self.flags = flags self.iframe_restrict = False # Find the root element for the whole tree doc = scope parent = self.get_parent(doc) while parent: doc = parent parent = self.get_parent(doc) root = None if not self.is_doc(doc): root = doc else: for child in self.get_children(doc): root = child break self.root = root self.scope = scope if scope is not doc else root self.has_html_namespace = self.has_html_ns(root) # A document can be both XML and HTML (XHTML) self.is_xml = self.is_xml_tree(doc) self.is_html = not self.is_xml or self.has_html_namespace def supports_namespaces(self) -> bool: """Check if namespaces are supported in the HTML type.""" return self.is_xml or self.has_html_namespace def get_tag_ns(self, el: bs4.Tag) -> str: """Get tag namespace.""" if self.supports_namespaces(): namespace = '' ns = self.get_uri(el) if ns: namespace = ns else: namespace = NS_XHTML return namespace def is_html_tag(self, el: bs4.Tag) -> bool: """Check if tag is in HTML namespace.""" return self.get_tag_ns(el) == NS_XHTML def get_tag(self, el: bs4.Tag) -> str | None: """Get tag.""" name = self.get_tag_name(el) return util.lower(name) if name is not None and not self.is_xml else name def get_prefix(self, el: bs4.Tag) -> str | None: """Get prefix.""" prefix = self.get_prefix_name(el) return util.lower(prefix) if prefix is not None and not self.is_xml else prefix def find_bidi(self, el: bs4.Tag) -> int | None: """Get directionality from element text.""" for node in self.get_children(el, tags=False): # Analyze child text nodes if self.is_tag(node): # Avoid analyzing certain elements specified in the specification. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None) if ( self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or not self.is_html_tag(node) or direction is not None ): continue # pragma: no cover # Check directionality of this node's text value = self.find_bidi(node) if value is not None: return value # Direction could not be determined continue # pragma: no cover # Skip `doctype` comments, etc. if self.is_special_string(node): continue # Analyze text nodes for directionality. for c in node: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return None def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool: """Filter the language tags.""" match = True lang_range = RE_WILD_STRIP.sub('-', lang_range).lower() ranges = lang_range.split('-') subtags = lang_tag.lower().split('-') length = len(ranges) slength = len(subtags) rindex = 0 sindex = 0 r = ranges[rindex] s = subtags[sindex] # Empty specified language should match unspecified language attributes if length == 1 and slength == 1 and not r and r == s: return True # Primary tag needs to match if (r != '*' and r != s) or (r == '*' and slength == 1 and not s): match = False rindex += 1 sindex += 1 # Match until we run out of ranges while match and rindex < length: r = ranges[rindex] try: s = subtags[sindex] except IndexError: # Ran out of subtags, # but we still have ranges match = False continue # Empty range if not r: match = False continue # Matched range elif s == r: rindex += 1 # Implicit wildcard cannot match # singletons elif len(s) == 1: match = False continue # Implicitly matched, so grab next subtag sindex += 1 return match def match_attribute_name( self, el: bs4.Tag, attr: str, prefix: str | None ) -> str | Sequence[str] | None: """Match attribute name and return value if it exists.""" value = None if self.supports_namespaces(): value = None # If we have not defined namespaces, we can't very well find them, so don't bother trying. if prefix: ns = self.namespaces.get(prefix) if ns is None and prefix != '*': return None else: ns = None for k, v in self.iter_attributes(el): # Get attribute parts namespace, name = self.split_namespace(el, k) # Can't match a prefix attribute as we haven't specified one to match # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`. if ns is None: if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)): value = v break # Coverage is not finding this even though it is executed. # Adding a print statement before this (and erasing coverage) causes coverage to find the line. # Ignore the false positive message. continue # pragma: no cover # We can't match our desired prefix attribute as the attribute doesn't have a prefix if namespace is None or ns != namespace and prefix != '*': continue # The attribute doesn't match. if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name): continue value = v break else: for k, v in self.iter_attributes(el): if util.lower(attr) != util.lower(k): continue value = v break return value def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: """Match the namespace of the element.""" match = True namespace = self.get_tag_ns(el) default_namespace = self.namespaces.get('') tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix) # We must match the default namespace if one is not provided if tag.prefix is None and (default_namespace is not None and namespace != default_namespace): match = False # If we specified `|tag`, we must not have a namespace. elif (tag.prefix is not None and tag.prefix == '' and namespace): match = False # Verify prefix matches elif ( tag.prefix and tag.prefix != '*' and (tag_ns is None or namespace != tag_ns) ): match = False return match def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool: """Match attributes.""" match = True if attributes: for a in attributes: temp = self.match_attribute_name(el, a.attribute, a.prefix) pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern if temp is None: match = False break value = temp if isinstance(temp, str) else ' '.join(temp) if pattern is None: continue elif pattern.match(value) is None: match = False break return match def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool: """Match tag name.""" name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name) return not ( name is not None and name not in (self.get_tag(el), '*') ) def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool: """Match the tag.""" match = True if tag is not None: # Verify namespace if not self.match_namespace(el, tag): match = False if not self.match_tagname(el, tag): match = False return match def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match past relationship.""" found = False # I don't think this can ever happen, but it makes `mypy` happy if isinstance(relation[0], ct.SelectorNull): # pragma: no cover return found if relation[0].rel_type == REL_PARENT: parent = self.get_parent(el, no_iframe=self.iframe_restrict) while not found and parent: found = self.match_selectors(parent, relation) parent = self.get_parent(parent, no_iframe=self.iframe_restrict) elif relation[0].rel_type == REL_CLOSE_PARENT: parent = self.get_parent(el, no_iframe=self.iframe_restrict) if parent: found = self.match_selectors(parent, relation) elif relation[0].rel_type == REL_SIBLING: sibling = self.get_previous(el) while not found and sibling: found = self.match_selectors(sibling, relation) sibling = self.get_previous(sibling) elif relation[0].rel_type == REL_CLOSE_SIBLING: sibling = self.get_previous(el) if sibling and self.is_tag(sibling): found = self.match_selectors(sibling, relation) return found def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool: """Match future child.""" match = False if recursive: children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]] else: children = self.get_children for child in children(parent, no_iframe=self.iframe_restrict): match = self.match_selectors(child, relation) if match: break return match def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match future relationship.""" found = False # I don't think this can ever happen, but it makes `mypy` happy if isinstance(relation[0], ct.SelectorNull): # pragma: no cover return found if relation[0].rel_type == REL_HAS_PARENT: found = self.match_future_child(el, relation, True) elif relation[0].rel_type == REL_HAS_CLOSE_PARENT: found = self.match_future_child(el, relation) elif relation[0].rel_type == REL_HAS_SIBLING: sibling = self.get_next(el) while not found and sibling: found = self.match_selectors(sibling, relation) sibling = self.get_next(sibling) elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING: sibling = self.get_next(el) if sibling and self.is_tag(sibling): found = self.match_selectors(sibling, relation) return found def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool: """Match relationship to other elements.""" found = False if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None: return found if relation[0].rel_type.startswith(':'): found = self.match_future_relations(el, relation) else: found = self.match_past_relations(el, relation) return found def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool: """Match element's ID.""" found = True for i in ids: if i != self.get_attribute_by_name(el, 'id', ''): found = False break return found def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool: """Match element's classes.""" current_classes = self.get_classes(el) found = True for c in classes: if c not in current_classes: found = False break return found def match_root(self, el: bs4.Tag) -> bool: """Match element as root.""" is_root = self.is_root(el) if is_root: sibling = self.get_previous(el, tags=False) while is_root and sibling is not None: if ( self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or self.is_cdata(sibling) ): is_root = False else: sibling = self.get_previous(sibling, tags=False) if is_root: sibling = self.get_next(el, tags=False) while is_root and sibling is not None: if ( self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or self.is_cdata(sibling) ): is_root = False else: sibling = self.get_next(sibling, tags=False) return is_root def match_scope(self, el: bs4.Tag) -> bool: """Match element as scope.""" return self.scope is el def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool: """Match tag type for `nth` matches.""" return ( (self.get_tag(child) == self.get_tag(el)) and (self.get_tag_ns(child) == self.get_tag_ns(el)) ) def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool: """Match `nth` elements.""" matched = True for n in nth: matched = False if n.selectors and not self.match_selectors(el, n.selectors): break parent = self.get_parent(el) if parent is None: parent = self.create_fake_parent(el) last = n.last last_index = len(parent) - 1 index = last_index if last else 0 relative_index = 0 a = n.a b = n.b var = n.n count = 0 count_incr = 1 factor = -1 if last else 1 idx = last_idx = a * count + b if var else a # We can only adjust bounds within a variable index if var: # Abort if our nth index is out of bounds and only getting further out of bounds as we increment. # Otherwise, increment to try to get in bounds. adjust = None while idx < 1 or idx > last_index: if idx < 0: diff_low = 0 - idx if adjust is not None and adjust == 1: break adjust = -1 count += count_incr idx = last_idx = a * count + b if var else a diff = 0 - idx if diff >= diff_low: break else: diff_high = idx - last_index if adjust is not None and adjust == -1: break adjust = 1 count += count_incr idx = last_idx = a * count + b if var else a diff = idx - last_index if diff >= diff_high: break diff_high = diff # If a < 0, our count is working backwards, so floor the index by increasing the count. # Find the count that yields the lowest, in bound value and use that. # Lastly reverse count increment so that we'll increase our index. lowest = count if a < 0: while idx >= 1: lowest = count count += count_incr idx = last_idx = a * count + b if var else a count_incr = -1 count = lowest idx = last_idx = a * count + b if var else a # Evaluate elements while our calculated nth index is still in range while 1 <= idx <= last_index + 1: child = None # Evaluate while our child index is still in range. for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False): index += factor if not self.is_tag(child): continue # Handle `of S` in `nth-child` if n.selectors and not self.match_selectors(child, n.selectors): continue # Handle `of-type` if n.of_type and not self.match_nth_tag_type(el, child): continue relative_index += 1 if relative_index == idx: if child is el: matched = True else: break if child is el: break if child is el: break last_idx = idx count += count_incr if count < 0: # Count is counting down and has now ventured into invalid territory. break idx = a * count + b if var else a if last_idx == idx: break if not matched: break return matched def match_empty(self, el: bs4.Tag) -> bool: """Check if element is empty (if requested).""" is_empty = True for child in self.get_children(el, tags=False): if self.is_tag(child): is_empty = False break elif self.is_content_string(child) and RE_NOT_EMPTY.search(child): is_empty = False break return is_empty def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool: """Match selectors.""" match = True for sel in selectors: if not self.match_selectors(el, sel): match = False return match def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool: """Match element if it contains text.""" match = True content = None # type: str | Sequence[str] | None for contain_list in contains: if content is None: if contain_list.own: content = self.get_own_text(el, no_iframe=self.is_html) else: content = self.get_text(el, no_iframe=self.is_html) found = False for text in contain_list.text: if contain_list.own: for c in content: if text in c: found = True break if found: break else: if text in content: found = True break if not found: match = False return match def match_default(self, el: bs4.Tag) -> bool: """Match default.""" match = False # Find this input's form form = None parent = self.get_parent(el, no_iframe=True) while parent and form is None: if self.get_tag(parent) == 'form' and self.is_html_tag(parent): form = parent else: parent = self.get_parent(parent, no_iframe=True) # Look in form cache to see if we've already located its default button found_form = False for f, t in self.cached_default_forms: if f is form: found_form = True if t is el: match = True break # We didn't have the form cached, so look for its default button if not found_form: for child in self.get_descendants(form, no_iframe=True): name = self.get_tag(child) # Can't do nested forms (haven't figured out why we never hit this) if name == 'form': # pragma: no cover break if name in ('input', 'button'): v = self.get_attribute_by_name(child, 'type', '') if v and util.lower(v) == 'submit': self.cached_default_forms.append((form, child)) if el is child: match = True break return match def match_indeterminate(self, el: bs4.Tag) -> bool: """Match default.""" match = False name = cast(str, self.get_attribute_by_name(el, 'name')) def get_parent_form(el: bs4.Tag) -> bs4.Tag | None: """Find this input's form.""" form = None parent = self.get_parent(el, no_iframe=True) while form is None: if self.get_tag(parent) == 'form' and self.is_html_tag(parent): form = parent break last_parent = parent parent = self.get_parent(parent, no_iframe=True) if parent is None: form = last_parent break return form form = get_parent_form(el) # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate found_form = False for f, n, i in self.cached_indeterminate_forms: if f is form and n == name: found_form = True if i is True: match = True break # We didn't have the form cached, so validate that the radio button is indeterminate if not found_form: checked = False for child in self.get_descendants(form, no_iframe=True): if child is el: continue tag_name = self.get_tag(child) if tag_name == 'input': is_radio = False check = False has_name = False for k, v in self.iter_attributes(child): if util.lower(k) == 'type' and util.lower(v) == 'radio': is_radio = True elif util.lower(k) == 'name' and v == name: has_name = True elif util.lower(k) == 'checked': check = True if is_radio and check and has_name and get_parent_form(child) is form: checked = True break if checked: break if not checked: match = True self.cached_indeterminate_forms.append((form, name, match)) return match def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool: """Match languages.""" match = False has_ns = self.supports_namespaces() root = self.root has_html_namespace = self.has_html_namespace # Walk parents looking for `lang` (HTML) or `xml:lang` XML property. parent = el found_lang = None last = None while not found_lang: has_html_ns = self.has_html_ns(parent) for k, v in self.iter_attributes(parent): attr_ns, attr = self.split_namespace(parent, k) if ( ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or ( has_ns and not has_html_ns and attr_ns == NS_XML and (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang' ) ): found_lang = v break last = parent parent = self.get_parent(parent, no_iframe=self.is_html) if parent is None: root = last has_html_namespace = self.has_html_ns(root) parent = last break # Use cached meta language. if found_lang is None and self.cached_meta_lang: for cache in self.cached_meta_lang: if root is cache[0]: found_lang = cache[1] # If we couldn't find a language, and the document is HTML, look to meta to determine language. if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')): # Find head found = False for tag in ('html', 'head'): found = False for child in self.get_children(parent, no_iframe=self.is_html): if self.get_tag(child) == tag and self.is_html_tag(child): found = True parent = child break if not found: # pragma: no cover break # Search meta tags if found: for child in parent: if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent): c_lang = False content = None for k, v in self.iter_attributes(child): if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language': c_lang = True if util.lower(k) == 'content': content = v if c_lang and content: found_lang = content self.cached_meta_lang.append((cast(str, root), cast(str, found_lang))) break if found_lang is not None: break if found_lang is None: self.cached_meta_lang.append((cast(str, root), '')) # If we determined a language, compare. if found_lang is not None: for patterns in langs: match = False for pattern in patterns: if self.extended_language_filter(pattern, cast(str, found_lang)): match = True if not match: break return match def match_dir(self, el: bs4.Tag, directionality: int) -> bool: """Check directionality.""" # If we have to match both left and right, we can't match either. if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL: return False if el is None or not self.is_html_tag(el): return False # Element has defined direction of left to right or right to left direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None) if direction not in (None, 0): return direction == directionality # Element is the document element (the root) and no direction assigned, assume left to right. is_root = self.is_root(el) if is_root and direction is None: return ct.SEL_DIR_LTR == directionality # If `input[type=telephone]` and no direction is assigned, assume left to right. name = self.get_tag(el) is_input = name == 'input' is_textarea = name == 'textarea' is_bdi = name == 'bdi' itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else '' if is_input and itype == 'tel' and direction is None: return ct.SEL_DIR_LTR == directionality # Auto handling for text inputs if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0: if is_textarea: value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node)) else: value = cast(str, self.get_attribute_by_name(el, 'value', '')) if value: for c in value: bidi = unicodedata.bidirectional(c) if bidi in ('AL', 'R', 'L'): direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL return direction == directionality # Assume left to right return ct.SEL_DIR_LTR == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(self.get_parent(el, no_iframe=True), directionality) # Auto handling for `bdi` and other non text inputs. if (is_bdi and direction is None) or direction == 0: direction = self.find_bidi(el) if direction is not None: return direction == directionality elif is_root: return ct.SEL_DIR_LTR == directionality return self.match_dir(self.get_parent(el, no_iframe=True), directionality) # Match parents direction return self.match_dir(self.get_parent(el, no_iframe=True), directionality) def match_range(self, el: bs4.Tag, condition: int) -> bool: """ Match range. Behavior is modeled after what we see in browsers. Browsers seem to evaluate if the value is out of range, and if not, it is in range. So a missing value will not evaluate out of range; therefore, value is in range. Personally, I feel like this should evaluate as neither in or out of range. """ out_of_range = False itype = util.lower(self.get_attribute_by_name(el, 'type')) mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None))) mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None))) # There is no valid min or max, so we cannot evaluate a range if mn is None and mx is None: return False value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None))) if value is not None: if itype in ("date", "datetime-local", "month", "week", "number", "range"): if mn is not None and value < mn: out_of_range = True if not out_of_range and mx is not None and value > mx: out_of_range = True elif itype == "time": if mn is not None and mx is not None and mn > mx: # Time is periodic, so this is a reversed/discontinuous range if value < mn and value > mx: out_of_range = True else: if mn is not None and value < mn: out_of_range = True if not out_of_range and mx is not None and value > mx: out_of_range = True return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range def match_defined(self, el: bs4.Tag) -> bool: """ Match defined. `:defined` is related to custom elements in a browser. - If the document is XML (not XHTML), all tags will match. - Tags that are not custom (don't have a hyphen) are marked defined. - If the tag has a prefix (without or without a namespace), it will not match. This is of course requires the parser to provide us with the proper prefix and namespace info, if it doesn't, there is nothing we can do. """ name = self.get_tag(el) return ( name is not None and ( name.find('-') == -1 or name.find(':') != -1 or self.get_prefix(el) is not None ) ) def match_placeholder_shown(self, el: bs4.Tag) -> bool: """ Match placeholder shown according to HTML spec. - text area should be checked if they have content. A single newline does not count as content. """ match = False content = self.get_text(el) if content in ('', '\n'): match = True return match def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool: """Check if element matches one of the selectors.""" match = False is_not = selectors.is_not is_html = selectors.is_html # Internal selector lists that use the HTML flag, will automatically get the `html` namespace. if is_html: namespaces = self.namespaces iframe_restrict = self.iframe_restrict self.namespaces = {'html': NS_XHTML} self.iframe_restrict = True if not is_html or self.is_html: for selector in selectors: match = is_not # We have a un-matchable situation (like `:focus` as you can focus an element in this environment) if isinstance(selector, ct.SelectorNull): continue # Verify tag matches if not self.match_tag(el, selector.tag): continue # Verify tag is defined if selector.flags & ct.SEL_DEFINED and not self.match_defined(el): continue # Verify element is root if selector.flags & ct.SEL_ROOT and not self.match_root(el): continue # Verify element is scope if selector.flags & ct.SEL_SCOPE and not self.match_scope(el): continue # Verify element has placeholder shown if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el): continue # Verify `nth` matches if not self.match_nth(el, selector.nth): continue if selector.flags & ct.SEL_EMPTY and not self.match_empty(el): continue # Verify id matches if selector.ids and not self.match_id(el, selector.ids): continue # Verify classes match if selector.classes and not self.match_classes(el, selector.classes): continue # Verify attribute(s) match if not self.match_attributes(el, selector.attributes): continue # Verify ranges if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES): continue # Verify language patterns if selector.lang and not self.match_lang(el, selector.lang): continue # Verify pseudo selector patterns if selector.selectors and not self.match_subselectors(el, selector.selectors): continue # Verify relationship selectors if selector.relation and not self.match_relations(el, selector.relation): continue # Validate that the current default selector match corresponds to the first submit button in the form if selector.flags & ct.SEL_DEFAULT and not self.match_default(el): continue # Validate that the unset radio button is among radio buttons with the same name in a form that are # also not set. if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el): continue # Validate element directionality if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS): continue # Validate that the tag contains the specified text. if selector.contains and not self.match_contains(el, selector.contains): continue match = not is_not break # Restore actual namespaces being used for external selector lists if is_html: self.namespaces = namespaces self.iframe_restrict = iframe_restrict return match def select(self, limit: int = 0) -> Iterator[bs4.Tag]: """Match all tags under the targeted tag.""" lim = None if limit < 1 else limit for child in self.get_descendants(self.tag): if self.match(child): yield child if lim is not None: lim -= 1 if lim < 1: break def closest(self) -> bs4.Tag | None: """Match closest ancestor.""" current = self.tag closest = None while closest is None and current is not None: if self.match(current): closest = current else: current = self.get_parent(current) return closest def filter(self) -> list[bs4.Tag]: # noqa A001 """Filter tag's children.""" return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)] def match(self, el: bs4.Tag) -> bool: """Match.""" return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors) class SoupSieve(ct.Immutable): """Compiled Soup Sieve selector matching object.""" pattern: str selectors: ct.SelectorList namespaces: ct.Namespaces | None custom: dict[str, str] flags: int __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash") def __init__( self, pattern: str, selectors: ct.SelectorList, namespaces: ct.Namespaces | None, custom: ct.CustomSelectors | None, flags: int ): """Initialize.""" super().__init__( pattern=pattern, selectors=selectors, namespaces=namespaces, custom=custom, flags=flags ) def match(self, tag: bs4.Tag) -> bool: """Match.""" return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag) def closest(self, tag: bs4.Tag) -> bs4.Tag: """Match closest ancestor.""" return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest() def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001 """ Filter. `CSSMatch` can cache certain searches for tags of the same document, so if we are given a tag, all tags are from the same document, and we can take advantage of the optimization. Any other kind of iterable could have tags from different documents or detached tags, so for those, we use a new `CSSMatch` for each item in the iterable. """ if CSSMatch.is_tag(iterable): return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter() else: return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)] def select_one(self, tag: bs4.Tag) -> bs4.Tag: """Select a single tag.""" tags = self.select(tag, limit=1) return tags[0] if tags else None def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]: """Select the specified tags.""" return list(self.iselect(tag, limit)) def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]: """Iterate the specified tags.""" yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit) def __repr__(self) -> str: # pragma: no cover """Representation.""" return ( f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, " f"custom={self.custom!r}, flags={self.flags!r})" ) __str__ = __repr__ ct.pickle_register(SoupSieve) PK]ZZZ*&�Q�Q�soupsieve/css_parser.py"""CSS selector parser.""" from __future__ import annotations import re from functools import lru_cache from . import util from . import css_match as cm from . import css_types as ct from .util import SelectorSyntaxError import warnings from typing import Match, Any, Iterator, cast UNICODE_REPLACEMENT_CHAR = 0xFFFD # Simple pseudo classes that take no parameters PSEUDO_SIMPLE = { ":any-link", ":empty", ":first-child", ":first-of-type", ":in-range", ":out-of-range", ":last-child", ":last-of-type", ":link", ":only-child", ":only-of-type", ":root", ':checked', ':default', ':disabled', ':enabled', ':indeterminate', ':optional', ':placeholder-shown', ':read-only', ':read-write', ':required', ':scope', ':defined' } # Supported, simple pseudo classes that match nothing in the Soup Sieve environment PSEUDO_SIMPLE_NO_MATCH = { ':active', ':current', ':focus', ':focus-visible', ':focus-within', ':future', ':host', ':hover', ':local-link', ':past', ':paused', ':playing', ':target', ':target-within', ':user-invalid', ':visited' } # Complex pseudo classes that take selector lists PSEUDO_COMPLEX = { ':contains', ':-soup-contains', ':-soup-contains-own', ':has', ':is', ':matches', ':not', ':where' } PSEUDO_COMPLEX_NO_MATCH = { ':current', ':host', ':host-context' } # Complex pseudo classes that take very specific parameters and are handled special PSEUDO_SPECIAL = { ':dir', ':lang', ':nth-child', ':nth-last-child', ':nth-last-of-type', ':nth-of-type' } PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL # Sub-patterns parts # Whitespace NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])' WS = fr'(?:[ \t]|{NEWLINE})' # Comments COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)' # Whitespace with comments included WSC = fr'(?:{WS}|{COMMENTS})' # CSS escapes CSS_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$))' CSS_STRING_ESCAPES = fr'(?:\\(?:[a-f0-9]{{1,6}}{WS}?|[^\r\n\f]|$|{NEWLINE}))' # CSS Identifier IDENTIFIER = fr''' (?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})+|--) (?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{CSS_ESCAPES})*) ''' # `nth` content NTH = fr'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){WSC}*(?:[-+]){WSC}*(?:[0-9]+))?' # Value: quoted string or identifier VALUE = fr'''(?:"(?:\\(?:.|{NEWLINE})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{NEWLINE})|[^\\'\r\n\f]+)*?'|{IDENTIFIER}+)''' # Attribute value comparison. `!=` is handled special as it is non-standard. ATTR = fr'(?:{WSC}*(?P<cmp>[!~^|*$]?=){WSC}*(?P<value>{VALUE})(?:{WSC}*(?P<case>[is]))?)?{WSC}*\]' # Selector patterns # IDs (`#id`) PAT_ID = fr'\#{IDENTIFIER}' # Classes (`.class`) PAT_CLASS = fr'\.{IDENTIFIER}' # Prefix:Tag (`prefix|tag`) PAT_TAG = fr'(?P<tag_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<tag_name>{IDENTIFIER}|\*)' # Attributes (`[attr]`, `[attr=value]`, etc.) PAT_ATTR = fr'\[{WSC}*(?P<attr_ns>(?:{IDENTIFIER}|\*)?\|)?(?P<attr_name>{IDENTIFIER}){ATTR}' # Pseudo class (`:pseudo-class`, `:pseudo-class(`) PAT_PSEUDO_CLASS = fr'(?P<name>:{IDENTIFIER})(?P<open>${WSC}*)?' # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes. PAT_PSEUDO_CLASS_SPECIAL = fr'(?P<name>:{IDENTIFIER})(?P<open>\({WSC}*)' # Custom pseudo class (`:--custom-pseudo`) PAT_PSEUDO_CLASS_CUSTOM = fr'(?P<name>:(?=--){IDENTIFIER})' # Closing pseudo group (`)`) PAT_PSEUDO_CLOSE = fr'{WSC}*$' # Pseudo element (`::pseudo-element`) PAT_PSEUDO_ELEMENT = fr':{PAT_PSEUDO_CLASS}' # At rule (`@page`, etc.) (not supported) PAT_AT_RULE = fr'@P{IDENTIFIER}' # Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.) PAT_PSEUDO_NTH_CHILD = fr''' (?P<pseudo_nth_child>{PAT_PSEUDO_CLASS_SPECIAL} (?P<nth_child>{NTH}|even|odd))(?:{WSC}*\)|(?P<of>{COMMENTS}*{WS}{WSC}*of{COMMENTS}*{WS}{WSC}*)) ''' # Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.) PAT_PSEUDO_NTH_TYPE = fr''' (?P<pseudo_nth_type>{PAT_PSEUDO_CLASS_SPECIAL} (?P<nth_type>{NTH}|even|odd)){WSC}*\) ''' # Pseudo class language (`:lang("*-de", en)`) PAT_PSEUDO_LANG = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)' # Pseudo class direction (`:dir(ltr)`) PAT_PSEUDO_DIR = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<dir>ltr|rtl){WSC}*\)' # Combining characters (`>`, `~`, ` `, `+`, `,`) PAT_COMBINE = fr'{WSC}*?(?P<relation>[,+>~]|{WS}(?![,+>~])){WSC}*' # Extra: Contains (`:contains(text)`) PAT_PSEUDO_CONTAINS = fr'{PAT_PSEUDO_CLASS_SPECIAL}(?P<values>{VALUE}(?:{WSC}*,{WSC}*{VALUE})*){WSC}*\)' # Regular expressions # CSS escape pattern RE_CSS_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WSC}?)|(\\[^\r\n\f])|(\\$))', re.I) RE_CSS_STR_ESC = re.compile(fr'(?:(\\[a-f0-9]{{1,6}}{WS}?)|(\\[^\r\n\f])|(\\$)|(\\{NEWLINE}))', re.I) # Pattern to break up `nth` specifiers RE_NTH = re.compile(fr'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){WSC}*(?P<s2>[-+]){WSC}*(?P<b>[0-9]+))?', re.I) # Pattern to iterate multiple values. RE_VALUES = re.compile(fr'(?:(?P<value>{VALUE})|(?P<split>{WSC}*,{WSC}*))', re.X) # Whitespace checks RE_WS = re.compile(WS) RE_WS_BEGIN = re.compile(fr'^{WSC}*') RE_WS_END = re.compile(fr'{WSC}*$') RE_CUSTOM = re.compile(fr'^{PAT_PSEUDO_CLASS_CUSTOM}$', re.X) # Constants # List split token COMMA_COMBINATOR = ',' # Relation token for descendant WS_COMBINATOR = " " # Parse flags FLG_PSEUDO = 0x01 FLG_NOT = 0x02 FLG_RELATIVE = 0x04 FLG_DEFAULT = 0x08 FLG_HTML = 0x10 FLG_INDETERMINATE = 0x20 FLG_OPEN = 0x40 FLG_IN_RANGE = 0x80 FLG_OUT_OF_RANGE = 0x100 FLG_PLACEHOLDER_SHOWN = 0x200 FLG_FORGIVE = 0x400 # Maximum cached patterns to store _MAXCACHE = 500 @lru_cache(maxsize=_MAXCACHE) def _cached_css_compile( pattern: str, namespaces: ct.Namespaces | None, custom: ct.CustomSelectors | None, flags: int ) -> cm.SoupSieve: """Cached CSS compile.""" custom_selectors = process_custom(custom) return cm.SoupSieve( pattern, CSSParser( pattern, custom=custom_selectors, flags=flags ).process_selectors(), namespaces, custom, flags ) def _purge_cache() -> None: """Purge the cache.""" _cached_css_compile.cache_clear() def process_custom(custom: ct.CustomSelectors | None) -> dict[str, str | ct.SelectorList]: """Process custom.""" custom_selectors = {} if custom is not None: for key, value in custom.items(): name = util.lower(key) if RE_CUSTOM.match(name) is None: raise SelectorSyntaxError(f"The name '{name}' is not a valid custom pseudo-class name") if name in custom_selectors: raise KeyError(f"The custom selector '{name}' has already been registered") custom_selectors[css_unescape(name)] = value return custom_selectors def css_unescape(content: str, string: bool = False) -> str: """ Unescape CSS value. Strings allow for spanning the value on multiple strings by escaping a new line. """ def replace(m: Match[str]) -> str: """Replace with the appropriate substitute.""" if m.group(1): codepoint = int(m.group(1)[1:], 16) if codepoint == 0: codepoint = UNICODE_REPLACEMENT_CHAR value = chr(codepoint) elif m.group(2): value = m.group(2)[1:] elif m.group(3): value = '\ufffd' else: value = '' return value return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content) def escape(ident: str) -> str: """Escape identifier.""" string = [] length = len(ident) start_dash = length > 0 and ident[0] == '-' if length == 1 and start_dash: # Need to escape identifier that is a single `-` with no other characters string.append(f'\\{ident}') else: for index, c in enumerate(ident): codepoint = ord(c) if codepoint == 0x00: string.append('\ufffd') elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F: string.append(f'\\{codepoint:x} ') elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39): string.append(f'\\{codepoint:x} ') elif ( codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A) ): string.append(c) else: string.append(f'\\{c}') return ''.join(string) class SelectorPattern: """Selector pattern.""" def __init__(self, name: str, pattern: str) -> None: """Initialize.""" self.name = name self.re_pattern = re.compile(pattern, re.I | re.X | re.U) def get_name(self) -> str: """Get name.""" return self.name def match(self, selector: str, index: int, flags: int) -> Match[str] | None: """Match the selector.""" return self.re_pattern.match(selector, index) class SpecialPseudoPattern(SelectorPattern): """Selector pattern.""" def __init__(self, patterns: tuple[tuple[str, tuple[str, ...], str, type[SelectorPattern]], ...]) -> None: """Initialize.""" self.patterns = {} for p in patterns: name = p[0] pattern = p[3](name, p[2]) for pseudo in p[1]: self.patterns[pseudo] = pattern self.matched_name = None # type: SelectorPattern | None self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U) def get_name(self) -> str: """Get name.""" return '' if self.matched_name is None else self.matched_name.get_name() def match(self, selector: str, index: int, flags: int) -> Match[str] | None: """Match the selector.""" pseudo = None m = self.re_pseudo_name.match(selector, index) if m: name = util.lower(css_unescape(m.group('name'))) pattern = self.patterns.get(name) if pattern: pseudo = pattern.match(selector, index, flags) if pseudo: self.matched_name = pattern return pseudo class _Selector: """ Intermediate selector class. This stores selector data for a compound selector as we are acquiring them. Once we are done collecting the data for a compound selector, we freeze the data in an object that can be pickled and hashed. """ def __init__(self, **kwargs: Any) -> None: """Initialize.""" self.tag = kwargs.get('tag', None) # type: ct.SelectorTag | None self.ids = kwargs.get('ids', []) # type: list[str] self.classes = kwargs.get('classes', []) # type: list[str] self.attributes = kwargs.get('attributes', []) # type: list[ct.SelectorAttribute] self.nth = kwargs.get('nth', []) # type: list[ct.SelectorNth] self.selectors = kwargs.get('selectors', []) # type: list[ct.SelectorList] self.relations = kwargs.get('relations', []) # type: list[_Selector] self.rel_type = kwargs.get('rel_type', None) # type: str | None self.contains = kwargs.get('contains', []) # type: list[ct.SelectorContains] self.lang = kwargs.get('lang', []) # type: list[ct.SelectorLang] self.flags = kwargs.get('flags', 0) # type: int self.no_match = kwargs.get('no_match', False) # type: bool def _freeze_relations(self, relations: list[_Selector]) -> ct.SelectorList: """Freeze relation.""" if relations: sel = relations[0] sel.relations.extend(relations[1:]) return ct.SelectorList([sel.freeze()]) else: return ct.SelectorList() def freeze(self) -> ct.Selector | ct.SelectorNull: """Freeze self.""" if self.no_match: return ct.SelectorNull() else: return ct.Selector( self.tag, tuple(self.ids), tuple(self.classes), tuple(self.attributes), tuple(self.nth), tuple(self.selectors), self._freeze_relations(self.relations), self.rel_type, tuple(self.contains), tuple(self.lang), self.flags ) def __str__(self) -> str: # pragma: no cover """String representation.""" return ( f'_Selector(tag={self.tag!r}, ids={self.ids!r}, classes={self.classes!r}, attributes={self.attributes!r}, ' f'nth={self.nth!r}, selectors={self.selectors!r}, relations={self.relations!r}, ' f'rel_type={self.rel_type!r}, contains={self.contains!r}, lang={self.lang!r}, flags={self.flags!r}, ' f'no_match={self.no_match!r})' ) __repr__ = __str__ class CSSParser: """Parse CSS selectors.""" css_tokens = ( SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE), SpecialPseudoPattern( ( ( "pseudo_contains", (':contains', ':-soup-contains', ':-soup-contains-own'), PAT_PSEUDO_CONTAINS, SelectorPattern ), ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern), ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern), ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern), ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern) ) ), SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM), SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS), SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT), SelectorPattern("at_rule", PAT_AT_RULE), SelectorPattern("id", PAT_ID), SelectorPattern("class", PAT_CLASS), SelectorPattern("tag", PAT_TAG), SelectorPattern("attribute", PAT_ATTR), SelectorPattern("combine", PAT_COMBINE) ) def __init__( self, selector: str, custom: dict[str, str | ct.SelectorList] | None = None, flags: int = 0 ) -> None: """Initialize.""" self.pattern = selector.replace('\x00', '\ufffd') self.flags = flags self.debug = self.flags & util.DEBUG self.custom = {} if custom is None else custom def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """Create attribute selector from the returned regex match.""" inverse = False op = m.group('cmp') case = util.lower(m.group('case')) if m.group('case') else None ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else '' attr = css_unescape(m.group('attr_name')) is_type = False pattern2 = None value = '' if case: flags = (re.I if case == 'i' else 0) | re.DOTALL elif util.lower(attr) == 'type': flags = re.I | re.DOTALL is_type = True else: flags = re.DOTALL if op: if m.group('value').startswith(('"', "'")): value = css_unescape(m.group('value')[1:-1], True) else: value = css_unescape(m.group('value')) if not op: # Attribute name pattern = None elif op.startswith('^'): # Value start with pattern = re.compile(r'^%s.*' % re.escape(value), flags) elif op.startswith('$'): # Value ends with pattern = re.compile(r'.*?%s$' % re.escape(value), flags) elif op.startswith('*'): # Value contains pattern = re.compile(r'.*?%s.*' % re.escape(value), flags) elif op.startswith('~'): # Value contains word within space separated list # `~=` should match nothing if it is empty or contains whitespace, # so if either of these cases is present, use `[^\s\S]` which cannot be matched. value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value) pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags) elif op.startswith('|'): # Value starts with word in dash separated list pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags) else: # Value matches pattern = re.compile(r'^%s$' % re.escape(value), flags) if op.startswith('!'): # Equivalent to `:not([attr=value])` inverse = True if is_type and pattern: pattern2 = re.compile(pattern.pattern) # Append the attribute selector sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2) if inverse: # If we are using `!=`, we need to nest the pattern under a `:not()`. sub_sel = _Selector() sub_sel.attributes.append(sel_attr) not_list = ct.SelectorList([sub_sel.freeze()], True, False) sel.selectors.append(not_list) else: sel.attributes.append(sel_attr) has_selector = True return has_selector def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """Parse tag pattern from regex match.""" prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None tag = css_unescape(m.group('tag_name')) sel.tag = ct.SelectorTag(tag, prefix) has_selector = True return has_selector def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """ Parse custom pseudo class alias. Compile custom selectors as we need them. When compiling a custom selector, set it to `None` in the dictionary so we can avoid an infinite loop. """ pseudo = util.lower(css_unescape(m.group('name'))) selector = self.custom.get(pseudo) if selector is None: raise SelectorSyntaxError( f"Undefined custom selector '{pseudo}' found at position {m.end(0)}", self.pattern, m.end(0) ) if not isinstance(selector, ct.SelectorList): del self.custom[pseudo] selector = CSSParser( selector, custom=self.custom, flags=self.flags ).process_selectors(flags=FLG_PSEUDO) self.custom[pseudo] = selector sel.selectors.append(selector) has_selector = True return has_selector def parse_pseudo_class( self, sel: _Selector, m: Match[str], has_selector: bool, iselector: Iterator[tuple[str, Match[str]]], is_html: bool ) -> tuple[bool, bool]: """Parse pseudo class.""" complex_pseudo = False pseudo = util.lower(css_unescape(m.group('name'))) if m.group('open'): complex_pseudo = True if complex_pseudo and pseudo in PSEUDO_COMPLEX: has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0)) elif not complex_pseudo and pseudo in PSEUDO_SIMPLE: if pseudo == ':root': sel.flags |= ct.SEL_ROOT elif pseudo == ':defined': sel.flags |= ct.SEL_DEFINED is_html = True elif pseudo == ':scope': sel.flags |= ct.SEL_SCOPE elif pseudo == ':empty': sel.flags |= ct.SEL_EMPTY elif pseudo in (':link', ':any-link'): sel.selectors.append(CSS_LINK) elif pseudo == ':checked': sel.selectors.append(CSS_CHECKED) elif pseudo == ':default': sel.selectors.append(CSS_DEFAULT) elif pseudo == ':indeterminate': sel.selectors.append(CSS_INDETERMINATE) elif pseudo == ":disabled": sel.selectors.append(CSS_DISABLED) elif pseudo == ":enabled": sel.selectors.append(CSS_ENABLED) elif pseudo == ":required": sel.selectors.append(CSS_REQUIRED) elif pseudo == ":optional": sel.selectors.append(CSS_OPTIONAL) elif pseudo == ":read-only": sel.selectors.append(CSS_READ_ONLY) elif pseudo == ":read-write": sel.selectors.append(CSS_READ_WRITE) elif pseudo == ":in-range": sel.selectors.append(CSS_IN_RANGE) elif pseudo == ":out-of-range": sel.selectors.append(CSS_OUT_OF_RANGE) elif pseudo == ":placeholder-shown": sel.selectors.append(CSS_PLACEHOLDER_SHOWN) elif pseudo == ':first-child': sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList())) elif pseudo == ':last-child': sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())) elif pseudo == ':first-of-type': sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList())) elif pseudo == ':last-of-type': sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())) elif pseudo == ':only-child': sel.nth.extend( [ ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()), ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()) ] ) elif pseudo == ':only-of-type': sel.nth.extend( [ ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()), ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()) ] ) has_selector = True elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH: self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) sel.no_match = True has_selector = True elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH: sel.no_match = True has_selector = True elif pseudo in PSEUDO_SUPPORTED: raise SelectorSyntaxError( f"Invalid syntax for pseudo class '{pseudo}'", self.pattern, m.start(0) ) else: raise NotImplementedError( f"'{pseudo}' pseudo-class is not implemented at this time" ) return has_selector, is_html def parse_pseudo_nth( self, sel: _Selector, m: Match[str], has_selector: bool, iselector: Iterator[tuple[str, Match[str]]] ) -> bool: """Parse `nth` pseudo.""" mdict = m.groupdict() if mdict.get('pseudo_nth_child'): postfix = '_child' else: postfix = '_type' mdict['name'] = util.lower(css_unescape(mdict['name'])) content = util.lower(mdict.get('nth' + postfix)) if content == 'even': # 2n s1 = 2 s2 = 0 var = True elif content == 'odd': # 2n+1 s1 = 2 s2 = 1 var = True else: nth_parts = cast(Match[str], RE_NTH.match(content)) _s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else '' a = nth_parts.group('a') var = a.endswith('n') if a.startswith('n'): _s1 += '1' elif var: _s1 += a[:-1] else: _s1 += a _s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else '' if nth_parts.group('b'): _s2 += nth_parts.group('b') else: _s2 = '0' s1 = int(_s1, 10) s2 = int(_s2, 10) pseudo_sel = mdict['name'] if postfix == '_child': if m.group('of'): # Parse the rest of `of S`. nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN) else: # Use default `*|*` for `of S`. nth_sel = CSS_NTH_OF_S_DEFAULT if pseudo_sel == ':nth-child': sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel)) elif pseudo_sel == ':nth-last-child': sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel)) else: if pseudo_sel == ':nth-of-type': sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList())) elif pseudo_sel == ':nth-last-of-type': sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList())) has_selector = True return has_selector def parse_pseudo_open( self, sel: _Selector, name: str, has_selector: bool, iselector: Iterator[tuple[str, Match[str]]], index: int ) -> bool: """Parse pseudo with opening bracket.""" flags = FLG_PSEUDO | FLG_OPEN if name == ':not': flags |= FLG_NOT elif name == ':has': flags |= FLG_RELATIVE elif name in (':where', ':is'): flags |= FLG_FORGIVE sel.selectors.append(self.parse_selectors(iselector, index, flags)) has_selector = True return has_selector def parse_has_combinator( self, sel: _Selector, m: Match[str], has_selector: bool, selectors: list[_Selector], rel_type: str, index: int ) -> tuple[bool, _Selector, str]: """Parse combinator tokens.""" combinator = m.group('relation').strip() if not combinator: combinator = WS_COMBINATOR if combinator == COMMA_COMBINATOR: sel.rel_type = rel_type selectors[-1].relations.append(sel) rel_type = ":" + WS_COMBINATOR selectors.append(_Selector()) else: if has_selector: # End the current selector and associate the leading combinator with this selector. sel.rel_type = rel_type selectors[-1].relations.append(sel) elif rel_type[1:] != WS_COMBINATOR: # It's impossible to have two whitespace combinators after each other as the patterns # will gobble up trailing whitespace. It is also impossible to have a whitespace # combinator after any other kind for the same reason. But we could have # multiple non-whitespace combinators. So if the current combinator is not a whitespace, # then we've hit the multiple combinator case, so we should fail. raise SelectorSyntaxError( f'The multiple combinators at position {index}', self.pattern, index ) # Set the leading combinator for the next selector. rel_type = ':' + combinator sel = _Selector() has_selector = False return has_selector, sel, rel_type def parse_combinator( self, sel: _Selector, m: Match[str], has_selector: bool, selectors: list[_Selector], relations: list[_Selector], is_pseudo: bool, is_forgive: bool, index: int ) -> tuple[bool, _Selector]: """Parse combinator tokens.""" combinator = m.group('relation').strip() if not combinator: combinator = WS_COMBINATOR if not has_selector: if not is_forgive or combinator != COMMA_COMBINATOR: raise SelectorSyntaxError( f"The combinator '{combinator}' at position {index}, must have a selector before it", self.pattern, index ) # If we are in a forgiving pseudo class, just make the selector a "no match" if combinator == COMMA_COMBINATOR: sel.no_match = True del relations[:] selectors.append(sel) else: if combinator == COMMA_COMBINATOR: if not sel.tag and not is_pseudo: # Implied `*` sel.tag = ct.SelectorTag('*', None) sel.relations.extend(relations) selectors.append(sel) del relations[:] else: sel.relations.extend(relations) sel.rel_type = combinator del relations[:] relations.append(sel) sel = _Selector() has_selector = False return has_selector, sel def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """Parse HTML classes and ids.""" selector = m.group(0) if selector.startswith('.'): sel.classes.append(css_unescape(selector[1:])) else: sel.ids.append(css_unescape(selector[1:])) has_selector = True return has_selector def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """Parse contains.""" pseudo = util.lower(css_unescape(m.group('name'))) if pseudo == ":contains": warnings.warn( # noqa: B028 "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.", FutureWarning ) contains_own = pseudo == ":-soup-contains-own" values = css_unescape(m.group('values')) patterns = [] for token in RE_VALUES.finditer(values): if token.group('split'): continue value = token.group('value') if value.startswith(("'", '"')): value = css_unescape(value[1:-1], True) else: value = css_unescape(value) patterns.append(value) sel.contains.append(ct.SelectorContains(patterns, contains_own)) has_selector = True return has_selector def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """Parse pseudo language.""" values = m.group('values') patterns = [] for token in RE_VALUES.finditer(values): if token.group('split'): continue value = token.group('value') if value.startswith(('"', "'")): value = css_unescape(value[1:-1], True) else: value = css_unescape(value) patterns.append(value) sel.lang.append(ct.SelectorLang(patterns)) has_selector = True return has_selector def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool: """Parse pseudo direction.""" value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL sel.flags |= value has_selector = True return has_selector def parse_selectors( self, iselector: Iterator[tuple[str, Match[str]]], index: int = 0, flags: int = 0 ) -> ct.SelectorList: """Parse selectors.""" # Initialize important variables sel = _Selector() selectors = [] has_selector = False closed = False relations = [] # type: list[_Selector] rel_type = ":" + WS_COMBINATOR # Setup various flags is_open = bool(flags & FLG_OPEN) is_pseudo = bool(flags & FLG_PSEUDO) is_relative = bool(flags & FLG_RELATIVE) is_not = bool(flags & FLG_NOT) is_html = bool(flags & FLG_HTML) is_default = bool(flags & FLG_DEFAULT) is_indeterminate = bool(flags & FLG_INDETERMINATE) is_in_range = bool(flags & FLG_IN_RANGE) is_out_of_range = bool(flags & FLG_OUT_OF_RANGE) is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN) is_forgive = bool(flags & FLG_FORGIVE) # Print out useful debug stuff if self.debug: # pragma: no cover if is_pseudo: print(' is_pseudo: True') if is_open: print(' is_open: True') if is_relative: print(' is_relative: True') if is_not: print(' is_not: True') if is_html: print(' is_html: True') if is_default: print(' is_default: True') if is_indeterminate: print(' is_indeterminate: True') if is_in_range: print(' is_in_range: True') if is_out_of_range: print(' is_out_of_range: True') if is_placeholder_shown: print(' is_placeholder_shown: True') if is_forgive: print(' is_forgive: True') # The algorithm for relative selectors require an initial selector in the selector list if is_relative: selectors.append(_Selector()) try: while True: key, m = next(iselector) # Handle parts if key == "at_rule": raise NotImplementedError(f"At-rules found at position {m.start(0)}") elif key == 'pseudo_class_custom': has_selector = self.parse_pseudo_class_custom(sel, m, has_selector) elif key == 'pseudo_class': has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html) elif key == 'pseudo_element': raise NotImplementedError(f"Pseudo-element found at position {m.start(0)}") elif key == 'pseudo_contains': has_selector = self.parse_pseudo_contains(sel, m, has_selector) elif key in ('pseudo_nth_type', 'pseudo_nth_child'): has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector) elif key == 'pseudo_lang': has_selector = self.parse_pseudo_lang(sel, m, has_selector) elif key == 'pseudo_dir': has_selector = self.parse_pseudo_dir(sel, m, has_selector) # Currently only supports HTML is_html = True elif key == 'pseudo_close': if not has_selector: if not is_forgive: raise SelectorSyntaxError( f"Expected a selector at position {m.start(0)}", self.pattern, m.start(0) ) sel.no_match = True if is_open: closed = True break else: raise SelectorSyntaxError( f"Unmatched pseudo-class close at position {m.start(0)}", self.pattern, m.start(0) ) elif key == 'combine': if is_relative: has_selector, sel, rel_type = self.parse_has_combinator( sel, m, has_selector, selectors, rel_type, index ) else: has_selector, sel = self.parse_combinator( sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index ) elif key == 'attribute': has_selector = self.parse_attribute_selector(sel, m, has_selector) elif key == 'tag': if has_selector: raise SelectorSyntaxError( f"Tag name found at position {m.start(0)} instead of at the start", self.pattern, m.start(0) ) has_selector = self.parse_tag_pattern(sel, m, has_selector) elif key in ('class', 'id'): has_selector = self.parse_class_id(sel, m, has_selector) index = m.end(0) except StopIteration: pass # Handle selectors that are not closed if is_open and not closed: raise SelectorSyntaxError( f"Unclosed pseudo-class at position {index}", self.pattern, index ) # Cleanup completed selector piece if has_selector: if not sel.tag and not is_pseudo: # Implied `*` sel.tag = ct.SelectorTag('*', None) if is_relative: sel.rel_type = rel_type selectors[-1].relations.append(sel) else: sel.relations.extend(relations) del relations[:] selectors.append(sel) # Forgive empty slots in pseudo-classes that have lists (and are forgiving) elif is_forgive and (not selectors or not relations): # Handle normal pseudo-classes with empty slots like `:is()` etc. sel.no_match = True del relations[:] selectors.append(sel) has_selector = True if not has_selector: # We will always need to finish a selector when `:has()` is used as it leads with combining. # May apply to others as well. raise SelectorSyntaxError( f'Expected a selector at position {index}', self.pattern, index ) # Some patterns require additional logic, such as default. We try to make these the # last pattern, and append the appropriate flag to that selector which communicates # to the matcher what additional logic is required. if is_default: selectors[-1].flags = ct.SEL_DEFAULT if is_indeterminate: selectors[-1].flags = ct.SEL_INDETERMINATE if is_in_range: selectors[-1].flags = ct.SEL_IN_RANGE if is_out_of_range: selectors[-1].flags = ct.SEL_OUT_OF_RANGE if is_placeholder_shown: selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN # Return selector list return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html) def selector_iter(self, pattern: str) -> Iterator[tuple[str, Match[str]]]: """Iterate selector tokens.""" # Ignore whitespace and comments at start and end of pattern m = RE_WS_BEGIN.search(pattern) index = m.end(0) if m else 0 m = RE_WS_END.search(pattern) end = (m.start(0) - 1) if m else (len(pattern) - 1) if self.debug: # pragma: no cover print(f'## PARSING: {pattern!r}') while index <= end: m = None for v in self.css_tokens: m = v.match(pattern, index, self.flags) if m: name = v.get_name() if self.debug: # pragma: no cover print(f"TOKEN: '{name}' --> {m.group(0)!r} at position {m.start(0)}") index = m.end(0) yield name, m break if m is None: c = pattern[index] # If the character represents the start of one of the known selector types, # throw an exception mentioning that the known selector type is in error; # otherwise, report the invalid character. if c == '[': msg = f"Malformed attribute selector at position {index}" elif c == '.': msg = f"Malformed class selector at position {index}" elif c == '#': msg = f"Malformed id selector at position {index}" elif c == ':': msg = f"Malformed pseudo-class selector at position {index}" else: msg = f"Invalid character {c!r} position {index}" raise SelectorSyntaxError(msg, self.pattern, index) if self.debug: # pragma: no cover print('## END PARSING') def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList: """Process selectors.""" return self.parse_selectors(self.selector_iter(self.pattern), index, flags) # Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern) # A few patterns are order dependent as they use patterns previous compiled. # CSS pattern for `:link` and `:any-link` CSS_LINK = CSSParser( 'html|*:is(a, area)[href]' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:checked` CSS_CHECKED = CSSParser( ''' html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected] ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:default` (must compile CSS_CHECKED first) CSS_DEFAULT = CSSParser( ''' :checked, /* This pattern must be at the end. Special logic is applied to the last selector. */ html|form html|*:is(button, input)[type="submit"] ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT) # CSS pattern for `:indeterminate` CSS_INDETERMINATE = CSSParser( ''' html|input[type="checkbox"][indeterminate], html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]), html|progress:not([value]), /* This pattern must be at the end. Special logic is applied to the last selector. */ html|input[type="radio"][name]:not([name='']):not([checked]) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE) # CSS pattern for `:disabled` CSS_DISABLED = CSSParser( ''' html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled], html|optgroup[disabled] > html|option, html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset), html|fieldset[disabled] > html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:enabled` CSS_ENABLED = CSSParser( ''' html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:required` CSS_REQUIRED = CSSParser( 'html|*:is(input, textarea, select)[required]' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:optional` CSS_OPTIONAL = CSSParser( 'html|*:is(input, textarea, select):not([required])' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:placeholder-shown` CSS_PLACEHOLDER_SHOWN = CSSParser( ''' html|input:is( :not([type]), [type=""], [type=text], [type=search], [type=url], [type=tel], [type=email], [type=password], [type=number] )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]), html|textarea[placeholder]:not([placeholder='']) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN) # CSS pattern default for `:nth-child` "of S" feature CSS_NTH_OF_S_DEFAULT = CSSParser( '*|*' ).process_selectors(flags=FLG_PSEUDO) # CSS pattern for `:read-write` (CSS_DISABLED must be compiled first) CSS_READ_WRITE = CSSParser( ''' html|*:is( textarea, input:is( :not([type]), [type=""], [type=text], [type=search], [type=url], [type=tel], [type=email], [type=number], [type=password], [type=date], [type=datetime-local], [type=month], [type=time], [type=week] ) ):not([readonly], :disabled), html|*:is([contenteditable=""], [contenteditable="true" i]) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:read-only` CSS_READ_ONLY = CSSParser( ''' html|*:not(:read-write) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_HTML) # CSS pattern for `:in-range` CSS_IN_RANGE = CSSParser( ''' html|input:is( [type="date"], [type="month"], [type="week"], [type="time"], [type="datetime-local"], [type="number"], [type="range"] ):is( [min], [max] ) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML) # CSS pattern for `:out-of-range` CSS_OUT_OF_RANGE = CSSParser( ''' html|input:is( [type="date"], [type="month"], [type="week"], [type="time"], [type="datetime-local"], [type="number"], [type="range"] ):is( [min], [max] ) ''' ).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML) PK]ZZZ�w�I�'�'soupsieve/css_types.py"""CSS selector structure items.""" from __future__ import annotations import copyreg from .pretty import pretty from typing import Any, Iterator, Hashable, Pattern, Iterable, Mapping __all__ = ( 'Selector', 'SelectorNull', 'SelectorTag', 'SelectorAttribute', 'SelectorContains', 'SelectorNth', 'SelectorLang', 'SelectorList', 'Namespaces', 'CustomSelectors' ) SEL_EMPTY = 0x1 SEL_ROOT = 0x2 SEL_DEFAULT = 0x4 SEL_INDETERMINATE = 0x8 SEL_SCOPE = 0x10 SEL_DIR_LTR = 0x20 SEL_DIR_RTL = 0x40 SEL_IN_RANGE = 0x80 SEL_OUT_OF_RANGE = 0x100 SEL_DEFINED = 0x200 SEL_PLACEHOLDER_SHOWN = 0x400 class Immutable: """Immutable.""" __slots__: tuple[str, ...] = ('_hash',) _hash: int def __init__(self, **kwargs: Any) -> None: """Initialize.""" temp = [] for k, v in kwargs.items(): temp.append(type(v)) temp.append(v) super().__setattr__(k, v) super().__setattr__('_hash', hash(tuple(temp))) @classmethod def __base__(cls) -> type[Immutable]: """Get base class.""" return cls def __eq__(self, other: Any) -> bool: """Equal.""" return ( isinstance(other, self.__base__()) and all(getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash') ) def __ne__(self, other: Any) -> bool: """Equal.""" return ( not isinstance(other, self.__base__()) or any(getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash') ) def __hash__(self) -> int: """Hash.""" return self._hash def __setattr__(self, name: str, value: Any) -> None: """Prevent mutability.""" raise AttributeError(f"'{self.__class__.__name__}' is immutable") def __repr__(self) -> str: # pragma: no cover """Representation.""" r = ', '.join([f"{k}={getattr(self, k)!r}" for k in self.__slots__[:-1]]) return f"{self.__class__.__name__}({r})" __str__ = __repr__ def pretty(self) -> None: # pragma: no cover """Pretty print.""" print(pretty(self)) class ImmutableDict(Mapping[Any, Any]): """Hashable, immutable dictionary.""" def __init__( self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]] ) -> None: """Initialize.""" self._validate(arg) self._d = dict(arg) self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())])) def _validate(self, arg: dict[Any, Any] | Iterable[tuple[Any, Any]]) -> None: """Validate arguments.""" if isinstance(arg, dict): if not all(isinstance(v, Hashable) for v in arg.values()): raise TypeError(f'{self.__class__.__name__} values must be hashable') elif not all(isinstance(k, Hashable) and isinstance(v, Hashable) for k, v in arg): raise TypeError(f'{self.__class__.__name__} values must be hashable') def __iter__(self) -> Iterator[Any]: """Iterator.""" return iter(self._d) def __len__(self) -> int: """Length.""" return len(self._d) def __getitem__(self, key: Any) -> Any: """Get item: `namespace['key']`.""" return self._d[key] def __hash__(self) -> int: """Hash.""" return self._hash def __repr__(self) -> str: # pragma: no cover """Representation.""" return f"{self._d!r}" __str__ = __repr__ class Namespaces(ImmutableDict): """Namespaces.""" def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Initialize.""" super().__init__(arg) def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Validate arguments.""" if isinstance(arg, dict): if not all(isinstance(v, str) for v in arg.values()): raise TypeError(f'{self.__class__.__name__} values must be hashable') elif not all(isinstance(k, str) and isinstance(v, str) for k, v in arg): raise TypeError(f'{self.__class__.__name__} keys and values must be Unicode strings') class CustomSelectors(ImmutableDict): """Custom selectors.""" def __init__(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Initialize.""" super().__init__(arg) def _validate(self, arg: dict[str, str] | Iterable[tuple[str, str]]) -> None: """Validate arguments.""" if isinstance(arg, dict): if not all(isinstance(v, str) for v in arg.values()): raise TypeError(f'{self.__class__.__name__} values must be hashable') elif not all(isinstance(k, str) and isinstance(v, str) for k, v in arg): raise TypeError(f'{self.__class__.__name__} keys and values must be Unicode strings') class Selector(Immutable): """Selector.""" __slots__ = ( 'tag', 'ids', 'classes', 'attributes', 'nth', 'selectors', 'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash' ) tag: SelectorTag | None ids: tuple[str, ...] classes: tuple[str, ...] attributes: tuple[SelectorAttribute, ...] nth: tuple[SelectorNth, ...] selectors: tuple[SelectorList, ...] relation: SelectorList rel_type: str | None contains: tuple[SelectorContains, ...] lang: tuple[SelectorLang, ...] flags: int def __init__( self, tag: SelectorTag | None, ids: tuple[str, ...], classes: tuple[str, ...], attributes: tuple[SelectorAttribute, ...], nth: tuple[SelectorNth, ...], selectors: tuple[SelectorList, ...], relation: SelectorList, rel_type: str | None, contains: tuple[SelectorContains, ...], lang: tuple[SelectorLang, ...], flags: int ): """Initialize.""" super().__init__( tag=tag, ids=ids, classes=classes, attributes=attributes, nth=nth, selectors=selectors, relation=relation, rel_type=rel_type, contains=contains, lang=lang, flags=flags ) class SelectorNull(Immutable): """Null Selector.""" def __init__(self) -> None: """Initialize.""" super().__init__() class SelectorTag(Immutable): """Selector tag.""" __slots__ = ("name", "prefix", "_hash") name: str prefix: str | None def __init__(self, name: str, prefix: str | None) -> None: """Initialize.""" super().__init__(name=name, prefix=prefix) class SelectorAttribute(Immutable): """Selector attribute rule.""" __slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash") attribute: str prefix: str pattern: Pattern[str] | None xml_type_pattern: Pattern[str] | None def __init__( self, attribute: str, prefix: str, pattern: Pattern[str] | None, xml_type_pattern: Pattern[str] | None ) -> None: """Initialize.""" super().__init__( attribute=attribute, prefix=prefix, pattern=pattern, xml_type_pattern=xml_type_pattern ) class SelectorContains(Immutable): """Selector contains rule.""" __slots__ = ("text", "own", "_hash") text: tuple[str, ...] own: bool def __init__(self, text: Iterable[str], own: bool) -> None: """Initialize.""" super().__init__(text=tuple(text), own=own) class SelectorNth(Immutable): """Selector nth type.""" __slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash") a: int n: bool b: int of_type: bool last: bool selectors: SelectorList def __init__(self, a: int, n: bool, b: int, of_type: bool, last: bool, selectors: SelectorList) -> None: """Initialize.""" super().__init__( a=a, n=n, b=b, of_type=of_type, last=last, selectors=selectors ) class SelectorLang(Immutable): """Selector language rules.""" __slots__ = ("languages", "_hash",) languages: tuple[str, ...] def __init__(self, languages: Iterable[str]): """Initialize.""" super().__init__(languages=tuple(languages)) def __iter__(self) -> Iterator[str]: """Iterator.""" return iter(self.languages) def __len__(self) -> int: # pragma: no cover """Length.""" return len(self.languages) def __getitem__(self, index: int) -> str: # pragma: no cover """Get item.""" return self.languages[index] class SelectorList(Immutable): """Selector list.""" __slots__ = ("selectors", "is_not", "is_html", "_hash") selectors: tuple[Selector | SelectorNull, ...] is_not: bool is_html: bool def __init__( self, selectors: Iterable[Selector | SelectorNull] | None = None, is_not: bool = False, is_html: bool = False ) -> None: """Initialize.""" super().__init__( selectors=tuple(selectors) if selectors is not None else (), is_not=is_not, is_html=is_html ) def __iter__(self) -> Iterator[Selector | SelectorNull]: """Iterator.""" return iter(self.selectors) def __len__(self) -> int: """Length.""" return len(self.selectors) def __getitem__(self, index: int) -> Selector | SelectorNull: """Get item.""" return self.selectors[index] def _pickle(p: Any) -> Any: return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]]) def pickle_register(obj: Any) -> None: """Allow object to be pickled.""" copyreg.pickle(obj, _pickle) pickle_register(Selector) pickle_register(SelectorNull) pickle_register(SelectorTag) pickle_register(SelectorAttribute) pickle_register(SelectorContains) pickle_register(SelectorNth) pickle_register(SelectorLang) pickle_register(SelectorList) PK]ZZZ��o��soupsieve/pretty.py""" Format a pretty string of a `SoupSieve` object for easy debugging. This won't necessarily support all types and such, and definitely not support custom outputs. It is mainly geared towards our types as the `SelectorList` object is a beast to look at without some indentation and newlines. The format and various output types is fairly known (though it hasn't been tested extensively to make sure we aren't missing corners). Example: ------- ``` >>> import soupsieve as sv >>> sv.compile('this > that.class[name=value]').selectors.pretty() SelectorList( selectors=( Selector( tag=SelectorTag( name='that', prefix=None), ids=(), classes=( 'class', ), attributes=( SelectorAttribute( attribute='name', prefix='', pattern=re.compile( '^value$'), xml_type_pattern=None), ), nth=(), selectors=(), relation=SelectorList( selectors=( Selector( tag=SelectorTag( name='this', prefix=None), ids=(), classes=(), attributes=(), nth=(), selectors=(), relation=SelectorList( selectors=(), is_not=False, is_html=False), rel_type='>', contains=(), lang=(), flags=0), ), is_not=False, is_html=False), rel_type=None, contains=(), lang=(), flags=0), ), is_not=False, is_html=False) ``` """ from __future__ import annotations import re from typing import Any RE_CLASS = re.compile(r'(?i)[a-z_][_a-z\d\.]+$') RE_PARAM = re.compile(r'(?i)[_a-z][_a-z\d]+=') RE_EMPTY = re.compile(r'\($|\[\]|\{\}') RE_LSTRT = re.compile(r'\[') RE_DSTRT = re.compile(r'\{') RE_TSTRT = re.compile(r'$') RE_LEND = re.compile(r'\]') RE_DEND = re.compile(r'\}') RE_TEND = re.compile(r'$') RE_INT = re.compile(r'\d+') RE_KWORD = re.compile(r'(?i)[_a-z][_a-z\d]+') RE_DQSTR = re.compile(r'"(?:\\.|[^"\\])*"') RE_SQSTR = re.compile(r"'(?:\\.|[^'\\])*'") RE_SEP = re.compile(r'\s*(,)\s*') RE_DSEP = re.compile(r'\s*(:)\s*') TOKENS = { 'class': RE_CLASS, 'param': RE_PARAM, 'empty': RE_EMPTY, 'lstrt': RE_LSTRT, 'dstrt': RE_DSTRT, 'tstrt': RE_TSTRT, 'lend': RE_LEND, 'dend': RE_DEND, 'tend': RE_TEND, 'sqstr': RE_SQSTR, 'sep': RE_SEP, 'dsep': RE_DSEP, 'int': RE_INT, 'kword': RE_KWORD, 'dqstr': RE_DQSTR } def pretty(obj: Any) -> str: # pragma: no cover """Make the object output string pretty.""" sel = str(obj) index = 0 end = len(sel) - 1 indent = 0 output = [] while index <= end: m = None for k, v in TOKENS.items(): m = v.match(sel, index) if m: name = k index = m.end(0) if name in ('class', 'lstrt', 'dstrt', 'tstrt'): indent += 4 output.append(f'{m.group(0)}\n{" " * indent}') elif name in ('param', 'int', 'kword', 'sqstr', 'dqstr', 'empty'): output.append(m.group(0)) elif name in ('lend', 'dend', 'tend'): indent -= 4 output.append(m.group(0)) elif name in ('sep',): output.append(f'{m.group(1)}\n{" " * indent}') elif name in ('dsep',): output.append(f'{m.group(1)} ') break return ''.join(output) PK]ZZZsoupsieve/py.typedPK]ZZZp�/d soupsieve/util.py"""Utility.""" from __future__ import annotations from functools import wraps, lru_cache import warnings import re from typing import Callable, Any DEBUG = 0x00001 RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$') UC_A = ord('A') UC_Z = ord('Z') @lru_cache(maxsize=512) def lower(string: str) -> str: """Lower.""" new_string = [] for c in string: o = ord(c) new_string.append(chr(o + 32) if UC_A <= o <= UC_Z else c) return ''.join(new_string) class SelectorSyntaxError(Exception): """Syntax error in a CSS selector.""" def __init__(self, msg: str, pattern: str | None = None, index: int | None = None) -> None: """Initialize.""" self.line = None self.col = None self.context = None if pattern is not None and index is not None: # Format pattern to show line and column position self.context, self.line, self.col = get_pattern_context(pattern, index) msg = f'{msg}\n line {self.line}:\n{self.context}' super().__init__(msg) def deprecated(message: str, stacklevel: int = 2) -> Callable[..., Any]: # pragma: no cover """ Raise a `DeprecationWarning` when wrapped function/method is called. Usage: @deprecated("This method will be removed in version X; use Y instead.") def some_method()" pass """ def _wrapper(func: Callable[..., Any]) -> Callable[..., Any]: @wraps(func) def _deprecated_func(*args: Any, **kwargs: Any) -> Any: warnings.warn( f"'{func.__name__}' is deprecated. {message}", category=DeprecationWarning, stacklevel=stacklevel ) return func(*args, **kwargs) return _deprecated_func return _wrapper def warn_deprecated(message: str, stacklevel: int = 2) -> None: # pragma: no cover """Warn deprecated.""" warnings.warn( message, category=DeprecationWarning, stacklevel=stacklevel ) def get_pattern_context(pattern: str, index: int) -> tuple[str, int, int]: """Get the pattern context.""" last = 0 current_line = 1 col = 1 text = [] # type: list[str] line = 1 offset = None # type: int | None # Split pattern by newline and handle the text before the newline for m in RE_PATTERN_LINE_SPLIT.finditer(pattern): linetext = pattern[last:m.start(0)] if not len(m.group(0)) and not len(text): indent = '' offset = -1 col = index - last + 1 elif last <= index < m.end(0): indent = '--> ' offset = (-1 if index > m.start(0) else 0) + 3 col = index - last + 1 else: indent = ' ' offset = None if len(text): # Regardless of whether we are presented with `\r\n`, `\r`, or `\n`, # we will render the output with just `\n`. We will still log the column # correctly though. text.append('\n') text.append(f'{indent}{linetext}') if offset is not None: text.append('\n') text.append(' ' * (col + offset) + '^') line = current_line current_line += 1 last = m.end(0) return ''.join(text), line, col PK]ZZZ��ZzHH+soupsieve-2.5.dist-info/licenses/LICENSE.mdMIT License Copyright (c) 2018 - 2023 Isaac Muse <isaacmuse@gmail.com> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK]ZZZ��P�zz soupsieve-2.5.dist-info/METADATAMetadata-Version: 2.1 Name: soupsieve Version: 2.5 Summary: A modern CSS selector implementation for Beautiful Soup. Project-URL: Homepage, https://github.com/facelessuser/soupsieve Author-email: Isaac Muse <Isaac.Muse@gmail.com> License-Expression: MIT License-File: LICENSE.md Keywords: CSS,HTML,XML,filter,query,selector,soup Classifier: Development Status :: 5 - Production/Stable Classifier: Environment :: Console Classifier: Intended Audience :: Developers Classifier: License :: OSI Approved :: MIT License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.8 Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Typing :: Typed Requires-Python: >=3.8 Description-Content-Type: text/markdown [![Donate via PayPal][donate-image]][donate-link] [![Discord][discord-image]][discord-link] [![Build][github-ci-image]][github-ci-link] [![Coverage Status][codecov-image]][codecov-link] [![PyPI Version][pypi-image]][pypi-link] [![PyPI Downloads][pypi-down]][pypi-link] [![PyPI - Python Version][python-image]][pypi-link] ![License][license-image-mit] # Soup Sieve ## Overview Soup Sieve is a CSS selector library designed to be used with [Beautiful Soup 4][bs4]. It aims to provide selecting, matching, and filtering using modern CSS selectors. Soup Sieve currently provides selectors from the CSS level 1 specifications up through the latest CSS level 4 drafts and beyond (though some are not yet implemented). Soup Sieve was written with the intent to replace Beautiful Soup's builtin select feature, and as of Beautiful Soup version 4.7.0, it now is :confetti_ball:. Soup Sieve can also be imported in order to use its API directly for more controlled, specialized parsing. Soup Sieve has implemented most of the CSS selectors up through the latest CSS draft specifications, though there are a number that don't make sense in a non-browser environment. Selectors that cannot provide meaningful functionality simply do not match anything. Some of the supported selectors are: - `.classes` - `#ids` - `[attributes=value]` - `parent child` - `parent > child` - `sibling ~ sibling` - `sibling + sibling` - `:not(element.class, element2.class)` - `:is(element.class, element2.class)` - `parent:has(> child)` - and [many more](https://facelessuser.github.io/soupsieve/selectors/) ## Installation You must have Beautiful Soup already installed: ``` pip install beautifulsoup4 ``` In most cases, assuming you've installed version 4.7.0, that should be all you need to do, but if you've installed via some alternative method, and Soup Sieve is not automatically installed, you can install it directly: ``` pip install soupsieve ``` If you want to manually install it from source, first ensure that [`build`](https://pypi.org/project/build/) is installed: ``` pip install build ``` Then navigate to the root of the project and build the wheel and install (replacing `<ver>` with the current version): ``` python -m build -w pip install dist/soupsive-<ver>-py3-none-any.whl ``` ## Documentation Documentation is found here: https://facelessuser.github.io/soupsieve/. ## License MIT [bs4]: https://beautiful-soup-4.readthedocs.io/en/latest/# [github-ci-image]: https://github.com/facelessuser/soupsieve/workflows/build/badge.svg?branch=master&event=push [github-ci-link]: https://github.com/facelessuser/soupsieve/actions?query=workflow%3Abuild+branch%3Amaster [discord-image]: https://img.shields.io/discord/678289859768745989?logo=discord&logoColor=aaaaaa&color=mediumpurple&labelColor=333333 [discord-link]:https://discord.gg/XBnPUZF [codecov-image]: https://img.shields.io/codecov/c/github/facelessuser/soupsieve/master.svg?logo=codecov&logoColor=aaaaaa&labelColor=333333 [codecov-link]: https://codecov.io/github/facelessuser/soupsieve [pypi-image]: https://img.shields.io/pypi/v/soupsieve.svg?logo=pypi&logoColor=aaaaaa&labelColor=333333 [pypi-down]: https://img.shields.io/pypi/dm/soupsieve.svg?logo=pypi&logoColor=aaaaaa&labelColor=333333 [pypi-link]: https://pypi.python.org/pypi/soupsieve [python-image]: https://img.shields.io/pypi/pyversions/soupsieve?logo=python&logoColor=aaaaaa&labelColor=333333 [license-image-mit]: https://img.shields.io/badge/license-MIT-blue.svg?labelColor=333333 [donate-image]: https://img.shields.io/badge/Donate-PayPal-3fabd1?logo=paypal [donate-link]: https://www.paypal.me/facelessuser PK]ZZZ��CWWsoupsieve-2.5.dist-info/WHEELWheel-Version: 1.0 Generator: hatchling 1.18.0 Root-Is-Purelib: true Tag: py3-none-any PK]ZZZt�؝�soupsieve-2.5.dist-info/RECORDsoupsieve/__init__.py,sha256=itGNTlsOM-E9lqxEoAsVLIhwoDHU9TRBdRGv0r_uy3o,4591 soupsieve/__meta__.py,sha256=Z_QrvaTEsAbtxUvys3OwAkcy1GNjl3UmdrdLVq5D9Zw,6766 soupsieve/css_match.py,sha256=oOXr1Tq5YoRubKf6KTY5kXAUHNSLdWgbS6ZveC_rVSM,57952 soupsieve/css_parser.py,sha256=M0SvRl0cAI-Ry3hS4VlggA0WussvdkrY4S5TJ6zt21o,46161 soupsieve/css_types.py,sha256=qCxBRWX9sGjgLXgU9qmM95-OZCHPZ8kiyaHvn0ik_9w,10192 soupsieve/pretty.py,sha256=8z9ZNykb57YR-mZUrY4O9YZXDP6BhGLSq9-DA5Y8rww,4033 soupsieve/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 soupsieve/util.py,sha256=Q0MVH77cUBXMtKV0D2f8Syf90remPe0EfLqe6-msAeI,3352 soupsieve-2.5.dist-info/licenses/LICENSE.md,sha256=zre5BTuIrd_6MQKkVzAr2EDRI5Je6e2cKskM9lSLM78,1096 soupsieve-2.5.dist-info/METADATA,sha256=EsaXvxjO4PdjQmZcO1ixqBjYZ42RrRnXGAPTyOsM-IE,4730 soupsieve-2.5.dist-info/WHEEL,sha256=9QBuHhg6FNW7lppboF2vKVbCGTVzsFykgRQjjlajrhA,87 soupsieve-2.5.dist-info/RECORD,, PK]ZZZ�vH��soupsieve/__init__.pyPK]ZZZ�T�nn�"soupsieve/__meta__.pyPK]ZZZŤ�`�`��,soupsieve/css_match.pyPK]ZZZ*&�Q�Q��Wsoupsieve/css_parser.pyPK]ZZZ�w�I�'�'��soupsieve/css_types.pyPK]ZZZ��o��soupsieve/pretty.pyPK]ZZZ��soupsieve/py.typedPK]ZZZp�/d ��soupsieve/util.pyPK]ZZZ��ZzHH+�J soupsieve-2.5.dist-info/licenses/LICENSE.mdPK]ZZZ��P�zz �� soupsieve-2.5.dist-info/METADATAPK]ZZZ��CWW�� soupsieve-2.5.dist-info/WHEELPK]ZZZt�؝��%!soupsieve-2.5.dist-info/RECORDPKQ�$