import datetime
import re
from typing import List, Optional
from unstructured.nlp.patterns import (
EMAIL_ADDRESS_PATTERN,
EMAIL_DATETIMETZ_PATTERN,
IMAGE_URL_PATTERN,
IP_ADDRESS_NAME_PATTERN,
IP_ADDRESS_PATTERN_RE,
MAPI_ID_PATTERN,
US_PHONE_NUMBERS_RE,
)
def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match:
if not isinstance(index, int) or index < 0:
raise ValueError(f"The index is {index}. Index must be a non-negative integer.")
regex_match = None
for i, result in enumerate(re.finditer(pattern, text)):
if i == index:
regex_match = result
if regex_match is None:
raise ValueError(f"Result with index {index} was not found. The largest index was {i}.")
return regex_match
def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
"""Extracts texts that occurs before the specified pattern. By default, it will use
the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
index.
Input
-----
strip: If True, removes trailing whitespace from the extracted string
"""
regex_match = _get_indexed_match(text, pattern, index)
start, _ = regex_match.span()
before_text = text[:start]
return before_text.rstrip() if strip else before_text
def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str:
"""Extracts texts that occurs before the specified pattern. By default, it will use
the first occurrence of the pattern (index 0). Use the index kwarg to choose a different
index.
Input
-----
strip: If True, removes leading whitespace from the extracted string
"""
regex_match = _get_indexed_match(text, pattern, index)
_, end = regex_match.span()
before_text = text[end:]
return before_text.lstrip() if strip else before_text
def extract_email_address(text: str) -> List[str]:
return re.findall(EMAIL_ADDRESS_PATTERN, text.lower())
def extract_ip_address(text: str) -> List[str]:
return re.findall(IP_ADDRESS_PATTERN_RE, text)
def extract_ip_address_name(text: str) -> List[str]:
return re.findall(IP_ADDRESS_NAME_PATTERN, text)
def extract_mapi_id(text: str) -> List[str]:
mapi_ids = re.findall(MAPI_ID_PATTERN, text)
mapi_ids = [mid.replace(";", "") for mid in mapi_ids]
return mapi_ids
def extract_datetimetz(text: str) -> Optional[datetime.datetime]:
date_extractions = re.findall(EMAIL_DATETIMETZ_PATTERN, text)
if len(date_extractions) > 0:
return datetime.datetime.strptime(date_extractions[0], "%a, %d %b %Y %H:%M:%S %z")
else:
return None
def extract_us_phone_number(text: str):
"""Extracts a US phone number from a section of text that includes a phone number. If there
is no phone number present, the result will be an empty string.
Example
-------
extract_phone_number("Phone Number: 215-867-5309") -> "215-867-5309"
"""
regex_match = US_PHONE_NUMBERS_RE.search(text)
if regex_match is None:
return ""
start, end = regex_match.span()
phone_number = text[start:end]
return phone_number.strip()
def extract_ordered_bullets(text) -> tuple:
"""Extracts the start of bulleted text sections bullets
accounting numeric and alphanumeric types.
Output
-----
tuple(section, sub_section, sub_sub_section): Each bullet partition
is a string or None if not present.
Example
-------
This is a very important point -> (None, None, None)
1.1 This is a very important point -> ("1", "1", None)
a.1 This is a very important point -> ("a", "1", None)
"""
a, b, c, temp = None, None, None, None
text_sp = text.split()
if any(["." not in text_sp[0], ".." in text_sp[0]]):
return a, b, c
bullet = re.split(pattern=r"[\.]", string=text_sp[0])
if not bullet[-1]:
del bullet[-1]
if len(bullet[0]) > 2:
return a, b, c
a, *temp = bullet
if temp:
try:
b, c, *_ = temp
except ValueError:
b = temp
b = "".join(b)
c = "".join(c) if c else None
return a, b, c
def extract_image_urls_from_html(text: str) -> List[str]:
return re.findall(IMAGE_URL_PATTERN, text)