"""
Regular expression constants.
"""
__all__ = [
'HTML_BODY_START',
'HTML_SAN_SPACE',
'INVALID_FILENAME_CHARS',
'INVALID_OLE_PATH',
'RTF_BODY_STRIP_INIT',
'RTF_BODY_STRIP_PRE_CLOSE',
'RTF_BODY_STRIP_PRE_OPEN',
'RTF_ENC_BODY_START',
]
import re
from typing import Final
# Allow better typing in versions above 3.8.
import sys
if sys.version_info >= (3, 9):
_RE_STR_TYPE = re.Pattern[str]
_RE_BYTES_TYPE = re.Pattern[bytes]
else:
_RE_STR_TYPE = re.Pattern
_RE_BYTES_TYPE = re.Pattern
# Characters that are invalid in a filename.
INVALID_FILENAME_CHARS: Final[_RE_STR_TYPE] = re.compile(r'[\\/:*?"<>|]')
# Regular expression to find sections of spaces for htmlSanitize.
HTML_SAN_SPACE: Final[_RE_STR_TYPE] = re.compile(' +')
# Regular expression to find the start of the html body.
HTML_BODY_START: Final[_RE_BYTES_TYPE] = re.compile(b'<body[^>]*>')
# Regular expression to find the start of the html body in encapsulated RTF.
# This is used for one of the pattern types that makes life easy.
RTF_ENC_BODY_START: Final[_RE_BYTES_TYPE] = re.compile(br'\{\\\*\\htmltag[0-9]* ?<body[^>]*>\}')
# Used in the vaildation of OLE paths. Any of these characters in a name make it
# invalid.
INVALID_OLE_PATH: Final[_RE_STR_TYPE] = re.compile(r'[:/\\!]')
# Used as the initial step in stripping RTF files for deencapsulation. Finds
# ignored sections that do not contrain groups *and* finds HTML tag sections
# that are entirely empty. It also then finds sections of data that can be
# merged together without affecting the results
RTF_BODY_STRIP_INIT: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf[^0{}][^{}]*?\\htmlrtf0 ?)|(\{\\\*\\htmltag[0-9]+\})|(\\htmlrtf0 ?\\htmlrtf1? ?)|(\\htmlrtf1? ?\{\}\\htmlrtf0 ?)|(\\htmlrtf1? ?\\\'[a-fA-F0-9]{2}\\htmlrtf0 ?)')
# Preprocessing steps to simplify the RTF.
RTF_BODY_STRIP_PRE_CLOSE: Final[_RE_BYTES_TYPE] = re.compile(rb'(\\htmlrtf1? ?}\\htmlrtf0 ?)|(\\htmlrtf1? ?[^0{}][^{}]*?} ?\\htmlrtf0 ?)')
RTF_BODY_STRIP_PRE_OPEN: Final[_RE_BYTES_TYPE] = re.compile(rb'\\htmlrtf1? ?{[^{}]*?\\htmlrtf0 ?')