__all__ = [
'tokenizeRTF',
]
import io
from typing import List, Optional, Tuple
from .token import Token, TokenType
_KNOWN_DESTINATIONS = (
b'aftncn',
b'aftnsep',
b'aftnsepc',
b'annotation',
b'author',
b'buptim',
b'category',
b'colortbl',
b'comment',
b'company',
b'creatim',
b'doccomm',
b'dptxbxtext',
b'factoidname',
b'fonttbl',
b'footer',
b'footerf',
b'footerl',
b'footerr',
b'ftncn',
b'ftnsep',
b'ftnsepc',
b'header',
b'headerf',
b'headerl',
b'headerr',
b'hlinkbase',
b'keywords',
b'manager',
b'operator',
b'pict',
b'printim',
b'private',
b'revtim',
b'stylesheet',
b'subject',
b'title',
)
def _finishTag(startText: bytes, reader: io.BytesIO) -> Tuple[bytes, Optional[bytes], Optional[int], bytes]:
"""
Finishes reading a tag, returning the needed parameters to make it a
token.
The return is a 4 tuple of the raw token bytes, the name field, the
parameter field (as an int), and the next character after the tag.
"""
# Very simple rules here. Anything other than a letter and we change
# state. If the next character is a hypen, check if the character after
# is a digit, otherwise return. If it is a digit or that previously
# mentioned next character was a digit, read digits until anything else
# is detected, then return.
name = startText[-1:]
param = b''
while (nextChar := reader.read(1)) != b'' and nextChar.isalpha():
# Read until not alpha.
startText += nextChar
name += nextChar
# Check what the next character is to decide what to do with it.
if nextChar == b'-':
# We do this as a separate check.
nextNext = reader.read(1)
if nextNext == b'':
raise ValueError('Unexpected end of data.')
elif nextNext.isdigit():
startText += nextChar
nextChar = nextNext
if nextChar.isdigit():
startText += nextChar
param += nextChar
while (nextChar := reader.read(1)) != b'' and nextChar.isdigit():
startText += nextChar
param += nextChar
param = int(param)
else:
param = None
# Finally, check if the next char is a space, and if it is, read one
# more char to replace it.
if nextChar == b' ':
nextChar = reader.read(1)
return startText, name, param, nextChar
def _readControl(startChar: bytes, reader: io.BytesIO) -> Tuple[Tuple[Token, ...], bytes]:
"""
Attempts to read the next data as a control, returning as many tokens
as necessary.
"""
# First, read the next character, as it decides how to handle
# everything.
nextChar = reader.read(1)
if nextChar == b'':
raise ValueError('Unexpected end of data.')
elif nextChar.isalpha():
# If is an alphabetical character, so start the handling of a tag.
text, name, param, nextChar = _finishTag(startChar + nextChar, reader)
# Important, check if the name is "bin". If it is, handle that
# specially before returning.
if name == b'bin':
if nextChar == b'':
raise ValueError('Unexpected end of data.')
binText = nextChar + reader.read(param - 1)
if len(binText) != param:
raise ValueError('Unexpected end of data.')
return (Token(text, TokenType.CONTROL, name, param), Token(binText, TokenType.BINARY)), nextChar
elif name in _KNOWN_DESTINATIONS:
return (Token(text, TokenType.DESTINATION, name, param),), nextChar
return (Token(text, TokenType.CONTROL, name, param),), nextChar
else:
# Most control symbols would return immediately, but there are two
# exceptions.
startChar += nextChar
if nextChar == b'*':
# This is going to be a custom destination. First, validation.
if len(nextChar := reader.read(1)) != 1:
raise ValueError('Unexpected end of data.')
elif nextChar != b'\\':
raise ValueError(f'Bad custom destination (expected a backslash, got {nextChar}).')
startChar += nextChar
# Check the the next char is alpha.
if not (nextChar := reader.read(1)).isalpha():
raise ValueError(f'Expected alpha character for destination, got {nextChar}.')
startChar += nextChar
# Call the function to read until a clear end of tag.
text, name, param, nextChar = _finishTag(startChar, reader)
return (Token(text, TokenType.IGNORABLE_DESTINATION, name, param),), nextChar
elif nextChar == b'\'':
# This is a hex character, so immediately read 2 more bytes.
hexChars = reader.read(2)
if len(hexChars) != 2:
raise ValueError('Unexpected end of data.')
try:
param = int(hexChars, 16)
except ValueError as e:
context = e.__cause__ or e.__context__
raise ValueError(f'Hex data was not hexidecimal (got {hexChars}).') from context
return (Token(startChar + hexChars, TokenType.SYMBOL, None, param),), reader.read(1)
else:
# If it is a control symbol, immediately return.
return (Token(startChar, TokenType.SYMBOL),), reader.read(1)
def _readText(startChar: bytes, reader: io.BytesIO) -> Tuple[Tuple[Token, ...], bytes]:
"""
Attempts to read the next data as text.
"""
chars = [startChar]
# Text is actually the easiest to read, as we just read until end of
# stream or until a special character. However, a few characters are
# simply dropped during reading.
while (nextChar := reader.read(1)) != b'' and nextChar not in (b'{', b'}', b'\\'):
# Certain characters are simply dropped.
if nextChar not in (b'\r', b'\n'):
chars.append(nextChar)
# Now, we actually are reading the text as *individual tokens*, so we
# need to
return tuple(Token(x, TokenType.TEXT) for x in chars), nextChar
def tokenizeRTF(data: bytes, validateStart: bool = True) -> List[Token]:
"""
Reads in the bytes and sets the tokens list to the contents after
tokenizing.
If tokenizing fails, the current tokens list will not be changed.
:param validateStart: If ``False``, does not check the first few tags.
Useful when tokenizing a snippet rather than a document.
:raises TypeError: The data is not recognized as RTF.
:raises ValueError: An issue with basic parsing occured.
"""
reader = io.BytesIO(data)
if validateStart:
# This tokenizer *only* breaks things up. It does *not* care about
# groups and stuff, as that is for a parser to deal with. All we do is
# track the current backslash state and token state. We also simply
# check that the first token is "\rtf1" preceeded by a group start, and
# that is it.
start = reader.read(6)
if start != b'{\\rtf1':
raise TypeError('Data does not start with "{\\rtf1".')
tokens = [
Token(b'{', TokenType.GROUP_START),
Token(b'\\rtf1', TokenType.CONTROL, b'rtf', 1),
]
nextChar = reader.read(1)
# If the next character is a space, ignore it.
if nextChar == b' ':
nextChar = reader.read(1)
else:
tokens = []
nextChar = reader.read(1)
# At every iteration, so long as there is more data, nextChar should be
# set. As such, use it to determine what kind of data to try to read,
# using the delimeter of that type of data to know what to do next.
while nextChar != b'':
# We should have exactly one character, the start of the next
# section. Use it to determine what to do.
if nextChar in (b'\r', b'\n'):
# Just read the next character and start the loop over.
nextChar = reader.read(1)
continue
if nextChar == b'\\':
newTokens, nextChar = _readControl(nextChar, reader)
elif nextChar == b'{':
# This will always be a group start, which has nothing left to
# read.
nextChar = reader.read(1)
newTokens = (Token(b'{', TokenType.GROUP_START),)
elif nextChar == b'}':
# This will always be a group end, which has nothing left to
# read.
nextChar = reader.read(1)
newTokens = (Token(b'}', TokenType.GROUP_END),)
else:
# Otherwise, it's just text.
newTokens, nextChar = _readText(nextChar, reader)
tokens.extend(newTokens)
return tokens