# -*- coding: utf-8 -*-
#
# This file is part of package name, a package description short.
# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
import difflib
import sys
import re
from typing import Union, AnyStr, Any
# from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator
from collections.abc import Generator
from lark.lexer import Token
from lark.tree import Tree
from lark import Lark
import logging
log = logging.getLogger("RTFDE")
def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str:
"""Returns the hex encoded value of a .rtf control parameter.
Args:
control_parameter: (int/str) Int or a string which represents an int.
Returns:
Zero padded 6 char long hexedecimal string.
"""
try:
return f"{control_parameter:#06x}"
except ValueError:
# If passed as string convert first
control_parameter = int(control_parameter)
return f"{control_parameter:#06x}"
def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str):
"""Prints binary object to a dump file for quick debugging.
Warning: Not for normal use. Only use when debugging.
Args:
data (bytes|str): Data to write to path
path (str): The file path to write data to
"""
# Be able to print binary objects easily
if isinstance(data, (bytes, bytearray)) is True:
open_as = 'wb+'
else:
open_as = 'w+'
with open(path, open_as) as fp:
original_stdout = sys.stdout
sys.stdout = fp
print(data)
sys.stdout = original_stdout
def encode_escaped_control_chars(raw_text: bytes) -> bytes:
"""Replaces escaped control chars within the text with their RTF encoded versions \\'HH.
Args:
raw_text (str): string which needs escape characters encoded
Returns:
A string with escaped control chars
"""
cleaned = raw_text.replace(b'\\\\', b"\\'5c")
cleaned = cleaned.replace(b'\\{', b"\\'7b")
cleaned = cleaned.replace(b'\\}', b"\\'7d")
return cleaned
def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool:
"""Checks if a Token is a codeword with a numeric argument.
Returns:
True if a Token is a codeword with a numeric argument. False if not.
"""
try:
val = token.value.strip()
# print(val, codeword)
if (val.startswith(codeword) and
val[len(codeword):].isdigit()):
return True
except AttributeError:
return False
return False
def print_lark_parser_evaluated_grammar(parser):
"""Prints the final evaluated grammar.
Can be useful for debugging possible errors in grammar evaluation.
Args:
parser (Lark obj): Lark object to extract grammar from.
"""
if not isinstance(parser, Lark):
raise ValueError("Requires a Lark object.")
eq = "="*15
eq = " " + eq + " "
print(eq + "RULES" + eq + "\n")
for i in parser.rules:
print(" " + i)
print(eq + "TERMINALS" + eq + "\n")
for i in parser.terminals:
print(" " + i)
print(eq + "IGNORED TOKENS" + eq + "\n")
for i in parser.ignore_tokens:
print(" " + i)
def log_validators(data):
"""Log validator logging only if RTFDE.validation_logger set to debug.
"""
logger = logging.getLogger("RTFDE.validation_logger")
if logger.level == logging.DEBUG:
logger.debug(data)
def log_transformations(data):
"""Log transform logging only if RTFDE.transform_logger set to debug.
"""
logger = logging.getLogger("RTFDE.transform_logger")
if logger.level == logging.DEBUG:
logger.debug(data)
def is_logger_on(logger_name, level=logging.DEBUG):
"""Check if a logger is enabled and on debug.
"""
logger = logging.getLogger(logger_name)
if logger.level == level:
return True
return False
def log_text_extraction(data):
"""Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
"""
logger = logging.getLogger("RTFDE.text_extraction")
if logger.level == logging.DEBUG:
logger.debug(data)
def log_htmlrtf_stripping(data: Token):
"""Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.
Raises:
AttributeError: Will occur if you pass this something that is not a token.
"""
logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger")
if logger.level == logging.DEBUG:
if not isinstance(data, Token):
raise AttributeError("HTMLRTF Stripping logger only logs Tokens")
tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}"
log_msg = tok_desc.format(value=data.value,
line=data.line,
end_line=data.end_line,
start_pos=data.start_pos,
end_pos = data.end_pos)
logger.debug(log_msg)
def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
"""Log diff of two strings. Defaults to splitting by newlines and keeping the ends.
Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.
Args:
original: The original string
revised: The changed version of the string
sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
"""
log.debug(get_string_diff(original, revised, sep))
def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
"""Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.
Args:
original: The original string
revised: The changed version of the string
sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
Returns:
A string object representing the diff of the two strings provided.
"""
if sep is None:
orig_split = original.decode().splitlines(keepends=True)
revised_split = revised.decode().splitlines(keepends=True)
else:
original = original.replace(b'\n',b'')
revised = revised.replace(b'\n',b'')
orig_split = [i.decode() for i in re.split(sep, original) if i != b'']
revised_split = [i.decode() for i in re.split(sep, revised) if i != b'']
return "\n".join(list(difflib.context_diff(orig_split,
revised_split)))
def get_tree_diff(original: Tree, revised: Tree):
"""Get the diff of two trees.
Args:
original (lark Tree): A lark tree before transformation
revised (lark Tree): A lark tree after transformation
Returns:
A string object representing the diff of the two Trees provided.
Example:
rtf_obj = DeEncapsulator(raw_rtf)
rtf_obj.deencapsulate()
transformed_tree = SomeTransformer.transform(rtf_obj.full_tree)
get_tree_diff(rtf_obj.full_tree, transformed_tree)
"""
log = logging.getLogger("RTFDE")
flat_original = list(flatten_tree(original))
flat_revised = list(flatten_tree(revised))
return "\n".join(list(difflib.context_diff(flat_original,
flat_revised)))
def flatten_tree(tree: Tree) -> Generator:
"""Flatten a lark Tree into a list of repr's of tree objects.
Args:
tree (lark Tree): A lark tree
"""
yield f"Tree('{tree.data}')"
for child in tree.children:
if isinstance(child, Token):
yield repr(child)
elif isinstance(child, Tree):
for i in flatten_tree(child):
yield i
else:
yield repr(child)
def flatten_tree_to_string_array(tree: Tree) -> Generator:
"""Flatten a lark Tree into a list of repr's of tree objects.
Args:
tree (lark Tree): A lark tree
"""
for child in tree.children:
if isinstance(child, Tree):
for i in flatten_tree_to_string_array(child):
yield i
elif isinstance(child, Token):
yield child.value
else:
yield child
def make_token_replacement(ttype, value, example):
if isinstance(example, Token):
fake_tok = Token(ttype,
value,
start_pos=example.start_pos,
end_pos=example.end_pos,
line=example.line,
end_line=example.end_line,
column=example.column,
end_column=example.end_column)
elif isinstance(example, Tree):
fake_tok = Token(ttype,
value,
start_pos=example.meta.start_pos,
end_pos=example.meta.end_pos,
line=example.meta.line,
end_line=example.meta.end_line,
column=example.meta.column,
end_column=example.meta.end_column)
return fake_tok
def embed():
import os
import readline
import rlcompleter
import code
import inspect
import traceback
history = os.path.join(os.path.expanduser('~'), '.python_history')
if os.path.isfile(history):
readline.read_history_file(history)
frame = inspect.currentframe().f_back
namespace = frame.f_locals.copy()
namespace.update(frame.f_globals)
readline.set_completer(rlcompleter.Completer(namespace).complete)
readline.parse_and_bind("tab: complete")
file = frame.f_code.co_filename
line = frame.f_lineno
function = frame.f_code.co_name
stack = ''.join(traceback.format_stack()[:-1])
print(stack)
banner = f" [ {os.path.basename(file)}:{line} in {function}() ]"
banner += "\n Entering interactive mode (Ctrl-D to exit) ..."
try:
code.interact(banner=banner, local=namespace)
finally:
readline.write_history_file(history)