# # Natural Language Toolkit: Twitter Tokenizer # # Copyright (C) 2001-2024 NLTK Project # Author: Christopher Potts <cgpotts@stanford.edu> # Ewan Klein <ewan@inf.ed.ac.uk> (modifications) # Pierpaolo Pantone <> (modifications) # Tom Aarsen <> (modifications) # URL: <https://www.nltk.org/> # For license information, see LICENSE.TXT # """ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this: 1. The tuple REGEXPS defines a list of regular expression strings. 2. The REGEXPS strings are put, in order, into a compiled regular expression object called WORD_RE, under the TweetTokenizer class. 3. The tokenization is done by WORD_RE.findall(s), where s is the user-supplied string, inside the tokenize() method of the class TweetTokenizer. 4. When instantiating Tokenizer objects, there are several options: * preserve_case. By default, it is set to True. If it is set to False, then the tokenizer will downcase everything except for emoticons. * reduce_len. By default, it is set to False. It specifies whether to replace repeated character sequences of length 3 or greater with sequences of length 3. * strip_handles. By default, it is set to False. It specifies whether to remove Twitter handles of text used in the `tokenize` method. * match_phone_numbers. By default, it is set to True. It indicates whether the `tokenize` method should look for phone numbers. """ ###################################################################### import html from typing import List import regex # https://github.com/nltk/nltk/issues/2409 from nltk.tokenize.api import TokenizerI ###################################################################### # The following strings are components in the regular expression # that is used for tokenizing. It's important that phone_number # appears first in the final regex (since it can contain whitespace). # It also could matter that tags comes after emoticons, due to the # possibility of having text like # # <:| and some text >:) # # Most importantly, the final element should always be last, since it # does a last ditch whitespace-based tokenization of whatever is left. # ToDo: Update with https://en.wikipedia.org/wiki/List_of_emoticons ? # This particular element is used in a couple ways, so we define it # with a name: EMOTICONS = r""" (?: [<>]? [:;=8] # eyes [\-o\*\']? # optional nose [\)\]$\[dDpP/\:\}\{@\|\\] # mouth | [$\]$\[dDpP/\:\}\{@\|\\] # mouth [\-o\*\']? # optional nose [:;=8] # eyes [<>]? | </?3 # heart )""" # URL pattern due to John Gruber, modified by Tom Winzig. See # https://gist.github.com/winzig/8894715 URLS = r""" # Capture 1: entire matched URL (?: https?: # URL protocol and colon (?: /{1,3} # 1-3 slashes | # or [a-z0-9%] # Single letter or digit or '%' # (Trying not to match e.g. "URI::Escape") ) | # or # looks like domain name followed by a slash: [a-z0-9.\-]+[.] (?:[a-z]{2,13}) / ) (?: # One or more: [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[] | # or \([^\s()]*?\([^\s()]+$[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | $[^\s]+?$ # balanced parens, non-recursive: (...) )+ (?: # End with: $[^\s()]*?\([^\s()]+$[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | $[^\s]+?$ # balanced parens, non-recursive: (...) | # or [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars ) | # OR, the following to match naked domains: (?: (?<!@) # not preceded by a @, avoid matching foo@_gmail.com_ [a-z0-9]+ (?:[.\-][a-z0-9]+)* [.] (?:[a-z]{2,13}) \b /? (?!@) # not succeeded by a @, # avoid matching "foo.na" in "foo.na@example.com" ) """ # emoji flag sequence # https://en.wikipedia.org/wiki/Regional_indicator_symbol # For regex simplicity, include all possible enclosed letter pairs, # not the ISO subset of two-letter regional indicator symbols. # See https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2#Current_codes # Future regional flag support may be handled with the regex for # U+1F3F4 🏴 followed by emoji tag sequences: # r'\U0001F3F4[\U000E0000-\U000E007E]{5}\U000E007F' FLAGS = r""" (?: [\U0001F1E6-\U0001F1FF]{2} # all enclosed letter pairs | # English flag \U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F | # Scottish flag \U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F | # For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales! \U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F ) """ # Regex for recognizing phone numbers: PHONE_REGEX = r""" (?: (?: # (international) \+?[01] [ *\-.\)]* )? (?: # (area code) [$]? \d{3} [ *\-.$]* )? \d{3} # exchange [ *\-.\)]* \d{4} # base )""" # The components of the tokenizer: REGEXPS = ( URLS, # ASCII Emoticons EMOTICONS, # HTML tags: r"""<[^>\s]+>""", # ASCII Arrows r"""[\-]+>|<[\-]+""", # Twitter username: r"""(?:@[\w_]+)""", # Twitter hashtags: r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", # email addresses r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", # Zero-Width-Joiner and Skin tone modifier emojis """.(?: [\U0001F3FB-\U0001F3FF]?(?:\u200d.[\U0001F3FB-\U0001F3FF]?)+ | [\U0001F3FB-\U0001F3FF] )""", # flags FLAGS, # Remaining word types: r""" (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes. | (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. | (?:[\w_]+) # Words without apostrophes or dashes. | (?:\.(?:\s*\.){1,}) # Ellipsis dots. | (?:\S) # Everything else that isn't whitespace. """, ) # Take the main components and add a phone regex as the second parameter REGEXPS_PHONE = (REGEXPS[0], PHONE_REGEX, *REGEXPS[1:]) ###################################################################### # TweetTokenizer.WORD_RE and TweetTokenizer.PHONE_WORD_RE represent # the core tokenizing regexes. They are compiled lazily. # WORD_RE performs poorly on these patterns: HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}") # The emoticon string gets its own regex so that we can preserve case for # them as needed: EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE) # These are for regularizing HTML entities to Unicode: ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") # For stripping away handles from a tweet: HANDLES_RE = regex.compile( r"(?<![A-Za-z0-9_!@#\$%&*])@" r"(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))" ) ###################################################################### # Functions for converting html entities ###################################################################### def _str_to_unicode(text, encoding=None, errors="strict"): if encoding is None: encoding = "utf-8" if isinstance(text, bytes): return text.decode(encoding, errors) return text def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"): """ Remove entities from text by converting them to their corresponding unicode character. :param text: a unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8'). :param list keep: list of entity names which should not be replaced.\ This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) and named entities (such as `` `` or ``>``). :param bool remove_illegal: If `True`, entities that can't be converted are\ removed. Otherwise, entities that can't be converted are kept "as is". :returns: A unicode string with the entities removed. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100') 'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>> """ def _convert_entity(match): entity_body = match.group(3) if match.group(1): try: if match.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets if 0x80 <= number <= 0x9F: return bytes((number,)).decode("cp1252") except ValueError: number = None else: if entity_body in keep: return match.group(0) number = html.entities.name2codepoint.get(entity_body) if number is not None: try: return chr(number) except (ValueError, OverflowError): pass return "" if remove_illegal else match.group(0) return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) ###################################################################### class TweetTokenizer(TokenizerI): r""" Tokenizer for tweets. >>> from nltk.tokenize import TweetTokenizer >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] Examples using `strip_handles` and `reduce_len parameters`: >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!' >>> tknzr.tokenize(s1) [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] """ # Values used to lazily compile WORD_RE and PHONE_WORD_RE, # which are the core tokenizing regexes. _WORD_RE = None _PHONE_WORD_RE = None ###################################################################### def __init__( self, preserve_case=True, reduce_len=False, strip_handles=False, match_phone_numbers=True, ): """ Create a `TweetTokenizer` instance with settings for use in the `tokenize` method. :param preserve_case: Flag indicating whether to preserve the casing (capitalisation) of text used in the `tokenize` method. Defaults to True. :type preserve_case: bool :param reduce_len: Flag indicating whether to replace repeated character sequences of length 3 or greater with sequences of length 3. Defaults to False. :type reduce_len: bool :param strip_handles: Flag indicating whether to remove Twitter handles of text used in the `tokenize` method. Defaults to False. :type strip_handles: bool :param match_phone_numbers: Flag indicating whether the `tokenize` method should look for phone numbers. Defaults to True. :type match_phone_numbers: bool """ self.preserve_case = preserve_case self.reduce_len = reduce_len self.strip_handles = strip_handles self.match_phone_numbers = match_phone_numbers def tokenize(self, text: str) -> List[str]: """Tokenize the input text. :param text: str :rtype: list(str) :return: a tokenized list of strings; joining this list returns\ the original string if `preserve_case=False`. """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Shorten problematic sequences of characters safe_text = HANG_RE.sub(r"\1\1\1", text) # Recognise phone numbers during tokenization if self.match_phone_numbers: words = self.PHONE_WORD_RE.findall(safe_text) else: words = self.WORD_RE.findall(safe_text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list( map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words) ) return words @property def WORD_RE(self) -> "regex.Pattern": """Core TweetTokenizer regex""" # Compiles the regex for this and all future instantiations of TweetTokenizer. if not type(self)._WORD_RE: type(self)._WORD_RE = regex.compile( f"({'|'.join(REGEXPS)})", regex.VERBOSE | regex.I | regex.UNICODE, ) return type(self)._WORD_RE @property def PHONE_WORD_RE(self) -> "regex.Pattern": """Secondary core TweetTokenizer regex""" # Compiles the regex for this and all future instantiations of TweetTokenizer. if not type(self)._PHONE_WORD_RE: type(self)._PHONE_WORD_RE = regex.compile( f"({'|'.join(REGEXPS_PHONE)})", regex.VERBOSE | regex.I | regex.UNICODE, ) return type(self)._PHONE_WORD_RE ###################################################################### # Normalization Functions ###################################################################### def reduce_lengthening(text): """ Replace repeated character sequences of length 3 or greater with sequences of length 3. """ pattern = regex.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1\1", text) def remove_handles(text): """ Remove Twitter username handles from text. """ # Substitute handles with ' ' to ensure that text on either side of removed handles are tokenized correctly return HANDLES_RE.sub(" ", text) ###################################################################### # Tokenization Function ###################################################################### def casual_tokenize( text, preserve_case=True, reduce_len=False, strip_handles=False, match_phone_numbers=True, ): """ Convenience function for wrapping the tokenizer. """ return TweetTokenizer( preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles, match_phone_numbers=match_phone_numbers, ).tokenize(text) ###############################################################################