import functools from typing import Tuple, Union import pdfminer from pdfminer.psparser import ( END_KEYWORD, KWD, PSEOF, PSBaseParser, PSBaseParserToken, PSKeyword, log, ) factory_seek = PSBaseParser.seek @functools.wraps(PSBaseParser.seek) def seek(self: PSBaseParser, pos: int) -> None: factory_seek(self, pos) self.eof = False @functools.wraps(PSBaseParser._parse_keyword) def _parse_keyword(self, s: bytes, i: int) -> int: m = END_KEYWORD.search(s, i) if m: j = m.start(0) self._curtoken += s[i:j] else: self._curtoken += s[i:] return len(s) if self._curtoken == b"true": token: Union[bool, PSKeyword] = True elif self._curtoken == b"false": token = False else: token = KWD(self._curtoken) self._add_token(token) self._parse1 = self._parse_main return j @functools.wraps(PSBaseParser.nexttoken) def nexttoken(self) -> Tuple[int, PSBaseParserToken]: if self.eof: # It's not really unexpected, come on now... raise PSEOF("Unexpected EOF") while not self._tokens: try: self.fillbuf() self.charpos = self._parse1(self.buf, self.charpos) except PSEOF: # If we hit EOF in the middle of a token, try to parse # it by tacking on whitespace, and delay raising PSEOF # until next time around self.charpos = self._parse1(b"\n", 0) self.eof = True # Oh, so there wasn't actually a token there? OK. if not self._tokens: raise token = self._tokens.pop(0) log.debug("nexttoken: %r", token) return token def patch_psparser(): """Monkey-patch certain versions of pdfminer.six to avoid dropping tokens at EOF (before 20231228) and splitting tokens at buffer boundaries (20231228 and 20240706). """ # Presuming the bug will be fixed in the next release if pdfminer.__version__ <= "20240706": PSBaseParser.seek = seek PSBaseParser._parse_keyword = _parse_keyword PSBaseParser.nexttoken = nexttoken
Memory