Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix the fix to #884 to fix #1025 #1030

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
- `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))
- `PSBaseParser` can't handle tokens split across end of buffer ([#1025](https://github.com/pdfminer/pdfminer.six/issues/1025))

### Removed

24 changes: 18 additions & 6 deletions pdfminer/psparser.py
Original file line number Diff line number Diff line change
@@ -169,6 +169,7 @@ class PSBaseParser:

def __init__(self, fp: BinaryIO) -> None:
self.fp = fp
self.eof = False
self.seek(0)

def __repr__(self) -> str:
@@ -204,6 +205,7 @@ def seek(self, pos: int) -> None:
self._curtoken = b""
self._curtokenpos = 0
self._tokens: List[Tuple[int, PSBaseParserToken]] = []
self.eof = False

def fillbuf(self) -> None:
if self.charpos < len(self.buf):
@@ -398,11 +400,8 @@ def _parse_keyword(self, s: bytes, i: int) -> int:
j = m.start(0)
self._curtoken += s[i:j]
else:
# Use the rest of the stream if no non-keyword character is found. This
# can happen if the keyword is the final bytes of the stream
# (https://github.com/pdfminer/pdfminer.six/issues/884).
j = len(s)
self._curtoken += s[i:]
return len(s)
if self._curtoken == b"true":
token: Union[bool, PSKeyword] = True
elif self._curtoken == b"false":
@@ -502,9 +501,22 @@ def _parse_hexstring(self, s: bytes, i: int) -> int:
return j

def nexttoken(self) -> Tuple[int, PSBaseParserToken]:
if self.eof:
# It's not really unexpected, come on now...
raise PSEOF("Unexpected EOF")
while not self._tokens:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
try:
self.fillbuf()
self.charpos = self._parse1(self.buf, self.charpos)
except PSEOF:
# If we hit EOF in the middle of a token, try to parse
# it by tacking on whitespace, and delay raising PSEOF
# until next time around
self.charpos = self._parse1(b"\n", 0)
self.eof = True
# Oh, so there wasn't actually a token there? OK.
if not self._tokens:
raise
token = self._tokens.pop(0)
log.debug("nexttoken: %r", token)
return token
307 changes: 305 additions & 2 deletions tests/test_pdfminer_psparser.py
Original file line number Diff line number Diff line change
@@ -155,5 +155,308 @@ def test_3(self):
See: https://github.com/pdfminer/pdfminer.six/issues/884
"""
parser = PSBaseParser(BytesIO(b"Do"))
parser._parse_keyword(b"Do", 0)
assert parser._tokens == [(0, KWD(b"Do"))]
pos, token = parser.nexttoken()
assert token == KWD(b"Do")


BIGDATA = b"""/CIDInit /ProcSet findresource begin\r
12 dict begin\r
begincmap\r
/CIDSystemInfo\r
3 dict dup begin\r
/Registry (Adobe) def\r
/Ordering (SI-*Times New Roman-4498) def\r
/Supplement 0 def\r
end def\r
/CMapName /Adobe-SI-*Times New Roman-4498-0 def\r
/CMapType 2 def\r
1 begincodespacerange\r
<0000> <FFFF>\r
endcodespacerange\r
100 beginbfchar\r
<0000> <FFFD>\r
<0001> <006F>\r
<0002> <0065>\r
<0003> <0073>\r
<0004> <006E>\r
<0005> <003A>\r
<0006> <0065>\r
<0007> <0069>\r
<0008> <0069>\r
<0009> <006C>\r
<000A> <006C>\r
<000B> <006E>\r
<000C> <006E0067>\r
<000D> <002E>\r
<000E> <0054>\r
<000F> <0064>\r
<0010> <006E0067>\r
<0011> <003A>\r
<0012> <0048>\r
<0013> <0050>\r
<0014> <0062>\r
<0015> <0063>\r
<0016> <0065>\r
<0017> <0067>\r
<0018> <0067>\r
<0019> <0069>\r
<001A> <0069>\r
<001B> <006C>\r
<001C> <006E>\r
<001D> <0072>\r
<001E> <0072>\r
<001F> <0074>\r
<0020> <0022>\r
<0021> <0028002C004C002900650074>\r
<0022> <002B006C003A002E>\r
<0023> <002D006C00720022>\r
<0024> <002D006C00720022>\r
<0025> <002D006E>\r
<0026> <002D0072006F>\r
<0027> <002D0074006C>\r
<0028> <002E>\r
<0029> <002E>\r
<002A> <002E>\r
<002B> <002E>\r
<002C> <002E>\r
<002D> <0036006F002E00530074006C>\r
<002E> <0039>\r
<002F> <003A>\r
<0030> <003A>\r
<0031> <003A>\r
<0032> <003A>\r
<0033> <003A0029>\r
<0034> <003A002C>\r
<0035> <003A002C>\r
<0036> <0043002E004F002E002E002E>\r
<0037> <0044002E0043004B>\r
<0038> <00440065006F002E004A>\r
<0039> <00440075006E>\r
<003A> <0046>\r
<003B> <0046006F>\r
<003C> <0046006F004A>\r
<003D> <0046006F0068004B006F0069>\r
<003E> <0046006F0072>\r
<003F> <0049>\r
<0040> <004A>\r
<0041> <004B>\r
<0042> <004B>\r
<0043> <004B>\r
<0044> <004D>\r
<0045> <004D005F0039>\r
<0046> <0050>\r
<0047> <0050>\r
<0048> <0050>\r
<0049> <0052>\r
<004A> <0053>\r
<004B> <0053>\r
<004C> <00530074>\r
<004D> <0054>\r
<004E> <0054006F>\r
<004F> <005C>\r
<0050> <00610072>\r
<0051> <0062>\r
<0052> <0062>\r
<0053> <0063>\r
<0054> <0063>\r
<0055> <0063002E>\r
<0056> <0063002E>\r
<0057> <00630065>\r
<0058> <006300650064002E>\r
<0059> <006300650064002E>\r
<005A> <00630069>\r
<005B> <00630074>\r
<005C> <00630075>\r
<005D> <0064>\r
<005E> <0064>\r
<005F> <0064>\r
<0060> <0064003A002C>\r
<0061> <00640069>\r
<0062> <0065>\r
<0063> <0065>\r
endbfchar\r
100 beginbfchar\r
<0064> <0065>\r
<0065> <0065002C>\r
<0066> <0065002C0065006F002E002E>\r
<0067> <0065006F002E002E>\r
<0068> <00650070006F>\r
<0069> <00650072>\r
<006A> <00650072>\r
<006B> <00650074>\r
<006C> <00660075>\r
<006D> <006600750065>\r
<006E> <0067>\r
<006F> <0068>\r
<0070> <0068>\r
<0071> <0068>\r
<0072> <0068005F003A0029>\r
<0073> <00680065>\r
<0074> <00680065006F002E0064>\r
<0075> <0068006F0063002E004B>\r
<0076> <0069>\r
<0077> <0069>\r
<0078> <0069>\r
<0079> <0069>\r
<007A> <0069>\r
<007B> <0069>\r
<007C> <0069>\r
<007D> <0069>\r
<007E> <0069006F>\r
<007F> <0069006F002E002E>\r
<0080> <00690074>\r
<0081> <006C>\r
<0082> <006C>\r
<0083> <006C>\r
<0084> <006C0065>\r
<0085> <006D>\r
<0086> <006D>\r
<0087> <006D>\r
<0088> <006D00610072>\r
<0089> <006D00650074>\r
<008A> <006E>\r
<008B> <006E>\r
<008C> <006E002E>\r
<008D> <006E005F0039>\r
<008E> <006E0065>\r
<008F> <006E006B003C003E>\r
<0090> <006E006F002E0064002E>\r
<0091> <006E00730074>\r
<0092> <006F>\r
<0093> <006F>\r
<0094> <006F>\r
<0095> <006F>\r
<0096> <006F>\r
<0097> <006F>\r
<0098> <006F>\r
<0099> <006F002E002E>\r
<009A> <006F002E002E>\r
<009B> <006F002E002E>\r
<009C> <006F002E0064>\r
<009D> <006F002E0065>\r
<009E> <006F002E006E>\r
<009F> <006F002E006E>\r
<00A0> <006F002E006E>\r
<00A1> <006F002E006E0074>\r
<00A2> <006F002E006E00750073>\r
<00A3> <006F002E0070>\r
<00A4> <006F002E0072>\r
<00A5> <006F002E0072>\r
<00A6> <006F002E00720072>\r
<00A7> <006F002E0077>\r
<00A8> <006F004A>\r
<00A9> <006F004A>\r
<00AA> <006F004A>\r
<00AB> <006F0064>\r
<00AC> <006F0065>\r
<00AD> <006F006C>\r
<00AE> <006F0073>\r
<00AF> <006F0073>\r
<00B0> <006F0074>\r
<00B1> <006F00A5>\r
<00B2> <006F00A5>\r
<00B3> <0070>\r
<00B4> <0070>\r
<00B5> <0070003C003E>\r
<00B6> <0070003C003E>\r
<00B7> <00700065>\r
<00B8> <00700072>\r
<00B9> <0072>\r
<00BA> <0072>\r
<00BB> <0072>\r
<00BC> <0072>\r
<00BD> <0072>\r
<00BE> <0072>\r
<00BF> <0072>\r
<00C0> <0072>\r
<00C1> <0072>\r
<00C2> <0072>\r
<00C3> <0072>\r
<00C4> <0072>\r
<00C5> <007200270039>\r
<00C6> <0072002E>\r
<00C7> <0072005C>\r
endbfchar\r
49 beginbfchar\r
<00C8> <0072006F0064>\r
<00C9> <00720072006D>\r
<00CA> <00720072006D0065>\r
<00CB> <007200740068>\r
<00CC> <00720075>\r
<00CD> <0072007A006F>\r
<00CE> <0073>\r
<00CF> <0073>\r
<00D0> <0073>\r
<00D1> <0073>\r
<00D2> <0073002E>\r
<00D3> <00730065>\r
<00D4> <00730065>\r
<00D5> <0073006F>\r
<00D6> <007300750062>\r
<00D7> <007300750062>\r
<00D8> <0074>\r
<00D9> <0074>\r
<00DA> <0074>\r
<00DB> <0074>\r
<00DC> <0074>\r
<00DD> <0074005C>\r
<00DE> <007400680065>\r
<00DF> <0074006D0065006E0074>\r
<00E0> <0074006F>\r
<00E1> <00740072>\r
<00E2> <00740074>\r
<00E3> <0075>\r
<00E4> <0075006E>\r
<00E5> <0075006E0064>\r
<00E6> <0076>\r
<00E7> <0076>\r
<00E8> <0076>\r
<00E9> <0077>\r
<00EA> <0077>\r
<00EB> <00770068006F>\r
<00EC> <0077006F002E002E>\r
<00ED> <0077006F002E00B10065>\r
<00EE> <0078>\r
<00EF> <00A5>\r
<00F0> <00B00027003B0039>\r
<00F1> <FFFD>\r
<00F2> <FFFD>\r
<00F3> <FFFD>\r
<00F4> <0020>\r
<00F5> <0009>\r
<00F6> <000A>\r
<00F7> <00A0>\r
<00F8> <00AD>\r
endbfchar\r
endcmap\r
CMapName currentdict /CMap defineresource pop\r
end\r
end"""
# as a bonus, omit the final CRLF so that we can verify that we don't
# re-break #884


def test_issue_1025():
"""Regression test for streams with a token that crosses a
buffer boundary.

See: https://github.com/pdfminer/pdfminer.six/issues/1025
"""
parser = PSBaseParser(BytesIO(BIGDATA))
beginbfchar = KWD(b"beginbfchar")
end = KWD(b"end")
tokens = []
while True:
try:
pos, token = parser.nexttoken()
# Make sure we are really testing the problem!
if pos == 4093:
assert token is beginbfchar
tokens.append(token)
except PSEOF:
break
# we should get "beginbfchar" 3 times (including the broken one)
assert sum(1 for token in tokens if token is beginbfchar) == 3
# we should get both "end" at the end
assert tokens[-1] == end
assert tokens[-2] == tokens[-1]