Skip to content

Commit

Permalink
Support zipped jpegs (#938)
Browse files Browse the repository at this point in the history
  • Loading branch information
pietermarsman authored Jan 16, 2024
1 parent f428846 commit e20e6af
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 9 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [Unreleased]

### Added

- Support for zipped jpeg's ([#938](https://github.com/pdfminer/pdfminer.six/pull/938))

### Fixed

- Resolving mediabox and pdffont ([#834](https://github.com/pdfminer/pdfminer.six/pull/834))
Expand Down
16 changes: 7 additions & 9 deletions pdfminer/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ def export_image(self, image: LTImage) -> str:

filters = image.stream.get_filters()

if len(filters) == 1 and filters[0][0] in LITERALS_DCT_DECODE:
if filters[-1][0] in LITERALS_DCT_DECODE:
name = self._save_jpeg(image)

elif len(filters) == 1 and filters[0][0] in LITERALS_JPX_DECODE:
elif filters[-1][0] in LITERALS_JPX_DECODE:
name = self._save_jpeg2000(image)

elif self._is_jbig2_iamge(image):
Expand All @@ -132,8 +132,7 @@ def export_image(self, image: LTImage) -> str:

def _save_jpeg(self, image: LTImage) -> str:
"""Save a JPEG encoded image"""
raw_data = image.stream.get_rawdata()
assert raw_data is not None
data = image.stream.get_data()

name, path = self._create_unique_image_name(image, ".jpg")
with open(path, "wb") as fp:
Expand All @@ -143,20 +142,19 @@ def _save_jpeg(self, image: LTImage) -> str:
except ImportError:
raise ImportError(PIL_ERROR_MESSAGE)

ifp = BytesIO(raw_data)
ifp = BytesIO(data)
i = Image.open(ifp)
i = ImageChops.invert(i)
i = i.convert("RGB")
i.save(fp, "JPEG")
else:
fp.write(raw_data)
fp.write(data)

return name

def _save_jpeg2000(self, image: LTImage) -> str:
"""Save a JPEG 2000 encoded image"""
raw_data = image.stream.get_rawdata()
assert raw_data is not None
data = image.stream.get_data()

name, path = self._create_unique_image_name(image, ".jp2")
with open(path, "wb") as fp:
Expand All @@ -169,7 +167,7 @@ def _save_jpeg2000(self, image: LTImage) -> str:
# that I have tried cannot open the file. However,
# open and saving with PIL produces a file that
# seems to be easily opened by other programs
ifp = BytesIO(raw_data)
ifp = BytesIO(data)
i = Image.open(ifp)
i.save(fp, "JPEG2000")
return name
Expand Down
6 changes: 6 additions & 0 deletions tests/test_tools_pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,3 +177,9 @@ def test_contrib_matplotlib(self):
def test_nonfree_cmp_itext_logo(self):
"""Test a pdf with Type3 font"""
run("nonfree/cmp_itext_logo.pdf")

def test_contrib_issue_495_pdfobjref(self):
"""Test for extracting a zipped pdf"""
filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf")
image_files = self.extract_images(filepath)
assert image_files[0].endswith("jpg")

0 comments on commit e20e6af

Please sign in to comment.