From c02236b90b2e1d43d574217c925da7b8d4ead756 Mon Sep 17 00:00:00 2001 From: Gilles Dartiguelongue Date: Sat, 5 Aug 2023 11:44:20 +0200 Subject: [PATCH] Fix #495: resolve params in PDFStream.get_filters Some PDF documents use reference to store filter params. Resolve them to allow proper extraction of content. ``` In [1]: import pdfplumber In [2]: doc = pdfplumber.open("bill.pdf") In [3]: s = doc.images[0]['stream'] In [4]: s.get_data() --------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[4], line 1 ----> 1 s.get_data() File ~/.local/share/virtualenvs/pdfreader/lib/python3.11/site-packages/pdfminer/pdftypes.py:396, in PDFStream.get_data(self) 394 def get_data(self) -> bytes: 395 if self.data is None: --> 396 self.decode() 397 assert self.data is not None 398 return self.data File ~/.local/share/virtualenvs/pdfreader/lib/python3.11/site-packages/pdfminer/pdftypes.py:373, in PDFStream.decode(self) 371 raise PDFNotImplementedError("Unsupported filter: %r" % f) 372 # apply predictors --> 373 if params and "Predictor" in params: 374 pred = int_value(params["Predictor"]) 375 if pred == 1: 376 # no predictor TypeError: argument of type 'PDFObjRef' is not iterable In [5]: s.get_filters() Out[5]: [(/'FlateDecode', )] ``` --- CHANGELOG.md | 1 + pdfminer/pdftypes.py | 16 ++++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ffbe882..a597516b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794)) - `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827)) - Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828)) +- Resolve filter parameters ([#495](https://github.com/pdfminer/pdfminer.six/issues/495)) ### Deprecated diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index d7a2f412..58672b41 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -306,13 +306,17 @@ def get_filters(self) -> List[Tuple[Any, Any]]: if settings.STRICT and len(params) != len(filters): raise PDFException("Parameters len filter mismatch") # resolve filter if possible - _filters = [] - for fltr in filters: - if hasattr(fltr, "resolve"): - fltr = fltr.resolve()[0] - _filters.append(fltr) + _filters = [ + fltr.resolve()[0] if hasattr(fltr, "resolve") else fltr + for fltr in filters + ] + # resolve params if possible + _params = [ + param.resolve() if hasattr(param, "resolve") else param + for param in params + ] # return list solves https://github.com/pdfminer/pdfminer.six/issues/15 - return list(zip(_filters, params)) + return list(zip(_filters, _params)) def decode(self) -> None: assert self.data is None and self.rawdata is not None, str(