Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix #495: resolve params in PDFStream.get_filters
Some PDF documents use reference to store filter params. Resolve them to allow proper extraction of content. ``` In [1]: import pdfplumber In [2]: doc = pdfplumber.open("bill.pdf") In [3]: s = doc.images[0]['stream'] In [4]: s.get_data() --------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[4], line 1 ----> 1 s.get_data() File ~/.local/share/virtualenvs/pdfreader/lib/python3.11/site-packages/pdfminer/pdftypes.py:396, in PDFStream.get_data(self) 394 def get_data(self) -> bytes: 395 if self.data is None: --> 396 self.decode() 397 assert self.data is not None 398 return self.data File ~/.local/share/virtualenvs/pdfreader/lib/python3.11/site-packages/pdfminer/pdftypes.py:373, in PDFStream.decode(self) 371 raise PDFNotImplementedError("Unsupported filter: %r" % f) 372 # apply predictors --> 373 if params and "Predictor" in params: 374 pred = int_value(params["Predictor"]) 375 if pred == 1: 376 # no predictor TypeError: argument of type 'PDFObjRef' is not iterable In [5]: s.get_filters() Out[5]: [(/'FlateDecode', <PDFObjRef:21>)] ```
- Loading branch information