diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml index 25186bae..47887ea8 100644 --- a/.github/workflows/actions.yml +++ b/.github/workflows/actions.yml @@ -20,9 +20,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip, Install nox @@ -38,9 +38,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip, Install nox @@ -56,9 +56,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip, Install nox @@ -75,12 +75,12 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - python-version: [ "3.6", "3.7", "3.8", "3.9", "3.10" ] + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Determine pip cache directory @@ -88,7 +88,7 @@ jobs: run: | echo "::set-output name=dir::$(pip cache dir)" - name: Cache pip cache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} key: ${{ runner.os }}-pip${{ matrix.python-version }} @@ -105,9 +105,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Python ${{ env.default-python }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ env.default-python }} - name: Upgrade pip and install nox @@ -129,19 +129,9 @@ jobs: - build-docs steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install dependencies run: python -m pip install wheel - - name: Set version - run: | - if [[ "${{ github.ref }}" == "refs/tags/"* ]] - then - VERSION=$(echo "${{ github.ref }}" | sed -e 's,.*/\(.*\),\1,' | sed -e 's/^v//') - else - VERSION=$(date +%Y%m%d).$(date +%H%M%S) - fi - echo ${VERSION} - sed -i "s/__VERSION__/${VERSION}/g" pdfminer/__init__.py - name: Build package run: python setup.py sdist bdist_wheel - name: Generate changelog @@ -161,4 +151,4 @@ jobs: body_path: ${{ github.workspace }}-CHANGELOG.md files: | dist/*.tar.gz - dist/*.whl \ No newline at end of file + dist/*.whl diff --git a/.gitignore b/.gitignore index b155fbbd..c1642e11 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ Pipfile.lock .vscode/ pyproject.toml poetry.lock +.eggs diff --git a/CHANGELOG.md b/CHANGELOG.md index a597516b..4a3e95e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] +### Fixed + +- Resolve filter parameters ([#495](https://github.com/pdfminer/pdfminer.six/issues/495)) + +## [20231228] + +### Removed +- Support for Python 3.6 and 3.7 ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) + ### Added - Output converter for the hOCR format ([#651](https://github.com/pdfminer/pdfminer.six/pull/651)) @@ -14,6 +23,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed +- Broken CI/CD pipeline by setting upper version limit for black, mypy, pip and setuptools ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) +- `flake8` failures ([#921](https://github.com/pdfminer/pdfminer.six/pull/921)) - `ValueError` when bmp images with 1 bit channel are decoded ([#773](https://github.com/pdfminer/pdfminer.six/issues/773)) - `ValueError` when trying to decrypt empty metadata values ([#766](https://github.com/pdfminer/pdfminer.six/issues/766)) - Sphinx errors during building of documentation ([#760](https://github.com/pdfminer/pdfminer.six/pull/760)) @@ -23,12 +34,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - Color "convenience operators" now (per spec) also set color space ([#794](https://github.com/pdfminer/pdfminer.six/pull/794)) - `ValueError` when extracting images, due to breaking changes in Pillow ([#827](https://github.com/pdfminer/pdfminer.six/pull/827)) - Small typo's and issues in the documentation ([#828](https://github.com/pdfminer/pdfminer.six/pull/828)) -- Resolve filter parameters ([#495](https://github.com/pdfminer/pdfminer.six/issues/495)) +- Ignore non-Unicode cmaps in TrueType fonts ([#806](https://github.com/pdfminer/pdfminer.six/pull/806)) + +### Changed + +- Using non-hardcoded version string and setuptools-git-versioning to enable installation from source and building on Python 3.12 ([#922](https://github.com/pdfminer/pdfminer.six/issues/922)) ### Deprecated - Usage of `if __name__ == "__main__"` where it was only intended for testing purposes ([#756](https://github.com/pdfminer/pdfminer.six/pull/756)) +### Removed + +- Support for Python 3.6 and 3.7 because they are end-of-life ([#923](https://github.com/pdfminer/pdfminer.six/pull/923)) + ## [20220524] ### Fixed diff --git a/README.md b/README.md index 182ffcf8..0015bb08 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Features How to use ---------- -* Install Python 3.6 or newer. +* Install Python 3.8 or newer. * Install pdfminer.six. `pip install pdfminer.six` diff --git a/noxfile.py b/noxfile.py index f55bbadb..52995e1b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -3,13 +3,13 @@ import nox -PYTHON_ALL_VERSIONS = ["3.6", "3.7", "3.8", "3.9", "3.10"] +PYTHON_ALL_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"] PYTHON_MODULES = ["pdfminer", "tools", "tests", "noxfile.py", "setup.py"] @nox.session def format(session): - session.install("black") + session.install("black<23") # Format files locally with black, but only check in cicd if "CI" in os.environ: session.run("black", "--check", *PYTHON_MODULES) @@ -25,7 +25,7 @@ def lint(session): @nox.session def types(session): - session.install("mypy") + session.install("mypy<1") session.run( "mypy", "--install-types", @@ -37,12 +37,16 @@ def types(session): @nox.session(python=PYTHON_ALL_VERSIONS) def tests(session): + session.install("pip") + session.install("setuptools") session.install("-e", ".[dev]") session.run("pytest") @nox.session def docs(session): + session.install("pip") + session.install("setuptools") session.install("-e", ".[docs]") session.run( "python", "-m", "sphinx", "-b", "html", "docs/source", "docs/build/html" diff --git a/pdfminer/__init__.py b/pdfminer/__init__.py index e8e5221f..5bd4d50a 100644 --- a/pdfminer/__init__.py +++ b/pdfminer/__init__.py @@ -1,4 +1,10 @@ -__version__ = "__VERSION__" # auto replaced with tag in github actions +from importlib.metadata import version, PackageNotFoundError + +try: + __version__ = version("pdfminer.six") +except PackageNotFoundError: + # package is not installed, return default + __version__ = "0.0" if __name__ == "__main__": print(__version__) diff --git a/pdfminer/cmapdb.py b/pdfminer/cmapdb.py index 01306ed2..f0c43ab7 100644 --- a/pdfminer/cmapdb.py +++ b/pdfminer/cmapdb.py @@ -195,15 +195,20 @@ def add_cid2unichr(self, cid: int, code: Union[PSLiteral, bytes, int]) -> None: if isinstance(code, PSLiteral): # Interpret as an Adobe glyph name. assert isinstance(code.name, str) - self.cid2unichr[cid] = name2unicode(code.name) + unichr = name2unicode(code.name) elif isinstance(code, bytes): # Interpret as UTF-16BE. - self.cid2unichr[cid] = code.decode("UTF-16BE", "ignore") + unichr = code.decode("UTF-16BE", "ignore") elif isinstance(code, int): - self.cid2unichr[cid] = chr(code) + unichr = chr(code) else: raise TypeError(code) + # A0 = non-breaking space, some weird fonts can have a collision on a cid here. + if unichr == "\u00A0" and self.cid2unichr.get(cid) == " ": + return + self.cid2unichr[cid] = unichr + class PyCMap(CMap): def __init__(self, name: str, module: Any) -> None: diff --git a/pdfminer/image.py b/pdfminer/image.py index 61c2673e..d72a10cd 100644 --- a/pdfminer/image.py +++ b/pdfminer/image.py @@ -8,7 +8,7 @@ from typing import Literal except ImportError: # Literal was introduced in Python 3.8 - from typing_extensions import Literal # type: ignore[misc] + from typing_extensions import Literal # type: ignore[assignment] from .jbig2 import JBIG2StreamReader, JBIG2StreamWriter from .layout import LTImage diff --git a/pdfminer/pdffont.py b/pdfminer/pdffont.py index 13629c77..63826b96 100644 --- a/pdfminer/pdffont.py +++ b/pdfminer/pdffont.py @@ -755,7 +755,11 @@ def create_unicode_map(self) -> FileUnicodeMap: ) char2gid: Dict[int, int] = {} # Only supports subtable type 0, 2 and 4. - for (_1, _2, st_offset) in subtables: + for (platform_id, encoding_id, st_offset) in subtables: + # Skip non-Unicode cmaps. + # https://docs.microsoft.com/en-us/typography/opentype/spec/cmap + if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])): + continue fp.seek(base_offset + st_offset) (fmttype, fmtlen, fmtlang) = cast( Tuple[int, int, int], struct.unpack(">HHH", fp.read(6)) @@ -824,6 +828,8 @@ def create_unicode_map(self) -> FileUnicodeMap: char2gid[c] = (c + idd) & 0xFFFF else: assert False, str(("Unhandled", fmttype)) + if not char2gid: + raise TrueTypeFont.CMapNotFound # create unicode map unicode_map = FileUnicodeMap() for (char, gid) in char2gid.items(): diff --git a/samples/contrib/issue-791-non-unicode-cmap.pdf b/samples/contrib/issue-791-non-unicode-cmap.pdf new file mode 100644 index 00000000..8595bd6f Binary files /dev/null and b/samples/contrib/issue-791-non-unicode-cmap.pdf differ diff --git a/setup.py b/setup.py index 8f257c3f..516e6af6 100644 --- a/setup.py +++ b/setup.py @@ -1,24 +1,23 @@ -import sys from pathlib import Path - from setuptools import setup -from os import path - -sys.path.append(str(Path(__file__).parent)) -import pdfminer as package # noqa: E402 -with open(path.join(path.abspath(path.dirname(__file__)), "README.md")) as f: +root_dir = Path(__file__).parent +with open(root_dir / "README.md", "rt") as f: readme = f.read() setup( name="pdfminer.six", - version=package.__version__, + setuptools_git_versioning={ + "enabled": True, + }, + setup_requires=["setuptools-git-versioning<2"], packages=["pdfminer"], package_data={"pdfminer": ["cmap/*.pickle.gz", "py.typed"]}, install_requires=[ "charset-normalizer >= 2.0.0", "cryptography >= 36.0.0", 'typing_extensions; python_version < "3.8"', + 'importlib_metadata; python_version < "3.8"', ], extras_require={ "dev": ["pytest", "nox", "black", "mypy == 0.931"], @@ -45,10 +44,11 @@ python_requires=">=3.6", classifiers=[ "Programming Language :: Python", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3 :: Only", "Development Status :: 5 - Production/Stable", "Environment :: Console", diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_converter.py b/tests/test_converter.py index 5bd560e9..17d280cb 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -1,11 +1,11 @@ import io from tempfile import TemporaryFile -from helpers import absolute_sample_path from pdfminer.converter import PDFLayoutAnalyzer, PDFConverter from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTContainer, LTRect, LTLine, LTCurve from pdfminer.pdfinterp import PDFGraphicState +from tests.helpers import absolute_sample_path class TestPaintPath: @@ -173,7 +173,7 @@ def get_types(path): # they all have shape 'ml' not 'mlh' ml_pdf = extract_pages("samples/contrib/pr-00530-ml-lines.pdf") ml_pdf_page = list(ml_pdf)[0] - assert sum(type(item) == LTLine for item in ml_pdf_page) == 6 + assert sum(type(item) is LTLine for item in ml_pdf_page) == 6 def _get_analyzer(self): analyzer = PDFLayoutAnalyzer(None) diff --git a/tests/test_font_size.py b/tests/test_font_size.py index fca808c3..cac5b753 100644 --- a/tests/test_font_size.py +++ b/tests/test_font_size.py @@ -1,6 +1,6 @@ -from helpers import absolute_sample_path from pdfminer.high_level import extract_pages from pdfminer.layout import LTChar, LTTextBox +from tests.helpers import absolute_sample_path def test_font_size(): diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 684ef279..ad9d78c4 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -1,8 +1,8 @@ import unittest -from helpers import absolute_sample_path -from pdfminer.high_level import extract_text, extract_pages +from pdfminer.high_level import extract_pages, extract_text from pdfminer.layout import LAParams, LTTextContainer +from tests.helpers import absolute_sample_path def run_with_string(sample_path, laparams=None): @@ -54,6 +54,7 @@ def run_with_file(sample_path): "contrib/issue_566_test_1.pdf": "ISSUE Date:2019-4-25 Buyer:黎荣", "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03", + "contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech", } @@ -140,6 +141,11 @@ def test_issue_625_identity_cmap(self): self.assertEqual(lines[6], test_strings[test_file]) + def test_issue_791_non_unicode_cmap(self): + test_file = "contrib/issue-791-non-unicode-cmap.pdf" + s = run_with_file(test_file) + self.assertEqual(s.strip(), test_strings[test_file]) + class TestExtractPages(unittest.TestCase): def _get_test_file_path(self): diff --git a/tests/test_layout.py b/tests/test_layout.py index fd393a4e..85058cf3 100644 --- a/tests/test_layout.py +++ b/tests/test_layout.py @@ -10,7 +10,7 @@ LTTextBoxVertical, ) from pdfminer.utils import Plane -from helpers import absolute_sample_path +from tests.helpers import absolute_sample_path class TestGroupTextLines(unittest.TestCase): diff --git a/tests/test_pdfdocument.py b/tests/test_pdfdocument.py index 3c1f2430..c57126fb 100644 --- a/tests/test_pdfdocument.py +++ b/tests/test_pdfdocument.py @@ -2,10 +2,10 @@ import pytest -from helpers import absolute_sample_path from pdfminer.pdfdocument import PDFDocument, PDFNoPageLabels from pdfminer.pdfparser import PDFParser from pdfminer.pdftypes import PDFObjectNotFound, dict_value, int_value +from tests.helpers import absolute_sample_path class TestPdfDocument(object): diff --git a/tests/test_pdfpage.py b/tests/test_pdfpage.py index 0d3109f1..c3fe86c2 100644 --- a/tests/test_pdfpage.py +++ b/tests/test_pdfpage.py @@ -1,7 +1,7 @@ -from helpers import absolute_sample_path from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfparser import PDFParser +from tests.helpers import absolute_sample_path class TestPdfPage(object): diff --git a/tests/test_tools_dumppdf.py b/tests/test_tools_dumppdf.py index 84e3111c..971c3d07 100644 --- a/tests/test_tools_dumppdf.py +++ b/tests/test_tools_dumppdf.py @@ -2,8 +2,8 @@ import pytest -from helpers import absolute_sample_path -from tempfilepath import TemporaryFilePath +from tests.helpers import absolute_sample_path +from tests.tempfilepath import TemporaryFilePath from tools import dumppdf diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py index abd53074..f6eeefcf 100644 --- a/tests/test_tools_pdf2txt.py +++ b/tests/test_tools_pdf2txt.py @@ -1,11 +1,11 @@ +import filecmp import os from shutil import rmtree from tempfile import mkdtemp -import filecmp import tools.pdf2txt as pdf2txt -from helpers import absolute_sample_path -from tempfilepath import TemporaryFilePath +from tests.helpers import absolute_sample_path +from tests.tempfilepath import TemporaryFilePath def run(sample_path, options=None): diff --git a/tests/test_utils.py b/tests/test_utils.py index 062a9733..160b02b4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,15 +2,15 @@ import pytest -from helpers import absolute_sample_path from pdfminer.layout import LTComponent from pdfminer.utils import ( - open_filename, Plane, - shorten_str, - format_int_roman, format_int_alpha, + format_int_roman, + open_filename, + shorten_str, ) +from tests.helpers import absolute_sample_path class TestOpenFilename: