Skip to content

Commit

Permalink
Simplify pdf_utils.py by removing class, just functions
Browse files Browse the repository at this point in the history
  • Loading branch information
pietermarsman committed Jun 27, 2024
1 parent 971f402 commit 067f94f
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 100 deletions.
17 changes: 11 additions & 6 deletions fuzzing/extract_text_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,38 @@

import atheris

from fuzzing.fuzz_helpers import EnhancedFuzzedDataProvider
from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider

with atheris.instrument_imports():
from fuzzing.pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
from fuzzing.utils import (
prepare_pdfminer_fuzzing,
is_valid_byte_stream,
generate_layout_parameters,
should_ignore_error,
)
from pdfminer.high_level import extract_text

from pdfminer.psparser import PSException


def fuzz_one_input(data: bytes) -> None:
if not PDFValidator.is_valid_byte_stream(data):
if not is_valid_byte_stream(data):
# Not worth continuing with this test case
return

fdp = EnhancedFuzzedDataProvider(data)
fdp = PdfminerFuzzedDataProvider(data)

try:
extract_text(
fdp.ConsumeMemoryFile(),
maxpages=fdp.ConsumeIntInRange(0, 10),
page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
laparams=PDFValidator.generate_layout_parameters(fdp),
laparams=generate_layout_parameters(fdp),
)
except (AssertionError, PSException):
return
except Exception as e:
if PDFValidator.should_ignore_error(e):
if should_ignore_error(e):
return
raise e

Expand Down
17 changes: 11 additions & 6 deletions fuzzing/extract_text_to_fp_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@

import atheris

from fuzzing.fuzz_helpers import EnhancedFuzzedDataProvider
from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider

with atheris.instrument_imports():
from .pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
from fuzzing.utils import (
prepare_pdfminer_fuzzing,
is_valid_byte_stream,
generate_layout_parameters,
should_ignore_error,
)
from pdfminer.high_level import extract_text_to_fp
from pdfminer.psparser import PSException

Expand All @@ -15,19 +20,19 @@


def fuzz_one_input(data: bytes) -> None:
if not PDFValidator.is_valid_byte_stream(data):
if not is_valid_byte_stream(data):
# Not worth continuing with this test case
return

fdp = EnhancedFuzzedDataProvider(data)
fdp = PdfminerFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile(all_data=False) as f_in, io.BytesIO() as f_out:
extract_text_to_fp(
f_in,
f_out,
output_type=fdp.PickValueInList(available_output_formats),
laparams=PDFValidator.generate_layout_parameters(fdp),
laparams=generate_layout_parameters(fdp),
maxpages=fdp.ConsumeIntInRange(0, 10),
page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
scale=fdp.ConsumeFloatInRange(0.0, 2.0),
Expand All @@ -38,7 +43,7 @@ def fuzz_one_input(data: bytes) -> None:
except (AssertionError, PSException):
return
except Exception as e:
if PDFValidator.should_ignore_error(e):
if should_ignore_error(e):
return
raise e

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from atheris import FuzzedDataProvider


class EnhancedFuzzedDataProvider(FuzzedDataProvider): # type: ignore[misc]
class PdfminerFuzzedDataProvider(FuzzedDataProvider): # type: ignore[misc]
def ConsumeRandomBytes(self) -> bytes:
int_range = self.ConsumeIntInRange(0, self.remaining_bytes())
return bytes(self.ConsumeBytes(int_range))
Expand Down
17 changes: 11 additions & 6 deletions fuzzing/page_extraction_fuzzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@
import atheris
import sys

from fuzzing.fuzz_helpers import EnhancedFuzzedDataProvider
from fuzzing.fuzzed_data_provider import PdfminerFuzzedDataProvider

with atheris.instrument_imports():
from .pdf_utils import PDFValidator, prepare_pdfminer_fuzzing
from fuzzing.utils import (
prepare_pdfminer_fuzzing,
is_valid_byte_stream,
generate_layout_parameters,
should_ignore_error,
)
from pdfminer.high_level import extract_pages
from pdfminer.psparser import PSException


def fuzz_one_input(data: bytes) -> None:
if not PDFValidator.is_valid_byte_stream(data):
if not is_valid_byte_stream(data):
# Not worth continuing with this test case
return

fdp = EnhancedFuzzedDataProvider(data)
fdp = PdfminerFuzzedDataProvider(data)

try:
with fdp.ConsumeMemoryFile() as f:
Expand All @@ -24,13 +29,13 @@ def fuzz_one_input(data: bytes) -> None:
f,
maxpages=fdp.ConsumeIntInRange(0, 10),
page_numbers=fdp.ConsumeOptionalIntList(10, 0, 10),
laparams=PDFValidator.generate_layout_parameters(fdp),
laparams=generate_layout_parameters(fdp),
)
)
except (AssertionError, PSException):
return
except Exception as e:
if PDFValidator.should_ignore_error(e):
if should_ignore_error(e):
return
raise e

Expand Down
81 changes: 0 additions & 81 deletions fuzzing/pdf_utils.py

This file was deleted.

75 changes: 75 additions & 0 deletions fuzzing/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Utilities shared across the various PDF fuzzing harnesses
"""
import logging
from typing import Optional

import atheris

from pdfminer.layout import LAParams
from pdfminer.psparser import PSException

PDF_MAGIC_BYTES = b"%PDF-"

# List of all exception message substrings explicitly raised by pdfminer that do not
# inherit from PSException
_EXPLICIT_EXCEPTION_MESSAGES = [
"Unsupported",
"duplicate labels",
"AcroForm",
"SASLPrep",
"Invalid",
]


def prepare_pdfminer_fuzzing() -> None:
"""
Used to disable logging of the pdfminer module
"""
logging.getLogger("pdfminer").setLevel(logging.CRITICAL)


def should_ignore_error(e: Exception) -> bool:
"""
Determines if the given raised exception is explicitly raised by pdfminer
:param e: The exception to check
:return: Whether the exception should be ignored or re-thrown
"""
return isinstance(e, PSException) or any(
em_ss in str(e) for em_ss in _EXPLICIT_EXCEPTION_MESSAGES
)


@atheris.instrument_func # type: ignore[misc]
def generate_layout_parameters(
fdp: atheris.FuzzedDataProvider,
) -> Optional[LAParams]:
if fdp.ConsumeBool():
return None

boxes_flow: Optional[float] = None
if fdp.ConsumeBool():
boxes_flow = fdp.ConsumeFloatInRange(-1.0, 1.0)

return LAParams(
line_overlap=fdp.ConsumeFloat(),
char_margin=fdp.ConsumeFloat(),
line_margin=fdp.ConsumeFloat(),
word_margin=fdp.ConsumeFloat(),
boxes_flow=boxes_flow,
detect_vertical=fdp.ConsumeBool(),
all_texts=fdp.ConsumeBool(),
)


@atheris.instrument_func # type: ignore[misc]
def is_valid_byte_stream(data: bytes) -> bool:
"""Quick check to see if this is worth of passing to atheris
:return: Whether the byte-stream passes the basic checks
"""
if not data.startswith(PDF_MAGIC_BYTES):
return False
if b"/Root" not in data:
return False

return True

0 comments on commit 067f94f

Please sign in to comment.