Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT: Enhance PDFConverter to support text injection into existing PDFs #641

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 78 additions & 35 deletions doc/code/converters/pdf_converter.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,22 +30,7 @@
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'__type__': 'TextTarget', '__module__': 'pyrit.prompt_target.text_target'}: user: https://airtstorageaccountdev.blob.core.windows.net/dbdata/prompt-memory-entries/urls/1736273802673024.pdf\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m\u001b[34muser: https://airtstorageaccountdev.blob.core.windows.net/dbdata/prompt-memory-entries/urls/1736273802673024.pdf\n"
]
}
],
"outputs": [],
"source": [
"import pathlib\n",
"\n",
Expand All @@ -69,7 +54,9 @@
"}\n",
"\n",
"# Load the YAML template for the PDF generation\n",
"template_path = pathlib.Path(DATASETS_PATH) / \"prompt_converters\" / \"pdf_converters\" / \"red_teaming_application_template.yaml\"\n",
"template_path = (\n",
" pathlib.Path(DATASETS_PATH) / \"prompt_converters\" / \"pdf_converters\" / \"red_teaming_application_template.yaml\"\n",
")\n",
"if not template_path.exists():\n",
" raise FileNotFoundError(f\"Template file not found: {template_path}\")\n",
"\n",
Expand Down Expand Up @@ -115,22 +102,7 @@
"execution_count": null,
"id": "3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'__type__': 'TextTarget', '__module__': 'pyrit.prompt_target.text_target'}: user: https://airtstorageaccountdev.blob.core.windows.net/dbdata/prompt-memory-entries/urls/1736273817760662.pdf\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m\u001b[34muser: https://airtstorageaccountdev.blob.core.windows.net/dbdata/prompt-memory-entries/urls/1736273817760662.pdf\n"
]
}
],
"outputs": [],
"source": [
"# Define a simple string prompt (no templates)\n",
"prompt = \"This is a simple test string for PDF generation. No templates here!\"\n",
Expand Down Expand Up @@ -161,10 +133,81 @@
"await orchestrator.print_conversations_async() # type: ignore"
]
},
{
"cell_type": "markdown",
"id": "4",
"metadata": {
"lines_to_next_cell": 0
},
"source": [
"# Modify Existing PDF with Injection Items"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4",
"id": "5",
"metadata": {},
"outputs": [],
"source": [
"from pyrit.common.path import get_default_dbdata_path\n",
"from pyrit.common import initialize_pyrit, IN_MEMORY\n",
"from pyrit.prompt_converter import PDFConverter\n",
"from pyrit.prompt_target import TextTarget\n",
"from pyrit.orchestrator import PromptSendingOrchestrator\n",
"from io import BytesIO\n",
"\n",
"initialize_pyrit(memory_db_type=IN_MEMORY)\n",
"\n",
"dbdata_path = get_default_dbdata_path()\n",
"cv_pdf_path = dbdata_path / \"volkan_kutal_cv.pdf\" # Dynamically resolve the CV PDF path\n",
"\n",
"# Example: Load an existing PDF\n",
"with open(cv_pdf_path, \"rb\") as pdf_file:\n",
" existing_pdf = BytesIO(pdf_file.read())\n",
"\n",
"# Define injection items\n",
"injection_items = [\n",
" {\"page\": 0, \"x\": 50, \"y\": 700, \"text\": \"Injected Text\", \"font_size\": 12, \"font\": \"Helvetica\", \"font_color\": (255, 0, 0)}, # Red text\n",
" {\"page\": 1, \"x\": 100, \"y\": 600, \"text\": \"Confidential\", \"font_size\": 10, \"font\": \"Helvetica\", \"font_color\": (0, 0, 255)} # Blue text\n",
"]\n",
"\n",
"# Define a simple string prompt (no templates)\n",
"prompt = \"This is a simple test string for PDF generation. No templates here!\"\n",
"\n",
"# Initialize the TextTarget (mock target for testing)\n",
"prompt_target = TextTarget()\n",
"\n",
"# Initialize the PDFConverter with the existing PDF and injection items\n",
"pdf_converter = PDFConverter(\n",
" prompt_template=None, # No template provided\n",
" font_type=\"Arial\",\n",
" font_size=12,\n",
" page_width=210,\n",
" page_height=297,\n",
" existing_pdf=existing_pdf, # Provide the existing PDF\n",
" injection_items=injection_items, # Provide the injection items\n",
")\n",
"\n",
"# Define the list of prompts as strings (required by PromptSendingOrchestrator)\n",
"prompts = [prompt]\n",
"\n",
"# Initialize the orchestrator\n",
"orchestrator = PromptSendingOrchestrator(\n",
" objective_target=prompt_target,\n",
" prompt_converters=[pdf_converter],\n",
" verbose=False,\n",
")\n",
"\n",
"# Run the orchestrator to modify the PDF and inspect the result\n",
"await orchestrator.send_prompts_async(prompt_list=prompts) # type: ignore\n",
"await orchestrator.print_conversations_async() # type: ignore"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6",
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -191,7 +234,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
"version": "3.11.9"
}
},
"nbformat": 4,
Expand Down
72 changes: 72 additions & 0 deletions doc/code/converters/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,77 @@
await orchestrator.send_prompts_async(prompt_list=prompts) # type: ignore
await orchestrator.print_conversations_async() # type: ignore

# %% [markdown]
# # Modify Existing PDF with Injection Items
# %%
from pyrit.common.path import get_default_dbdata_path
from pyrit.common import initialize_pyrit, IN_MEMORY
from pyrit.prompt_converter import PDFConverter
from pyrit.prompt_target import TextTarget
from pyrit.orchestrator import PromptSendingOrchestrator
from io import BytesIO

initialize_pyrit(memory_db_type=IN_MEMORY)

dbdata_path = get_default_dbdata_path()
cv_pdf_path = dbdata_path / "volkan_kutal_cv.pdf" # Dynamically resolve the CV PDF path
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might have to add a basic PDF to the repo for this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, under assets maybe


# Example: Load an existing PDF
with open(cv_pdf_path, "rb") as pdf_file:
existing_pdf = BytesIO(pdf_file.read())

# Define injection items
injection_items = [
{
"page": 0,
"x": 50,
"y": 700,
"text": "Injected Text",
"font_size": 12,
"font": "Helvetica",
"font_color": (255, 0, 0),
}, # Red text
{
"page": 1,
"x": 100,
"y": 600,
"text": "Confidential",
"font_size": 10,
"font": "Helvetica",
"font_color": (0, 0, 255),
}, # Blue text
]

# Define a simple string prompt (no templates)
prompt = "This is a simple test string for PDF generation. No templates here!"

# Initialize the TextTarget (mock target for testing)
prompt_target = TextTarget()

# Initialize the PDFConverter with the existing PDF and injection items
pdf_converter = PDFConverter(
prompt_template=None, # No template provided
font_type="Arial",
font_size=12,
page_width=210,
page_height=297,
existing_pdf=existing_pdf, # Provide the existing PDF
injection_items=injection_items, # Provide the injection items
)

# Define the list of prompts as strings (required by PromptSendingOrchestrator)
prompts = [prompt]

# Initialize the orchestrator
orchestrator = PromptSendingOrchestrator(
objective_target=prompt_target,
prompt_converters=[pdf_converter],
verbose=False,
)

# Run the orchestrator to modify the PDF and inspect the result
await orchestrator.send_prompts_async(prompt_list=prompts) # type: ignore
await orchestrator.print_conversations_async() # type: ignore

# %%
orchestrator.dispose_db_engine()
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ dependencies = [
"pyodbc>=5.1.0",
"pycountry>=24.6.1",
"python-dotenv>=1.0.1",
"pypdf==5.1.0",
"segno>=1.6.1",
"scikit-learn>=1.4.2",
"scipy>=1.14.1",
Expand Down
105 changes: 100 additions & 5 deletions pyrit/prompt_converter/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,24 @@
from pyrit.common.logger import logger
from io import BytesIO
import ast
from typing import Optional
from typing import Optional, List, Dict

from fpdf import FPDF
from pypdf import PdfReader, PdfWriter, PageObject
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have to update pyproject.toml to add this dependency and remove fpdf

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello Roman,

Depending on the features we want, we need both dependencies, pypdf is used for reading and modifying existing PDFs (e.g., CVs), while fpdf is for creating PDFs from scratch. This allows us to create custom templates, as mentioned in issue.

By keeping both, we would support:

  • Templated PDF Generation: YAML-based templates with SeedPrompt for dynamic data injection.
  • Non-Templated PDF Generation: Directly converting plain string prompts into PDFs.
  • Reading and Modifying PDFs: Supporting existing PDFs and post-creation edits.

I'd prefer to keep both dependencies since it allows us to handle not only CVs but also generate custom cover letters from scratch. It enables us to test injecting malicious input into both CVs and cover letters.

For example, we could let an LLM generate cover letters for us and then test how injecting malicious inputs into these cover letters might trick an AI recruiter, not just through CV injections but also through targeted manipulations of the cover letter.

Wdyt?


from pyrit.models import PromptDataType, SeedPrompt, data_serializer_factory
from pyrit.prompt_converter import PromptConverter, ConverterResult


class PDFConverter(PromptConverter):
"""
Converts a text prompt into a PDF file. Supports two modes:
Converts a text prompt into a PDF file. Supports various modes:
1. **Template-Based Generation**: If a `SeedPrompt` is provided, dynamic data can be injected into the
template using the `SeedPrompt.render_template_value` method, and the resulting content is converted to a PDF.
2. **Direct Text-Based Generation**: If no template is provided, the raw string prompt is converted directly
into a PDF.
3. **Modify Existing PDFs** (Overlay approach): Enables injecting text into existing PDFs at specified
coordinates, merging a new "overlay layer" onto the original PDF.

Args:
prompt_template (Optional[SeedPrompt], optional): A `SeedPrompt` object representing a template.
Expand All @@ -35,23 +38,29 @@ def __init__(
prompt_template: Optional[SeedPrompt] = None,
font_type: Optional[str] = "Arial",
font_size: Optional[int] = 12,
font_color: Optional[tuple] = (255, 255, 255),
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved
page_width: Optional[int] = 210,
page_height: Optional[int] = 297,
column_width: Optional[int] = 0,
row_height: Optional[int] = 10,
existing_pdf: Optional[BytesIO] = None,
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved
injection_items: Optional[List[Dict]] = None,
) -> None:
self._prompt_template = prompt_template
self._font_type = font_type
self._font_size = font_size
self._font_color = font_color
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved
self._page_width = page_width
self._page_height = page_height
self._column_width = column_width
self._row_height = row_height
self._existing_pdf = existing_pdf
self._injection_items = injection_items
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved

async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
"""
Converts the given prompt into a PDF. If a template is provided, it injects the prompt into the template,
otherwise, it generates a simple PDF with the prompt as the content.
otherwise, it generates a simple PDF with the prompt as the content. Further it can modify existing PDFs.

Args:
prompt (str): The prompt to be embedded in the PDF.
Expand All @@ -66,8 +75,11 @@ async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text
# Step 1: Prepare content
content = self._prepare_content(prompt)

# Step 2: Generate PDF
pdf_bytes = self._generate_pdf(content)
# Step 2: Generate or modify the PDF (Overlay, if existing PDF)
if self._existing_pdf:
pdf_bytes = self._modify_existing_pdf()
else:
pdf_bytes = self._generate_pdf(content)

# Step 3: Serialize PDF
pdf_serializer = await self._serialize_pdf(pdf_bytes, content)
Expand Down Expand Up @@ -144,6 +156,89 @@ def _generate_pdf(self, content: str) -> BytesIO:
pdf_bytes.seek(0)
return pdf_bytes

def _modify_existing_pdf(self) -> BytesIO:
"""
Creates an overlay for each injection item and merges it onto the corresponding page
in the existing PDF.
"""
if not self._existing_pdf or not self._injection_items:
raise ValueError("Existing PDF and injection items are required for modification.")

reader = PdfReader(self._existing_pdf)
writer = PdfWriter()

for page_number, page in enumerate(reader.pages):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some readers just read content once and then essentially don't keep the content in memory. Does this one work that way? If so, this will stop working if you call it twice. Just asking since it's not obvious to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PdfReader loads the cross-reference table and essential metadata into memory during initialization, which allows for repeated access to reader.pages .

This behavior was verified through testing, which confirmed that reader.pages provides consistent results across multiple accesses.

def test_pdf_reader_repeated_access():
    # Create a simple PDF in memory
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.cell(200, 10, txt="Test Content", ln=True)
    pdf_bytes = BytesIO()
    pdf.output(pdf_bytes)
    pdf_bytes.seek(0)

    # Use PdfReader to read the PDF
    reader = PdfReader(pdf_bytes)
    first_access = reader.pages[0].extract_text()
    second_access = reader.pages[0].extract_text()

    # Assertions to verify behavior
    assert first_access == second_access, "Repeated access should return consistent data"

# For each item that belongs on this page, create and merge an overlay
for item in self._injection_items:
if item.get("page", 0) == page_number:
x = item.get("x", 10)
y = item.get("y", 10)
Comment on lines +174 to +175
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is 10 just an arbitrary location you chose or does it mean something?

Copy link
Contributor Author

@KutalVolkan KutalVolkan Jan 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 10 is a default offset to avoid placing text at the edge of the page (0,0). It ensures better readability and positioning. While it serves a practical purpose, the choice of 10 is somewhat arbitrary and can be adjusted as needed.

text = item.get("text", "")
font = item.get("font", self._font_type)
font_size = item.get("font_size", self._font_size)
font_color = item.get("font_color", self._font_color)

# Create an overlay page and merge it
overlay_page = self._inject_text_into_page(page, x, y, text, font, font_size, font_color)
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved
page.merge_page(overlay_page)

writer.add_page(page)
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved

output_pdf = BytesIO()
writer.write(output_pdf)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is that written to?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The writer.write(output_pdf) method writes the contents of the PDF into the BytesIO object, output_pdf. A BytesIO object is an in-memory stream that mimics file-like behavior but stores its data in the system's RAM instead of writing it to disk.

output_pdf.seek(0)
return output_pdf
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved

def _inject_text_into_page(
self, page: PageObject, x: float, y: float, text: str, font: str, font_size: int, font_color: tuple
) -> PageObject:
"""
Generates a single-page PDF "overlay" that places `text` at coordinates (x, y).
Returns that overlay page as a PageObject, which can then be merged onto the base page.
"""
# Determine page size from the original page's MediaBox
page_width = float(page.mediabox[2] - page.mediabox[0])
page_height = float(page.mediabox[3] - page.mediabox[1])

# Out-of-Bounds Checks
if x < 0:
logger.error(f"x_pos is less than 0 and therefore out of bounds: x={x}")
raise ValueError(f"x_pos is less than 0 and therefore out of bounds: x={x}")
if x > page_width:
logger.error(f"x_pos exceeds page width and is out of bounds: x={x}, page_width={page_width}")
raise ValueError(f"x_pos exceeds page width and is out of bounds: x={x}, page_width={page_width}")
if y < 0:
logger.error(f"y_pos is less than 0 and therefore out of bounds: y={y}")
raise ValueError(f"y_pos is less than 0 and therefore out of bounds: y={y}")
if y > page_height:
logger.error(f"y_pos exceeds page height and is out of bounds: y={y}, page_height={page_height}")
raise ValueError(f"y_pos exceeds page height and is out of bounds: y={y}, page_height={page_height}")

# Create a small overlay PDF in memory
overlay_pdf = FPDF(unit="pt", format=(page_width, page_height))
overlay_pdf.add_page()

# Set font
overlay_pdf.set_font(font, size=font_size)
r, g, b = font_color
overlay_pdf.set_text_color(r, g, b)

# Position text: FPDF starts (0,0) at top-left, so (x, y) from bottom-left
# means we do "set_xy(x, page_height - y)" if your coordinates assume bottom-left origin
overlay_pdf.set_xy(x, page_height - y)

# Insert the text
overlay_pdf.cell(0, 0, text)

# Convert overlay PDF to a PageObject
overlay_buffer = BytesIO()
overlay_pdf.output(overlay_buffer)
overlay_buffer.seek(0)

overlay_reader = PdfReader(overlay_buffer)
overlay_page = overlay_reader.pages[0]
KutalVolkan marked this conversation as resolved.
Show resolved Hide resolved
return overlay_page

async def _serialize_pdf(self, pdf_bytes: BytesIO, content: str):
"""
Serializes the generated PDF using a data serializer.
Expand Down
Loading