-
Notifications
You must be signed in to change notification settings - Fork 399
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
FEAT: Enhance PDFConverter to support text injection into existing PDFs #641
base: main
Are you sure you want to change the base?
Changes from all commits
b5ee6c3
a4bfd30
47412c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,21 +4,24 @@ | |
from pyrit.common.logger import logger | ||
from io import BytesIO | ||
import ast | ||
from typing import Optional | ||
from typing import Optional, List, Dict | ||
|
||
from fpdf import FPDF | ||
from pypdf import PdfReader, PdfWriter, PageObject | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we have to update pyproject.toml to add this dependency and remove fpdf There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hello Roman, Depending on the features we want, we need both dependencies, By keeping both, we would support:
I'd prefer to keep both dependencies since it allows us to handle not only CVs but also generate custom cover letters from scratch. It enables us to test injecting malicious input into both CVs and cover letters. For example, we could let an LLM generate cover letters for us and then test how injecting malicious inputs into these cover letters might trick an AI recruiter, not just through CV injections but also through targeted manipulations of the cover letter. Wdyt? |
||
|
||
from pyrit.models import PromptDataType, SeedPrompt, data_serializer_factory | ||
from pyrit.prompt_converter import PromptConverter, ConverterResult | ||
|
||
|
||
class PDFConverter(PromptConverter): | ||
""" | ||
Converts a text prompt into a PDF file. Supports two modes: | ||
Converts a text prompt into a PDF file. Supports various modes: | ||
1. **Template-Based Generation**: If a `SeedPrompt` is provided, dynamic data can be injected into the | ||
template using the `SeedPrompt.render_template_value` method, and the resulting content is converted to a PDF. | ||
2. **Direct Text-Based Generation**: If no template is provided, the raw string prompt is converted directly | ||
into a PDF. | ||
3. **Modify Existing PDFs** (Overlay approach): Enables injecting text into existing PDFs at specified | ||
coordinates, merging a new "overlay layer" onto the original PDF. | ||
|
||
Args: | ||
prompt_template (Optional[SeedPrompt], optional): A `SeedPrompt` object representing a template. | ||
|
@@ -35,23 +38,29 @@ def __init__( | |
prompt_template: Optional[SeedPrompt] = None, | ||
font_type: Optional[str] = "Arial", | ||
font_size: Optional[int] = 12, | ||
font_color: Optional[tuple] = (255, 255, 255), | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
page_width: Optional[int] = 210, | ||
page_height: Optional[int] = 297, | ||
column_width: Optional[int] = 0, | ||
row_height: Optional[int] = 10, | ||
existing_pdf: Optional[BytesIO] = None, | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
injection_items: Optional[List[Dict]] = None, | ||
) -> None: | ||
self._prompt_template = prompt_template | ||
self._font_type = font_type | ||
self._font_size = font_size | ||
self._font_color = font_color | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._page_width = page_width | ||
self._page_height = page_height | ||
self._column_width = column_width | ||
self._row_height = row_height | ||
self._existing_pdf = existing_pdf | ||
self._injection_items = injection_items | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: | ||
""" | ||
Converts the given prompt into a PDF. If a template is provided, it injects the prompt into the template, | ||
otherwise, it generates a simple PDF with the prompt as the content. | ||
otherwise, it generates a simple PDF with the prompt as the content. Further it can modify existing PDFs. | ||
|
||
Args: | ||
prompt (str): The prompt to be embedded in the PDF. | ||
|
@@ -66,8 +75,11 @@ async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text | |
# Step 1: Prepare content | ||
content = self._prepare_content(prompt) | ||
|
||
# Step 2: Generate PDF | ||
pdf_bytes = self._generate_pdf(content) | ||
# Step 2: Generate or modify the PDF (Overlay, if existing PDF) | ||
if self._existing_pdf: | ||
pdf_bytes = self._modify_existing_pdf() | ||
else: | ||
pdf_bytes = self._generate_pdf(content) | ||
|
||
# Step 3: Serialize PDF | ||
pdf_serializer = await self._serialize_pdf(pdf_bytes, content) | ||
|
@@ -144,6 +156,89 @@ def _generate_pdf(self, content: str) -> BytesIO: | |
pdf_bytes.seek(0) | ||
return pdf_bytes | ||
|
||
def _modify_existing_pdf(self) -> BytesIO: | ||
""" | ||
Creates an overlay for each injection item and merges it onto the corresponding page | ||
in the existing PDF. | ||
""" | ||
if not self._existing_pdf or not self._injection_items: | ||
raise ValueError("Existing PDF and injection items are required for modification.") | ||
|
||
reader = PdfReader(self._existing_pdf) | ||
writer = PdfWriter() | ||
|
||
for page_number, page in enumerate(reader.pages): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some readers just read content once and then essentially don't keep the content in memory. Does this one work that way? If so, this will stop working if you call it twice. Just asking since it's not obvious to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The This behavior was verified through testing, which confirmed that def test_pdf_reader_repeated_access():
# Create a simple PDF in memory
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.cell(200, 10, txt="Test Content", ln=True)
pdf_bytes = BytesIO()
pdf.output(pdf_bytes)
pdf_bytes.seek(0)
# Use PdfReader to read the PDF
reader = PdfReader(pdf_bytes)
first_access = reader.pages[0].extract_text()
second_access = reader.pages[0].extract_text()
# Assertions to verify behavior
assert first_access == second_access, "Repeated access should return consistent data" |
||
# For each item that belongs on this page, create and merge an overlay | ||
for item in self._injection_items: | ||
if item.get("page", 0) == page_number: | ||
x = item.get("x", 10) | ||
y = item.get("y", 10) | ||
Comment on lines
+174
to
+175
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is 10 just an arbitrary location you chose or does it mean something? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
text = item.get("text", "") | ||
font = item.get("font", self._font_type) | ||
font_size = item.get("font_size", self._font_size) | ||
font_color = item.get("font_color", self._font_color) | ||
|
||
# Create an overlay page and merge it | ||
overlay_page = self._inject_text_into_page(page, x, y, text, font, font_size, font_color) | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
page.merge_page(overlay_page) | ||
|
||
writer.add_page(page) | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
output_pdf = BytesIO() | ||
writer.write(output_pdf) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where is that written to? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
output_pdf.seek(0) | ||
return output_pdf | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def _inject_text_into_page( | ||
self, page: PageObject, x: float, y: float, text: str, font: str, font_size: int, font_color: tuple | ||
) -> PageObject: | ||
""" | ||
Generates a single-page PDF "overlay" that places `text` at coordinates (x, y). | ||
Returns that overlay page as a PageObject, which can then be merged onto the base page. | ||
""" | ||
# Determine page size from the original page's MediaBox | ||
page_width = float(page.mediabox[2] - page.mediabox[0]) | ||
page_height = float(page.mediabox[3] - page.mediabox[1]) | ||
|
||
# Out-of-Bounds Checks | ||
if x < 0: | ||
logger.error(f"x_pos is less than 0 and therefore out of bounds: x={x}") | ||
raise ValueError(f"x_pos is less than 0 and therefore out of bounds: x={x}") | ||
if x > page_width: | ||
logger.error(f"x_pos exceeds page width and is out of bounds: x={x}, page_width={page_width}") | ||
raise ValueError(f"x_pos exceeds page width and is out of bounds: x={x}, page_width={page_width}") | ||
if y < 0: | ||
logger.error(f"y_pos is less than 0 and therefore out of bounds: y={y}") | ||
raise ValueError(f"y_pos is less than 0 and therefore out of bounds: y={y}") | ||
if y > page_height: | ||
logger.error(f"y_pos exceeds page height and is out of bounds: y={y}, page_height={page_height}") | ||
raise ValueError(f"y_pos exceeds page height and is out of bounds: y={y}, page_height={page_height}") | ||
|
||
# Create a small overlay PDF in memory | ||
overlay_pdf = FPDF(unit="pt", format=(page_width, page_height)) | ||
overlay_pdf.add_page() | ||
|
||
# Set font | ||
overlay_pdf.set_font(font, size=font_size) | ||
r, g, b = font_color | ||
overlay_pdf.set_text_color(r, g, b) | ||
|
||
# Position text: FPDF starts (0,0) at top-left, so (x, y) from bottom-left | ||
# means we do "set_xy(x, page_height - y)" if your coordinates assume bottom-left origin | ||
overlay_pdf.set_xy(x, page_height - y) | ||
|
||
# Insert the text | ||
overlay_pdf.cell(0, 0, text) | ||
|
||
# Convert overlay PDF to a PageObject | ||
overlay_buffer = BytesIO() | ||
overlay_pdf.output(overlay_buffer) | ||
overlay_buffer.seek(0) | ||
|
||
overlay_reader = PdfReader(overlay_buffer) | ||
overlay_page = overlay_reader.pages[0] | ||
KutalVolkan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return overlay_page | ||
|
||
async def _serialize_pdf(self, pdf_bytes: BytesIO, content: str): | ||
""" | ||
Serializes the generated PDF using a data serializer. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We might have to add a basic PDF to the repo for this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, under
assets
maybe