Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/CR-1815-Convert PDF into Excel for Table Data Verification #77

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added convert_pdf_to_excel/.DS_Store
Binary file not shown.
Binary file not shown.
149 changes: 149 additions & 0 deletions convert_pdf_to_excel/convert_pdf_to_excel/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
<?xml version="1.0" encoding="UTF-8"?>
<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.testsigma.addons</groupId>
<artifactId>convert_pdf_to_excel</artifactId>
<version>1.0.1</version>
<packaging>jar</packaging>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>11</maven.compiler.source>
<maven.compiler.target>11</maven.compiler.target>
<testsigma.sdk.version>1.2.13_cloud</testsigma.sdk.version>
<junit.jupiter.version>5.8.0-M1</junit.jupiter.version>
<testsigma.addon.maven.plugin>1.0.0</testsigma.addon.maven.plugin>
<maven.source.plugin.version>3.2.1</maven.source.plugin.version>
<lombok.version>1.18.20</lombok.version>

</properties>

<dependencies>
<dependency>
<groupId>com.testsigma</groupId>
<artifactId>testsigma-java-sdk</artifactId>
<version>${testsigma.sdk.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>${junit.jupiter.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.14.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.seleniumhq.selenium/selenium-java -->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>4.14.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/io.appium/java-client -->
<dependency>
<groupId>io.appium</groupId>
<artifactId>java-client</artifactId>
<version>9.0.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
<version>2.13.0</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.1</version>
</dependency>
<!-- Apache POI -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.4</version>
</dependency>

</dependencies>
<build>
<finalName>convert_pdf_to_excel</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<minimizeJar>true</minimizeJar>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-source-plugin</artifactId>
<version>${maven.source.plugin.version}</version>
<executions>
<execution>
<id>attach-sources</id>
<goals>
<goal>jar</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package com.testsigma.addons.web;

import com.testsigma.sdk.ApplicationType;
import com.testsigma.sdk.Result;
import com.testsigma.sdk.WebAction;
import com.testsigma.sdk.annotation.Action;
import com.testsigma.sdk.annotation.TestData;
import com.testsigma.sdk.annotation.RunTimeData;
import lombok.Data;
import org.apache.pdfbox.Loader;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import technology.tabula.*;
import technology.tabula.Table;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;

import java.io.*;
import java.net.URL;
import java.nio.file.*;
import java.util.List;

@Data
@Action(actionText = "PDF: Convert PDF file at pdf-file-path to Excel and save as output-file-name, storing path in runtime-variable variable-name",
description = "Extracts content from PDF and converts to Excel, storing path in a runtime variable and it will work in local execution",
applicationType = ApplicationType.WEB)
public class PdfToExcelConverter extends WebAction {

@TestData(reference = "pdf-file-path")
private com.testsigma.sdk.TestData pdfFilePath;

@TestData(reference = "output-file-name")
private com.testsigma.sdk.TestData outputFileName;

@TestData(reference = "variable-name", isRuntimeVariable = true)
private com.testsigma.sdk.TestData runtimeVariable;

@RunTimeData
private com.testsigma.sdk.RunTimeData runTimeData;

@Override
protected Result execute() {
Result result = Result.SUCCESS;
try {
String pdfPath = String.valueOf(pdfFilePath.getValue());
String fileName = String.valueOf(outputFileName.getValue());

File pdfFile;
Path pdfParentPath;

if (pdfPath.startsWith("http://") || pdfPath.startsWith("https://")) {
// If the path is a URL, download the PDF and use its filename for output
pdfFile = downloadPdf(pdfPath);
pdfParentPath = pdfFile.toPath().getParent(); // Get parent of the downloaded temp file
} else {
// Else it's a local file
pdfFile = new File(pdfPath);
if (!pdfFile.exists()) {
throw new IllegalArgumentException("PDF file does not exist: " + pdfPath);
}
pdfParentPath = pdfFile.toPath().getParent(); // Get parent of local file
}

// Ensure .xlsx extension
String outputFileNameWithExtension = fileName.endsWith(".xlsx") ? fileName : fileName + ".xlsx";

// Combine parent path with the filename to ensure it's in the same directory
Path outputPath = pdfParentPath.resolve(outputFileNameWithExtension);
String absoluteOutputPath = outputPath.toAbsolutePath().toString();

// Load PDF and extract data
ObjectExtractor extractor = new ObjectExtractor(Loader.loadPDF(pdfFile));
SpreadsheetExtractionAlgorithm algo = new SpreadsheetExtractionAlgorithm();

// Extract tables from the first page
Page page = extractor.extract(1); // First page
List<Table> tables = algo.extract(page);

// Create Excel workbook
Workbook workbook = new XSSFWorkbook();
Sheet sheet = workbook.createSheet("PDF Data");

// Fill the sheet with table data
int rowIndex = 0;
for (Table table : tables) {
for (List<RectangularTextContainer> row : table.getRows()) {
Row excelRow = sheet.createRow(rowIndex++);
int colIndex = 0;
for (RectangularTextContainer cell : row) {
Cell excelCell = excelRow.createCell(colIndex++);
excelCell.setCellValue(cell.getText());
}
}
}

// Write to Excel file
try (FileOutputStream fileOut = new FileOutputStream(absoluteOutputPath)) {
workbook.write(fileOut);
}

// Store output path in runtime variable
runTimeData.setKey(String.valueOf(runtimeVariable.getValue()));
runTimeData.setValue(absoluteOutputPath);

logger.info("PDF converted successfully: " + absoluteOutputPath);
setSuccessMessage("Successfully converted PDF to Excel at " + absoluteOutputPath);

// Delete the downloaded file in case of URL
if (pdfPath.startsWith("http://") || pdfPath.startsWith("https://")) {
pdfFile.delete();
}

} catch (Exception e) {
logger.warn("PDF conversion failed: " + e);
setErrorMessage("PDF conversion error: " + e.getMessage());
result = Result.FAILED;
}
return result;
}

private File downloadPdf(String pdfUrl) throws IOException {
// Download PDF file from URL and save locally
URL url = new URL(pdfUrl);
String fileName = Paths.get(url.getPath()).getFileName().toString();
File tempFile = File.createTempFile("downloaded-", fileName);
try (InputStream in = url.openStream();
OutputStream out = new FileOutputStream(tempFile)) {
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = in.read(buffer)) != -1) {
out.write(buffer, 0, bytesRead);
}
}
return tempFile;
}
}
Loading