Skip to content

Commit

Permalink
Use PDFBox 3 as default text converter; fallback to 1.8 in case of er…
Browse files Browse the repository at this point in the history
…rors

Issue: #4449
  • Loading branch information
buchen committed Jan 6, 2025
1 parent b898083 commit 78a48b0
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 33 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ public void execute(@Named(IServiceConstants.ACTIVE_PART) MPart part,

StringBuilder textBuilder = new StringBuilder();
textBuilder.append("```").append("\n");
textBuilder.append("PDFBox Version: ")
.append(inputFile.getPDFBoxVersion().toString()) //
textBuilder.append("PDFBox Version: ").append(inputFile.getPDFBoxVersion()) //
.append("\n");
textBuilder.append("Portfolio Performance Version: ")
.append(PortfolioPlugin.getDefault().getBundle().getVersion().toString()) //
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import org.eclipse.core.runtime.IProgressMonitor;

import name.abuchen.portfolio.Messages;
import name.abuchen.portfolio.PortfolioLog;
import name.abuchen.portfolio.datatransfer.Extractor;
import name.abuchen.portfolio.datatransfer.Extractor.Item;
import name.abuchen.portfolio.datatransfer.SecurityCache;
Expand Down Expand Up @@ -164,6 +165,27 @@ public Map<Extractor, List<Item>> run(IProgressMonitor monitor, Map<File, List<E
}
}

if (!extracted)
{
inputFile.convertLegacyPDFtoText();
for (Extractor extractor : extractors)
{
List<Item> items = extractor.extract(securityCache, inputFile, warnings);

if (!items.isEmpty())
{
extracted = true;
itemsByExtractor.computeIfAbsent(extractor, e -> new ArrayList<Item>()).addAll(items);
break;
}
}

if (extracted)
{
PortfolioLog.info("PDF successfully imported with PDFBox 1.8.x " + inputFile.getName()); //$NON-NLS-1$
}
}

if (!extracted)
{
Predicate<? super Exception> isNotUnsupportedOperation = e -> !(e instanceof UnsupportedOperationException);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,14 @@
import java.util.List;
import java.util.Scanner;

import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.osgi.framework.FrameworkUtil;
import org.osgi.framework.Version;

import name.abuchen.portfolio.datatransfer.Extractor;
import name.abuchen.portfolio.pdfbox1.PDFBox1Adapter;
import name.abuchen.portfolio.pdfbox3.PDFBox3Adapter;

public class PDFInputFile extends Extractor.InputFile
{
private String text;
private String version;

public PDFInputFile(File file)
{
Expand All @@ -27,8 +24,7 @@ public PDFInputFile(File file)
/* protected */ PDFInputFile(File file, String extractedText)
{
this(file);
this.text = extractedText;
this.text = withoutHorizontalWhitespace(extractedText);
this.text = sanitize(extractedText);
}

public static List<Extractor.InputFile> loadTestCase(Class<?> testCase, String... filenames)
Expand Down Expand Up @@ -62,37 +58,32 @@ public String getText()
return text;
}

public Version getPDFBoxVersion()
public String getPDFBoxVersion()
{
return FrameworkUtil.getBundle(PDDocument.class).getVersion();
return version;
}

public void convertPDFtoText() throws IOException
{
try (PDDocument document = PDDocument.load(getFile()))
{
boolean isProtected = document.isEncrypted();
if (isProtected)
{
document.decrypt(""); //$NON-NLS-1$
document.setAllSecurityToBeRemoved(true);
}

PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setSortByPosition(true);
text = textStripper.getText(document);

text = withoutHorizontalWhitespace(text);
}
catch (CryptographyException e)
{
throw new IOException(e);
}
var adapter = new PDFBox3Adapter();

text = sanitize(adapter.convertToText(getFile()));
version = adapter.getPDFBoxVersion();
}

public void convertLegacyPDFtoText() throws IOException
{
var adapter = new PDFBox1Adapter();

text = sanitize(adapter.convertToText(getFile()));
version = adapter.getPDFBoxVersion();
}

private String withoutHorizontalWhitespace(String s)
@SuppressWarnings("nls")
private String sanitize(String s)
{
// replace horizontal whitespace characters by normal whitespace
return s.replaceAll("\\h", " "); //$NON-NLS-1$ //$NON-NLS-2$
// without carriage returns
return s.replaceAll("\\h", " ").replace("\r", "");
}
}

0 comments on commit 78a48b0

Please sign in to comment.