From fb1f238a25e4be680ab92ae5684583857c33ddbe Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Tue, 17 Dec 2024 16:32:00 +0100 Subject: [PATCH] TIKA-2342: suppport PDFBox IgnoreContentStreamSpaceGlyphs; add test; remove dead code line --- .../org/apache/tika/parser/pdf/PDFParser.java | 18 ++++++++++ .../tika/parser/pdf/PDFParserConfig.java | 23 ++++++++++++ .../apache/tika/parser/pdf/PDFParserTest.java | 33 +++++++++++++++++- .../testContentStreamSpaceGlyphs.pdf | Bin 0 -> 805 bytes 4 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testContentStreamSpaceGlyphs.pdf diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java index 77ccb9231a..13d09e6040 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java @@ -759,6 +759,24 @@ public boolean isSuppressDuplicateOverlappingText() { return defaultConfig.isSuppressDuplicateOverlappingText(); } + /** + * If true, the parser should ignore spaces in the content stream and rely purely on the + * algorithm to determine where word breaks are (PDFBOX-3774). This can improve text extraction + * results where the content stream is sorted by position and has text overlapping spaces, but + * could cause some word breaks to not be added to the output. By default this is disabled. + */ + @Field + public void setIgnoreContentStreamSpaceGlyphs(boolean v) { + defaultConfig.setIgnoreContentStreamSpaceGlyphs(v); + } + + /** + * @see #setIgnoreContentStreamSpaceGlyphs(boolean) + */ + public boolean isIgnoreContentStreamSpaceGlyphs() { + return defaultConfig.isIgnoreContentStreamSpaceGlyphs(); + } + /** * If true, the parser should try to remove duplicated * text over the same region. This is needed for some diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java index d1f93fc410..af6213ba9b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java @@ -65,6 +65,9 @@ public ImageType getImageType() { // True if we let PDFBox remove duplicate overlapping text: private boolean suppressDuplicateOverlappingText = false; + // True if we let PDFBox ignore spaces in the content stream and rely purely on the algorithm: + private boolean ignoreContentStreamSpaceGlyphs = false; + // True if we extract annotation text ourselves // (workaround for PDFBOX-1143): private boolean extractAnnotationText = true; @@ -223,6 +226,8 @@ public void configure(PDF2XHTML pdf2XHTML) { pdf2XHTML.setDropThreshold(dropThreshold); } pdf2XHTML.setSuppressDuplicateOverlappingText(isSuppressDuplicateOverlappingText()); + // TODO TIKA-2342 activate after PDFBox release + //pdf2XHTML.setIgnoreContentStreamSpaceGlyphs(isIgnoreContentStreamSpaceGlyphs()); } /** @@ -404,6 +409,24 @@ public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlap userConfigured.add("suppressDuplicateOverlappingText"); } + /** + * @see #setIgnoreContentStreamSpaceGlyphs(boolean) + */ + public boolean isIgnoreContentStreamSpaceGlyphs() { + return ignoreContentStreamSpaceGlyphs; + } + + /** + * If true, the parser should ignore spaces in the content stream and rely purely on the + * algorithm to determine where word breaks are (PDFBOX-3774). This can improve text extraction + * results where the content stream is sorted by position and has text overlapping spaces, but + * could cause some word breaks to not be added to the output. By default this is disabled. + */ + public void setIgnoreContentStreamSpaceGlyphs(boolean ignoreContentStreamSpaceGlyphs) { + this.ignoreContentStreamSpaceGlyphs = ignoreContentStreamSpaceGlyphs; + userConfigured.add("ignoreContentStreamSpaceGlyphs"); + } + /** * @see #setExtractAnnotationText(boolean) */ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index e3cfe54156..b4c22d0613 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -465,6 +465,37 @@ public void testDuplicateOverlappingText() throws Exception { } + // TODO TIKA-2342 activate after PDFBox release + // @Test + public void testIgnoreContentStreamSpaceGlyphs() throws Exception { + PDFParser parser = new PDFParser(); + // Default is false (keep spaces, don't sort): + XMLResult r = getXML("testContentStreamSpaceGlyphs.pdf", parser); + assertContains("( )overlap", r.xml); + + parser.getPDFParserConfig().setIgnoreContentStreamSpaceGlyphs(true); + r = getXML("testContentStreamSpaceGlyphs.pdf", parser); + assertContains("( )overlap", r.xml); + parser.getPDFParserConfig().setSortByPosition(true); + r = getXML("testContentStreamSpaceGlyphs.pdf", parser); + assertContains("( overlap )", r.xml); + + //now try with autodetect + ParseContext context = new ParseContext(); + PDFParserConfig config = new PDFParserConfig(); + context.set(PDFParserConfig.class, config); + r = getXML("testContentStreamSpaceGlyphs.pdf", context); + // Default is false (keep spaces, don't sort): + assertContains("( )overlap", r.xml); + + config.setIgnoreContentStreamSpaceGlyphs(true); + r = getXML("testContentStreamSpaceGlyphs.pdf", context); + assertContains("( )overlap", r.xml); + config.setSortByPosition(true); + r = getXML("testContentStreamSpaceGlyphs.pdf", context); + assertContains("( overlap )", r.xml); + } + @Test public void testSortByPosition() throws Exception { PDFParser parser = new PDFParser(); @@ -499,7 +530,7 @@ public void testSortByPosition() throws Exception { config.setSortByPosition(true); context.set(PDFParserConfig.class, config); - stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); + //stream = getResourceAsStream("/test-documents/testPDFTwoTextBoxes.pdf"); content = getText("testPDFTwoTextBoxes.pdf", new Metadata(), context); content = content.replaceAll("\\s+", " "); // Column text is now interleaved: diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testContentStreamSpaceGlyphs.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testContentStreamSpaceGlyphs.pdf new file mode 100644 index 0000000000000000000000000000000000000000..1a73dc4e2c1c5d53a466259125d1dde5c9b48b26 GIT binary patch literal 805 zcmY!laBZ^4=fsl4ocwey{jk)c;>`R! z1$~fe{eZ;u)M5oApz5UAIVGt@3i@t2i6yBn zsmb{%sUS_oB}J);xx8EzbBehhUMe>Z*$}|t`Pj%bz`)!vh{N;YrH+dM3S37rvb0)K z3U{^@ST^z)1sRwy&rHc%mY$H6lfYuGzh7sohWbW^WfBbcL&cu+f*b-d7Um>NOeYx` z8pC|&pOh6`lFOyk zZGN;qzi*q_g4Wfw$#(^}Y|eYu^w;zs^POA5?bA=q@A=JK$sEy>-H9da3=J>?&lng` zLHYS53ZSGC#HH`)q7ZFkWM*VwWbABeW@u<>V(4PvX6)wV=4xbaX5eID=xE_$r$AUq zEG$kUf>P7C^n)|2QWXphffnSYq*f?I8z>kW0F{I*L>mIFHUu)UMHfn{N>2E3{>1qM zM>t)2eCGK0%sJyTN5^N*nw~nI9)3v%)g?@mu=u&SB(bQZq6p{&QxiijRaIAiH!c7? CX&n&& literal 0 HcmV?d00001