From bfe0aa05193857142ed4b2af3e971eeb51cdcd16 Mon Sep 17 00:00:00 2001 From: iadcode Date: Thu, 27 Jan 2022 11:03:02 +1100 Subject: [PATCH 1/3] Add option to set path to custom tika config file --- .../elasticsearch/crawler/fs/settings/Fs.java | 28 ++++- .../crawler/fs/tika/TikaInstance.java | 112 ++++++++++-------- 2 files changed, 87 insertions(+), 53 deletions(-) diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index 5f4da1199..fdfd5bfaf 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -58,6 +58,7 @@ public class Fs { private Ocr ocr = new Ocr(); private ByteSizeValue ignoreAbove = null; private boolean followSymlinks = false; + private String tikaConfigPath = null; public static Builder builder() { return new Builder(); @@ -91,6 +92,7 @@ public static class Builder { private Ocr ocr = new Ocr(); private ByteSizeValue ignoreAbove = null; private boolean followSymlinks = false; + private String tikaConfigPath = null; public Builder setUrl(String url) { this.url = url; @@ -246,10 +248,16 @@ public Builder setFollowSymlinks(boolean followSymlinks) { return this; } + public Builder setTikaConfigPath(String tikaConfigPath) { + this.tikaConfigPath = tikaConfigPath; + return this; + } + public Fs build() { return new Fs(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata, - checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks); + checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, + tikaConfigPath); } } @@ -260,7 +268,8 @@ public Fs( ) { private Fs(String url, TimeValue updateRate, List includes, List excludes, List filters, boolean jsonSupport, boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource, Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport, - boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks) { + boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks, + String tikaConfigPath) { this.url = url; this.updateRate = updateRate; this.includes = includes; @@ -284,6 +293,7 @@ private Fs(String url, TimeValue updateRate, List includes, List this.ocr = ocr; this.ignoreAbove = ignoreAbove; this.followSymlinks = followSymlinks; + this.tikaConfigPath = tikaConfigPath; } public String getUrl() { @@ -486,6 +496,14 @@ public void setFollowSymlinks(boolean followSymlinks) { this.followSymlinks = followSymlinks; } + public String getTikaConfigPath() { + return tikaConfigPath; + } + + public void setTikaConfigPath(String tikaConfigPath) { + this.tikaConfigPath = tikaConfigPath; + } + @Override public boolean equals(Object o) { if (this == o) return true; @@ -513,14 +531,15 @@ public boolean equals(Object o) { Objects.equals(indexedChars, fs.indexedChars) && Objects.equals(checksum, fs.checksum) && Objects.equals(ocr, fs.ocr) && - Objects.equals(ignoreAbove, fs.ignoreAbove); + Objects.equals(ignoreAbove, fs.ignoreAbove) && + Objects.equals(tikaConfigPath, fs.tikaConfigPath); } @Override public int hashCode() { return Objects.hash(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize, removeDeleted, addAsInnerObject, storeSource, indexContent, indexedChars, attributesSupport, rawMetadata, xmlSupport, - checksum, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks); + checksum, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, tikaConfigPath); } @Override @@ -548,6 +567,7 @@ public String toString() { ", ocr=" + ocr + ", ignoreAbove=" + ignoreAbove + ", followSymlinks=" + followSymlinks + + ", tikaConfigPath=" + tikaConfigPath + '}'; } } diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 097a4906d..159fcb585 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -25,6 +25,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.tika.config.ServiceLoader; +import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; @@ -43,6 +44,7 @@ import org.apache.tika.sax.WriteOutContentHandler; import org.xml.sax.SAXException; +import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; @@ -81,63 +83,75 @@ private static void initTika(Fs fs) { private static void initParser(Fs fs) { if (parser == null) { - PDFParser pdfParser = new PDFParser(); - DefaultParser defaultParser; - TesseractOCRParser ocrParser; - - // To solve https://issues.apache.org/jira/browse/TIKA-3364 - // PDF content might be extracted multiple times. - pdfParser.getPDFParserConfig().setExtractBookmarksText(false); - - if (ocrActivated) { - logger.debug("OCR is activated."); - ocrParser = new TesseractOCRParser(); - if (fs.getOcr().getPath() != null) { - logger.debug("Tesseract Path set to [{}].", fs.getOcr().getPath()); - ocrParser.setTesseractPath(fs.getOcr().getPath()); - } - if (fs.getOcr().getDataPath() != null) { - logger.debug("Tesseract Data Path set to [{}].", fs.getOcr().getDataPath()); - ocrParser.setTessdataPath(fs.getOcr().getDataPath()); - } + if (fs.getTikaConfigPath() != null && (new File(fs.getTikaConfigPath())).exists()) { + logger.info("Using custom tika configuration."); + TikaConfig config = null; try { - if (ocrParser.hasTesseract()) { - logger.debug("OCR strategy for PDF documents is [{}] and tesseract was found.", fs.getOcr().getPdfStrategy()); - pdfParser.setOcrStrategy(fs.getOcr().getPdfStrategy()); - } else { - logger.debug("But Tesseract is not installed so we won't run OCR."); + config = new TikaConfig(fs.getTikaConfigPath()); + } catch (TikaException|IOException|SAXException e) { + e.printStackTrace(); + } + + parser = new AutoDetectParser(config); + } else { + PDFParser pdfParser = new PDFParser(); + DefaultParser defaultParser; + TesseractOCRParser ocrParser; + + // To solve https://issues.apache.org/jira/browse/TIKA-3364 + // PDF content might be extracted multiple times. + pdfParser.getPDFParserConfig().setExtractBookmarksText(false); + + if (ocrActivated) { + logger.debug("OCR is activated."); + ocrParser = new TesseractOCRParser(); + if (fs.getOcr().getPath() != null) { + logger.debug("Tesseract Path set to [{}].", fs.getOcr().getPath()); + ocrParser.setTesseractPath(fs.getOcr().getPath()); + } + if (fs.getOcr().getDataPath() != null) { + logger.debug("Tesseract Data Path set to [{}].", fs.getOcr().getDataPath()); + ocrParser.setTessdataPath(fs.getOcr().getDataPath()); + } + try { + if (ocrParser.hasTesseract()) { + logger.debug("OCR strategy for PDF documents is [{}] and tesseract was found.", fs.getOcr().getPdfStrategy()); + pdfParser.setOcrStrategy(fs.getOcr().getPdfStrategy()); + } else { + logger.debug("But Tesseract is not installed so we won't run OCR."); + ocrActivated = false; + pdfParser.setOcrStrategy("no_ocr"); + } + } catch (TikaConfigException e) { + logger.debug("Tesseract is not correctly set up so we won't run OCR. Error is: {}", e.getMessage()); + logger.debug("Fullstack trace error for Tesseract", e); ocrActivated = false; pdfParser.setOcrStrategy("no_ocr"); } - } catch (TikaConfigException e) { - logger.debug("Tesseract is not correctly set up so we won't run OCR. Error is: {}", e.getMessage()); - logger.debug("Fullstack trace error for Tesseract", e); - ocrActivated = false; - pdfParser.setOcrStrategy("no_ocr"); } - } - if (ocrActivated) { - logger.info("OCR is enabled. This might slowdown the process."); - // We are excluding the pdf parser as we built one that we want to use instead. - defaultParser = new DefaultParser( - MediaTypeRegistry.getDefaultRegistry(), - new ServiceLoader(), - Collections.singletonList(PDFParser.class)); - } else { - logger.info("OCR is disabled."); - TesseractOCRConfig config = context.get(TesseractOCRConfig.class); - if (config != null) { - config.setSkipOcr(true); + if (ocrActivated) { + logger.info("OCR is enabled. This might slowdown the process."); + // We are excluding the pdf parser as we built one that we want to use instead. + defaultParser = new DefaultParser( + MediaTypeRegistry.getDefaultRegistry(), + new ServiceLoader(), + Collections.singletonList(PDFParser.class)); + } else { + logger.info("OCR is disabled."); + TesseractOCRConfig config = context.get(TesseractOCRConfig.class); + if (config != null) { + config.setSkipOcr(true); + } + // We are excluding the pdf parser as we built one that we want to use instead + // and the OCR Parser as it's explicitly disabled. + defaultParser = new DefaultParser( + MediaTypeRegistry.getDefaultRegistry(), + new ServiceLoader(), + Arrays.asList(PDFParser.class, TesseractOCRParser.class)); } - // We are excluding the pdf parser as we built one that we want to use instead - // and the OCR Parser as it's explicitly disabled. - defaultParser = new DefaultParser( - MediaTypeRegistry.getDefaultRegistry(), - new ServiceLoader(), - Arrays.asList(PDFParser.class, TesseractOCRParser.class)); + parser = new AutoDetectParser(defaultParser, pdfParser); } - parser = new AutoDetectParser(defaultParser, pdfParser); } } From 88baf3a3248509d2439598d09e28a95292883a3c Mon Sep 17 00:00:00 2001 From: iadcode Date: Mon, 7 Feb 2022 16:25:40 +1100 Subject: [PATCH 2/3] Add Tika Config Path Tests and Documentation --- docs/source/admin/fs/local-fs.rst | 36 ++++++++ .../FsCrawlerTestTikaConfigPathIT.java | 85 +++++++++++++++++++ .../test_tika_config_bad_path/test.html | 15 ++++ .../test_tika_config_bad_path/test.xhtml | 17 ++++ .../samples/test_tika_config_path/test.html | 15 ++++ .../samples/test_tika_config_path/test.xhtml | 17 ++++ .../elasticsearch/crawler/fs/settings/Fs.java | 6 +- .../fs/settings/FsSettingsParserTest.java | 1 + .../src/main/resources/documents/test.xhtml | 17 ++++ .../main/resources/documents/tikaConfig.xml | 15 ++++ .../crawler/fs/tika/TikaDocParserTest.java | 38 ++++++++- 11 files changed, 256 insertions(+), 6 deletions(-) create mode 100644 integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java create mode 100644 integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html create mode 100644 integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml create mode 100644 integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.html create mode 100644 integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.xhtml create mode 100644 test-documents/src/main/resources/documents/test.xhtml create mode 100644 test-documents/src/main/resources/documents/tikaConfig.xml diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index 49b0701bc..5652df43a 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -56,6 +56,8 @@ Here is a list of Local FS settings (under ``fs.`` prefix)`: +----------------------------+-----------------------+---------------------------------+ | ``fs.follow_symlinks`` | ``false`` | `Follow Symlinks`_ | +----------------------------+-----------------------+---------------------------------+ +| ``fs.tika_config_path`` | ``null`` | `Tika Config Path`_ | ++----------------------------+-----------------------+---------------------------------+ .. _root-directory: @@ -762,3 +764,37 @@ If you want FSCrawler to follow the symbolic links, you need to be explicit abou name: "test" fs: follow_symlink: true + +Tika Config Path +^^^^^^^^^^^^^^^^ + +.. versionadded:: 2.10 + +If you want to override the default tika parser configuration, you can set the path to a custom tika +configuration file, which will be used instead. + +.. code:: yaml + + name: "test" + fs: + tika_config_path: '/path/to/tikaConfig.xml' + +An example tika config file is shown below. See `apache documentation `__ for more information. + +.. code:: xml + + + + + + + + + + + + + application/xhtml+xml + + + diff --git a/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java b/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java new file mode 100644 index 000000000..03a1edac7 --- /dev/null +++ b/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java @@ -0,0 +1,85 @@ +/* + * Licensed to David Pilato (the "Author") under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Author licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package fr.pilato.elasticsearch.crawler.fs.test.integration.elasticsearch; + +import fr.pilato.elasticsearch.crawler.fs.client.ESMatchQuery; +import fr.pilato.elasticsearch.crawler.fs.client.ESSearchRequest; +import fr.pilato.elasticsearch.crawler.fs.settings.Fs; +import fr.pilato.elasticsearch.crawler.fs.test.integration.AbstractFsCrawlerITCase; +import fr.pilato.elasticsearch.crawler.fs.tika.TikaInstance; +import java.io.File; +import java.io.InputStream; +import org.apache.commons.io.FileUtils; +import org.junit.Test; + +/** + * Test tika config path crawler setting + */ +public class FsCrawlerTestTikaConfigPathIT extends AbstractFsCrawlerITCase { + + @Test + public void test_tika_config_path() throws Exception { + TikaInstance.reloadTika(); + InputStream tikaConfigIS = getClass().getResourceAsStream("/documents/tikaConfig.xml"); + File tikaConfigFile = File.createTempFile("tikaConfigTestFile", ".xml"); + FileUtils.copyInputStreamToFile(tikaConfigIS, tikaConfigFile); + + Fs fs = startCrawlerDefinition() + .setTikaConfigPath(tikaConfigFile.getPath()) + .build(); + startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); + + countTestHelper(new ESSearchRequest().withIndex(getCrawlerName()), 2L, null); + countTestHelper(new ESSearchRequest() + .withIndex(getCrawlerName()) + .withESQuery(new ESMatchQuery("content", "Tika")), 2L, null); + // HTML parsed as TXT will contain all tags in content + // XHTML parsed as XML will remove tags from content + countTestHelper(new ESSearchRequest() + .withIndex(getCrawlerName()) + .withESQuery(new ESMatchQuery("content", "div")), 1L, null); + countTestHelper(new ESSearchRequest() + .withIndex(getCrawlerName()) + .withESQuery(new ESMatchQuery("meta.title", "Test Tika title")), 0L, null); + + tikaConfigFile.delete(); + } + + @Test + public void test_tika_config_bad_path() throws Exception { + TikaInstance.reloadTika(); + Fs fs = startCrawlerDefinition() + .setTikaConfigPath("/bad/path.xml") + .build(); + startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); + + countTestHelper(new ESSearchRequest().withIndex(getCrawlerName()), 2L, null); + countTestHelper(new ESSearchRequest() + .withIndex(getCrawlerName()) + .withESQuery(new ESMatchQuery("content", "example")), 2L, null); + // Both XHTML and HTML parsed as HTML will put information in meta, not in content + countTestHelper(new ESSearchRequest() + .withIndex(getCrawlerName()) + .withESQuery(new ESMatchQuery("content", "Tika")), 0L, null); + countTestHelper(new ESSearchRequest() + .withIndex(getCrawlerName()) + .withESQuery(new ESMatchQuery("meta.title", "Test Tika title")), 2L, null); + } +} diff --git a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html new file mode 100644 index 000000000..14ae83f91 --- /dev/null +++ b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html @@ -0,0 +1,15 @@ + + + + Test Tika title + + + +

This is an example of HTML

+
+

This is a separate line

+ + + diff --git a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml new file mode 100644 index 000000000..8ab5fe1ba --- /dev/null +++ b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml @@ -0,0 +1,17 @@ + + + + + + Test Tika title + + + +

This is an example of XHTML

+
+

This is a separate line

+ + + diff --git a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.html b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.html new file mode 100644 index 000000000..14ae83f91 --- /dev/null +++ b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.html @@ -0,0 +1,15 @@ + + + + Test Tika title + + + +

This is an example of HTML

+
+

This is a separate line

+ + + diff --git a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.xhtml b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.xhtml new file mode 100644 index 000000000..8ab5fe1ba --- /dev/null +++ b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/test.xhtml @@ -0,0 +1,17 @@ + + + + + + Test Tika title + + + +

This is an example of XHTML

+
+

This is a separate line

+ + + diff --git a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java index fdfd5bfaf..23dfa0b7c 100644 --- a/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java +++ b/settings/src/main/java/fr/pilato/elasticsearch/crawler/fs/settings/Fs.java @@ -249,8 +249,8 @@ public Builder setFollowSymlinks(boolean followSymlinks) { } public Builder setTikaConfigPath(String tikaConfigPath) { - this.tikaConfigPath = tikaConfigPath; - return this; + this.tikaConfigPath = tikaConfigPath; + return this; } public Fs build() { @@ -567,7 +567,7 @@ public String toString() { ", ocr=" + ocr + ", ignoreAbove=" + ignoreAbove + ", followSymlinks=" + followSymlinks + - ", tikaConfigPath=" + tikaConfigPath + + ", tikaConfigPath='" + tikaConfigPath + '\'' + '}'; } } diff --git a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java index 95d466d17..b2c3337b4 100644 --- a/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java +++ b/settings/src/test/java/fr/pilato/elasticsearch/crawler/fs/settings/FsSettingsParserTest.java @@ -142,6 +142,7 @@ private void defaultSettingsTester(FsSettings settings) { assertThat(settings.getFs().getIncludes(), nullValue()); assertThat(settings.getFs().getExcludes(), contains("*/~*")); assertThat(settings.getFs().getIndexedChars(), nullValue()); + assertThat(settings.getFs().getTikaConfigPath(), nullValue()); assertThat(settings.getFs().getUpdateRate(), is(TimeValue.timeValueMinutes(15))); assertThat(settings.getFs().getUrl(), is("/tmp/es")); assertThat(settings.getFs().isAddFilesize(), is(true)); diff --git a/test-documents/src/main/resources/documents/test.xhtml b/test-documents/src/main/resources/documents/test.xhtml new file mode 100644 index 000000000..be440f4e5 --- /dev/null +++ b/test-documents/src/main/resources/documents/test.xhtml @@ -0,0 +1,17 @@ + + + + + + Test Tika title + + + +

This is an example of XHTML

+
+

This is a separate line

+ + + diff --git a/test-documents/src/main/resources/documents/tikaConfig.xml b/test-documents/src/main/resources/documents/tikaConfig.xml new file mode 100644 index 000000000..f88b306f4 --- /dev/null +++ b/test-documents/src/main/resources/documents/tikaConfig.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + application/xhtml+xml + + + \ No newline at end of file diff --git a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java index 2cb713f38..b13730a37 100644 --- a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java +++ b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java @@ -23,10 +23,10 @@ import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings; import fr.pilato.elasticsearch.crawler.fs.settings.Ocr; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; +import java.io.File; +import java.net.URISyntaxException; +import org.apache.commons.io.FileUtils; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.junit.BeforeClass; import org.junit.Test; @@ -667,6 +667,38 @@ public void testOcr() throws IOException, TikaConfigException { } } + @Test + public void testCustomTikaConfig() throws IOException, URISyntaxException { + InputStream tikaConfigIS = getClass().getResourceAsStream("/documents/tikaConfig.xml"); + File tikaConfigFile = File.createTempFile("tikaConfigTestFile", ".xml"); + FileUtils.copyInputStreamToFile(tikaConfigIS, tikaConfigFile); + + FsSettings fsSettings = FsSettings.builder(getCurrentTestName()) + .setFs(Fs.builder().setTikaConfigPath(tikaConfigFile.getPath()).build()) + .build(); + + // Test that default parser for HTML is HTML parser + Doc doc = extractFromFile("test.html"); + assertThat(doc.getContent(), not(containsString("Test Tika title"))); + assertThat(doc.getContent(), containsString("This second part of the text is in Page 2")); + + // Test HTML parser is never used, TXT parser used instead + doc = extractFromFile("test.html", fsSettings); + assertThat(doc.getContent(), containsString("Test Tika title")); + + // Test that default parser for XHTML is HTML parser + doc = extractFromFile("test.xhtml"); + assertThat(doc.getContent(), not(containsString("Test Tika title"))); + assertThat(doc.getContent(), containsString("This is an example of XHTML")); + + // Test XML parser is used to parse XHTML + doc = extractFromFile("test.xhtml", fsSettings); + assertThat(doc.getContent(), containsString("Test Tika title")); + assertThat(doc.getContent(), not(containsString("Test Tika title"))); + + tikaConfigFile.delete(); + } + @Test public void testShiftJisEncoding() throws IOException { Doc doc = extractFromFile("issue-400-shiftjis.txt"); From 197b09bd03ad6fe8bd19e23bb6ec6973bf0fe0e8 Mon Sep 17 00:00:00 2001 From: iadcode Date: Fri, 25 Feb 2022 10:17:38 +1100 Subject: [PATCH 3/3] Review Changes - Changed link to dynamic for Tika Configuration apache documentation - Moved tika configuration file(s) - Added early fail for tika config file not found - Updated exception handling --- docs/source/admin/fs/local-fs.rst | 4 +-- docs/source/conf.py | 2 ++ .../FsCrawlerTestTikaConfigPathIT.java | 34 ++----------------- .../test_tika_config_bad_path/test.html | 15 -------- .../test_tika_config_bad_path/test.xhtml | 17 ---------- .../config}/tikaConfig.xml | 0 .../src/main/resources/config/tikaConfig.xml | 15 ++++++++ .../crawler/fs/tika/TikaInstance.java | 10 ++++-- .../crawler/fs/tika/TikaDocParserTest.java | 17 +++++----- 9 files changed, 37 insertions(+), 77 deletions(-) delete mode 100644 integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html delete mode 100644 integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml rename {test-documents/src/main/resources/documents => integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/config}/tikaConfig.xml (100%) create mode 100644 test-documents/src/main/resources/config/tikaConfig.xml diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index 5652df43a..b5c866ae9 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -739,7 +739,7 @@ such as ``MD5`` or ``SHA-1``. .. note:: You MUST set ``index_content`` to true to allow this feature to work. Nevertheless you MAY set ``indexed_chars`` to 0 if you do not need any content in the index. - + You MUST NOT set ``json_support`` or ``xml_support`` to allow this feature to work also. .. code:: yaml @@ -779,7 +779,7 @@ configuration file, which will be used instead. fs: tika_config_path: '/path/to/tikaConfig.xml' -An example tika config file is shown below. See `apache documentation `__ for more information. +An example tika config file is shown below. See |Tika_configuring|_ for more information. .. code:: xml diff --git a/docs/source/conf.py b/docs/source/conf.py index de9726af0..667b4c209 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -229,6 +229,7 @@ def read_version(full_version=True): .. |ES| replace:: Elasticsearch .. |Tika_format| replace:: Tika .. |Tika_version| replace:: Tika {fmt_tika_version} +.. |Tika_configuring| replace:: Configuring Tika .. |ESHL_version6| replace:: Elasticsearch Rest Client {fmt_es_version6} .. |ESHL_version7| replace:: Elasticsearch Rest Client {fmt_es_version7} .. |Tiff_version| replace:: jai-imageio-core:{fmt_tiff_version} @@ -244,6 +245,7 @@ def read_version(full_version=True): .. _ES: https://www.elastic.co/products/elasticsearch .. _Tika_format: https://tika.apache.org/{fmt_tika_version}/formats.html#Supported_Document_Formats .. _Tika_version: https://tika.apache.org/{fmt_tika_version}/ +.. _Tika_configuring: https://tika.apache.org/{fmt_tika_version}/configuring.html .. _ESHL_version6: https://www.elastic.co/guide/en/elasticsearch/client/java-rest/current/index.html .. _ESHL_version7: https://www.elastic.co/guide/en/elasticsearch/client/java-rest/current/index.html .. _Tiff_version: https://repo1.maven.org/maven2/com/github/jai-imageio/jai-imageio-core/{fmt_tiff_version}/ diff --git a/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java b/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java index 03a1edac7..8599108a9 100644 --- a/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java +++ b/integration-tests/it-common/src/main/java/fr/pilato/elasticsearch/crawler/fs/test/integration/elasticsearch/FsCrawlerTestTikaConfigPathIT.java @@ -23,10 +23,6 @@ import fr.pilato.elasticsearch.crawler.fs.client.ESSearchRequest; import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.test.integration.AbstractFsCrawlerITCase; -import fr.pilato.elasticsearch.crawler.fs.tika.TikaInstance; -import java.io.File; -import java.io.InputStream; -import org.apache.commons.io.FileUtils; import org.junit.Test; /** @@ -36,13 +32,9 @@ public class FsCrawlerTestTikaConfigPathIT extends AbstractFsCrawlerITCase { @Test public void test_tika_config_path() throws Exception { - TikaInstance.reloadTika(); - InputStream tikaConfigIS = getClass().getResourceAsStream("/documents/tikaConfig.xml"); - File tikaConfigFile = File.createTempFile("tikaConfigTestFile", ".xml"); - FileUtils.copyInputStreamToFile(tikaConfigIS, tikaConfigFile); - Fs fs = startCrawlerDefinition() - .setTikaConfigPath(tikaConfigFile.getPath()) + .setTikaConfigPath(currentTestResourceDir.resolve("config/tikaConfig.xml").toString()) + .addExclude("/config/*") .build(); startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); @@ -58,28 +50,6 @@ public void test_tika_config_path() throws Exception { countTestHelper(new ESSearchRequest() .withIndex(getCrawlerName()) .withESQuery(new ESMatchQuery("meta.title", "Test Tika title")), 0L, null); - - tikaConfigFile.delete(); } - @Test - public void test_tika_config_bad_path() throws Exception { - TikaInstance.reloadTika(); - Fs fs = startCrawlerDefinition() - .setTikaConfigPath("/bad/path.xml") - .build(); - startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null); - - countTestHelper(new ESSearchRequest().withIndex(getCrawlerName()), 2L, null); - countTestHelper(new ESSearchRequest() - .withIndex(getCrawlerName()) - .withESQuery(new ESMatchQuery("content", "example")), 2L, null); - // Both XHTML and HTML parsed as HTML will put information in meta, not in content - countTestHelper(new ESSearchRequest() - .withIndex(getCrawlerName()) - .withESQuery(new ESMatchQuery("content", "Tika")), 0L, null); - countTestHelper(new ESSearchRequest() - .withIndex(getCrawlerName()) - .withESQuery(new ESMatchQuery("meta.title", "Test Tika title")), 2L, null); - } } diff --git a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html deleted file mode 100644 index 14ae83f91..000000000 --- a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - Test Tika title - - - -

This is an example of HTML

-
-

This is a separate line

- - - diff --git a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml deleted file mode 100644 index 8ab5fe1ba..000000000 --- a/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_bad_path/test.xhtml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - Test Tika title - - - -

This is an example of XHTML

-
-

This is a separate line

- - - diff --git a/test-documents/src/main/resources/documents/tikaConfig.xml b/integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/config/tikaConfig.xml similarity index 100% rename from test-documents/src/main/resources/documents/tikaConfig.xml rename to integration-tests/it-common/src/main/resources-binary/samples/test_tika_config_path/config/tikaConfig.xml diff --git a/test-documents/src/main/resources/config/tikaConfig.xml b/test-documents/src/main/resources/config/tikaConfig.xml new file mode 100644 index 000000000..f88b306f4 --- /dev/null +++ b/test-documents/src/main/resources/config/tikaConfig.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + application/xhtml+xml + + + \ No newline at end of file diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java index 159fcb585..505bce49e 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaInstance.java @@ -83,13 +83,17 @@ private static void initTika(Fs fs) { private static void initParser(Fs fs) { if (parser == null) { - if (fs.getTikaConfigPath() != null && (new File(fs.getTikaConfigPath())).exists()) { - logger.info("Using custom tika configuration."); + if (fs.getTikaConfigPath() != null) { + if (!(new File(fs.getTikaConfigPath())).exists()) { + throw new RuntimeException("Tika configuration file " + fs.getTikaConfigPath() + " not found!"); + } + logger.info("Using custom tika configuration from [{}].", fs.getTikaConfigPath()); TikaConfig config = null; try { config = new TikaConfig(fs.getTikaConfigPath()); } catch (TikaException|IOException|SAXException e) { - e.printStackTrace(); + logger.error("Can not configure Tika: {}", e.getMessage()); + logger.debug("Fullstack trace error for Tika", e); } parser = new AutoDetectParser(config); diff --git a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java index b13730a37..fc10f7ca2 100644 --- a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java +++ b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java @@ -23,9 +23,9 @@ import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings; import fr.pilato.elasticsearch.crawler.fs.settings.Ocr; -import java.io.File; import java.net.URISyntaxException; -import org.apache.commons.io.FileUtils; +import java.nio.file.Files; +import java.nio.file.Path; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.junit.BeforeClass; @@ -669,12 +669,15 @@ public void testOcr() throws IOException, TikaConfigException { @Test public void testCustomTikaConfig() throws IOException, URISyntaxException { - InputStream tikaConfigIS = getClass().getResourceAsStream("/documents/tikaConfig.xml"); - File tikaConfigFile = File.createTempFile("tikaConfigTestFile", ".xml"); - FileUtils.copyInputStreamToFile(tikaConfigIS, tikaConfigFile); + InputStream tikaConfigIS = getClass().getResourceAsStream("/config/tikaConfig.xml"); + Path testTikaConfig = rootTmpDir.resolve("tika-config"); + if (Files.notExists(testTikaConfig)) { + Files.createDirectory(testTikaConfig); + } + Files.copy(tikaConfigIS, testTikaConfig.resolve("tikaConfig.xml")); FsSettings fsSettings = FsSettings.builder(getCurrentTestName()) - .setFs(Fs.builder().setTikaConfigPath(tikaConfigFile.getPath()).build()) + .setFs(Fs.builder().setTikaConfigPath(testTikaConfig.resolve("tikaConfig.xml").toString()).build()) .build(); // Test that default parser for HTML is HTML parser @@ -695,8 +698,6 @@ public void testCustomTikaConfig() throws IOException, URISyntaxException { doc = extractFromFile("test.xhtml", fsSettings); assertThat(doc.getContent(), containsString("Test Tika title")); assertThat(doc.getContent(), not(containsString("Test Tika title"))); - - tikaConfigFile.delete(); } @Test