Skip to content

Commit

Permalink
Merge pull request #1367 from iadcode/master
Browse files Browse the repository at this point in the history
Add option to set path to custom tika config file
  • Loading branch information
mergify[bot] authored Mar 17, 2022
2 parents d906840 + a1de7bd commit c16b4c1
Show file tree
Hide file tree
Showing 12 changed files with 300 additions and 56 deletions.
38 changes: 37 additions & 1 deletion docs/source/admin/fs/local-fs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ Here is a list of Local FS settings (under ``fs.`` prefix)`:
+----------------------------+-----------------------+---------------------------------+
| ``fs.follow_symlinks`` | ``false`` | `Follow Symlinks`_ |
+----------------------------+-----------------------+---------------------------------+
| ``fs.tika_config_path`` | ``null`` | `Tika Config Path`_ |
+----------------------------+-----------------------+---------------------------------+

.. _root-directory:

Expand Down Expand Up @@ -742,7 +744,7 @@ such as ``MD5`` or ``SHA-1``.
.. note::

You MUST set ``index_content`` to true to allow this feature to work. Nevertheless you MAY set ``indexed_chars`` to 0 if you do not need any content in the index.

You MUST NOT set ``json_support`` or ``xml_support`` to allow this feature to work also.

.. code:: yaml
Expand All @@ -767,3 +769,37 @@ If you want FSCrawler to follow the symbolic links, you need to be explicit abou
name: "test"
fs:
follow_symlink: true
Tika Config Path
^^^^^^^^^^^^^^^^

.. versionadded:: 2.10

If you want to override the default tika parser configuration, you can set the path to a custom tika
configuration file, which will be used instead.

.. code:: yaml
name: "test"
fs:
tika_config_path: '/path/to/tikaConfig.xml'
An example tika config file is shown below. See |Tika_configuring|_ for more information.

.. code:: xml
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<service-loader dynamic="true"/>
<service-loader loadErrorHandler="IGNORE"/>
<parsers>
<!-- Use Default Parser for files, but Default Parser will never use HTML parser -->
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.html.HtmlParser"/>
</parser>
<!-- Use a different parser for XHTML -->
<parser class="org.apache.tika.parser.xml.XMLParser">
<mime>application/xhtml+xml</mime>
</parser>
</parsers>
</properties>
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ def read_version(full_version=True):
.. |ES| replace:: Elasticsearch
.. |Tika_format| replace:: Tika
.. |Tika_version| replace:: Tika {fmt_tika_version}
.. |Tika_configuring| replace:: Configuring Tika
.. |ES_version6| replace:: Elasticsearch {fmt_es_version6}
.. |ES_version7| replace:: Elasticsearch {fmt_es_version7}
.. |Tiff_version| replace:: jai-imageio-core:{fmt_tiff_version}
Expand All @@ -239,6 +240,7 @@ def read_version(full_version=True):
.. _ES: https://www.elastic.co/products/elasticsearch
.. _Tika_format: https://tika.apache.org/{fmt_tika_version}/formats.html#Supported_Document_Formats
.. _Tika_version: https://tika.apache.org/{fmt_tika_version}/
.. _Tika_configuring: https://tika.apache.org/{fmt_tika_version}/configuring.html
.. _ES_version6: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
.. _ES_version7: https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html
.. _Tiff_version: https://repo1.maven.org/maven2/com/github/jai-imageio/jai-imageio-core/{fmt_tiff_version}/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Licensed to David Pilato (the "Author") under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Author licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package fr.pilato.elasticsearch.crawler.fs.test.integration.elasticsearch;

import fr.pilato.elasticsearch.crawler.fs.client.ESMatchQuery;
import fr.pilato.elasticsearch.crawler.fs.client.ESSearchRequest;
import fr.pilato.elasticsearch.crawler.fs.settings.Fs;
import fr.pilato.elasticsearch.crawler.fs.test.integration.AbstractFsCrawlerITCase;
import org.junit.Test;

/**
* Test tika config path crawler setting
*/
public class FsCrawlerTestTikaConfigPathIT extends AbstractFsCrawlerITCase {

@Test
public void test_tika_config_path() throws Exception {
Fs fs = startCrawlerDefinition()
.setTikaConfigPath(currentTestResourceDir.resolve("config/tikaConfig.xml").toString())
.addExclude("/config/*")
.build();
startCrawler(getCrawlerName(), fs, endCrawlerDefinition(getCrawlerName()), null);

countTestHelper(new ESSearchRequest().withIndex(getCrawlerName()), 2L, null);
countTestHelper(new ESSearchRequest()
.withIndex(getCrawlerName())
.withESQuery(new ESMatchQuery("content", "Tika")), 2L, null);
// HTML parsed as TXT will contain all tags in content
// XHTML parsed as XML will remove tags from content
countTestHelper(new ESSearchRequest()
.withIndex(getCrawlerName())
.withESQuery(new ESMatchQuery("content", "div")), 1L, null);
countTestHelper(new ESSearchRequest()
.withIndex(getCrawlerName())
.withESQuery(new ESMatchQuery("meta.title", "Test Tika title")), 0L, null);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<service-loader dynamic="true"/>
<service-loader loadErrorHandler="IGNORE"/>
<parsers>
<!-- Default Parser, but never use HTML parser -->
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.html.HtmlParser"/>
</parser>
<!-- Use a different parser for XHTML -->
<parser class="org.apache.tika.parser.xml.XMLParser">
<mime>application/xhtml+xml</mime>
</parser>
</parsers>
</properties>
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<html
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns="http://www.w3.org/TR/REC-html40">

<head>
<title>Test Tika title</title>
</head>

<body>
<p>This is an example of HTML</p>
<div />
<p>This is a separate line</p>
</body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns="http://www.w3.org/TR/REC-html40">

<head>
<title>Test Tika title</title>
</head>

<body>
<p>This is an example of XHTML</p>
<div />
<p>This is a separate line</p>
</body>

</html>
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ public class Fs {
private Ocr ocr = new Ocr();
private ByteSizeValue ignoreAbove = null;
private boolean followSymlinks = false;
private String tikaConfigPath = null;

public static Builder builder() {
return new Builder();
Expand Down Expand Up @@ -91,6 +92,7 @@ public static class Builder {
private Ocr ocr = new Ocr();
private ByteSizeValue ignoreAbove = null;
private boolean followSymlinks = false;
private String tikaConfigPath = null;

public Builder setUrl(String url) {
this.url = url;
Expand Down Expand Up @@ -246,10 +248,16 @@ public Builder setFollowSymlinks(boolean followSymlinks) {
return this;
}

public Builder setTikaConfigPath(String tikaConfigPath) {
this.tikaConfigPath = tikaConfigPath;
return this;
}

public Fs build() {
return new Fs(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize,
removeDeleted, addAsInnerObject, storeSource, indexedChars, indexContent, attributesSupport, rawMetadata,
checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks);
checksum, xmlSupport, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks,
tikaConfigPath);
}
}

Expand All @@ -260,7 +268,8 @@ public Fs( ) {
private Fs(String url, TimeValue updateRate, List<String> includes, List<String> excludes, List<String> filters, boolean jsonSupport,
boolean filenameAsId, boolean addFilesize, boolean removeDeleted, boolean addAsInnerObject, boolean storeSource,
Percentage indexedChars, boolean indexContent, boolean attributesSupport, boolean rawMetadata, String checksum, boolean xmlSupport,
boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks) {
boolean indexFolders, boolean langDetect, boolean continueOnError, Ocr ocr, ByteSizeValue ignoreAbove, boolean followSymlinks,
String tikaConfigPath) {
this.url = url;
this.updateRate = updateRate;
this.includes = includes;
Expand All @@ -284,6 +293,7 @@ private Fs(String url, TimeValue updateRate, List<String> includes, List<String>
this.ocr = ocr;
this.ignoreAbove = ignoreAbove;
this.followSymlinks = followSymlinks;
this.tikaConfigPath = tikaConfigPath;
}

public String getUrl() {
Expand Down Expand Up @@ -486,6 +496,14 @@ public void setFollowSymlinks(boolean followSymlinks) {
this.followSymlinks = followSymlinks;
}

public String getTikaConfigPath() {
return tikaConfigPath;
}

public void setTikaConfigPath(String tikaConfigPath) {
this.tikaConfigPath = tikaConfigPath;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
Expand Down Expand Up @@ -513,14 +531,15 @@ public boolean equals(Object o) {
Objects.equals(indexedChars, fs.indexedChars) &&
Objects.equals(checksum, fs.checksum) &&
Objects.equals(ocr, fs.ocr) &&
Objects.equals(ignoreAbove, fs.ignoreAbove);
Objects.equals(ignoreAbove, fs.ignoreAbove) &&
Objects.equals(tikaConfigPath, fs.tikaConfigPath);
}

@Override
public int hashCode() {
return Objects.hash(url, updateRate, includes, excludes, filters, jsonSupport, filenameAsId, addFilesize,
removeDeleted, addAsInnerObject, storeSource, indexContent, indexedChars, attributesSupport, rawMetadata, xmlSupport,
checksum, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks);
checksum, indexFolders, langDetect, continueOnError, ocr, ignoreAbove, followSymlinks, tikaConfigPath);
}

@Override
Expand Down Expand Up @@ -548,6 +567,7 @@ public String toString() {
", ocr=" + ocr +
", ignoreAbove=" + ignoreAbove +
", followSymlinks=" + followSymlinks +
", tikaConfigPath='" + tikaConfigPath + '\'' +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ private void defaultSettingsTester(FsSettings settings) {
assertThat(settings.getFs().getIncludes(), nullValue());
assertThat(settings.getFs().getExcludes(), contains("*/~*"));
assertThat(settings.getFs().getIndexedChars(), nullValue());
assertThat(settings.getFs().getTikaConfigPath(), nullValue());
assertThat(settings.getFs().getUpdateRate(), is(TimeValue.timeValueMinutes(15)));
assertThat(settings.getFs().getUrl(), is("/tmp/es"));
assertThat(settings.getFs().isAddFilesize(), is(true));
Expand Down
15 changes: 15 additions & 0 deletions test-documents/src/main/resources/config/tikaConfig.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<service-loader dynamic="true"/>
<service-loader loadErrorHandler="IGNORE"/>
<parsers>
<!-- Default Parser, but never use HTML parser -->
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.html.HtmlParser"/>
</parser>
<!-- Use a different parser for XHTML -->
<parser class="org.apache.tika.parser.xml.XMLParser">
<mime>application/xhtml+xml</mime>
</parser>
</parsers>
</properties>
17 changes: 17 additions & 0 deletions test-documents/src/main/resources/documents/test.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html
xmlns:o="urn:schemas-microsoft-com:office:office"
xmlns="http://www.w3.org/TR/REC-html40">

<head>
<title>Test Tika title</title>
</head>

<body>
<p>This is an example of XHTML</p>
<div />
<p>This is a separate line</p>
</body>

</html>
Loading

0 comments on commit c16b4c1

Please sign in to comment.