From 2e09f0d016961d8e44a34c14c377ebbf4dc1fdb4 Mon Sep 17 00:00:00 2001 From: tallison Date: Tue, 22 Oct 2024 07:19:19 -0400 Subject: [PATCH] TIKA-4330 -- Add a MetadataListFilter --- .../java/org/apache/tika/cli/TikaCLI.java | 4 +- .../java/org/apache/tika/gui/TikaGUI.java | 14 +++-- .../org/apache/tika/config/TikaConfig.java | 11 ++++ .../CompositeMetadataListFilter.java | 58 +++++++++++++++++++ .../listfilter/MetadataListFilter.java | 52 +++++++++++++++++ .../metadata/listfilter/NoOpListFilter.java | 28 +++++++++ .../org/apache/tika/pipes/PipesServer.java | 39 ++++++++++--- .../AttachmentCountingListFilter.java | 33 +++++++++++ .../apache/tika/pipes/PipesClientTest.java | 17 ++++++ .../test-documents/mock/embedded.xml | 53 +++++++++++++++++ .../resource/RecursiveMetadataResource.java | 6 +- 11 files changed, 300 insertions(+), 15 deletions(-) create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java create mode 100644 tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java create mode 100644 tika-core/src/test/resources/test-documents/mock/embedded.xml diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 4aa5361bae..aa087910a6 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -499,7 +499,9 @@ private void handleRecursiveJson(URL url, OutputStream output) throws IOExceptio } JsonMetadataList.setPrettyPrinting(prettyPrint); try (Writer writer = getOutputWriter(output, encoding)) { - JsonMetadataList.toJson(handler.getMetadataList(), writer); + List metadataList = handler.getMetadataList(); + metadataList = config.getMetadataListFilter().filter(metadataList); + JsonMetadataList.toJson(metadataList, writer); } } diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index 41ed93232e..d314b472cb 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -38,6 +38,7 @@ import java.nio.file.Files; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Set; import javax.swing.Box; @@ -152,9 +153,11 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener * File chooser. */ private final JFileChooser chooser = new JFileChooser(); + private final TikaConfig tikaConfig; - public TikaGUI(Parser parser) { + public TikaGUI(Parser parser, TikaConfig tikaConfig) { super("Apache Tika"); + this.tikaConfig = tikaConfig; setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE); addMenuBar(); @@ -198,8 +201,9 @@ public static void main(String[] args) throws Exception { UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); final TikaConfig finalConfig = config; SwingUtilities.invokeLater(() -> new TikaGUI( - new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256), - false)).setVisible(true)); + new DigestingParser(new AutoDetectParser(finalConfig), + new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256), + false), finalConfig).setVisible(true)); } private void addMenuBar() { @@ -374,7 +378,9 @@ private void handleStream(InputStream input, Metadata md) throws Exception { wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext()); StringWriter jsonBuffer = new StringWriter(); JsonMetadataList.setPrettyPrinting(true); - JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer); + List metadataList = recursiveParserWrapperHandler.getMetadataList(); + metadataList = tikaConfig.getMetadataListFilter().filter(metadataList); + JsonMetadataList.toJson(metadataList, jsonBuffer); setText(json, jsonBuffer.toString()); } layout.show(cards, "metadata"); diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java index e68ad10d65..63c72bfef5 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java @@ -62,6 +62,8 @@ import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; +import org.apache.tika.metadata.listfilter.MetadataListFilter; +import org.apache.tika.metadata.listfilter.NoOpListFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypeException; @@ -104,6 +106,7 @@ public class TikaConfig { private final EncodingDetector encodingDetector; private final Renderer renderer; private final MetadataFilter metadataFilter; + private final MetadataListFilter metadataListFilter; private final AutoDetectParserConfig autoDetectParserConfig; private static int MAX_JSON_STRING_FIELD_LENGTH = DEFAULT_MAX_JSON_STRING_FIELD_LENGTH; @@ -177,6 +180,7 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException, this.translator = translatorLoader.loadOverall(element, mimeTypes, loader); this.executorService = executorLoader.loadOverall(element, mimeTypes, loader); this.metadataFilter = MetadataFilter.load(element, true); + this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); this.serviceLoader = loader; setMaxJsonStringFieldLength(element); @@ -205,6 +209,7 @@ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = new NoOpFilter(); + this.metadataListFilter = new NoOpListFilter(); this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT; TIMES_INSTANTIATED.incrementAndGet(); } @@ -251,6 +256,7 @@ public TikaConfig() throws TikaException, IOException { this.translator = getDefaultTranslator(serviceLoader); this.executorService = getDefaultExecutorService(); this.metadataFilter = new NoOpFilter(); + this.metadataListFilter = new NoOpListFilter(); this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT; } else { ServiceLoader tmpServiceLoader = new ServiceLoader(); @@ -278,6 +284,7 @@ public TikaConfig() throws TikaException, IOException { this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader); this.metadataFilter = MetadataFilter.load(element, true); + this.metadataListFilter = MetadataListFilter.load(element, true); this.autoDetectParserConfig = AutoDetectParserConfig.load(element); setMaxJsonStringFieldLength(element); } catch (SAXException e) { @@ -629,6 +636,10 @@ public MetadataFilter getMetadataFilter() { return metadataFilter; } + public MetadataListFilter getMetadataListFilter() { + return metadataListFilter; + } + public AutoDetectParserConfig getAutoDetectParserConfig() { return autoDetectParserConfig; } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java new file mode 100644 index 0000000000..cede25bd52 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/CompositeMetadataListFilter.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +public class CompositeMetadataListFilter extends MetadataListFilter { + + //no longer final to allow for no arg initialization during serialization + private List filters; + + public CompositeMetadataListFilter() { + filters = new ArrayList<>(); + } + public CompositeMetadataListFilter(List filters) { + this.filters = filters; + } + + public void setFilters(List filters) { + this.filters.clear(); + this.filters.addAll(filters); + } + + public List getFilters() { + return filters; + } + + @Override + public List filter(List metadataList) throws TikaException { + for (MetadataListFilter filter : filters) { + metadataList = filter.filter(metadataList); + } + return metadataList; + } + + @Override + public String toString() { + return "CompositeMetadataListFilter{" + "filters=" + filters + '}'; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java new file mode 100644 index 0000000000..93021da7c3 --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/MetadataListFilter.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.io.IOException; +import java.io.Serializable; +import java.util.List; + +import org.w3c.dom.Element; + +import org.apache.tika.config.ConfigBase; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.filter.MetadataFilter; + +public abstract class MetadataListFilter extends ConfigBase implements Serializable { + /** + * Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter + * @param root + * @return + * @throws TikaConfigException + * @throws IOException + */ + public static MetadataListFilter load(Element root, boolean allowMissing) throws TikaConfigException, + IOException { + try { + return buildComposite("metadataListFilters", CompositeMetadataListFilter.class, + "metadataListFilter", MetadataFilter.class, root); + } catch (TikaConfigException e) { + if (allowMissing && e.getMessage().contains("could not find metadataListFilters")) { + return new NoOpListFilter(); + } + throw e; + } + } + public abstract List filter(List metadataList) throws TikaException; +} diff --git a/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java new file mode 100644 index 0000000000..68654e4f2c --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/metadata/listfilter/NoOpListFilter.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.util.List; + +import org.apache.tika.metadata.Metadata; + +public class NoOpListFilter extends MetadataListFilter { + @Override + public List filter(List metadataList) { + return metadataList; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java index dffb7c9ce2..e339b619fe 100644 --- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java +++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java @@ -58,6 +58,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.metadata.listfilter.MetadataListFilter; +import org.apache.tika.metadata.listfilter.NoOpListFilter; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.DigestingParser; @@ -400,11 +402,8 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD long start = System.currentTimeMillis(); String stack = getContainerStacktrace(t, parseData.getMetadataList()); //we need to apply the metadata filter after we pull out the stacktrace - MetadataFilter filter = t.getParseContext().get(MetadataFilter.class); - if (filter == null) { - filter = tikaConfig.getMetadataFilter(); - } - filterMetadata(filter, parseData.getMetadataList()); + filterMetadata(t, parseData.getMetadataList()); + filterMetadataList(t, parseData); ParseContext parseContext = t.getParseContext(); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException(); EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class); @@ -437,16 +436,35 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD } } - private void filterMetadata(MetadataFilter metadataFilter, List metadataList) { + private void filterMetadata(FetchEmitTuple t, List metadataList) { + MetadataFilter filter = t.getParseContext().get(MetadataFilter.class); + if (filter == null) { + filter = tikaConfig.getMetadataFilter(); + } for (Metadata m : metadataList) { try { - metadataFilter.filter(m); + filter.filter(m); } catch (TikaException e) { LOG.warn("failed to filter metadata", e); } } } + private void filterMetadataList(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) { + MetadataListFilter filter = t.getParseContext().get(MetadataListFilter.class); + if (filter == null) { + filter = tikaConfig.getMetadataListFilter(); + } + if (filter instanceof NoOpListFilter) { + return; + } + try { + parseData.filter(filter); + } catch (TikaException e) { + LOG.warn("failed to filter metadata list", e); + } + } + private Fetcher getFetcher(FetchEmitTuple t) { try { return fetcherManager.getFetcher(t.getFetchKey().getFetcherName()); @@ -830,7 +848,8 @@ private void write(STATUS status) { } static class MetadataListAndEmbeddedBytes { - final List metadataList; + + List metadataList; final Optional embeddedDocumentBytesHandler; public MetadataListAndEmbeddedBytes(List metadataList, @@ -843,6 +862,10 @@ public List getMetadataList() { return metadataList; } + public void filter(MetadataListFilter filter) throws TikaException { + metadataList = filter.filter(metadataList); + } + public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() { return embeddedDocumentBytesHandler.get(); } diff --git a/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java new file mode 100644 index 0000000000..daa68c9280 --- /dev/null +++ b/tika-core/src/test/java/org/apache/tika/metadata/listfilter/AttachmentCountingListFilter.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.listfilter; + +import java.util.List; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +public class AttachmentCountingListFilter extends MetadataListFilter { + @Override + public List filter(List metadataList) throws TikaException { + if (metadataList == null || metadataList.isEmpty()) { + return metadataList; + } + metadataList.get(0).set("X-TIKA:attachment_count", Integer.toString(metadataList.size() - 1)); + return metadataList; + } +} diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java index 13b0dc312c..35d52fc4a6 100644 --- a/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java +++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java @@ -32,6 +32,9 @@ import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.MockUpperCaseFilter; +import org.apache.tika.metadata.listfilter.AttachmentCountingListFilter; +import org.apache.tika.metadata.listfilter.CompositeMetadataListFilter; +import org.apache.tika.metadata.listfilter.MetadataListFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.emitter.EmitKey; import org.apache.tika.pipes.fetcher.FetchKey; @@ -76,4 +79,18 @@ public void testMetadataFilter() throws IOException, InterruptedException { Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0); Assertions.assertEquals("TESTOVERLAPPINGTEXT.PDF", metadata.get("resourceName")); } + + @Test + public void testMetadataListFilter() throws IOException, InterruptedException { + ParseContext parseContext = new ParseContext(); + MetadataListFilter metadataFilter = new CompositeMetadataListFilter(List.of(new AttachmentCountingListFilter())); + parseContext.set(MetadataListFilter.class, metadataFilter); + PipesResult pipesResult = pipesClient.process( + new FetchEmitTuple("mock/embedded.xml", new FetchKey(fetcherName, "mock/embedded.xml"), + new EmitKey(), new Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP)); + Assertions.assertNotNull(pipesResult.getEmitData().getMetadataList()); + Assertions.assertEquals(5, pipesResult.getEmitData().getMetadataList().size()); + Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0); + Assertions.assertEquals(4, Integer.parseInt(metadata.get("X-TIKA:attachment_count"))); + } } diff --git a/tika-core/src/test/resources/test-documents/mock/embedded.xml b/tika-core/src/test/resources/test-documents/mock/embedded.xml new file mode 100644 index 0000000000..c75c2fce6b --- /dev/null +++ b/tika-core/src/test/resources/test-documents/mock/embedded.xml @@ -0,0 +1,53 @@ + + + + + + + Nikolai Lobachevsky + main_content + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + <mock> + <metadata action="add" name="author">embeddedAuthor</metadata> + <write element="p">some_embedded_content</write> + </mock> + + + \ No newline at end of file diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index a180ddfba1..8b26a672a2 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -19,6 +19,7 @@ import static org.apache.tika.server.core.resource.TikaResource.fillMetadata; import static org.apache.tika.server.core.resource.TikaResource.fillParseContext; +import static org.apache.tika.server.core.resource.TikaResource.getConfig; import java.io.InputStream; import java.util.List; @@ -40,6 +41,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.listfilter.MetadataListFilter; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; @@ -84,8 +86,8 @@ public static List parseMetadata(InputStream is, Metadata metadata, Mu //we shouldn't get here? LOG.error("something went seriously wrong", e); } - - return handler.getMetadataList(); + MetadataListFilter metadataListFilter = context.get(MetadataListFilter.class, getConfig().getMetadataListFilter()); + return metadataListFilter.filter(handler.getMetadataList()); } static HandlerConfig buildHandlerConfig(MultivaluedMap httpHeaders, String handlerTypeName, HandlerConfig.PARSE_MODE parseMode) {