Skip to content

Commit

Permalink
TIKA-4330 -- Add a MetadataListFilter
Browse files Browse the repository at this point in the history
  • Loading branch information
tballison committed Oct 22, 2024
1 parent 56f56f3 commit 2e09f0d
Show file tree
Hide file tree
Showing 11 changed files with 300 additions and 15 deletions.
4 changes: 3 additions & 1 deletion tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,9 @@ private void handleRecursiveJson(URL url, OutputStream output) throws IOExceptio
}
JsonMetadataList.setPrettyPrinting(prettyPrint);
try (Writer writer = getOutputWriter(output, encoding)) {
JsonMetadataList.toJson(handler.getMetadataList(), writer);
List<Metadata> metadataList = handler.getMetadataList();
metadataList = config.getMetadataListFilter().filter(metadataList);
JsonMetadataList.toJson(metadataList, writer);
}
}

Expand Down
14 changes: 10 additions & 4 deletions tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.nio.file.Files;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.swing.Box;
Expand Down Expand Up @@ -152,9 +153,11 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener
* File chooser.
*/
private final JFileChooser chooser = new JFileChooser();
private final TikaConfig tikaConfig;

public TikaGUI(Parser parser) {
public TikaGUI(Parser parser, TikaConfig tikaConfig) {
super("Apache Tika");
this.tikaConfig = tikaConfig;
setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);

addMenuBar();
Expand Down Expand Up @@ -198,8 +201,9 @@ public static void main(String[] args) throws Exception {
UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName());
final TikaConfig finalConfig = config;
SwingUtilities.invokeLater(() -> new TikaGUI(
new DigestingParser(new AutoDetectParser(finalConfig), new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256),
false)).setVisible(true));
new DigestingParser(new AutoDetectParser(finalConfig),
new CommonsDigester(MAX_MARK, CommonsDigester.DigestAlgorithm.MD5, CommonsDigester.DigestAlgorithm.SHA256),
false), finalConfig).setVisible(true));
}

private void addMenuBar() {
Expand Down Expand Up @@ -374,7 +378,9 @@ private void handleStream(InputStream input, Metadata md) throws Exception {
wrapper.parse(input, recursiveParserWrapperHandler, new Metadata(), new ParseContext());
StringWriter jsonBuffer = new StringWriter();
JsonMetadataList.setPrettyPrinting(true);
JsonMetadataList.toJson(recursiveParserWrapperHandler.getMetadataList(), jsonBuffer);
List<Metadata> metadataList = recursiveParserWrapperHandler.getMetadataList();
metadataList = tikaConfig.getMetadataListFilter().filter(metadataList);
JsonMetadataList.toJson(metadataList, jsonBuffer);
setText(json, jsonBuffer.toString());
}
layout.show(cards, "metadata");
Expand Down
11 changes: 11 additions & 0 deletions tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.metadata.listfilter.MetadataListFilter;
import org.apache.tika.metadata.listfilter.NoOpListFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
Expand Down Expand Up @@ -104,6 +106,7 @@ public class TikaConfig {
private final EncodingDetector encodingDetector;
private final Renderer renderer;
private final MetadataFilter metadataFilter;
private final MetadataListFilter metadataListFilter;
private final AutoDetectParserConfig autoDetectParserConfig;

private static int MAX_JSON_STRING_FIELD_LENGTH = DEFAULT_MAX_JSON_STRING_FIELD_LENGTH;
Expand Down Expand Up @@ -177,6 +180,7 @@ private TikaConfig(Element element, ServiceLoader loader) throws TikaException,
this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
this.metadataFilter = MetadataFilter.load(element, true);
this.metadataListFilter = MetadataListFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
this.serviceLoader = loader;
setMaxJsonStringFieldLength(element);
Expand Down Expand Up @@ -205,6 +209,7 @@ public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException {
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
this.metadataListFilter = new NoOpListFilter();
this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT;
TIMES_INSTANTIATED.incrementAndGet();
}
Expand Down Expand Up @@ -251,6 +256,7 @@ public TikaConfig() throws TikaException, IOException {
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = new NoOpFilter();
this.metadataListFilter = new NoOpListFilter();
this.autoDetectParserConfig = AutoDetectParserConfig.DEFAULT;
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
Expand Down Expand Up @@ -278,6 +284,7 @@ public TikaConfig() throws TikaException, IOException {
this.executorService =
executorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.metadataFilter = MetadataFilter.load(element, true);
this.metadataListFilter = MetadataListFilter.load(element, true);
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
setMaxJsonStringFieldLength(element);
} catch (SAXException e) {
Expand Down Expand Up @@ -629,6 +636,10 @@ public MetadataFilter getMetadataFilter() {
return metadataFilter;
}

public MetadataListFilter getMetadataListFilter() {
return metadataListFilter;
}

public AutoDetectParserConfig getAutoDetectParserConfig() {
return autoDetectParserConfig;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata.listfilter;

import java.util.ArrayList;
import java.util.List;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;

public class CompositeMetadataListFilter extends MetadataListFilter {

//no longer final to allow for no arg initialization during serialization
private List<MetadataListFilter> filters;

public CompositeMetadataListFilter() {
filters = new ArrayList<>();
}
public CompositeMetadataListFilter(List<MetadataListFilter> filters) {
this.filters = filters;
}

public void setFilters(List<MetadataListFilter> filters) {
this.filters.clear();
this.filters.addAll(filters);
}

public List<MetadataListFilter> getFilters() {
return filters;
}

@Override
public List<Metadata> filter(List<Metadata> metadataList) throws TikaException {
for (MetadataListFilter filter : filters) {
metadataList = filter.filter(metadataList);
}
return metadataList;
}

@Override
public String toString() {
return "CompositeMetadataListFilter{" + "filters=" + filters + '}';
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata.listfilter;

import java.io.IOException;
import java.io.Serializable;
import java.util.List;

import org.w3c.dom.Element;

import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.MetadataFilter;

public abstract class MetadataListFilter extends ConfigBase implements Serializable {
/**
* Loads the metadata list filter from the config file if it exists, otherwise returns NoOpFilter
* @param root
* @return
* @throws TikaConfigException
* @throws IOException
*/
public static MetadataListFilter load(Element root, boolean allowMissing) throws TikaConfigException,
IOException {
try {
return buildComposite("metadataListFilters", CompositeMetadataListFilter.class,
"metadataListFilter", MetadataFilter.class, root);
} catch (TikaConfigException e) {
if (allowMissing && e.getMessage().contains("could not find metadataListFilters")) {
return new NoOpListFilter();
}
throw e;
}
}
public abstract List<Metadata> filter(List<Metadata> metadataList) throws TikaException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata.listfilter;

import java.util.List;

import org.apache.tika.metadata.Metadata;

public class NoOpListFilter extends MetadataListFilter {
@Override
public List<Metadata> filter(List<Metadata> metadataList) {
return metadataList;
}
}
39 changes: 31 additions & 8 deletions tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.listfilter.MetadataListFilter;
import org.apache.tika.metadata.listfilter.NoOpListFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
Expand Down Expand Up @@ -400,11 +402,8 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, parseData.getMetadataList());
//we need to apply the metadata filter after we pull out the stacktrace
MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataFilter();
}
filterMetadata(filter, parseData.getMetadataList());
filterMetadata(t, parseData.getMetadataList());
filterMetadataList(t, parseData);
ParseContext parseContext = t.getParseContext();
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class);
Expand Down Expand Up @@ -437,16 +436,35 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD
}
}

private void filterMetadata(MetadataFilter metadataFilter, List<Metadata> metadataList) {
private void filterMetadata(FetchEmitTuple t, List<Metadata> metadataList) {
MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataFilter();
}
for (Metadata m : metadataList) {
try {
metadataFilter.filter(m);
filter.filter(m);
} catch (TikaException e) {
LOG.warn("failed to filter metadata", e);
}
}
}

private void filterMetadataList(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) {
MetadataListFilter filter = t.getParseContext().get(MetadataListFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataListFilter();
}
if (filter instanceof NoOpListFilter) {
return;
}
try {
parseData.filter(filter);
} catch (TikaException e) {
LOG.warn("failed to filter metadata list", e);
}
}

private Fetcher getFetcher(FetchEmitTuple t) {
try {
return fetcherManager.getFetcher(t.getFetchKey().getFetcherName());
Expand Down Expand Up @@ -830,7 +848,8 @@ private void write(STATUS status) {
}

static class MetadataListAndEmbeddedBytes {
final List<Metadata> metadataList;

List<Metadata> metadataList;
final Optional<EmbeddedDocumentBytesHandler> embeddedDocumentBytesHandler;

public MetadataListAndEmbeddedBytes(List<Metadata> metadataList,
Expand All @@ -843,6 +862,10 @@ public List<Metadata> getMetadataList() {
return metadataList;
}

public void filter(MetadataListFilter filter) throws TikaException {
metadataList = filter.filter(metadataList);
}

public EmbeddedDocumentBytesHandler getEmbeddedDocumentBytesHandler() {
return embeddedDocumentBytesHandler.get();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.metadata.listfilter;

import java.util.List;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;

public class AttachmentCountingListFilter extends MetadataListFilter {
@Override
public List<Metadata> filter(List<Metadata> metadataList) throws TikaException {
if (metadataList == null || metadataList.isEmpty()) {
return metadataList;
}
metadataList.get(0).set("X-TIKA:attachment_count", Integer.toString(metadataList.size() - 1));
return metadataList;
}
}
Loading

0 comments on commit 2e09f0d

Please sign in to comment.