From d32891d98f7a36e84c87a2bcf3ca72d69b4af3af Mon Sep 17 00:00:00 2001 From: ruwi-next <144923150+ruwi-next@users.noreply.github.com> Date: Thu, 31 Oct 2024 18:09:03 +0000 Subject: [PATCH] [TIKA-4337] Fix XPS regressions (#2028) * [TIKA-4337] Fix XPS exceptions Add missing indices check Only parse glyph advance * Add GlyphRun split, VisualBrush, improved y check (cherry picked from commit ce6a3a650418c27e11e6a8df79afe3a285e71521) --- .../ooxml/xps/XPSPageContentHandler.java | 157 ++++++++++++++---- 1 file changed, 127 insertions(+), 30 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java index 41a4b029b3..e6464639ff 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xps/XPSPageContentHandler.java @@ -53,6 +53,8 @@ class XPSPageContentHandler extends DefaultHandler { private static final String CANVAS = "Canvas"; private static final String CLIP = "Clip"; private static final String NULL_CLIP = "NULL_CLIP"; + private static final String VISUAL_BRUSH = "VisualBrush"; + private static final String TRANSFORM = "Transform"; private static final String UNICODE_STRING = "UnicodeString"; private static final String ORIGIN_X = "OriginX"; private static final String ORIGIN_Y = "OriginY"; @@ -84,6 +86,9 @@ class XPSPageContentHandler extends DefaultHandler { // The threshold for the horizontal distance between glyph runs to insert a whitespace, measured in em private static final float WHITESPACE_THRESHOLD = 0.3f; + // The threshold for the horizontal distance between glyphs to split that glyph run into two, measured in em + private static final float SPLIT_THRESHOLD = 1.0f; + // The threshold for the vertical distance between glyph runs to be considered on the same row, measured in em private static final float ROW_COMBINE_THRESHOLD = 0.5f; @@ -147,6 +152,15 @@ public void startElement(String uri, String localName, String qName, Attributes canvasStack.push(clip); } return; + } else if (VISUAL_BRUSH.equals(localName)) { + // Also push visual brush transform onto stack as this will move children + String transform = getVal(TRANSFORM, atts); + if (transform == null) { + canvasStack.push(NULL_CLIP); + } else { + canvasStack.push(transform); + } + return; } else if (PATH.equals(localName)) { //for now just grab them and dump them at the end of the page. String url = getVal(NAVIGATE_URI, atts); @@ -165,7 +179,7 @@ public void startElement(String uri, String localName, String qName, Attributes Float originX = null; Float originY = null; String unicodeString = null; - int bidilevel = 1; + Integer bidilevel = null; List indices = null; float fontSize = 0; String fontUri = null; @@ -208,37 +222,37 @@ public void startElement(String uri, String localName, String qName, Attributes if (unicodeString != null) { originX = (originX == null) ? Integer.MIN_VALUE : originX; originY = (originY == null) ? Integer.MAX_VALUE : originY; - String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP; - List runs = canvases.get(currentCanvasClip); + StringBuilder canvasStringBuilder = new StringBuilder(); + for (String s : canvasStack) { + canvasStringBuilder.append(s); + canvasStringBuilder.append(';'); + } + String canvasCombined = canvasStringBuilder.toString(); + List runs = canvases.get(canvasCombined); if (runs == null) { runs = new ArrayList<>(); } + if (indices == null) { + indices = new ArrayList<>(); + } runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indices, fontSize, fontUri)); - canvases.put(currentCanvasClip, runs); + canvases.put(canvasCombined, runs); } } // Parses a indices string into a list of GlyphIndex - private static List parseIndicesString(String indicesString) throws SAXException { + private List parseIndicesString(String indicesString) throws SAXException { try { ArrayList indices = new ArrayList<>(); for (String indexString : indicesString.split(";", -1)) { - if (indexString.isEmpty()) { - indices.add(new GlyphIndex(0, 0.0f)); - continue; - } - int commaIndex = indexString.indexOf(','); - if (commaIndex == -1) { - int glyphIndex = Integer.parseInt(indexString); - indices.add(new GlyphIndex(glyphIndex, 0.0f)); + // We only want to extract the advance which will be the second comma separated value + String[] commaSplit = indexString.split(",", -1); + if (commaSplit.length < 2) { + indices.add(new GlyphIndex(0.0f)); } else { - int glyphIndex = 0; - if (commaIndex > 0) { - glyphIndex = Integer.parseInt(indexString.substring(0, commaIndex)); - } // Advance is measured in hundreths so divide by 100 - float advance = Float.parseFloat(indexString.substring(commaIndex + 1)) / 100.0f; - indices.add(new GlyphIndex(glyphIndex, advance)); + float advance = Float.parseFloat(commaSplit[1]) / 100.0f; + indices.add(new GlyphIndex(advance)); } } return indices; @@ -322,6 +336,7 @@ private final void writePage() throws SAXException { } private void writeRow(List row) throws SAXException { + row = splitRow(row); sortRow(row); xhml.startElement(P); @@ -340,6 +355,62 @@ private void writeRow(List row) throws SAXException { xhml.endElement(P); } + // Returns a new list of glyph runs in a row after splitting any runs with large advances into multiple runs + // This fixes issues where a single run has a large advance and text visually is placed in that gap so should be read in a different order + private static List splitRow(List row) { + List newRuns = new ArrayList<>(); + for (int j = 0; j < row.size(); j++) { + GlyphRun run = row.get(j); + // TODO: Implement splitting for RTL too + if (run.direction != GlyphRun.DIRECTION.LTR) { + newRuns.add(run); + continue; + } + float width = 0f; + for (int i = 0; i < run.indices.size() - 1; i++) { + GlyphIndex index = run.indices.get(i); + if (index.advance == 0.0f) { + if (i == 0) { + // If this is the first glyph use hard coded estimate + width += ESTIMATE_GLYPH_WIDTH; + } else { + // If advance is 0.0 it is probably the last glyph in the run, we don't know how wide it is so we use the average of the previous widths as an estimate + width += width / i; + } + } else { + width += index.advance; + } + if (index.advance > SPLIT_THRESHOLD) { + newRuns.add(new GlyphRun( + run.name, + run.originY, + run.originX, + run.unicodeString.substring(0, i + 1), + null, + run.indices.subList(0, i + 1), + run.fontSize, + run.fontUri + )); + run.indices.set(i, new GlyphIndex(0.0f)); + run = new GlyphRun( + run.name, + run.originY, + run.originX + width * run.fontSize, + run.unicodeString.substring(i + 1, run.unicodeString.length()), + null, + run.indices.subList(i + 1, run.indices.size()), + run.fontSize, + run.fontUri + ); + i = 0; + width = 0f; + } + } + newRuns.add(run); + } + return newRuns; + } + private static void sortRow(List row) { boolean allRTL = true; for (GlyphRun run : row) { @@ -374,11 +445,10 @@ private List> buildRows(List glyphRuns) { continue; } else { boolean addedNewRow = false; - //can rely on the last row having the highest y - List row = rows.get(rows.size() - 1); - GlyphRun lastRun = row.get(row.size() - 1); - float averageFontSize = (glyphRun.fontSize + lastRun.fontSize) / 2f; - if (Math.abs(glyphRun.originY - lastRun.originY) < averageFontSize * ROW_COMBINE_THRESHOLD) { + List row = findClosestRowVertically(rows, glyphRun.originY); + GlyphRun firstRun = row.get(0); + float averageFontSize = (glyphRun.fontSize + firstRun.fontSize) / 2f; + if (Math.abs(glyphRun.originY - firstRun.originY) < averageFontSize * ROW_COMBINE_THRESHOLD) { row.add(glyphRun); } else { row = new ArrayList<>(); @@ -400,6 +470,30 @@ private List> buildRows(List glyphRuns) { return rows; } + // Search to find the closest row vertically to the y, if rows is empty returns null + private List findClosestRowVertically(List> rows, float y) { + List best = null; + float bestDistance = Float.POSITIVE_INFINITY; + // Loop backwards since normally XPS files are in order so we will match the last element + // TODO: This could be optimised using a binary search since we know rows is sorted + for (int i = rows.size() - 1; i >= 0; i--) { + List row = rows.get(i); + if (row.size() == 0) { + continue; + } + float distance = Math.abs(row.get(row.size() - 1).originY - y); + // There is nothing better than 0 + if (distance == 0f) { + return row; + } + if (distance < bestDistance) { + best = row; + bestDistance = distance; + } + } + return best; + } + final static class GlyphRun { //TODO: use name in conjunction with Frag information @@ -475,18 +569,21 @@ private float width() { } final static class GlyphIndex { - // The index of the glyph in the font - private final int index; + // TODO: Parse other elements of GlyphIndex + + // private int index; + // private int clusterCodeUnitCount; + // private int clusterGlyphCount; + // The placement of the glyph that follows relative to the origin of the current glyph. Measured as a multiple of the fonts em-size. // Should be multiplied by the font em-size to get a value that can be compared across GlyphRuns // Will be zero for the last glpyh in a glyph run private final float advance; + // private float uOffset; + // private float vOffset; - private GlyphIndex(int index, float advance) { - this.index = index; + private GlyphIndex(float advance) { this.advance = advance; } - } - }