Skip to content

Commit

Permalink
[TIKA-4337] Fix XPS regressions (#2028)
Browse files Browse the repository at this point in the history
* [TIKA-4337] Fix XPS exceptions

Add missing indices check

Only parse glyph advance

* Add GlyphRun split, VisualBrush, improved y check

(cherry picked from commit ce6a3a6)
  • Loading branch information
ruwi-next authored and tballison committed Oct 31, 2024
1 parent 57c8974 commit d32891d
Showing 1 changed file with 127 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class XPSPageContentHandler extends DefaultHandler {
private static final String CANVAS = "Canvas";
private static final String CLIP = "Clip";
private static final String NULL_CLIP = "NULL_CLIP";
private static final String VISUAL_BRUSH = "VisualBrush";
private static final String TRANSFORM = "Transform";
private static final String UNICODE_STRING = "UnicodeString";
private static final String ORIGIN_X = "OriginX";
private static final String ORIGIN_Y = "OriginY";
Expand Down Expand Up @@ -84,6 +86,9 @@ class XPSPageContentHandler extends DefaultHandler {
// The threshold for the horizontal distance between glyph runs to insert a whitespace, measured in em
private static final float WHITESPACE_THRESHOLD = 0.3f;

// The threshold for the horizontal distance between glyphs to split that glyph run into two, measured in em
private static final float SPLIT_THRESHOLD = 1.0f;

// The threshold for the vertical distance between glyph runs to be considered on the same row, measured in em
private static final float ROW_COMBINE_THRESHOLD = 0.5f;

Expand Down Expand Up @@ -147,6 +152,15 @@ public void startElement(String uri, String localName, String qName, Attributes
canvasStack.push(clip);
}
return;
} else if (VISUAL_BRUSH.equals(localName)) {
// Also push visual brush transform onto stack as this will move children
String transform = getVal(TRANSFORM, atts);
if (transform == null) {
canvasStack.push(NULL_CLIP);
} else {
canvasStack.push(transform);
}
return;
} else if (PATH.equals(localName)) {
//for now just grab them and dump them at the end of the page.
String url = getVal(NAVIGATE_URI, atts);
Expand All @@ -165,7 +179,7 @@ public void startElement(String uri, String localName, String qName, Attributes
Float originX = null;
Float originY = null;
String unicodeString = null;
int bidilevel = 1;
Integer bidilevel = null;
List<GlyphIndex> indices = null;
float fontSize = 0;
String fontUri = null;
Expand Down Expand Up @@ -208,37 +222,37 @@ public void startElement(String uri, String localName, String qName, Attributes
if (unicodeString != null) {
originX = (originX == null) ? Integer.MIN_VALUE : originX;
originY = (originY == null) ? Integer.MAX_VALUE : originY;
String currentCanvasClip = (canvasStack.size() > 0) ? canvasStack.peek() : NULL_CLIP;
List<GlyphRun> runs = canvases.get(currentCanvasClip);
StringBuilder canvasStringBuilder = new StringBuilder();
for (String s : canvasStack) {
canvasStringBuilder.append(s);
canvasStringBuilder.append(';');
}
String canvasCombined = canvasStringBuilder.toString();
List<GlyphRun> runs = canvases.get(canvasCombined);
if (runs == null) {
runs = new ArrayList<>();
}
if (indices == null) {
indices = new ArrayList<>();
}
runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indices, fontSize, fontUri));
canvases.put(currentCanvasClip, runs);
canvases.put(canvasCombined, runs);
}
}

// Parses a indices string into a list of GlyphIndex
private static List<GlyphIndex> parseIndicesString(String indicesString) throws SAXException {
private List<GlyphIndex> parseIndicesString(String indicesString) throws SAXException {
try {
ArrayList<GlyphIndex> indices = new ArrayList<>();
for (String indexString : indicesString.split(";", -1)) {
if (indexString.isEmpty()) {
indices.add(new GlyphIndex(0, 0.0f));
continue;
}
int commaIndex = indexString.indexOf(',');
if (commaIndex == -1) {
int glyphIndex = Integer.parseInt(indexString);
indices.add(new GlyphIndex(glyphIndex, 0.0f));
// We only want to extract the advance which will be the second comma separated value
String[] commaSplit = indexString.split(",", -1);
if (commaSplit.length < 2) {
indices.add(new GlyphIndex(0.0f));
} else {
int glyphIndex = 0;
if (commaIndex > 0) {
glyphIndex = Integer.parseInt(indexString.substring(0, commaIndex));
}
// Advance is measured in hundreths so divide by 100
float advance = Float.parseFloat(indexString.substring(commaIndex + 1)) / 100.0f;
indices.add(new GlyphIndex(glyphIndex, advance));
float advance = Float.parseFloat(commaSplit[1]) / 100.0f;
indices.add(new GlyphIndex(advance));
}
}
return indices;
Expand Down Expand Up @@ -322,6 +336,7 @@ private final void writePage() throws SAXException {
}

private void writeRow(List<GlyphRun> row) throws SAXException {
row = splitRow(row);
sortRow(row);

xhml.startElement(P);
Expand All @@ -340,6 +355,62 @@ private void writeRow(List<GlyphRun> row) throws SAXException {
xhml.endElement(P);
}

// Returns a new list of glyph runs in a row after splitting any runs with large advances into multiple runs
// This fixes issues where a single run has a large advance and text visually is placed in that gap so should be read in a different order
private static List<GlyphRun> splitRow(List<GlyphRun> row) {
List<GlyphRun> newRuns = new ArrayList<>();
for (int j = 0; j < row.size(); j++) {
GlyphRun run = row.get(j);
// TODO: Implement splitting for RTL too
if (run.direction != GlyphRun.DIRECTION.LTR) {
newRuns.add(run);
continue;
}
float width = 0f;
for (int i = 0; i < run.indices.size() - 1; i++) {
GlyphIndex index = run.indices.get(i);
if (index.advance == 0.0f) {
if (i == 0) {
// If this is the first glyph use hard coded estimate
width += ESTIMATE_GLYPH_WIDTH;
} else {
// If advance is 0.0 it is probably the last glyph in the run, we don't know how wide it is so we use the average of the previous widths as an estimate
width += width / i;
}
} else {
width += index.advance;
}
if (index.advance > SPLIT_THRESHOLD) {
newRuns.add(new GlyphRun(
run.name,
run.originY,
run.originX,
run.unicodeString.substring(0, i + 1),
null,
run.indices.subList(0, i + 1),
run.fontSize,
run.fontUri
));
run.indices.set(i, new GlyphIndex(0.0f));
run = new GlyphRun(
run.name,
run.originY,
run.originX + width * run.fontSize,
run.unicodeString.substring(i + 1, run.unicodeString.length()),
null,
run.indices.subList(i + 1, run.indices.size()),
run.fontSize,
run.fontUri
);
i = 0;
width = 0f;
}
}
newRuns.add(run);
}
return newRuns;
}

private static void sortRow(List<GlyphRun> row) {
boolean allRTL = true;
for (GlyphRun run : row) {
Expand Down Expand Up @@ -374,11 +445,10 @@ private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
continue;
} else {
boolean addedNewRow = false;
//can rely on the last row having the highest y
List<GlyphRun> row = rows.get(rows.size() - 1);
GlyphRun lastRun = row.get(row.size() - 1);
float averageFontSize = (glyphRun.fontSize + lastRun.fontSize) / 2f;
if (Math.abs(glyphRun.originY - lastRun.originY) < averageFontSize * ROW_COMBINE_THRESHOLD) {
List<GlyphRun> row = findClosestRowVertically(rows, glyphRun.originY);
GlyphRun firstRun = row.get(0);
float averageFontSize = (glyphRun.fontSize + firstRun.fontSize) / 2f;
if (Math.abs(glyphRun.originY - firstRun.originY) < averageFontSize * ROW_COMBINE_THRESHOLD) {
row.add(glyphRun);
} else {
row = new ArrayList<>();
Expand All @@ -400,6 +470,30 @@ private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
return rows;
}

// Search to find the closest row vertically to the y, if rows is empty returns null
private List<GlyphRun> findClosestRowVertically(List<List<GlyphRun>> rows, float y) {
List<GlyphRun> best = null;
float bestDistance = Float.POSITIVE_INFINITY;
// Loop backwards since normally XPS files are in order so we will match the last element
// TODO: This could be optimised using a binary search since we know rows is sorted
for (int i = rows.size() - 1; i >= 0; i--) {
List<GlyphRun> row = rows.get(i);
if (row.size() == 0) {
continue;
}
float distance = Math.abs(row.get(row.size() - 1).originY - y);
// There is nothing better than 0
if (distance == 0f) {
return row;
}
if (distance < bestDistance) {
best = row;
bestDistance = distance;
}
}
return best;
}

final static class GlyphRun {

//TODO: use name in conjunction with Frag information
Expand Down Expand Up @@ -475,18 +569,21 @@ private float width() {
}

final static class GlyphIndex {
// The index of the glyph in the font
private final int index;
// TODO: Parse other elements of GlyphIndex

// private int index;
// private int clusterCodeUnitCount;
// private int clusterGlyphCount;

// The placement of the glyph that follows relative to the origin of the current glyph. Measured as a multiple of the fonts em-size.
// Should be multiplied by the font em-size to get a value that can be compared across GlyphRuns
// Will be zero for the last glpyh in a glyph run
private final float advance;
// private float uOffset;
// private float vOffset;

private GlyphIndex(int index, float advance) {
this.index = index;
private GlyphIndex(float advance) {
this.advance = advance;
}

}

}

0 comments on commit d32891d

Please sign in to comment.