Skip to content

Commit

Permalink
[TIKA-4315] Fix XPS whitespace not being emitted (#1970)
Browse files Browse the repository at this point in the history
* TIKA-4315: Fix xps whitespace

* Calc XPS whitespace

(cherry picked from commit e137d04)
  • Loading branch information
ruwi-next authored and tballison committed Oct 24, 2024
1 parent 6bc7f2a commit f6fe845
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ class XPSPageContentHandler extends DefaultHandler {
private static final String BIDI_LEVEL = "BidiLevel";
private static final String INDICES = "Indices";
private static final String NAME = "Name";
private static final String FONT_RENDERING_EM_SIZE = "FontRenderingEmSize";
private static final String FONT_URI = "FontUri";
private static final String PATH = "Path";
private static final String NAVIGATE_URI = "FixedPage.NavigateUri";
private static final String IMAGE_SOURCE = "ImageSource";
Expand All @@ -73,6 +75,18 @@ class XPSPageContentHandler extends DefaultHandler {
private static final String P = "p";
private static final String HREF = "href";
private static final String A = "a";

private static final char[] SPACE = new char[]{' '};

// Estimate width of glyph when better information is not available, measured in em
private static final float ESTIMATE_GLYPH_WIDTH = 0.5f;

// The threshold for the horizontal distance between glyph runs to insert a whitespace, measured in em
private static final float WHITESPACE_THRESHOLD = 0.3f;

// The threshold for the vertical distance between glyph runs to be considered on the same row, measured in em
private static final float ROW_COMBINE_THRESHOLD = 0.5f;

//sort based on y coordinate of first element in each row
//this requires every row to have at least one element
private static Comparator<? super List<GlyphRun>> ROW_SORTER =
Expand All @@ -84,6 +98,18 @@ class XPSPageContentHandler extends DefaultHandler {
}
return 0;
};
private static Comparator<GlyphRun> LTR_SORTER = new Comparator<GlyphRun>() {
@Override
public int compare(GlyphRun a, GlyphRun b) {
return Float.compare(a.left(), b.left());
}
};
private static Comparator<GlyphRun> RTL_SORTER = new Comparator<GlyphRun>() {
@Override
public int compare(GlyphRun a, GlyphRun b) {
return Float.compare(b.left(), a.left());
}
};
private final XHTMLContentHandler xhml;
private final Map<String, Metadata> embeddedInfos;
//path in zip file for an image rendered on this page
Expand All @@ -93,7 +119,7 @@ class XPSPageContentHandler extends DefaultHandler {
//buffer for the glyph runs within a given canvas
//in insertion order
private Map<String, List<GlyphRun>> canvases = new LinkedHashMap<>();
private Set<String> urls = new LinkedHashSet();
private Set<String> urls = new LinkedHashSet<String>();
private Stack<String> canvasStack = new Stack<>();

public XPSPageContentHandler(XHTMLContentHandler xhtml, Map<String, Metadata> embeddedInfos) {
Expand Down Expand Up @@ -140,7 +166,9 @@ public void startElement(String uri, String localName, String qName, Attributes
Float originY = null;
String unicodeString = null;
int bidilevel = 1;
String indicesString = null;
List<GlyphIndex> indices = null;
float fontSize = 0;
String fontUri = null;

for (int i = 0; i < atts.getLength(); i++) {
String lName = atts.getLocalName(i);
Expand All @@ -149,28 +177,32 @@ public void startElement(String uri, String localName, String qName, Attributes

if (ORIGIN_X.equals(lName) && value.length() > 0) {
try {
originX = Float.parseFloat(atts.getValue(i));
originX = Float.parseFloat(value);
} catch (NumberFormatException e) {
throw new SAXException(e);
}
} else if (ORIGIN_Y.equals(lName) && value.length() > 0) {
try {
originY = Float.parseFloat(atts.getValue(i));
originY = Float.parseFloat(value);
} catch (NumberFormatException e) {
throw new SAXException(e);
}
} else if (UNICODE_STRING.equals(lName)) {
unicodeString = atts.getValue(i);
} else if (BIDI_LEVEL.equals(lName) && value.length() > 0) {
try {
bidilevel = Integer.parseInt(atts.getValue(i));
bidilevel = Integer.parseInt(value);
} catch (NumberFormatException e) {
throw new SAXException(e);
}
} else if (INDICES.equals(lName)) {
indicesString = atts.getValue(i);
indices = parseIndicesString(value);
} else if (NAME.equals(lName)) {
name = value;
} else if (FONT_RENDERING_EM_SIZE.equals(lName)) {
fontSize = Float.parseFloat(value);
} else if (FONT_URI.equals(lName)) {
fontUri = value;
}
}
if (unicodeString != null) {
Expand All @@ -181,10 +213,38 @@ public void startElement(String uri, String localName, String qName, Attributes
if (runs == null) {
runs = new ArrayList<>();
}
runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indicesString));
runs.add(new GlyphRun(name, originY, originX, unicodeString, bidilevel, indices, fontSize, fontUri));
canvases.put(currentCanvasClip, runs);
}
}

// Parses a indices string into a list of GlyphIndex
private static List<GlyphIndex> parseIndicesString(String indicesString) throws SAXException {
try {
ArrayList<GlyphIndex> indices = new ArrayList<>();
for (String indexString : indicesString.split(";", -1)) {
if (indexString.isEmpty()) {
indices.add(new GlyphIndex(0, 0.0f));
continue;
}
int commaIndex = indexString.indexOf(',');
if (commaIndex == -1) {
int glyphIndex = Integer.parseInt(indexString);
indices.add(new GlyphIndex(glyphIndex, 0.0f));
} else {
int glyphIndex = 0;
if (commaIndex > 0) {
glyphIndex = Integer.parseInt(indexString.substring(0, commaIndex));
}
// Advance is measured in hundreths so divide by 100
float advance = Float.parseFloat(indexString.substring(commaIndex + 1)) / 100.0f;
indices.add(new GlyphIndex(glyphIndex, advance));
}
}
return indices;
} catch (NumberFormatException e) {
throw new SAXException(e);
}
}

@Override
Expand Down Expand Up @@ -234,7 +294,6 @@ private final void writePage() throws SAXException {
}

for (Map.Entry<String, List<GlyphRun>> e : canvases.entrySet()) {
String clip = e.getKey();
List<GlyphRun> runs = e.getValue();
if (runs.size() == 0) {
continue;
Expand Down Expand Up @@ -263,36 +322,45 @@ private final void writePage() throws SAXException {
}

private void writeRow(List<GlyphRun> row) throws SAXException {
/*
int rtl = 0;
int ltr = 0;
//if the row is entirely rtl, sort all as rtl
//otherwise sort ltr
for (GlyphRun r : row) {
//ignore directionality of pure spaces
if (r.unicodeString == null || r.unicodeString.trim().length() == 0) {
continue;
}
if (r.direction == GlyphRun.DIRECTION.RTL) {
rtl++;
} else {
ltr++;
}
}
if (rtl > 0 && ltr == 0) {
Collections.sort(row, GlyphRun.RTL_COMPARATOR);
} else {
Collections.sort(row, GlyphRun.LTR_COMPARATOR);
}*/
sortRow(row);

xhml.startElement(P);
GlyphRun previous = null;
for (GlyphRun run : row) {
//figure out if you need to add a space
if (previous != null) {
float distanceFromPrevious = run.left() - previous.right();
float averageFontSize = (run.fontSize + previous.fontSize) / 2f;
if (distanceFromPrevious > averageFontSize * WHITESPACE_THRESHOLD) {
xhml.ignorableWhitespace(SPACE, 0, SPACE.length);
}
}
xhml.characters(run.unicodeString);
previous = run;
}
xhml.endElement(P);
}

private static void sortRow(List<GlyphRun> row) {
boolean allRTL = true;
for (GlyphRun run : row) {
if (run.unicodeString.trim().length() == 0) {
// ignore whitespace for all RTL check
continue;
}
if (run.direction == GlyphRun.DIRECTION.LTR) {
allRTL = false;
break;
}
}
if (allRTL) {
// If all the text in a row is RTL then sort it in reverse
java.util.Collections.sort(row, RTL_SORTER);
} else {
// Otherwise sort it from left to right
java.util.Collections.sort(row, LTR_SORTER);
}
}

//returns a List of rows (where a row is a list of glyphruns)
//the List is sorted in increasing order of the first y of each row
private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
Expand All @@ -308,9 +376,9 @@ private List<List<GlyphRun>> buildRows(List<GlyphRun> glyphRuns) {
boolean addedNewRow = false;
//can rely on the last row having the highest y
List<GlyphRun> row = rows.get(rows.size() - 1);
//0.5 is a purely heuristic/magical number that should be derived
//from the data, not made up. TODO: fix this
if (Math.abs(glyphRun.originY - row.get(0).originY) < 0.5) {
GlyphRun lastRun = row.get(row.size() - 1);
float averageFontSize = (glyphRun.fontSize + lastRun.fontSize) / 2f;
if (Math.abs(glyphRun.originY - lastRun.originY) < averageFontSize * ROW_COMBINE_THRESHOLD) {
row.add(glyphRun);
} else {
row = new ArrayList<>();
Expand Down Expand Up @@ -339,19 +407,23 @@ final static class GlyphRun {
private final String name;
private final float originY;
private final float originX;
//not currently used, but could be used for bidi text calculations
private final String unicodeString;
private final String indicesString;
//not used yet
private final List<GlyphIndex> indices;
private final DIRECTION direction;
//not currently used, but could be used for width calculations
// Fonts em-size
private final float fontSize;
// Not used currently
private final String fontUri;

private GlyphRun(String name, float originY, float originX, String unicodeString,
Integer bidiLevel, String indicesString) {
Integer bidiLevel, List<GlyphIndex> indices, float fontSize, String fontUri) {
this.name = name;
this.unicodeString = unicodeString;
this.originY = originY;
this.originX = originX;
this.fontSize = fontSize;
this.fontUri = fontUri;
this.indices = indices;
if (bidiLevel == null) {
direction = DIRECTION.LTR;
} else {
Expand All @@ -361,12 +433,60 @@ private GlyphRun(String name, float originY, float originX, String unicodeString
direction = DIRECTION.RTL;
}
}
this.indicesString = indicesString;
}

private enum DIRECTION {
LTR, RTL
}

private float left() {
if (direction == DIRECTION.LTR) {
return originX;
} else {
return originX - width();
}
}

private float right() {
if (direction == DIRECTION.LTR) {
return originX + width();
} else {
return originX;
}
}

private float width() {
float width = 0.0f;
for (int i = 0; i < indices.size(); i++) {
if (indices.get(i).advance == 0.0) {
if (i == 0) {
// If this is the first glyph use hard coded estimate
width += ESTIMATE_GLYPH_WIDTH;
} else {
// If advance is 0.0 it is probably the last glyph in the run, we don't know how wide it is so we use the average of the previous widths as an estimate
width += width / i;
}
} else {
width += indices.get(i).advance;
}
}
return width * fontSize;
}
}

final static class GlyphIndex {
// The index of the glyph in the font
private final int index;
// The placement of the glyph that follows relative to the origin of the current glyph. Measured as a multiple of the fonts em-size.
// Should be multiplied by the font em-size to get a value that can be compared across GlyphRuns
// Will be zero for the last glpyh in a glyph run
private final float advance;

private GlyphIndex(int index, float advance) {
this.index = index;
this.advance = advance;
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,7 @@ public void testBasic() throws Exception {
assertContains("<p>Attachment Test</p>", content);
assertContains("<div class=\"canvas\"><p>Different", content);

//I'd want this to be "tika content", but copy+paste in Windows yields tikacontent
assertContains("tikacontent", content);
assertContains("tika content", content);


assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
Expand Down Expand Up @@ -104,14 +103,14 @@ public void testXPSWithDataDescriptor() throws Exception {
XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor.xps")
.toURI());
//test both path and stream based
List<Metadata> metadataList = getRecursiveMetadata(path, true);
List<Metadata> metadataList = getRecursiveMetadata(path);
assertEquals(2, metadataList.size());
assertContains("This is my XPS document test",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));

ByteArrayOutputStream bos = new ByteArrayOutputStream();
Files.copy(path, bos);
metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true);
metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), false);
assertEquals(2, metadataList.size());
assertContains("This is my XPS document test",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
Expand All @@ -125,17 +124,36 @@ public void testOpenXPSWithDataDescriptor() throws Exception {
Path path = Paths.get(
XPSParserTest.class.getResource("/test-documents/testXPSWithDataDescriptor2.xps")
.toURI());
List<Metadata> metadataList = getRecursiveMetadata(path, true);
List<Metadata> metadataList = getRecursiveMetadata(path);
assertEquals(2, metadataList.size());
assertContains("How was I supposed to know",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));

ByteArrayOutputStream bos = new ByteArrayOutputStream();
Files.copy(path, bos);
metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), true);
metadataList = getRecursiveMetadata(new ByteArrayInputStream(bos.toByteArray()), false);
assertEquals(2, metadataList.size());
assertContains("How was I supposed to know",
metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}

@Test
public void testSpreadsheetXPS() throws Exception {
Path path = Paths.get(XPSParserTest.class.getResource("/test-documents/testXLSX.xps").toURI());
List<Metadata> metadataList = getRecursiveMetadata(path);
String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
assertContains("abcd efg", content);
assertContains("foo bar baz", content);
assertContains("spaced out", content);
}

@Test
public void testTextDocumentXPS() throws Exception {
Path path = Paths.get(XPSParserTest.class.getResource("/test-documents/test_text.xps").toURI());
List<Metadata> metadataList = getRecursiveMetadata(path);
String content = metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT);
assertContains("Rainbow", content);
assertContains("Large font size", content);
assertContains("Parts of this are in italics and bold.", content);
}
}
Binary file not shown.
Binary file not shown.

0 comments on commit f6fe845

Please sign in to comment.