From d0da536443bea800e69e8df798ac52b650d0c42c Mon Sep 17 00:00:00 2001 From: Bert Frees Date: Wed, 11 Apr 2018 11:07:35 +0200 Subject: [PATCH] Extract generic chunker step from html-chunker The generic part is just a dumb, HTML unaware chunker that chunks based on a (HTML specific) style sheet and preserved the structure of the input. The generic step is followed by a "finalize" step that cleans up the structure of the resulting chunks. See https://github.com/daisy/pipeline-scripts/issues/123 --- .../src/main/resources/xml/xproc/chunker.xpl | 47 +++++++ .../main/resources/xml/xproc/html-chunker.xpl | 48 +++++-- .../src/main/resources/xml/xslt/chunker.xsl | 116 +++++++++++++++++ .../xml/xslt/html-chunker-break-points.xsl | 22 ++++ .../xml/xslt/html-chunker-finalize.xsl | 46 +++++++ .../main/resources/xml/xslt/html-chunker.xsl | 120 ------------------ 6 files changed, 266 insertions(+), 133 deletions(-) create mode 100644 html-utils/src/main/resources/xml/xproc/chunker.xpl create mode 100644 html-utils/src/main/resources/xml/xslt/chunker.xsl create mode 100755 html-utils/src/main/resources/xml/xslt/html-chunker-break-points.xsl create mode 100755 html-utils/src/main/resources/xml/xslt/html-chunker-finalize.xsl delete mode 100755 html-utils/src/main/resources/xml/xslt/html-chunker.xsl diff --git a/html-utils/src/main/resources/xml/xproc/chunker.xpl b/html-utils/src/main/resources/xml/xproc/chunker.xpl new file mode 100644 index 0000000..9cb06b0 --- /dev/null +++ b/html-utils/src/main/resources/xml/xproc/chunker.xpl @@ -0,0 +1,47 @@ + + + + +

Break a document into smaller parts.

+
+ + + + +

An XSLT style sheet that specifies the break + points. For each node that should be put in its own chunk, the the style sheet must + contain a template in the `is-chunk` mode that matches this node and returns + `true()`.

+
+
+ + + + + + + + + + + + + + + + + + + + + + + + +
diff --git a/html-utils/src/main/resources/xml/xproc/html-chunker.xpl b/html-utils/src/main/resources/xml/xproc/html-chunker.xpl index 679e134..a274e6f 100644 --- a/html-utils/src/main/resources/xml/xproc/html-chunker.xpl +++ b/html-utils/src/main/resources/xml/xproc/html-chunker.xpl @@ -2,22 +2,44 @@ + version="1.0" + name="main"> + + +

Break a HTML document into smaller parts based on + its structure.

+
- - - + + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + + + + + +
diff --git a/html-utils/src/main/resources/xml/xslt/chunker.xsl b/html-utils/src/main/resources/xml/xslt/chunker.xsl new file mode 100644 index 0000000..6920160 --- /dev/null +++ b/html-utils/src/main/resources/xml/xslt/chunker.xsl @@ -0,0 +1,116 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Unable to resolve link to '' + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/html-utils/src/main/resources/xml/xslt/html-chunker-break-points.xsl b/html-utils/src/main/resources/xml/xslt/html-chunker-break-points.xsl new file mode 100755 index 0000000..409925f --- /dev/null +++ b/html-utils/src/main/resources/xml/xslt/html-chunker-break-points.xsl @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + diff --git a/html-utils/src/main/resources/xml/xslt/html-chunker-finalize.xsl b/html-utils/src/main/resources/xml/xslt/html-chunker-finalize.xsl new file mode 100755 index 0000000..47cbe89 --- /dev/null +++ b/html-utils/src/main/resources/xml/xslt/html-chunker-finalize.xsl @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <xsl:apply-templates select="@*"/> + <xsl:value-of select="($title,string(.))[1]"/> + + + + diff --git a/html-utils/src/main/resources/xml/xslt/html-chunker.xsl b/html-utils/src/main/resources/xml/xslt/html-chunker.xsl deleted file mode 100755 index 425d6bd..0000000 --- a/html-utils/src/main/resources/xml/xslt/html-chunker.xsl +++ /dev/null @@ -1,120 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <xsl:apply-templates select="@*"/> - <xsl:sequence select="$title"/> - - - - - - - - - Unable to resolve link to '' - - - - - - - - - - - - - - - - - -
- -
-
-
-
-
- - - - - - - - - - -
- -
-
-
- -
- -
-
-
-
-
- - -