-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract generic chunker step from html-chunker
The generic part is just a dumb, HTML unaware chunker that chunks based on a (HTML specific) style sheet and preserved the structure of the input. The generic step is followed by a "finalize" step that cleans up the structure of the resulting chunks. See daisy/pipeline-scripts#123
- Loading branch information
Showing
6 changed files
with
266 additions
and
133 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<p:declare-step type="px:chunker" | ||
xmlns:p="http://www.w3.org/ns/xproc" | ||
xmlns:px="http://www.daisy.org/ns/pipeline/xproc" | ||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | ||
exclude-inline-prefixes="#all" | ||
version="1.0" | ||
name="main"> | ||
|
||
<p:documentation> | ||
<p xmlns="http://www.w3.org/1999/xhtml">Break a document into smaller parts.</p> | ||
</p:documentation> | ||
|
||
<p:input port="source"/> | ||
<p:option name="stylesheet" required="true"> | ||
<p:documentation> | ||
<p xmlns="http://www.w3.org/1999/xhtml">An XSLT style sheet that specifies the break | ||
points. For each node that should be put in its own chunk, the the style sheet must | ||
contain a template in the `is-chunk` mode that matches this node and returns | ||
`true()`.</p> | ||
</p:documentation> | ||
</p:option> | ||
<p:output port="result" sequence="true"> | ||
<p:pipe step="xslt" port="secondary"/> | ||
</p:output> | ||
|
||
<p:string-replace match="/xsl:stylesheet/xsl:include/@href[.='$stylesheet']" name="compile"> | ||
<p:input port="source"> | ||
<p:document href="../xslt/chunker.xsl"/> | ||
</p:input> | ||
<p:with-option name="replace" select="concat('"',$stylesheet,'"')"/> | ||
</p:string-replace> | ||
|
||
<p:xslt name="xslt"> | ||
<p:input port="source"> | ||
<p:pipe step="main" port="source"/> | ||
</p:input> | ||
<p:input port="stylesheet"> | ||
<p:pipe step="compile" port="result"/> | ||
</p:input> | ||
<p:input port="parameters"> | ||
<p:empty/> | ||
</p:input> | ||
</p:xslt> | ||
<p:sink/> | ||
|
||
</p:declare-step> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | ||
xmlns:xs="http://www.w3.org/2001/XMLSchema" | ||
xmlns:f="http://www.daisy.org/ns/pipeline/internal-functions" | ||
version="2.0" | ||
exclude-result-prefixes="#all"> | ||
|
||
<xsl:include href="$stylesheet"/> | ||
|
||
<xsl:output method="xhtml" indent="yes"/> | ||
|
||
<xsl:variable name="doc" select="/"/> | ||
|
||
<xsl:key name="ids" match="*" use="@id|@xml:id"/> | ||
|
||
<xsl:variable name="chunks" as="document-node()*"> | ||
<xsl:apply-templates select="/*" mode="chunking"/> | ||
</xsl:variable> | ||
|
||
<xsl:variable name="chunks-ids" as="xs:string*" select="$chunks/generate-id()"/> | ||
|
||
<xsl:function name="f:chunk-name"> | ||
<xsl:param name="chunk" as="document-node()"/> | ||
<xsl:sequence | ||
select="replace(base-uri($doc/*),'.*?([^/]+)(\.[^.]+)$',concat('$1-',index-of($chunks-ids,generate-id($chunk)),'$2'))" | ||
/> | ||
</xsl:function> | ||
|
||
<xsl:template match="/"> | ||
<xsl:for-each select="$chunks"> | ||
<xsl:result-document href="{replace(base-uri($doc/*),'([^/]+)$',f:chunk-name(.))}"> | ||
<xsl:apply-templates select="/*"/> | ||
</xsl:result-document> | ||
</xsl:for-each> | ||
</xsl:template> | ||
|
||
<xsl:template match="@href[starts-with(.,'#')]"> | ||
<xsl:variable name="refid" select="substring(.,2)" as="xs:string"/> | ||
<xsl:variable name="refchunk" select="$chunks[key('ids',$refid,.)]"/> | ||
<xsl:if test="empty($refchunk)"> | ||
<xsl:message>Unable to resolve link to '<xsl:value-of select="."/>'</xsl:message> | ||
</xsl:if> | ||
<xsl:attribute name="href" | ||
select="if (empty($refchunk) or / = $refchunk) then . | ||
else concat(f:chunk-name($refchunk),.)" | ||
/> | ||
</xsl:template> | ||
|
||
<xsl:template match="@*|node()"> | ||
<xsl:copy> | ||
<xsl:apply-templates select="@*|node()"/> | ||
</xsl:copy> | ||
</xsl:template> | ||
|
||
<xsl:template mode="chunking" match="*"> | ||
<xsl:variable name="this" select="."/> | ||
<xsl:for-each-group select="text()[normalize-space()]|*" | ||
group-adjacent="exists(descendant-or-self::*[f:is-chunk(.)])"> | ||
<xsl:choose> | ||
<xsl:when test="current-grouping-key()"> | ||
<xsl:for-each select="current-group()"> | ||
<xsl:choose> | ||
<xsl:when test="f:is-chunk(.)"> | ||
<xsl:call-template name="copy-ancestors"> | ||
<xsl:with-param name="nodes" select="."/> | ||
<xsl:with-param name="original-parent" select="$this"/> | ||
</xsl:call-template> | ||
</xsl:when> | ||
<xsl:otherwise> | ||
<xsl:apply-templates mode="#current" select="."/> | ||
</xsl:otherwise> | ||
</xsl:choose> | ||
</xsl:for-each> | ||
</xsl:when> | ||
<xsl:otherwise> | ||
<xsl:call-template name="copy-ancestors"> | ||
<xsl:with-param name="nodes" select="current-group()"/> | ||
<xsl:with-param name="original-parent" select="$this"/> | ||
</xsl:call-template> | ||
</xsl:otherwise> | ||
</xsl:choose> | ||
</xsl:for-each-group> | ||
</xsl:template> | ||
|
||
<xsl:template name="copy-ancestors"> | ||
<xsl:param name="nodes" as="node()*" required="yes"/> | ||
<xsl:param name="original-parent" as="element()" required="yes"/> | ||
<xsl:variable name="wrapped"> | ||
<xsl:element name="{local-name($original-parent)}" namespace="{namespace-uri($original-parent)}"> | ||
<xsl:sequence select="$original-parent/@*"/> | ||
<xsl:sequence select="$nodes"/> | ||
</xsl:element> | ||
</xsl:variable> | ||
<xsl:choose> | ||
<xsl:when test="$original-parent/parent::*"> | ||
<xsl:call-template name="copy-ancestors"> | ||
<xsl:with-param name="nodes" select="$wrapped"/> | ||
<xsl:with-param name="original-parent" select="$original-parent/parent::*"/> | ||
</xsl:call-template> | ||
</xsl:when> | ||
<xsl:otherwise> | ||
<xsl:sequence select="$wrapped"/> | ||
</xsl:otherwise> | ||
</xsl:choose> | ||
</xsl:template> | ||
|
||
<xsl:function name="f:is-chunk" as="xs:boolean"> | ||
<xsl:param name="node" as="node()"/> | ||
<xsl:apply-templates mode="is-chunk" select="$node"/> | ||
</xsl:function> | ||
|
||
<xsl:template mode="is-chunk" priority="-100" match="node()" as="xs:boolean"> | ||
<xsl:sequence select="false()"/> | ||
</xsl:template> | ||
|
||
</xsl:stylesheet> |
22 changes: 22 additions & 0 deletions
22
html-utils/src/main/resources/xml/xslt/html-chunker-break-points.xsl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | ||
xmlns:xs="http://www.w3.org/2001/XMLSchema" | ||
xmlns="http://www.w3.org/1999/xhtml" | ||
xmlns:epub="http://www.idpf.org/2007/ops" | ||
xpath-default-namespace="http://www.w3.org/1999/xhtml" | ||
version="2.0"> | ||
|
||
<xsl:template mode="is-chunk" | ||
match="/html/body/section[not(epub:has-type(.,'bodymatter') and child::section)]| | ||
/html/body/section[epub:has-type(.,'bodymatter')]/section" | ||
as="xs:boolean"> | ||
<xsl:sequence select="true()"/> | ||
</xsl:template> | ||
|
||
<xsl:function name="epub:has-type" as="xs:boolean"> | ||
<xsl:param name="element" as="element()"/> | ||
<xsl:param name="type" as="xs:string"/> | ||
<xsl:sequence select="tokenize($element/@epub:type,'\s+')=$type"/> | ||
</xsl:function> | ||
|
||
</xsl:stylesheet> |
46 changes: 46 additions & 0 deletions
46
html-utils/src/main/resources/xml/xslt/html-chunker-finalize.xsl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<xsl:stylesheet xmlns="http://www.w3.org/1999/xhtml" | ||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform" | ||
xmlns:xs="http://www.w3.org/2001/XMLSchema" | ||
xmlns:epub="http://www.idpf.org/2007/ops" | ||
xmlns:tts="http://www.daisy.org/ns/pipeline/tts" | ||
xpath-default-namespace="http://www.w3.org/1999/xhtml" | ||
exclude-result-prefixes="#all" | ||
version="2.0"> | ||
|
||
<xsl:variable name="source" select="collection()[2]"/> | ||
<xsl:variable name="head" select="$source/html/head"/> | ||
|
||
<xsl:template match="@*|node()"> | ||
<xsl:copy> | ||
<xsl:apply-templates select="@*|node()"/> | ||
</xsl:copy> | ||
</xsl:template> | ||
|
||
<xsl:template match="/html"> | ||
<xsl:copy> | ||
<xsl:copy-of select="(@* except @xml:base) | namespace::*"/> | ||
<xsl:apply-templates select="$head"> | ||
<xsl:with-param name="title" select="(((//h1)[1])/string(.),((//h2)[1])/string(.))[1]" tunnel="yes"/> | ||
</xsl:apply-templates> | ||
<xsl:apply-templates select="node() except head"/> | ||
</xsl:copy> | ||
</xsl:template> | ||
|
||
<xsl:template match="body"> | ||
<xsl:copy> | ||
<!-- TODO: try to not "depend" on the TTS namespace here --> | ||
<xsl:copy-of select="@tts:*|section/@*"/> | ||
<xsl:apply-templates select="section/node()"/> | ||
</xsl:copy> | ||
</xsl:template> | ||
|
||
<xsl:template match="title"> | ||
<xsl:param name="title" tunnel="yes" as="xs:string?"/> | ||
<title> | ||
<xsl:apply-templates select="@*"/> | ||
<xsl:value-of select="($title,string(.))[1]"/> | ||
</title> | ||
</xsl:template> | ||
|
||
</xsl:stylesheet> |
Oops, something went wrong.