Skip to content

Commit

Permalink
Extract generic chunker step from html-chunker
Browse files Browse the repository at this point in the history
The generic part is just a dumb, HTML unaware chunker that chunks
based on a (HTML specific) style sheet and preserved the structure of
the input. The generic step is followed by a "finalize" step that
cleans up the structure of the resulting chunks.

See daisy/pipeline-scripts#123
  • Loading branch information
bertfrees committed Oct 1, 2018
1 parent e6d15b5 commit d0da536
Show file tree
Hide file tree
Showing 6 changed files with 266 additions and 133 deletions.
47 changes: 47 additions & 0 deletions html-utils/src/main/resources/xml/xproc/chunker.xpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?xml version="1.0" encoding="UTF-8"?>
<p:declare-step type="px:chunker"
xmlns:p="http://www.w3.org/ns/xproc"
xmlns:px="http://www.daisy.org/ns/pipeline/xproc"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
exclude-inline-prefixes="#all"
version="1.0"
name="main">

<p:documentation>
<p xmlns="http://www.w3.org/1999/xhtml">Break a document into smaller parts.</p>
</p:documentation>

<p:input port="source"/>
<p:option name="stylesheet" required="true">
<p:documentation>
<p xmlns="http://www.w3.org/1999/xhtml">An XSLT style sheet that specifies the break
points. For each node that should be put in its own chunk, the the style sheet must
contain a template in the `is-chunk` mode that matches this node and returns
`true()`.</p>
</p:documentation>
</p:option>
<p:output port="result" sequence="true">
<p:pipe step="xslt" port="secondary"/>
</p:output>

<p:string-replace match="/xsl:stylesheet/xsl:include/@href[.='$stylesheet']" name="compile">
<p:input port="source">
<p:document href="../xslt/chunker.xsl"/>
</p:input>
<p:with-option name="replace" select="concat('&quot;',$stylesheet,'&quot;')"/>
</p:string-replace>

<p:xslt name="xslt">
<p:input port="source">
<p:pipe step="main" port="source"/>
</p:input>
<p:input port="stylesheet">
<p:pipe step="compile" port="result"/>
</p:input>
<p:input port="parameters">
<p:empty/>
</p:input>
</p:xslt>
<p:sink/>

</p:declare-step>
48 changes: 35 additions & 13 deletions html-utils/src/main/resources/xml/xproc/html-chunker.xpl
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,44 @@
<p:declare-step type="px:html-chunker"
xmlns:p="http://www.w3.org/ns/xproc"
xmlns:px="http://www.daisy.org/ns/pipeline/xproc"
xmlns:html="http://www.w3.org/1999/xhtml"
exclude-inline-prefixes="#all"
version="1.0">
version="1.0"
name="main">

<p:documentation>
<p xmlns="http://www.w3.org/1999/xhtml">Break a HTML document into smaller parts based on
its structure.</p>
</p:documentation>

<p:input port="source"/>
<p:output port="result" sequence="true">
<p:pipe step="xslt" port="secondary"/>
</p:output>
<p:output port="result" sequence="true"/>

<p:import href="chunker.xpl"/>

<p:delete match="/html:html/html:head"/>

<px:chunker>
<p:with-option name="stylesheet" select="resolve-uri('../xslt/html-chunker-break-points.xsl')">
<p:inline>
<this/>
</p:inline>
</p:with-option>
</px:chunker>

<p:xslt name="xslt">
<p:input port="stylesheet">
<p:document href="../xslt/html-chunker.xsl"/>
</p:input>
<p:input port="parameters">
<p:empty/>
</p:input>
</p:xslt>
<p:sink/>
<p:for-each name="chunks">
<p:xslt>
<p:input port="source">
<p:pipe step="chunks" port="current"/>
<p:pipe step="main" port="source"/>
</p:input>
<p:input port="stylesheet">
<p:document href="../xslt/html-chunker-finalize.xsl"/>
</p:input>
<p:input port="parameters">
<p:empty/>
</p:input>
</p:xslt>
</p:for-each>

</p:declare-step>
116 changes: 116 additions & 0 deletions html-utils/src/main/resources/xml/xslt/chunker.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:f="http://www.daisy.org/ns/pipeline/internal-functions"
version="2.0"
exclude-result-prefixes="#all">

<xsl:include href="$stylesheet"/>

<xsl:output method="xhtml" indent="yes"/>

<xsl:variable name="doc" select="/"/>

<xsl:key name="ids" match="*" use="@id|@xml:id"/>

<xsl:variable name="chunks" as="document-node()*">
<xsl:apply-templates select="/*" mode="chunking"/>
</xsl:variable>

<xsl:variable name="chunks-ids" as="xs:string*" select="$chunks/generate-id()"/>

<xsl:function name="f:chunk-name">
<xsl:param name="chunk" as="document-node()"/>
<xsl:sequence
select="replace(base-uri($doc/*),'.*?([^/]+)(\.[^.]+)$',concat('$1-',index-of($chunks-ids,generate-id($chunk)),'$2'))"
/>
</xsl:function>

<xsl:template match="/">
<xsl:for-each select="$chunks">
<xsl:result-document href="{replace(base-uri($doc/*),'([^/]+)$',f:chunk-name(.))}">
<xsl:apply-templates select="/*"/>
</xsl:result-document>
</xsl:for-each>
</xsl:template>

<xsl:template match="@href[starts-with(.,'#')]">
<xsl:variable name="refid" select="substring(.,2)" as="xs:string"/>
<xsl:variable name="refchunk" select="$chunks[key('ids',$refid,.)]"/>
<xsl:if test="empty($refchunk)">
<xsl:message>Unable to resolve link to '<xsl:value-of select="."/>'</xsl:message>
</xsl:if>
<xsl:attribute name="href"
select="if (empty($refchunk) or / = $refchunk) then .
else concat(f:chunk-name($refchunk),.)"
/>
</xsl:template>

<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>

<xsl:template mode="chunking" match="*">
<xsl:variable name="this" select="."/>
<xsl:for-each-group select="text()[normalize-space()]|*"
group-adjacent="exists(descendant-or-self::*[f:is-chunk(.)])">
<xsl:choose>
<xsl:when test="current-grouping-key()">
<xsl:for-each select="current-group()">
<xsl:choose>
<xsl:when test="f:is-chunk(.)">
<xsl:call-template name="copy-ancestors">
<xsl:with-param name="nodes" select="."/>
<xsl:with-param name="original-parent" select="$this"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:apply-templates mode="#current" select="."/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="copy-ancestors">
<xsl:with-param name="nodes" select="current-group()"/>
<xsl:with-param name="original-parent" select="$this"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each-group>
</xsl:template>

<xsl:template name="copy-ancestors">
<xsl:param name="nodes" as="node()*" required="yes"/>
<xsl:param name="original-parent" as="element()" required="yes"/>
<xsl:variable name="wrapped">
<xsl:element name="{local-name($original-parent)}" namespace="{namespace-uri($original-parent)}">
<xsl:sequence select="$original-parent/@*"/>
<xsl:sequence select="$nodes"/>
</xsl:element>
</xsl:variable>
<xsl:choose>
<xsl:when test="$original-parent/parent::*">
<xsl:call-template name="copy-ancestors">
<xsl:with-param name="nodes" select="$wrapped"/>
<xsl:with-param name="original-parent" select="$original-parent/parent::*"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:sequence select="$wrapped"/>
</xsl:otherwise>
</xsl:choose>
</xsl:template>

<xsl:function name="f:is-chunk" as="xs:boolean">
<xsl:param name="node" as="node()"/>
<xsl:apply-templates mode="is-chunk" select="$node"/>
</xsl:function>

<xsl:template mode="is-chunk" priority="-100" match="node()" as="xs:boolean">
<xsl:sequence select="false()"/>
</xsl:template>

</xsl:stylesheet>
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns="http://www.w3.org/1999/xhtml"
xmlns:epub="http://www.idpf.org/2007/ops"
xpath-default-namespace="http://www.w3.org/1999/xhtml"
version="2.0">

<xsl:template mode="is-chunk"
match="/html/body/section[not(epub:has-type(.,'bodymatter') and child::section)]|
/html/body/section[epub:has-type(.,'bodymatter')]/section"
as="xs:boolean">
<xsl:sequence select="true()"/>
</xsl:template>

<xsl:function name="epub:has-type" as="xs:boolean">
<xsl:param name="element" as="element()"/>
<xsl:param name="type" as="xs:string"/>
<xsl:sequence select="tokenize($element/@epub:type,'\s+')=$type"/>
</xsl:function>

</xsl:stylesheet>
46 changes: 46 additions & 0 deletions html-utils/src/main/resources/xml/xslt/html-chunker-finalize.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet xmlns="http://www.w3.org/1999/xhtml"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:epub="http://www.idpf.org/2007/ops"
xmlns:tts="http://www.daisy.org/ns/pipeline/tts"
xpath-default-namespace="http://www.w3.org/1999/xhtml"
exclude-result-prefixes="#all"
version="2.0">

<xsl:variable name="source" select="collection()[2]"/>
<xsl:variable name="head" select="$source/html/head"/>

<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>

<xsl:template match="/html">
<xsl:copy>
<xsl:copy-of select="(@* except @xml:base) | namespace::*"/>
<xsl:apply-templates select="$head">
<xsl:with-param name="title" select="(((//h1)[1])/string(.),((//h2)[1])/string(.))[1]" tunnel="yes"/>
</xsl:apply-templates>
<xsl:apply-templates select="node() except head"/>
</xsl:copy>
</xsl:template>

<xsl:template match="body">
<xsl:copy>
<!-- TODO: try to not "depend" on the TTS namespace here -->
<xsl:copy-of select="@tts:*|section/@*"/>
<xsl:apply-templates select="section/node()"/>
</xsl:copy>
</xsl:template>

<xsl:template match="title">
<xsl:param name="title" tunnel="yes" as="xs:string?"/>
<title>
<xsl:apply-templates select="@*"/>
<xsl:value-of select="($title,string(.))[1]"/>
</title>
</xsl:template>

</xsl:stylesheet>
Loading

0 comments on commit d0da536

Please sign in to comment.