From 7dabb92db98920d61a73e1d8ed47e64095bf55a1 Mon Sep 17 00:00:00 2001 From: Henry Date: Mon, 27 May 2024 09:55:37 -0700 Subject: [PATCH] Content iterator (#82) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Mickaël Menu --- .github/workflows/build.yml | 2 +- .github/workflows/release.yml | 2 +- go.mod | 17 +- go.sum | 39 +- pkg/archive/archive_zip.go | 68 ++- pkg/content/content.go | 53 +++ pkg/content/element/attributes.go | 71 +++ pkg/content/element/element.go | 257 +++++++++++ pkg/content/element/text_role.go | 43 ++ pkg/content/iterator/html.go | 167 +++++++ pkg/content/iterator/html_converter.go | 600 +++++++++++++++++++++++++ pkg/content/iterator/iterator.go | 73 +++ pkg/content/iterator/publication.go | 156 +++++++ pkg/fetcher/fetcher_archive.go | 49 +- pkg/fetcher/fetcher_archive_test.go | 52 +-- pkg/fetcher/fetcher_file.go | 4 + pkg/fetcher/resource.go | 12 + pkg/fetcher/resource_bytes.go | 9 +- pkg/internal/extensions/generics.go | 33 ++ pkg/internal/util/css.go | 122 +++++ pkg/internal/util/css_test.go | 53 +++ pkg/manifest/contributor.go | 13 +- pkg/manifest/link.go | 42 ++ pkg/manifest/link_test.go | 48 +- pkg/manifest/locator.go | 31 +- pkg/manifest/manifest.go | 81 ++++ pkg/manifest/manifest_test.go | 139 ++++++ pkg/manifest/metadata.go | 8 +- pkg/manifest/properties.go | 13 +- pkg/manifest/properties_test.go | 4 +- pkg/mediatype/mediatype_of.go | 14 +- pkg/mediatype/sniffer.go | 35 +- pkg/parser/epub/deobfuscator.go | 148 +++--- pkg/parser/epub/deobfuscator_test.go | 4 +- pkg/parser/epub/parser.go | 6 +- pkg/parser/epub/positions_service.go | 4 +- pkg/parser/pdf/parser.go | 7 +- pkg/parser/pdf/parser_metadata.go | 13 +- pkg/pub/publication.go | 28 +- pkg/pub/service.go | 1 + pkg/pub/service_content.go | 101 +++++ pkg/streamer/a11y_infer_test.go | 2 +- 42 files changed, 2391 insertions(+), 233 deletions(-) create mode 100644 pkg/content/content.go create mode 100644 pkg/content/element/attributes.go create mode 100644 pkg/content/element/element.go create mode 100644 pkg/content/element/text_role.go create mode 100644 pkg/content/iterator/html.go create mode 100644 pkg/content/iterator/html_converter.go create mode 100644 pkg/content/iterator/iterator.go create mode 100644 pkg/content/iterator/publication.go create mode 100644 pkg/internal/util/css.go create mode 100644 pkg/internal/util/css_test.go create mode 100644 pkg/pub/service_content.go diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2821d962..ea303847 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,7 @@ jobs: - name: Set up Go uses: actions/setup-go@v2 with: - go-version: 1.18 + go-version: 1.21 - name: Build run: go build -v ./... diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 485a5119..d004e69e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -18,7 +18,7 @@ jobs: - run: git fetch --force --tags - uses: actions/setup-go@v3 with: - go-version: '>=1.20.0' + go-version: '>=1.21.0' cache: true - uses: goreleaser/goreleaser-action@v4 with: diff --git a/go.mod b/go.mod index 6766cf20..a8797d41 100644 --- a/go.mod +++ b/go.mod @@ -1,13 +1,14 @@ module github.com/readium/go-toolkit -go 1.18 +go 1.21 require ( github.com/agext/regexp v1.3.0 + github.com/andybalholm/cascadia v1.3.2 github.com/deckarep/golang-set v1.7.1 github.com/gorilla/mux v1.7.4 github.com/opds-community/libopds2-go v0.0.0-20170628075933-9c163cf60f6e - github.com/pdfcpu/pdfcpu v0.3.13 + github.com/pdfcpu/pdfcpu v0.5.0 github.com/pkg/errors v0.9.1 github.com/readium/xmlquery v0.0.0-20230106230237-8f493145aef4 github.com/relvacode/iso8601 v1.1.0 @@ -18,8 +19,8 @@ require ( github.com/stretchr/testify v1.7.0 github.com/trimmer-io/go-xmp v1.0.0 github.com/urfave/negroni v1.0.0 - golang.org/x/net v0.7.0 - golang.org/x/text v0.7.0 + golang.org/x/net v0.10.0 + golang.org/x/text v0.12.0 ) require ( @@ -29,8 +30,8 @@ require ( github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/gopherjs/gopherjs v0.0.0-20190910122728-9d188e94fb99 // indirect github.com/hashicorp/hcl v1.0.0 // indirect - github.com/hhrutter/lzw v0.0.0-20190829144645-6f07a24e8650 // indirect - github.com/hhrutter/tiff v0.0.0-20190829141212-736cae8d0bc7 // indirect + github.com/hhrutter/lzw v1.0.0 // indirect + github.com/hhrutter/tiff v1.0.1 // indirect github.com/inconshreveable/mousetrap v1.0.1 // indirect github.com/magiconair/properties v1.8.5 // indirect github.com/mitchellh/mapstructure v1.4.1 // indirect @@ -40,8 +41,8 @@ require ( github.com/spf13/cast v1.3.1 // indirect github.com/spf13/jwalterweatherman v1.1.0 // indirect github.com/subosito/gotenv v1.2.0 // indirect - golang.org/x/image v0.5.0 // indirect - golang.org/x/sys v0.5.0 // indirect + golang.org/x/image v0.11.0 // indirect + golang.org/x/sys v0.8.0 // indirect gopkg.in/ini.v1 v1.62.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 80dd7650..3e78ec78 100644 --- a/go.sum +++ b/go.sum @@ -41,6 +41,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/agext/regexp v1.3.0 h1:6+9tp+S41TU48gFNV47bX+pp1q7WahGofw6JccmsCDs= github.com/agext/regexp v1.3.0/go.mod h1:6phv1gViOJXWcTfpxOi9VMS+MaSAo+SUDf7do3ur1HA= +github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= +github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/antchfx/xpath v1.2.1 h1:qhp4EW6aCOVr5XIkT+l6LJ9ck/JsUH/yyauNgTQkBF8= github.com/antchfx/xpath v1.2.1/go.mod h1:i54GszH55fYfBmoZXapTHN8T8tkcHfRgLyVwwqzXNcs= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= @@ -171,11 +173,10 @@ github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= github.com/hashicorp/memberlist v0.1.3/go.mod h1:ajVTdAv/9Im8oMAAj5G31PhhMCZJV2pPBoIllUwCN7I= github.com/hashicorp/serf v0.8.2/go.mod h1:6hOLApaqBFA1NXqRQAsxw9QxuDEvNxSQRwA/JwenrHc= -github.com/hhrutter/lzw v0.0.0-20190827003112-58b82c5a41cc/go.mod h1:yJBvOcu1wLQ9q9XZmfiPfur+3dQJuIhYQsMGLYcItZk= -github.com/hhrutter/lzw v0.0.0-20190829144645-6f07a24e8650 h1:1yY/RQWNSBjJe2GDCIYoLmpWVidrooriUr4QS/zaATQ= -github.com/hhrutter/lzw v0.0.0-20190829144645-6f07a24e8650/go.mod h1:yJBvOcu1wLQ9q9XZmfiPfur+3dQJuIhYQsMGLYcItZk= -github.com/hhrutter/tiff v0.0.0-20190829141212-736cae8d0bc7 h1:o1wMw7uTNyA58IlEdDpxIrtFHTgnvYzA8sCQz8luv94= -github.com/hhrutter/tiff v0.0.0-20190829141212-736cae8d0bc7/go.mod h1:WkUxfS2JUu3qPo6tRld7ISb8HiC0gVSU91kooBMDVok= +github.com/hhrutter/lzw v1.0.0 h1:laL89Llp86W3rRs83LvKbwYRx6INE8gDn0XNb1oXtm0= +github.com/hhrutter/lzw v1.0.0/go.mod h1:2HC6DJSn/n6iAZfgM3Pg+cP1KxeWc3ezG8bBqW5+WEo= +github.com/hhrutter/tiff v1.0.1 h1:MIus8caHU5U6823gx7C6jrfoEvfSTGtEFRiM8/LOzC0= +github.com/hhrutter/tiff v1.0.1/go.mod h1:zU/dNgDm0cMIa8y8YwcYBeuEEveI4B0owqHyiPpJPHc= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/inconshreveable/mousetrap v1.0.1 h1:U3uMjPSQEBMNp1lFxmllqCPM6P5u/Xq7Pgzkat/bFNc= @@ -213,8 +214,8 @@ github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3Rllmb github.com/opds-community/libopds2-go v0.0.0-20170628075933-9c163cf60f6e h1:kjurmIVxVypqhb5CUAG9jLhYL1TLsUE47KfoEm7cdlE= github.com/opds-community/libopds2-go v0.0.0-20170628075933-9c163cf60f6e/go.mod h1:U/OpXIq9O6FgLfzvun31PZt8iIlbG93BieaxjOEIAd0= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= -github.com/pdfcpu/pdfcpu v0.3.13 h1:VFon2Yo1PJt+sA57vPAeXWGLSZ7Ux3Jl4h02M0+s3dg= -github.com/pdfcpu/pdfcpu v0.3.13/go.mod h1:UJc5xsXg0fpmjp1zOPdyYcAQArc/Zf3V0nv5URe+9fg= +github.com/pdfcpu/pdfcpu v0.5.0 h1:F3wC4bwPbaJM+RPgm1D0Q4SAUwxElw7BhwNvL3iPgDo= +github.com/pdfcpu/pdfcpu v0.5.0/go.mod h1:UPcHdWcMw1V6Bo5tcWHd3jZfkG8cwUwrJkQOlB6o+7g= github.com/pelletier/go-toml v1.9.3 h1:zeC5b1GviRUyKYd6OJPvBU/mcVDVoL1OhT17FCt5dSQ= github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -305,9 +306,8 @@ golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EH golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= -golang.org/x/image v0.0.0-20190823064033-3a9bac650e44/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= -golang.org/x/image v0.5.0 h1:5JMiNunQeQw++mMOz48/ISeNu3Iweh/JaZU8ZLqHRrI= -golang.org/x/image v0.5.0/go.mod h1:FVC7BI/5Ym8R25iw5OLsgshdUBbT1h5jZTpA+mvAdZ4= +golang.org/x/image v0.11.0 h1:ds2RoQvBvYTiJkwpSFDwCcDFNX7DqjL2WsUgTNk0Ooo= +golang.org/x/image v0.11.0/go.mod h1:bglhjqbqVuEb9e9+eNR45Jfu7D+T4Qan+NhQk8Ck2P8= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -332,6 +332,7 @@ golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181023162649-9b4f9f5ad519/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -370,8 +371,10 @@ golang.org/x/net v0.0.0-20210316092652-d523dce5a7f4/go.mod h1:RBQZq4jEuRlivfhVLd golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96bSt6lcn1PtDYWL6XObtHCRCNQM= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= -golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= +golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -396,6 +399,7 @@ golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181026203630-95b1ffbd15a5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -442,10 +446,14 @@ golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -454,8 +462,10 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.12.0 h1:k+n5B8goJNdU7hSvEtMUz3d1Q6D/XW4COJSJR6fN0mc= +golang.org/x/text v0.12.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -511,6 +521,7 @@ golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4f golang.org/x/tools v0.1.0/go.mod h1:xkSsbof2nBLbhDlRMhhhyNLN/zl3eTqcnHD5viDpcZ0= golang.org/x/tools v0.1.2/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/pkg/archive/archive_zip.go b/pkg/archive/archive_zip.go index 7fbe5898..57bbd17f 100644 --- a/pkg/archive/archive_zip.go +++ b/pkg/archive/archive_zip.go @@ -31,20 +31,20 @@ func (e gozipArchiveEntry) CompressedLength() uint64 { return e.file.CompressedSize64 } +// This is a special mode to minimize the number of reads from the underlying reader. +// It's especially useful when trying to stream the ZIP from a remote file, e.g. +// cloud storage. It's only enabled when trying to read the entire file and compression +// is enabled. Care needs to be taken to cover every edge case. +func (e gozipArchiveEntry) couldMinimizeReads() bool { + return e.minimizeReads && e.CompressedLength() > 0 +} + func (e gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) { if end < start { return nil, errors.New("range not satisfiable") } - minimizeReads := false - if e.CompressedLength() > 0 && e.minimizeReads && start == 0 && end == 0 { - // This is a special mode to minimize the number of reads from the underlying reader. - // It's especially useful when trying to stream the ZIP from a remote file, e.g. - // cloud storage. It's only enabled when trying to read the entire file and compression - // is enabled. Maybe at some point it should be enabled for range reads as well, but - // that's something that depends on the usecase, size of the file etc. so it's off for now. - minimizeReads = true - } + minimizeReads := e.couldMinimizeReads() var f io.Reader var err error @@ -70,13 +70,10 @@ func (e gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) { } frdr := flate.NewReader(bytes.NewReader(compressedData)) defer frdr.Close() - data := make([]byte, e.file.UncompressedSize64) - _, err = io.ReadFull(frdr, data) - if err != nil { - return nil, err - } - return data, nil - } else if start == 0 && end == 0 { + f = frdr + } + + if start == 0 && end == 0 { data := make([]byte, e.file.UncompressedSize64) _, err := io.ReadFull(f, data) if err != nil { @@ -90,23 +87,48 @@ func (e gozipArchiveEntry) Read(start int64, end int64) ([]byte, error) { return nil, err } } - data := make([]byte, end-start+1) - n, err := f.Read(data) + data := make([]byte, min(end-start+1, int64(e.file.UncompressedSize64))) + _, err = io.ReadFull(f, data) if err != nil { return nil, err } - return data[:n], nil + return data, nil } func (e gozipArchiveEntry) Stream(w io.Writer, start int64, end int64) (int64, error) { if end < start { return -1, errors.New("range not satisfiable") } - f, err := e.file.Open() - if err != nil { - return -1, err + + minimizeReads := e.couldMinimizeReads() && start == 0 && end == 0 + + var f io.Reader + var err error + if minimizeReads { + f, err = e.file.OpenRaw() + if err != nil { + return -1, err + } + } else { + rc, err := e.file.Open() + if err != nil { + return -1, err + } + defer rc.Close() + f = rc + } + + if minimizeReads { + compressedData := make([]byte, e.file.CompressedSize64) + _, err := io.ReadFull(f, compressedData) + if err != nil { + return -1, err + } + frdr := flate.NewReader(bytes.NewReader(compressedData)) + defer frdr.Close() + f = frdr } - defer f.Close() + if start == 0 && end == 0 { return io.Copy(w, f) } diff --git a/pkg/content/content.go b/pkg/content/content.go new file mode 100644 index 00000000..ffed15f4 --- /dev/null +++ b/pkg/content/content.go @@ -0,0 +1,53 @@ +package content + +import ( + "strings" + + "github.com/readium/go-toolkit/pkg/content/element" + "github.com/readium/go-toolkit/pkg/content/iterator" +) + +type Content interface { + Text(separator *string) (string, error) // Extracts the full raw text, or returns null if no text content can be found. + Iterator() iterator.Iterator // Creates a new iterator for this content. + Elements() ([]element.Element, error) // Returns all the elements as a list. +} + +// Extracts the full raw text, or returns null if no text content can be found. +func ContentText(content Content, separator *string) (string, error) { + sep := "\n" + if separator != nil { + sep = *separator + } + var sb strings.Builder + els, err := content.Elements() + if err != nil { + return "", err + } + for _, el := range els { + if txel, ok := el.(element.TextualElement); ok { + txt := txel.Text() + if txt != "" { + sb.WriteString(txel.Text()) + sb.WriteString(sep) + } + } + } + return strings.TrimSuffix(sb.String(), sep), nil +} + +func ContentElements(content Content) ([]element.Element, error) { + var elements []element.Element + it := content.Iterator() + for { + hasNext, err := it.HasNext() + if err != nil { + return nil, err + } + if !hasNext { + break + } + elements = append(elements, it.Next()) + } + return elements, nil +} diff --git a/pkg/content/element/attributes.go b/pkg/content/element/attributes.go new file mode 100644 index 00000000..5c4de9d7 --- /dev/null +++ b/pkg/content/element/attributes.go @@ -0,0 +1,71 @@ +package element + +type AttributeKey string + +const AcessibilityLabelAttributeKey AttributeKey = "accessibilityLabel" +const LanguageAttributeKey AttributeKey = "language" + +// An attribute is an arbitrary key-value metadata pair. +type Attribute[T any] struct { + Key AttributeKey + Value T +} + +func NewAttribute(key AttributeKey, value any) Attribute[any] { + return Attribute[any]{ + Key: key, + Value: value, + } +} + +// An object associated with a list of attributes. +type AttributesHolder struct { + attributes []Attribute[any] // Associated list of attributes. +} + +func NewAttributesHolder(attributes []Attribute[any]) AttributesHolder { + return AttributesHolder{ + attributes: attributes, + } +} + +func (ah AttributesHolder) Language() string { + v := ah.GetFirst(LanguageAttributeKey) + if v != nil { + return v.Value.(string) + } + return "" +} + +func (ah AttributesHolder) AccessibilityLabel() string { + v := ah.GetFirst(AcessibilityLabelAttributeKey) + if v != nil { + return v.Value.(string) + } + return "" +} + +// Gets the first attribute with the given [key]. +func (ah AttributesHolder) GetFirst(key AttributeKey) *Attribute[any] { + for _, at := range ah.attributes { + if at.Key == key { + return &at + } + } + return nil +} + +// Gets all the attributes with the given [key]. +func (ah AttributesHolder) Get(key AttributeKey) AttributesHolder { + var result []Attribute[any] + for _, at := range ah.attributes { + if at.Key == key { + result = append(result, at) + } + } + return NewAttributesHolder(result) +} + +func (ah AttributesHolder) Attributes() AttributesHolder { + return NewAttributesHolder(ah.attributes) +} diff --git a/pkg/content/element/element.go b/pkg/content/element/element.go new file mode 100644 index 00000000..1f0ebc82 --- /dev/null +++ b/pkg/content/element/element.go @@ -0,0 +1,257 @@ +package element + +import ( + "encoding/json" + "strings" + + "github.com/readium/go-toolkit/pkg/manifest" +) + +// Note: We can't embed structs/interfaces in the interfaces otherwise they become +// "non-basic", meaning we then can't use them as returns for other interfaces like +// [Iterator], where it's the return type of many of the functions. Maybe we should +// rethink this approach with all the interfaces later when not copying the kotlin. + +// Represents a single semantic content element part of a publication. +type Element interface { + // AttributesHolder + Language() string + AccessibilityLabel() string + Attributes() AttributesHolder + + Locator() manifest.Locator // Locator targeting this element in the Publication. +} + +func ElementToMap(e Element) map[string]interface{} { + res := make(map[string]interface{}) + res["locator"] = e.Locator() + if l := e.Language(); l != "" { + res["language"] = l + } + if l := e.AccessibilityLabel(); l != "" { + res["accessibilityLabel"] = l + } + return res +} + +// An element which can be represented as human-readable text. +type TextualElement interface { + // AttributesHolder + Language() string + AccessibilityLabel() string + Attributes() AttributesHolder + + // Element + Locator() manifest.Locator // Locator targeting this element in the Publication. + + Text() string // Human-readable text representation for this element. +} + +// An element referencing an embedded external resource. +type EmbeddedElement interface { + // AttributesHolder + Language() string + AccessibilityLabel() string + Attributes() AttributesHolder + + // Element + Locator() manifest.Locator // Locator targeting this element in the Publication. + + EmbeddedLink() manifest.Link // Referenced resource in the publication. +} + +// An audio clip. +type AudioElement struct { + locator manifest.Locator + embeddedLink manifest.Link + AttributesHolder +} + +// Implements Element +func (e AudioElement) Locator() manifest.Locator { + return e.locator +} + +// Implements EmbeddedElement +func (e AudioElement) EmbeddedLink() manifest.Link { + e.embeddedLink.Href = strings.TrimPrefix(e.embeddedLink.Href, "/") + return e.embeddedLink +} + +// Implements TextualElement +func (e AudioElement) Text() string { + return e.AccessibilityLabel() +} + +func (e AudioElement) MarshalJSON() ([]byte, error) { + res := ElementToMap(e) + res["text"] = e.Text() + res["link"] = e.EmbeddedLink() + res["@type"] = "Video" + return json.Marshal(res) +} + +func NewAudioElement(locator manifest.Locator, embeddedLink manifest.Link, attributes []Attribute[any]) AudioElement { + return AudioElement{ + AttributesHolder: AttributesHolder{ + attributes: attributes, + }, + locator: locator, + embeddedLink: embeddedLink, + } +} + +// A video clip. +type VideoElement struct { + locator manifest.Locator + embeddedLink manifest.Link + AttributesHolder +} + +// Implements Element +func (e VideoElement) Locator() manifest.Locator { + return e.locator +} + +// Implements EmbeddedElement +func (e VideoElement) EmbeddedLink() manifest.Link { + e.embeddedLink.Href = strings.TrimPrefix(e.embeddedLink.Href, "/") + return e.embeddedLink +} + +// Implements TextualElement +func (e VideoElement) Text() string { + return e.AccessibilityLabel() +} + +func (e VideoElement) MarshalJSON() ([]byte, error) { + res := ElementToMap(e) + res["text"] = e.Text() + res["link"] = e.EmbeddedLink() + res["@type"] = "Video" + return json.Marshal(res) +} + +func NewVideoElement(locator manifest.Locator, embeddedLink manifest.Link, attributes []Attribute[any]) VideoElement { + return VideoElement{ + AttributesHolder: AttributesHolder{ + attributes: attributes, + }, + locator: locator, + embeddedLink: embeddedLink, + } +} + +// A bitmap image. +// The caption is a short piece of text associated with the image. +type ImageElement struct { + locator manifest.Locator + embeddedLink manifest.Link + caption string + AttributesHolder +} + +// Implements Element +func (e ImageElement) Locator() manifest.Locator { + return e.locator +} + +// Implements EmbeddedElement +func (e ImageElement) EmbeddedLink() manifest.Link { + e.embeddedLink.Href = strings.TrimPrefix(e.embeddedLink.Href, "/") + return e.embeddedLink +} + +// Implements TextualElement +func (e ImageElement) Text() string { + if e.caption != "" { + // The caption might be a better text description than the accessibility label, when available. + return e.caption + } + return e.AccessibilityLabel() +} + +func (e ImageElement) MarshalJSON() ([]byte, error) { + res := ElementToMap(e) + res["text"] = e.Text() + res["link"] = e.EmbeddedLink() + res["@type"] = "Image" + return json.Marshal(res) +} + +func NewImageElement(locator manifest.Locator, embeddedLink manifest.Link, caption string, attributes []Attribute[any]) ImageElement { + return ImageElement{ + AttributesHolder: AttributesHolder{ + attributes: attributes, + }, + caption: caption, + locator: locator, + embeddedLink: embeddedLink, + } +} + +// Ranged portion of text with associated attributes. +type TextSegment struct { + AttributesHolder // Attributes associated with this segment, e.g. language. + Locator manifest.Locator // Locator to the segment of text. + Text string // Text in the segment. +} + +// A text element. +type TextElement struct { + AttributesHolder + locator manifest.Locator + role TextRole + segments []TextSegment +} + +// Implements TextualElement +func (e TextElement) Text() string { + var sb strings.Builder + for _, v := range e.segments { + sb.WriteString(v.Text) + } + return sb.String() +} + +// Implements Element +func (e TextElement) Locator() manifest.Locator { + return e.locator +} + +func (e TextElement) Role() TextRole { + return e.role +} + +func (e TextElement) MarshalJSON() ([]byte, error) { + res := ElementToMap(e) + res["role"] = e.role.Role() + textElements := make([]interface{}, len(e.segments)) + for i, s := range e.segments { + te := map[string]interface{}{ + "locator": s.Locator, + "text": s.Text, + } + if l := s.Language(); l != "" { + te["language"] = l + } + if l := s.AccessibilityLabel(); l != "" { + te["accessibilityLabel"] = l + } + textElements[i] = te + } + res["text"] = textElements + res["@type"] = "Text" + return json.Marshal(res) +} + +func NewTextElement(locator manifest.Locator, role TextRole, segments []TextSegment, attributes []Attribute[any]) TextElement { + return TextElement{ + AttributesHolder: AttributesHolder{ + attributes: attributes, + }, + locator: locator, + role: role, + segments: segments, + } +} diff --git a/pkg/content/element/text_role.go b/pkg/content/element/text_role.go new file mode 100644 index 00000000..c4783717 --- /dev/null +++ b/pkg/content/element/text_role.go @@ -0,0 +1,43 @@ +package element + +import ( + "fmt" + "net/url" +) + +// Represents a purpose of an element in the broader context of the document. +type TextRole interface { + Role() string +} + +// Title of a section. +type Heading struct { + Level int // Heading importance, 1 being the highest. +} + +func (h Heading) Role() string { + return fmt.Sprintf("heading-%d", h.Level) +} + +// Normal body of content. +type Body struct{} + +func (b Body) Role() string { + return "body" +} + +// A footnote at the bottom of a document. +type Footnote struct{} + +func (f Footnote) Role() string { + return "footnote" +} + +type Quote struct { + ReferenceURL *url.URL // URL to the source for this quote. + ReferenceTitle string // Name of the source for this quote. +} + +func (q Quote) Role() string { + return "quote" +} diff --git a/pkg/content/iterator/html.go b/pkg/content/iterator/html.go new file mode 100644 index 00000000..bf388fab --- /dev/null +++ b/pkg/content/iterator/html.go @@ -0,0 +1,167 @@ +package iterator + +import ( + "strings" + + "github.com/andybalholm/cascadia" + "github.com/pkg/errors" + "github.com/readium/go-toolkit/pkg/content/element" + "github.com/readium/go-toolkit/pkg/fetcher" + "github.com/readium/go-toolkit/pkg/manifest" + "github.com/readium/go-toolkit/pkg/mediatype" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +type HTMLContentIterator struct { + resource fetcher.Resource + locator manifest.Locator + BeforeMaxLength int // Locators will contain a `before` context of up to this amount of characters. + + currentElement *ElementWithDelta + currentIndex *int + parsedElements *ParsedElements +} + +// Iterates an HTML [resource], starting from the given [locator]. +// If you want to start mid-resource, the [locator] must contain a `cssSelector` key in its [Locator.Locations] object. +// If you want to start from the end of the resource, the [locator] must have a `progression` of 1.0. +func NewHTML(resource fetcher.Resource, locator manifest.Locator) *HTMLContentIterator { + return &HTMLContentIterator{ + resource: resource, + locator: locator, + BeforeMaxLength: 50, + } +} + +func HTMLFactory() ResourceContentIteratorFactory { + return func(resource fetcher.Resource, locator manifest.Locator) Iterator { + if resource.Link().MediaType().Matches(&mediatype.HTML, &mediatype.XHTML) { + return NewHTML(resource, locator) + } + return nil + } +} + +func (it *HTMLContentIterator) HasPrevious() (bool, error) { + if it.currentElement != nil && it.currentElement.Delta == -1 { + return true, nil + } + + elements, err := it.elements() + if err != nil { + return false, err + } + index := elements.StartIndex + if it.currentIndex != nil { + index = *it.currentIndex + } + index-- + + if index < 0 || index >= len(elements.Elements) { + return false, nil + } + + it.currentIndex = &index + it.currentElement = &ElementWithDelta{ + El: elements.Elements[index], + Delta: -1, + } + return true, nil +} + +func (it *HTMLContentIterator) Previous() element.Element { + if it.currentElement == nil || it.currentElement.Delta != -1 { + panic("Previous() in HTMLContentIterator called without a previous call to HasPrevious()") + } + el := it.currentElement.El + it.currentElement = nil + return el +} + +func (it *HTMLContentIterator) HasNext() (bool, error) { + if it.currentElement != nil && it.currentElement.Delta == 1 { + return true, nil + } + + elements, err := it.elements() + if err != nil { + return false, err + } + index := elements.StartIndex - 1 + if it.currentIndex != nil { + index = *it.currentIndex + } + index++ + + if index < 0 || index >= len(elements.Elements) { + return false, nil + } + + it.currentIndex = &index + it.currentElement = &ElementWithDelta{ + El: elements.Elements[index], + Delta: 1, + } + return true, nil +} + +func (it *HTMLContentIterator) Next() element.Element { + if it.currentElement == nil || it.currentElement.Delta != 1 { + panic("Next() in HTMLContentIterator called without a previous call to HasNext()") + } + el := it.currentElement.El + it.currentElement = nil + return el +} + +func (it *HTMLContentIterator) elements() (*ParsedElements, error) { + if it.parsedElements == nil { + elements, err := it.parseElements() + if err != nil { + return nil, err + } + it.parsedElements = elements + } + return it.parsedElements, nil +} + +func (it *HTMLContentIterator) parseElements() (*ParsedElements, error) { + raw, rerr := it.resource.ReadAsString() + if rerr != nil { + return nil, errors.Wrap(rerr, "failed reading HTML string of "+it.resource.Link().Href) + } + + document, err := html.ParseWithOptions( + strings.NewReader(raw), + html.ParseOptionEnableScripting(false), + ) + if err != nil { + return nil, errors.Wrap(err, "failed parsing HTML of "+it.resource.Link().Href) + } + + body := childOfType(document, atom.Body, true) + if body == nil { + return nil, errors.New("HTML of " + it.resource.Link().Href + " doesn't have a ") + } + + contentConverter := HTMLConverter{ + baseLocator: it.locator, + beforeMaxLength: it.BeforeMaxLength, + } + if sel := it.locator.Locations.CSSSelector(); sel != "" { + c, err := cascadia.Parse(sel) + if err != nil { + return nil, errors.Wrapf(err, "failed parsing CSS selector \"%s\" of locator for %s", sel, it.locator.Href) + } + if find := cascadia.Query(body, c); find != nil { + contentConverter.startElement = find + } + } + + // Traverse the document's HTML + TraverseNode(&contentConverter, body) + + res := contentConverter.Result() + return &res, nil +} diff --git a/pkg/content/iterator/html_converter.go b/pkg/content/iterator/html_converter.go new file mode 100644 index 00000000..d7c92636 --- /dev/null +++ b/pkg/content/iterator/html_converter.go @@ -0,0 +1,600 @@ +package iterator + +import ( + "net/url" + "strings" + "unicode" + "unicode/utf8" + + "github.com/readium/go-toolkit/pkg/content/element" + iutil "github.com/readium/go-toolkit/pkg/internal/util" + "github.com/readium/go-toolkit/pkg/manifest" + "github.com/readium/go-toolkit/pkg/util" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +// Holds the result of parsing the HTML resource into a list of [element.Element]. +// The [startIndex] will be calculated from the element matched by the base [locator], if possible. Defaults to 0. +type ParsedElements struct { + Elements []element.Element + StartIndex int +} + +func trimText(text string, before *string) manifest.Text { + var b string + if before != nil { + b = *before + } + // Get all the space from the beginning of the string and add it to the before + var bsb strings.Builder + for _, v := range text { + if unicode.IsSpace(v) { + bsb.WriteRune(v) + } else { + break + } + } + b += bsb.String() + + // Get all the space from the end of the string and add it to the after + var asb strings.Builder + for i := len(text) - 1; i >= 0; i-- { + if unicode.IsSpace(rune(text[i])) { + asb.WriteRune(rune(text[i])) + } else { + break + } + } + + return manifest.Text{ + Before: b + bsb.String(), + Highlight: text[bsb.Len() : len(text)-asb.Len()], + After: asb.String(), + } +} + +func onlySpace(s string) bool { + for _, runeValue := range s { + if !unicode.IsSpace(runeValue) { + return false + } + } + return true +} + +func getAttr(n *html.Node, key string) string { + for _, attr := range n.Attr { + if attr.Key == key { + return attr.Val + } + } + return "" +} + +func srcRelativeToHref(n *html.Node, base string) *string { + if n == nil { + return nil + } + + if v := getAttr(n, "src"); v != "" { + h, _ := util.NewHREF(v, base).String() + return &h + } + return nil +} + +// Get child elements of a certain type, with a maximum depth. +func childrenOfType(doc *html.Node, typ atom.Atom, depth uint) (children []*html.Node) { + var f func(*html.Node, uint) + f = func(n *html.Node, d uint) { + if n.Type == html.ElementNode && n.DataAtom == typ { + children = append(children, n) + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + if d > 0 { + f(c, d-1) + } + } + } + f(doc, depth) + return +} + +// Get the first or last element of a certain type +func childOfType(doc *html.Node, typ atom.Atom, first bool) *html.Node { + var b *html.Node + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.DataAtom == typ { + b = n + if first { + return + } + } + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) + } + } + f(doc) + return b +} + +// Everything from this list except "device": +// https://github.com/jhy/jsoup/blob/0b10d516ed8f907f8fb4acb9a0806137a8988d45/src/main/java/org/jsoup/parser/Tag.java#L243 +var inlineTags map[atom.Atom]struct{} = map[atom.Atom]struct{}{ + atom.Object: {}, + atom.Base: {}, + atom.Font: {}, + atom.Tt: {}, + atom.I: {}, + atom.B: {}, + atom.U: {}, + atom.Big: {}, + atom.Small: {}, + atom.Em: {}, + atom.Strong: {}, + atom.Dfn: {}, + atom.Code: {}, + atom.Samp: {}, + atom.Kbd: {}, + atom.Var: {}, + atom.Cite: {}, + atom.Abbr: {}, + atom.Time: {}, + atom.Acronym: {}, + atom.Mark: {}, + atom.Ruby: {}, + atom.Rt: {}, + atom.Rp: {}, + atom.Rtc: {}, + atom.A: {}, + atom.Img: {}, + atom.Br: {}, + atom.Wbr: {}, + atom.Map: {}, + atom.Q: {}, + atom.Sub: {}, + atom.Sup: {}, + atom.Bdo: {}, + atom.Iframe: {}, + atom.Embed: {}, + atom.Span: {}, + atom.Input: {}, + atom.Select: {}, + atom.Textarea: {}, + atom.Label: {}, + atom.Button: {}, + atom.Optgroup: {}, + atom.Option: {}, + atom.Legend: {}, + atom.Datalist: {}, + atom.Keygen: {}, + atom.Output: {}, + atom.Progress: {}, + atom.Meter: {}, + atom.Area: {}, + atom.Param: {}, + atom.Source: {}, + atom.Track: {}, + atom.Summary: {}, + atom.Command: {}, + atom.Basefont: {}, + atom.Bgsound: {}, + atom.Menuitem: {}, + atom.Data: {}, + atom.Bdi: {}, + atom.S: {}, + atom.Strike: {}, + atom.Nobr: {}, + atom.Rb: {}, +} + +// Not inline = is block +func isInlineTag(n *html.Node) bool { + if n == nil { + return false + } + _, ok := inlineTags[n.DataAtom] + return ok +} + +func nodeLanguage(n *html.Node) *string { + if l := getAttr(n, "lang"); l != "" { // Includes lang and xml:lang + return &l + } + if n.Parent != nil { + return nodeLanguage(n.Parent) + } + return nil +} + +// From JSoup: https://github.com/jhy/jsoup/blob/1762412a28fa7b08ccf71d93fc4c98dc73086e03/src/main/java/org/jsoup/internal/StringUtil.java#L233 +// Slight differing definition of what a whitespace characacter is +func appendNormalizedWhitespace(accum *strings.Builder, text string, stripLeading bool) { + var lastWasWhite, reachedNonWhite bool + for _, t := range text { + if unicode.IsSpace(t) { + if (stripLeading && !reachedNonWhite) || lastWasWhite { + continue + } + accum.WriteRune(' ') + lastWasWhite = true + } else if t != 8203 && t != 173 { // zero width sp, soft hyphen + accum.WriteRune(t) + lastWasWhite = false + reachedNonWhite = true + } + } +} + +type NodeVisitor interface { + Head(n *html.Node, depth int) // Callback for when a node is first visited. + Tail(n *html.Node, depth int) // Callback for when a node is last visited, after all of its descendants have been visited. +} + +// Start a depth-first traverse of the root and all of its descendants. +// This implementation does not use recursion, so a deep DOM does not risk blowing the stack. +// From JSoup: https://github.com/jhy/jsoup/blob/1762412a28fa7b08ccf71d93fc4c98dc73086e03/src/main/java/org/jsoup/select/NodeTraversor.java#L20 +// NOTE: Unlike the JSoup implementation, we expect any implementor of NodeVisitor to be read-only, because it simplifies implementation +func TraverseNode(visitor NodeVisitor, root *html.Node) { + node := root + depth := 0 + + for node != nil { + visitor.Head(node, depth) // visit current node + + // DON'T check if removed or replaced + + if node.FirstChild != nil { // descend + node = node.FirstChild + depth++ + } else { + for { + if !(node.NextSibling == nil && depth > 0) { + break + } + visitor.Tail(node, depth) // when no more siblings, ascend + node = node.Parent + depth-- + } + visitor.Tail(node, depth) + if node == root { + break + } + node = node.NextSibling + } + } +} + +type breadcrumbData struct { + node *html.Node + cssSelector string +} + +// Note that this whole thing is based off of JSoup's NodeVisitor and NodeTraverser classes +// https://jsoup.org/apidocs/org/jsoup/select/NodeVisitor.html +// https://jsoup.org/apidocs/org/jsoup/select/NodeTraversor.html +type HTMLConverter struct { + baseLocator manifest.Locator + startElement *html.Node + beforeMaxLength int + + elements []element.Element + startIndex int + + segmentsAcc []element.TextSegment // Segments accumulated for the current element. + textAcc strings.Builder // Text since the beginning of the current segment, after coalescing whitespaces. + wholeRawTextAcc *string // Text content since the beginning of the resource, including whitespaces. + elementRawTextAcc string // Text content since the beginning of the current element, including whitespaces. + rawTextAcc string // Text content since the beginning of the current element, including whitespaces. + currentLanguage *string // Language of the current segment. + + breadcrumbs []breadcrumbData // LIFO stack of the current element's block ancestors. +} + +func (c *HTMLConverter) Result() ParsedElements { + p := ParsedElements{ + Elements: c.elements, + } + one := 1.0 + if c.baseLocator.Locations.Progression == &one { + p.StartIndex = len(c.elements) + } else { + p.StartIndex = c.startIndex + } + return p +} + +// Implements NodeTraversor +func (c *HTMLConverter) Head(n *html.Node, depth int) { + if n.Type == html.ElementNode { + isBlock := !isInlineTag(n) + var cssSelector *string + if isBlock { + // Calculate CSS selector now because we'll definitely need it + cs := iutil.CSSSelector(n) + cssSelector = &cs + + // Flush text + c.flushText() + + // Add blocks to breadcrumbs + c.breadcrumbs = append(c.breadcrumbs, breadcrumbData{ + node: n, + cssSelector: cs, + }) + } + + if n.DataAtom == atom.Br { + c.flushText() + } else if n.DataAtom == atom.Img || n.DataAtom == atom.Audio || n.DataAtom == atom.Video { + c.flushText() + + if cssSelector == nil { + cs := iutil.CSSSelector(n) + cssSelector = &cs + } + elementLocator := manifest.Locator{ + Href: c.baseLocator.Href, + Type: c.baseLocator.Type, + Title: c.baseLocator.Title, + Text: c.baseLocator.Text, + Locations: manifest.Locations{ + OtherLocations: map[string]interface{}{ + "cssSelector": cssSelector, + }, + }, + } + + if n.DataAtom == atom.Img { + if href := srcRelativeToHref(n, c.baseLocator.Href); href != nil { + atlist := []element.Attribute[any]{} + alt := getAttr(n, "alt") + if alt == "" { + // Try fallback to title if no alt + alt = getAttr(n, "title") + } + if alt != "" { + atlist = append(atlist, element.NewAttribute(element.AcessibilityLabelAttributeKey, alt)) + } + c.elements = append(c.elements, element.NewImageElement( + elementLocator, + manifest.Link{ + Href: *href, + }, + "", // FIXME: Get the caption from figcaption + atlist, + )) + } + } else { // Audio or Video + href := srcRelativeToHref(n, c.baseLocator.Href) + var link *manifest.Link + if href != nil { + link = &manifest.Link{ + Href: *href, + } + } else { + sourceNodes := childrenOfType(n, atom.Source, 1) + sources := make([]manifest.Link, len(sourceNodes)) + for _, source := range sourceNodes { + if src := srcRelativeToHref(source, c.baseLocator.Href); src != nil { + l := manifest.Link{ + Href: *src, + } + if typ := getAttr(source, "type"); typ != "" { + l.Type = typ + } + sources = append(sources, l) + } + } + if len(sources) > 0 { + link = &sources[0] + if len(sources) > 1 { + link.Alternates = sources[1:] + } + } + } + + if link != nil { + if n.DataAtom == atom.Audio { + c.elements = append(c.elements, element.NewAudioElement( + elementLocator, + *link, + []element.Attribute[any]{}, + )) + } else if n.DataAtom == atom.Video { + c.elements = append(c.elements, element.NewVideoElement( + elementLocator, + *link, + []element.Attribute[any]{}, + )) + } + } + } + } + + if isBlock { + c.flushText() + } + } +} + +// Implements NodeTraversor +func (c *HTMLConverter) Tail(n *html.Node, depth int) { + if n.Type == html.TextNode && !onlySpace(n.Data) { + language := nodeLanguage(n) + if c.currentLanguage != language { + c.flushSegment() + c.currentLanguage = language + } + + c.rawTextAcc += n.Data + + var stripLeading bool + if acc := c.textAcc.String(); len(acc) > 0 && acc[len(acc)-1] == ' ' { + stripLeading = true + } + appendNormalizedWhitespace(&c.textAcc, n.Data, stripLeading) + } else if n.Type == html.ElementNode { + if !isInlineTag(n) { // Is block + if len(c.breadcrumbs) > 0 && c.breadcrumbs[len(c.breadcrumbs)-1].node != n { + // TODO, should we panic? Kotlin does assert(breadcrumbs.last() == node) which throws + panic("HTMLConverter: breadcrumbs mismatch") + } + c.flushText() + c.breadcrumbs = c.breadcrumbs[:len(c.breadcrumbs)-1] + } + } +} + +func (c *HTMLConverter) flushText() { + c.flushSegment() + + if c.startIndex == 0 && c.startElement != nil && + ((len(c.breadcrumbs) == 0 && c.startElement == nil) || // TODO is this right?? + (c.startElement != nil && len(c.breadcrumbs) > 0 && + c.breadcrumbs[len(c.breadcrumbs)-1].node == c.startElement)) { + c.startIndex = len(c.elements) + } + + if len(c.segmentsAcc) == 0 { + return + } + + // Trim the end of the last segment's text to get a cleaner output for the TextElement. + // Only whitespaces between the segments are meaningful. + c.segmentsAcc[len(c.segmentsAcc)-1].Text = strings.TrimRightFunc(c.segmentsAcc[len(c.segmentsAcc)-1].Text, unicode.IsSpace) + + var bestRole element.TextRole = element.Body{} + if len(c.breadcrumbs) > 0 { + el := c.breadcrumbs[len(c.breadcrumbs)-1].node + for _, at := range el.Attr { + if at.Namespace == "http://www.idpf.org/2007/ops" && at.Key == "type" && at.Val == "footnote" { + bestRole = element.Footnote{} + break + } + } + if bestRole.Role() == "body" { // Still a body + switch el.DataAtom { + case atom.H1: + bestRole = element.Heading{Level: 1} + case atom.H2: + bestRole = element.Heading{Level: 2} + case atom.H3: + bestRole = element.Heading{Level: 3} + case atom.H4: + bestRole = element.Heading{Level: 4} + case atom.H5: + bestRole = element.Heading{Level: 5} + case atom.H6: + bestRole = element.Heading{Level: 6} + case atom.Blockquote: + fallthrough + case atom.Q: + quote := element.Quote{} + for _, at := range el.Attr { + if at.Key == "cite" { + quote.ReferenceURL, _ = url.Parse(at.Val) + } + if at.Key == "title" { + quote.ReferenceTitle = at.Val + } + } + bestRole = quote + } + } + } + + var before *string + if len(c.segmentsAcc) > 0 { + before = &c.segmentsAcc[0].Locator.Text.Before + } + el := element.NewTextElement( + manifest.Locator{ + Href: c.baseLocator.Href, + Type: c.baseLocator.Type, + Title: c.baseLocator.Title, + Locations: manifest.Locations{ + OtherLocations: map[string]interface{}{}, + }, + Text: trimText(c.elementRawTextAcc, before), + }, + bestRole, + c.segmentsAcc, + nil, + ) + if len(c.breadcrumbs) > 0 { + if lastCrumb := c.breadcrumbs[len(c.breadcrumbs)-1]; lastCrumb.cssSelector != "" { + el.Locator().Locations.OtherLocations["cssSelector"] = lastCrumb.cssSelector + } + } + c.elements = append(c.elements, el) + c.elementRawTextAcc = "" + c.segmentsAcc = []element.TextSegment{} +} + +func (c *HTMLConverter) flushSegment() { + text := c.textAcc.String() + trimmedText := strings.TrimSpace(text) + + if len(text) > 0 { + if len(c.segmentsAcc) == 0 { + text = strings.TrimLeftFunc(text, unicode.IsSpace) + + var whitespaceSuffix string + r, _ := utf8.DecodeLastRuneInString(text) + if unicode.IsSpace(r) { + whitespaceSuffix = string(r) + } + + text = trimmedText + whitespaceSuffix + } + + var before *string + if c.wholeRawTextAcc != nil { + var last string + if c.beforeMaxLength > len(*c.wholeRawTextAcc) { + last = (*c.wholeRawTextAcc)[:] + } else { + last = (*c.wholeRawTextAcc)[len(*c.wholeRawTextAcc)-c.beforeMaxLength:] + } + before = &last + } + seg := element.TextSegment{ + Locator: manifest.Locator{ + Href: c.baseLocator.Href, + Type: c.baseLocator.Type, + Title: c.baseLocator.Title, + Locations: manifest.Locations{ + // TODO fix: needs to use baseLocator locations too! + OtherLocations: map[string]interface{}{}, + }, + Text: trimText(c.rawTextAcc, before), + }, + Text: text, + } + if len(c.breadcrumbs) > 0 { + if lastCrumb := c.breadcrumbs[len(c.breadcrumbs)-1]; lastCrumb.cssSelector != "" { + seg.Locator.Locations.OtherLocations["cssSelector"] = lastCrumb.cssSelector + } + } + if c.currentLanguage != nil { + seg.AttributesHolder = element.NewAttributesHolder([]element.Attribute[any]{ + element.NewAttribute(element.LanguageAttributeKey, c.currentLanguage), + }) + } + c.segmentsAcc = append(c.segmentsAcc, seg) + } + + if c.rawTextAcc != "" { + if c.wholeRawTextAcc != nil { + (*c.wholeRawTextAcc) += c.rawTextAcc + } else { + ns := strings.Clone(c.rawTextAcc) + c.wholeRawTextAcc = &ns + } + } + c.rawTextAcc = "" + c.textAcc.Reset() +} diff --git a/pkg/content/iterator/iterator.go b/pkg/content/iterator/iterator.go new file mode 100644 index 00000000..0f1eb6fd --- /dev/null +++ b/pkg/content/iterator/iterator.go @@ -0,0 +1,73 @@ +package iterator + +import "github.com/readium/go-toolkit/pkg/content/element" + +// Iterates through a list of [Element] items asynchronously. +// [hasNext] and [hasPrevious] refer to the last element computed by a previous call to any of both methods. +// TODO: It's based on a kotlin iterator, maybe we can make this more of something for go? +type Iterator interface { + HasNext() (bool, error) // Returns true if the iterator has a next element + Next() element.Element // Retrieves the element computed by a preceding call to [hasNext]. Panics if [hasNext] was not invoked. + HasPrevious() (bool, error) // Returns true if the iterator has a previous element + Previous() element.Element // Retrieves the element computed by a preceding call to [hasPrevious]. Panics if [hasNext] was not invoked. +} + +// Moves to the next item and returns it, or nil if we reached the end. +func ItNextOrNil(it Iterator) (element.Element, error) { + b, err := it.HasNext() + if err != nil { + return nil, err + } + if b { + return it.Next(), nil + } + return nil, nil +} + +// Moves to the previous item and returns it, or nil if we reached the beginning. +func ItPreviousOrNil(it Iterator) (element.Element, error) { + b, err := it.HasPrevious() + if err != nil { + return nil, err + } + if b { + return it.Previous(), nil + } + return nil, nil +} + +// [Iterator] for a resource, associated with its [index] in the reading order. +type IndexedIterator struct { + index int + iterator Iterator +} + +func (it *IndexedIterator) NextContentIn(direction Direction) (element.Element, error) { + if direction == Foward { + return ItNextOrNil(it.iterator) + } else { + return ItPreviousOrNil(it.iterator) + } +} + +type Direction int8 + +const Foward Direction = 1 +const Backward Direction = -1 + +// Just turn the direction into a number by casting it +func (d Direction) Delta() int { + return int(d) +} + +// [Element] loaded with [hasPrevious] or [hasNext], associated with the move direction. +type ElementInDirection struct { + El element.Element + Dir Direction +} + +// [Element] loaded with [hasPrevious] or [hasNext], associated with the move delta. +type ElementWithDelta struct { + El element.Element + Delta int +} diff --git a/pkg/content/iterator/publication.go b/pkg/content/iterator/publication.go new file mode 100644 index 00000000..035bc277 --- /dev/null +++ b/pkg/content/iterator/publication.go @@ -0,0 +1,156 @@ +package iterator + +import ( + "github.com/readium/go-toolkit/pkg/content/element" + "github.com/readium/go-toolkit/pkg/fetcher" + "github.com/readium/go-toolkit/pkg/manifest" +) + +type ResourceContentIteratorFactory = func(fetcher.Resource, manifest.Locator) Iterator + +type PublicationContentIterator struct { + manifest manifest.Manifest + fetcher fetcher.Fetcher + startLocator *manifest.Locator + resourceContentIteratorFactories []ResourceContentIteratorFactory + + _currentIterator *IndexedIterator + currentElement *ElementInDirection +} + +// TODO maybe wrap manifest/fetcher in something that doesn't depend on pub package +func NewPublicationContent(manifest manifest.Manifest, fetcher fetcher.Fetcher, startLocator *manifest.Locator, resourceContentIteratorFactories []ResourceContentIteratorFactory) *PublicationContentIterator { + return &PublicationContentIterator{ + manifest: manifest, + fetcher: fetcher, + startLocator: startLocator, + resourceContentIteratorFactories: resourceContentIteratorFactories, + } +} + +func (it *PublicationContentIterator) HasPrevious() (bool, error) { + e, err := it.nextIn(Backward) + if err != nil { + return false, err + } + it.currentElement = e + return it.currentElement != nil, nil +} + +func (it *PublicationContentIterator) Previous() element.Element { + if it.currentElement == nil || it.currentElement.Dir != Backward { + panic("Previous() in PublicationContentIterator called without successful call to HasPrevious() first") // TODO should this be a panic? + } + return it.currentElement.El +} + +func (it *PublicationContentIterator) HasNext() (bool, error) { + e, err := it.nextIn(Foward) + if err != nil { + return false, err + } + it.currentElement = e + return it.currentElement != nil, nil +} + +func (it *PublicationContentIterator) Next() element.Element { + if it.currentElement == nil || it.currentElement.Dir != Foward { + panic("Next() in PublicationContentIterator called without successful call to HasNext() first") // TODO should this be a panic? + } + return it.currentElement.El +} + +func (it *PublicationContentIterator) nextIn(direction Direction) (*ElementInDirection, error) { + iterator := it.currentIterator() + if iterator == nil { + return nil, nil + } + + content, err := iterator.NextContentIn(direction) + if err != nil { + return nil, err + } + if content == nil { + if ni := it.nextIteratorIn(direction, iterator.index); ni != nil { + it._currentIterator = ni + return it.nextIn(direction) + } + return nil, nil + } + return &ElementInDirection{ + El: content, + Dir: direction, + }, nil +} + +// Returns the [Iterator] for the current [Resource] in the reading order. +func (it *PublicationContentIterator) currentIterator() *IndexedIterator { + if it._currentIterator == nil { + it._currentIterator = it.initialIterator() + } + return it._currentIterator +} + +// Returns the first iterator starting at [startLocator] or the beginning of the publication. +func (it *PublicationContentIterator) initialIterator() *IndexedIterator { + var index int + var ii *IndexedIterator + if it.startLocator != nil { + if i := it.manifest.ReadingOrder.IndexOfFirstWithHref(it.startLocator.Href); i > 0 { + index = i + } + ii = it.loadIteratorAt(index, *it.startLocator) + } else { + ii = it.loadIteratorAtProgression(index, 0) + } + + if ii == nil { + return it.nextIteratorIn(Foward, index) + } + return ii +} + +// Returns the next resource iterator in the given [direction], starting from [fromIndex] +func (it *PublicationContentIterator) nextIteratorIn(direction Direction, fromIndex int) *IndexedIterator { + index := fromIndex + direction.Delta() + if index < 0 || index >= len(it.manifest.ReadingOrder) { + return nil + } + + var progression float64 + if direction == Backward { + progression = 1 + } + + if it := it.loadIteratorAtProgression(index, progression); it != nil { + return it + } + return it.nextIteratorIn(direction, index) +} + +// Loads the iterator at the given [index] in the reading order. +// The [locator] will be used to compute the starting [Locator] for the iterator. +func (it *PublicationContentIterator) loadIteratorAt(index int, locator manifest.Locator) *IndexedIterator { + link := it.manifest.ReadingOrder[index] + resource := it.fetcher.Get(link) + + for _, factory := range it.resourceContentIteratorFactories { + res := factory(resource, locator) + if res != nil { + return &IndexedIterator{index, res} + } + } + return nil +} + +// Loads the iterator at the given [index] in the reading order. +// The [progression] will be used to build a locator and call [loadIteratorAt]. +func (it *PublicationContentIterator) loadIteratorAtProgression(index int, progression float64) *IndexedIterator { + link := it.manifest.ReadingOrder[index] + locator := it.manifest.LocatorFromLink(link) + if locator == nil { + return nil + } + locator.Locations.Progression = &progression + return it.loadIteratorAt(index, *locator) +} diff --git a/pkg/fetcher/fetcher_archive.go b/pkg/fetcher/fetcher_archive.go index c84147b1..7df687b9 100644 --- a/pkg/fetcher/fetcher_archive.go +++ b/pkg/fetcher/fetcher_archive.go @@ -36,16 +36,6 @@ func (f *ArchiveFetcher) Links() (manifest.LinkList, error) { link.Type = mt.String() } } - cl := af.CompressedLength() - if cl == 0 { - cl = af.Length() - } - link.Properties.Add(manifest.Properties{ - "https://readium.org/webpub-manifest/properties#archive": manifest.Properties{ - "entryLength": cl, - "isEntryCompressed": af.CompressedLength() > 0, - }, - }) links = append(links, link) } return links, nil @@ -57,10 +47,25 @@ func (f *ArchiveFetcher) Get(link manifest.Link) Resource { if err != nil { return NewFailureResource(link, NotFound(err)) } - return &entryResource{ + + // Compute archive properties + cl := entry.CompressedLength() + if cl == 0 { + cl = entry.Length() + } + + er := &entryResource{ link: link, entry: entry, + properties: manifest.Properties{ + "https://readium.org/webpub-manifest/properties#archive": map[string]interface{}{ + "entryLength": cl, + "isEntryCompressed": entry.CompressedLength() > 0, + }, + }, } + + return er } // Close implements Fetcher @@ -90,8 +95,9 @@ func NewArchiveFetcherFromPathWithFactory(path string, factory archive.ArchiveFa // Resource from archive entry type entryResource struct { - link manifest.Link - entry archive.Entry + link manifest.Link + entry archive.Entry + properties manifest.Properties } // File implements Resource @@ -104,21 +110,16 @@ func (r *entryResource) Close() { // Nothing needs to be done at the moment } +// Link implements Resource func (r *entryResource) Link() manifest.Link { - cl := r.entry.CompressedLength() - if cl == 0 { - cl = r.entry.Length() - } - r.link.Properties.Add(manifest.Properties{ - "https://readium.org/webpub-manifest/properties#archive": manifest.Properties{ - "entryLength": cl, - "isEntryCompressed": r.entry.CompressedLength() > 0, - }, - }) - return r.link } +// Properties implements Resource +func (r *entryResource) Properties() manifest.Properties { + return r.properties +} + // Read implements Resource func (r *entryResource) Read(start int64, end int64) ([]byte, *ResourceError) { data, err := r.entry.Read(start, end) diff --git a/pkg/fetcher/fetcher_archive_test.go b/pkg/fetcher/fetcher_archive_test.go index 72198030..e3024df5 100644 --- a/pkg/fetcher/fetcher_archive_test.go +++ b/pkg/fetcher/fetcher_archive_test.go @@ -15,20 +15,30 @@ func withArchiveFetcher(t *testing.T, callback func(a *ArchiveFetcher)) { } func TestArchiveFetcherLinks(t *testing.T) { - makeTestLink := func(href string, typ string, entryLength uint64, isCompressed bool) manifest.Link { - return manifest.Link{ + makeTestLink := func(href string, typ string, entryLength uint64, isCompressed bool) struct { + manifest.Link + manifest.Properties + } { + l := manifest.Link{ Href: href, Type: typ, - Properties: manifest.Properties{ - "https://readium.org/webpub-manifest/properties#archive": manifest.Properties{ - "entryLength": entryLength, - "isEntryCompressed": isCompressed, - }, + } + p := manifest.Properties{ + "https://readium.org/webpub-manifest/properties#archive": map[string]interface{}{ + "entryLength": entryLength, + "isEntryCompressed": isCompressed, }, } + return struct { + manifest.Link + manifest.Properties + }{l, p} } - mustContain := manifest.LinkList{ + mustContain := []struct { + manifest.Link + manifest.Properties + }{ makeTestLink("/mimetype", "", 20, false), makeTestLink("/EPUB/cover.xhtml", "application/xhtml+xml", 259, true), makeTestLink("/EPUB/css/epub.css", "text/css", 595, true), @@ -45,7 +55,12 @@ func TestArchiveFetcherLinks(t *testing.T) { links, err := a.Links() assert.Nil(t, err) - assert.ElementsMatch(t, mustContain, links) + mustLinks := make([]manifest.Link, len(mustContain)) + for i, l := range mustContain { + assert.Equal(t, l.Properties, a.Get(l.Link).Properties()) + mustLinks[i] = l.Link + } + assert.ElementsMatch(t, mustLinks, links) }) } @@ -128,25 +143,10 @@ func TestArchiveFetcherAddsProperties(t *testing.T) { withArchiveFetcher(t, func(a *ArchiveFetcher) { resource := a.Get(manifest.Link{Href: "/EPUB/css/epub.css"}) assert.Equal(t, manifest.Properties{ - "https://readium.org/webpub-manifest/properties#archive": manifest.Properties{ - "entryLength": uint64(595), - "isEntryCompressed": true, - }, - }, resource.Link().Properties) - }) -} - -func TestArchiveFetcherOriginalPropertiesKept(t *testing.T) { - withArchiveFetcher(t, func(a *ArchiveFetcher) { - resource := a.Get(manifest.Link{Href: "/EPUB/css/epub.css", Properties: manifest.Properties{ - "other": "property", - }}) - assert.Equal(t, manifest.Properties{ - "other": "property", - "https://readium.org/webpub-manifest/properties#archive": manifest.Properties{ + "https://readium.org/webpub-manifest/properties#archive": map[string]interface{}{ "entryLength": uint64(595), "isEntryCompressed": true, }, - }, resource.Link().Properties) + }, resource.Properties()) }) } diff --git a/pkg/fetcher/fetcher_file.go b/pkg/fetcher/fetcher_file.go index 431052a8..6c57339c 100644 --- a/pkg/fetcher/fetcher_file.go +++ b/pkg/fetcher/fetcher_file.go @@ -128,6 +128,10 @@ func (r *FileResource) Link() manifest.Link { return r.link } +func (r *FileResource) Properties() manifest.Properties { + return manifest.Properties{} +} + // Close implements Resource func (r *FileResource) Close() { if r.file != nil { diff --git a/pkg/fetcher/resource.go b/pkg/fetcher/resource.go index f4a5d540..aba81a11 100644 --- a/pkg/fetcher/resource.go +++ b/pkg/fetcher/resource.go @@ -38,6 +38,10 @@ type Resource interface { // It might be modified by the [Resource] to include additional metadata, e.g. the `Content-Type` HTTP header in [Link.Type]. Link() manifest.Link + // Returns the properties associated with the resource. + // This is opened for extensions. + Properties() manifest.Properties + // Returns data length from metadata if available, or calculated from reading the bytes otherwise. // This value must be treated as a hint, as it might not reflect the actual bytes length. To get the real length, you need to read the whole resource. Length() (int64, *ResourceError) @@ -265,6 +269,10 @@ func (r FailureResource) Link() manifest.Link { return r.link } +func (r FailureResource) Properties() manifest.Properties { + return manifest.Properties{} +} + // Length implements Resource func (r FailureResource) Length() (int64, *ResourceError) { return 0, r.ex @@ -323,6 +331,10 @@ func (r ProxyResource) Link() manifest.Link { return r.Res.Link() } +func (r ProxyResource) Properties() manifest.Properties { + return r.Res.Properties() +} + // Length implements Resource func (r ProxyResource) Length() (int64, *ResourceError) { return r.Res.Length() diff --git a/pkg/fetcher/resource_bytes.go b/pkg/fetcher/resource_bytes.go index f4e896ba..ddbfd59b 100644 --- a/pkg/fetcher/resource_bytes.go +++ b/pkg/fetcher/resource_bytes.go @@ -29,6 +29,11 @@ func (r *BytesResource) Link() manifest.Link { return r.link } +// Properties implements Resource +func (r *BytesResource) Properties() manifest.Properties { + return manifest.Properties{} +} + // Length implements Resource func (r *BytesResource) Length() (int64, *ResourceError) { bin, err := r.Read(0, 0) @@ -63,7 +68,7 @@ func (r *BytesResource) Read(start int64, end int64) ([]byte, *ResourceError) { end = length } - return r._bytes[start:end], nil + return r._bytes[start : end+1], nil } // Stream implements Resource @@ -79,7 +84,7 @@ func (r *BytesResource) Stream(w io.Writer, start int64, end int64) (int64, *Res if start == 0 && end == 0 { buff = bytes.NewBuffer(r._bytes) } else { - buff = bytes.NewBuffer(r._bytes[start:end]) + buff = bytes.NewBuffer(r._bytes[start : end+1]) } n, err := io.Copy(w, buff) if err != nil { diff --git a/pkg/internal/extensions/generics.go b/pkg/internal/extensions/generics.go index 09150717..0ce61b8f 100644 --- a/pkg/internal/extensions/generics.go +++ b/pkg/internal/extensions/generics.go @@ -1,5 +1,7 @@ package extensions +import "encoding/json" + func Pointer[T any](val T) *T { return &val } @@ -47,3 +49,34 @@ func AddToSet(s []string, e string) []string { } return s } + +func DeduplicateAndMarshalJSON[T any](s []T) ([]json.RawMessage, error) { + if len(s) == 0 { + // Shortcut if slice is empty + return []json.RawMessage{}, nil + } + if len(s) == 1 { + // Shortcut if only one element in slice + bin, err := json.Marshal(s[0]) + if err != nil { + return nil, err + } + return []json.RawMessage{bin}, nil + } + output := make([]json.RawMessage, 0, len(s)) + seen := make(map[string]struct{}, len(s)) + for _, v := range s { + bin, err := json.Marshal(v) + if err != nil { + return nil, err + } + str := string(bin) + + if _, ok := seen[str]; ok { + continue + } + seen[str] = struct{}{} + output = append(output, bin) + } + return output, nil +} diff --git a/pkg/internal/util/css.go b/pkg/internal/util/css.go new file mode 100644 index 00000000..e665f313 --- /dev/null +++ b/pkg/internal/util/css.go @@ -0,0 +1,122 @@ +package util + +import ( + "fmt" + "strconv" + "strings" + + "github.com/agext/regexp" + "github.com/andybalholm/cascadia" + "github.com/pkg/errors" + "golang.org/x/net/html" + "golang.org/x/net/html/atom" +) + +func escapeChar(r rune) string { + return fmt.Sprintf("\\%x ", int(r)) +} + +var nonIdentifier = regexp.MustCompile("[^a-zA-Z0-9_-]") +var extraSpace = regexp.MustCompile("\\s+") + +// Note - this is a rudimentary implementation +func escapeCSSIdentifier(input string) string { + if len(input) == 0 { + return "" + } + + // Matches CSS non-identifier characters + input = nonIdentifier.ReplaceAllStringFunc(input, func(match string) string { + return escapeChar([]rune(match)[0]) + }) + + // If identifier starts with a digit, hyphen + digit or two hyphens, escape it + firstChar := []rune(input)[0] + if firstChar == '-' { + if len(input) > 1 && (input[1] == '-' || (input[1] >= '0' && input[1] <= '9')) { + input = escapeChar(firstChar) + input[1:] + } + } else if firstChar >= '0' && firstChar <= '9' { + input = escapeChar(firstChar) + input[1:] + } + + return input +} + +func getAttr(n *html.Node, key string) string { + for _, attr := range n.Attr { + if attr.Key == key { + return attr.Val + } + } + return "" +} + +// Get a CSS selector that will uniquely select a provided HTML element +// If the element has an ID, returns #id; +// otherwise returns the parent (if any) CSS selector, followed by '>' +// followed by a unique selector for the element (tag.class.class:nth-child(n)). +// Logic copied from JSoup: https://github.com/jhy/jsoup/blob/0b10d516ed8f907f8fb4acb9a0806137a8988d45/src/main/java/org/jsoup/nodes/Element.java#L829 +func CSSSelector(n *html.Node) string { + if n == nil || n.Type != html.ElementNode { + return "" + } + + id := getAttr(n, "id") + if id != "" { + // We're making the big assumption that ID is unique, as would be in good HTML + // TODO investigate if we can assume this in all EPUBs + return "#" + escapeCSSIdentifier(id) + } + + var selector strings.Builder + selector.WriteString( + // Escape tagname + escapeCSSIdentifier(n.Data), + ) + + /* + NOT IMPLEMENTED + Translate HTML namespace ns:tag to CSS namespace syntax ns|tag + // escapeCSSIdentifier(n.Namespace) + "|" + the tag name + */ + + // Add CSS classes to selector + classNames := extraSpace.Split(getAttr(n, "class"), -1) + for _, className := range classNames { + if className == "" { + continue + } + selector.WriteRune('.') + selector.WriteString(escapeCSSIdentifier(className)) + } + + if n.Parent == nil { + // No parent, we're done + return selector.String() + } + + if n.Parent.Type == html.ElementNode && n.Parent.DataAtom == atom.Html { + // Parent is the root element, we're done + return selector.String() + } + + s, err := cascadia.Parse(selector.String()) + if err != nil { + panic(errors.Wrap(err, "failed parsing generated CSS selector")) + } + if nodes := cascadia.QueryAll(n.Parent, s); len(nodes) > 1 { + // Figure out the index of this node among its siblings + idx := 1 + for ps := n.PrevSibling; ps != nil; ps = ps.PrevSibling { + if ps.Type == html.ElementNode { + idx++ + } + } + selector.WriteString(":nth-child(") + selector.WriteString(strconv.Itoa(idx)) + selector.WriteRune(')') + } + + return CSSSelector(n.Parent) + " > " + selector.String() +} diff --git a/pkg/internal/util/css_test.go b/pkg/internal/util/css_test.go new file mode 100644 index 00000000..fd963529 --- /dev/null +++ b/pkg/internal/util/css_test.go @@ -0,0 +1,53 @@ +package util + +import ( + "strings" + "testing" + + "github.com/andybalholm/cascadia" + "github.com/stretchr/testify/assert" + "golang.org/x/net/html" +) + +const testDoc = ` + + + + Section IV: FAIRY STORIES—MODERN FANTASTIC TALES + + + +
+
171
+

INTRODUCTORY

+ +

The difficulties of classification are very apparent here, and once more it must be noted that illustrative and practical purposes rather than logical ones are served by the arrangement adopted. The modern fanciful story is here placed next to the real folk story instead of after all the groups of folk products. The Hebrew stories at the beginning belong quite as well, perhaps even better, in Section V, while the stories at the end of Section VI shade off into the more modern types of short tales.

+

The child's natural literature. The world has lost certain secrets as the price of an advancing civilization.

+

Without discussing the limits of the culture-epoch theory of human development as a complete guide in education, it is clear that the young child passes through a period when his mind looks out upon the world in a manner analogous to that of the folk as expressed in their literature.

+
+ +` + +func TestCSSSelector(t *testing.T) { + doc, err := html.Parse(strings.NewReader(testDoc)) + if !assert.NoError(t, err) { + return + } + + qf := func(query string) string { + n := cascadia.Query(doc, cascadia.MustCompile(query)) + if !assert.NotNil(t, n) { + t.FailNow() + } + return CSSSelector(n) + } + + assert.Equal(t, qf("body"), "body") + assert.Equal(t, qf("#pgepubid00498"), "#pgepubid00498") + assert.Equal(t, qf("#Page_171"), "#Page_171") + assert.Equal(t, qf("#pgepubid00498 > h3"), "#pgepubid00498 > h3") + assert.Equal(t, qf("#pgepubid00498 > div.center"), "#pgepubid00498 > div.center") + assert.Equal(t, qf("#pgepubid00498 > p:nth-child(3)"), "#pgepubid00498 > p:nth-child(3)") + assert.Equal(t, qf("#pgepubid00498 > p:nth-child(5)"), "#pgepubid00498 > p:nth-child(5)") + assert.Equal(t, qf("#pgepubid00498 > p:nth-child(4) > span"), "#pgepubid00498 > p:nth-child(4) > span") +} diff --git a/pkg/manifest/contributor.go b/pkg/manifest/contributor.go index 1902990d..3e78852b 100644 --- a/pkg/manifest/contributor.go +++ b/pkg/manifest/contributor.go @@ -4,6 +4,7 @@ import ( "encoding/json" "github.com/pkg/errors" + "github.com/readium/go-toolkit/pkg/internal/extensions" ) // Contributor @@ -153,8 +154,14 @@ func (c Contributors) MarshalJSON() ([]byte, error) { if len(c) == 0 { return []byte("null"), nil } - if len(c) == 1 { - return json.Marshal(c[0]) + + // De-duplicate contributors before marshalling + marshalled, err := extensions.DeduplicateAndMarshalJSON([]Contributor(c)) + if err != nil { + return nil, err + } + if len(marshalled) == 1 { + return json.Marshal(marshalled[0]) } - return json.Marshal([]Contributor(c)) + return json.Marshal(marshalled) } diff --git a/pkg/manifest/link.go b/pkg/manifest/link.go index a2921450..549f8240 100644 --- a/pkg/manifest/link.go +++ b/pkg/manifest/link.go @@ -181,6 +181,48 @@ func (l *Link) UnmarshalJSON(b []byte) error { return nil } +func (l Link) MarshalJSON() ([]byte, error) { + res := make(map[string]interface{}) + res["href"] = l.Href + if l.Type != "" { + res["type"] = l.Type + } + if l.Templated { + res["templated"] = l.Templated + } + if l.Title != "" { + res["title"] = l.Title + } + if len(l.Rels) > 0 { + res["rel"] = l.Rels + } + if len(l.Properties) > 0 { + res["properties"] = l.Properties + } + if l.Height > 0 { + res["height"] = l.Height + } + if l.Width > 0 { + res["width"] = l.Width + } + if l.Bitrate > 0 { + res["bitrate"] = l.Bitrate + } + if l.Duration > 0 { + res["duration"] = l.Duration + } + if len(l.Languages) > 0 { + res["language"] = l.Languages + } + if len(l.Alternates) > 0 { + res["alternate"] = l.Alternates + } + if len(l.Children) > 0 { + res["children"] = l.Children + } + return json.Marshal(res) +} + // Slice of links type LinkList []Link diff --git a/pkg/manifest/link_test.go b/pkg/manifest/link_test.go index 12ad8b43..f34a84b6 100644 --- a/pkg/manifest/link_test.go +++ b/pkg/manifest/link_test.go @@ -66,17 +66,19 @@ func TestLinkUnmarshalFullJSON(t *testing.T) { ] }`), &l)) assert.Equal(t, Link{ - Href: "http://href", - Type: "application/pdf", - Templated: true, - Title: "Link Title", - Rels: []string{"publication", "cover"}, - Properties: map[string]interface{}{"orientation": "landscape"}, - Height: 1024, - Width: 768, - Bitrate: 74.2, - Duration: 45.6, - Languages: []string{"fr"}, + Href: "http://href", + Type: "application/pdf", + Templated: true, + Title: "Link Title", + Rels: []string{"publication", "cover"}, + Properties: Properties{ + "orientation": "landscape", + }, + Height: 1024, + Width: 768, + Bitrate: 74.2, + Duration: 45.6, + Languages: []string{"fr"}, Alternates: []Link{ {Href: "/alternate1"}, {Href: "/alternate2"}, @@ -181,17 +183,19 @@ func TestLinkMinimalJSON(t *testing.T) { func TestLinkFullJSON(t *testing.T) { b, err := json.Marshal(Link{ - Href: "http://href", - Type: "application/pdf", - Templated: true, - Title: "Link Title", - Rels: []string{"publication", "cover"}, - Properties: map[string]interface{}{"orientation": "landscape"}, - Height: 1024, - Width: 768, - Bitrate: 74.2, - Duration: 45.6, - Languages: []string{"fr"}, + Href: "http://href", + Type: "application/pdf", + Templated: true, + Title: "Link Title", + Rels: []string{"publication", "cover"}, + Properties: Properties{ + "orientation": "landscape", + }, + Height: 1024, + Width: 768, + Bitrate: 74.2, + Duration: 45.6, + Languages: []string{"fr"}, Alternates: []Link{ {Href: "/alternate1"}, {Href: "/alternate2"}, diff --git a/pkg/manifest/locator.go b/pkg/manifest/locator.go index e4b3a579..9225bab8 100644 --- a/pkg/manifest/locator.go +++ b/pkg/manifest/locator.go @@ -93,9 +93,11 @@ func (l *Locations) UnmarshalJSON(b []byte) error { } func (l Locations) MarshalJSON() ([]byte, error) { - j := l.OtherLocations - if j == nil { - j = make(map[string]interface{}) + j := make(map[string]interface{}) + if l.OtherLocations != nil { + for k, v := range l.OtherLocations { + j[k] = v + } } if len(l.Fragments) > 0 { @@ -114,6 +116,19 @@ func (l Locations) MarshalJSON() ([]byte, error) { return json.Marshal(j) } +// HTML extensions for [Locations] + +func (l Locations) CSSSelector() string { + if v, ok := l.OtherLocations["cssSelector"]; ok { + if s, ok := v.(string); ok { + return s + } + } + return "" +} + +// TODO partialCfi and domRange getters + // Textual context of the locator. // A Locator Text Object contains multiple text fragments, useful to give a context to the [Locator] or for highlights. // https://github.com/readium/architecture/tree/master/models/locators#the-text-object @@ -137,11 +152,11 @@ func TextFromJSON(rawJson map[string]interface{}) (t Text) { // Locator provides a precise location in a publication in a format that can be stored and shared. // // There are many different use cases for locators: -// - getting back to the last position in a publication -// - bookmarks -// - highlights & annotations -// - search results -// - human-readable (and shareable) reference in a publication +// - getting back to the last position in a publication +// - bookmarks +// - highlights & annotations +// - search results +// - human-readable (and shareable) reference in a publication // // https://github.com/readium/architecture/tree/master/locators type Locator struct { diff --git a/pkg/manifest/manifest.go b/pkg/manifest/manifest.go index 71c12ba3..529717f8 100644 --- a/pkg/manifest/manifest.go +++ b/pkg/manifest/manifest.go @@ -3,6 +3,7 @@ package manifest import ( "encoding/json" "path" + "strings" "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/internal/extensions" @@ -55,6 +56,49 @@ func (m Manifest) ConformsTo(profile Profile) bool { return false } +// Finds the first [Link] with the given href in the manifest's links. +// Searches through (in order) the reading order, resources and links recursively following alternate and children links. +// If there's no match, tries again after removing any query parameter and anchor from the given href. +func (m Manifest) LinkWithHref(href string) *Link { + var deepLinkWithHref func(ll LinkList, href string) *Link + deepLinkWithHref = func(ll LinkList, href string) *Link { + for _, l := range ll { + if l.Href == href { + return &l + } else { + if link := deepLinkWithHref(l.Alternates, href); link != nil { + return link + } + if link := deepLinkWithHref(l.Children, href); link != nil { + return link + } + } + } + return nil + } + + find := func(href string) *Link { + if l := deepLinkWithHref(m.ReadingOrder, href); l != nil { + return l + } + if l := deepLinkWithHref(m.Resources, href); l != nil { + return l + } + if l := deepLinkWithHref(m.Links, href); l != nil { + return l + } + return nil + } + + if l := find(href); l != nil { + return l + } + if l := find(strings.SplitN(strings.SplitN(href, "#", 2)[0], "?", 2)[0]); l != nil { + return l + } + return nil +} + // Finds the first [Link] with the given relation in the manifest's links. func (m Manifest) LinkWithRel(rel string) *Link { for _, resource := range m.Resources { @@ -115,6 +159,43 @@ func (m Manifest) LinksWithRel(rel string) []Link { return res } +// Creates a new [Locator] object from a [Link] to a resource of this manifest. +// Returns nil if the resource is not found in this manifest. +func (m Manifest) LocatorFromLink(link Link) *Locator { + components := strings.SplitN(link.Href, "#", 2) + href := components[0] + resourceLink := m.LinkWithHref(href) + if resourceLink == nil { + return nil + } + if resourceLink.Type == "" { + return nil + } + var fragment string + if len(components) > 1 { + fragment = components[1] + } + + l := &Locator{ + Href: href, + Type: resourceLink.Type, + Title: resourceLink.Title, + } + + if l.Title == "" { + l.Title = link.Title + } + + if fragment != "" { + l.Locations.Fragments = []string{fragment} + } else { + var p float64 + l.Locations.Progression = &p + } + + return l +} + // Parses a [Manifest] from its RWPM JSON representation. // // TODO log [warnings] ? diff --git a/pkg/manifest/manifest_test.go b/pkg/manifest/manifest_test.go index ee9d4ac1..8b78ee72 100644 --- a/pkg/manifest/manifest_test.go +++ b/pkg/manifest/manifest_test.go @@ -350,3 +350,142 @@ func TestManifestHrefResolvedToRootRemotePackage(t *testing.T) { assert.Equal(t, "http://example.com/directory/chap1.html", m.ReadingOrder[0].Href) } + +func TestManifestLocatorFromMinimalLink(t *testing.T) { + manifest := Manifest{ + Metadata: Metadata{ + LocalizedTitle: NewLocalizedStringFromString(""), + }, + ReadingOrder: LinkList{{ + Href: "/href", + Type: "text/html", + Title: "Resource", + }}, + } + + var z float64 + assert.Equal(t, &Locator{ + Href: "/href", + Type: "text/html", + Title: "Resource", + Locations: Locations{ + Progression: &z, + }, + }, manifest.LocatorFromLink(Link{ + Href: "/href", + })) +} + +func TestManifestLocatorFromInside(t *testing.T) { + manifest := Manifest{ + Metadata: Metadata{ + LocalizedTitle: NewLocalizedStringFromString(""), + }, + ReadingOrder: LinkList{{ + Href: "/href1", + Type: "text/html", + }}, + Resources: LinkList{{ + Href: "/href2", + Type: "text/html", + }}, + Links: LinkList{{ + Href: "/href3", + Type: "text/html", + }}, + } + + var z float64 + assert.Equal(t, &Locator{ + Href: "/href1", + Type: "text/html", + Locations: Locations{ + Progression: &z, + }, + }, manifest.LocatorFromLink(Link{ + Href: "/href1", + })) + assert.Equal(t, &Locator{ + Href: "/href2", + Type: "text/html", + Locations: Locations{ + Progression: &z, + }, + }, manifest.LocatorFromLink(Link{ + Href: "/href2", + })) + assert.Equal(t, &Locator{ + Href: "/href3", + Type: "text/html", + Locations: Locations{ + Progression: &z, + }, + }, manifest.LocatorFromLink(Link{ + Href: "/href3", + })) +} + +func TestManifestLocatorFromFullLinkWithFragment(t *testing.T) { + manifest := Manifest{ + Metadata: Metadata{ + LocalizedTitle: NewLocalizedStringFromString(""), + }, + ReadingOrder: LinkList{{ + Href: "/href", + Type: "text/html", + Title: "Resource", + }}, + } + + assert.Equal(t, &Locator{ + Href: "/href", + Type: "text/html", + Title: "Resource", + Locations: Locations{ + Fragments: []string{"page=42"}, + }, + }, manifest.LocatorFromLink(Link{ + Href: "/href#page=42", + Type: "text/xml", + Title: "My link", + })) +} + +func TestManifestLocatorFallbackTitle(t *testing.T) { + manifest := Manifest{ + Metadata: Metadata{ + LocalizedTitle: NewLocalizedStringFromString(""), + }, + ReadingOrder: LinkList{{ + Href: "/href", + Type: "text/html", + }}, + } + assert.Equal(t, &Locator{ + Href: "/href", + Type: "text/html", + Title: "My link", + Locations: Locations{ + Fragments: []string{"page=42"}, + }, + }, manifest.LocatorFromLink(Link{ + Href: "/href#page=42", + Type: "text/xml", + Title: "My link", + })) +} + +func TestManifestLocatorLinkNotFound(t *testing.T) { + manifest := Manifest{ + Metadata: Metadata{ + LocalizedTitle: NewLocalizedStringFromString(""), + }, + ReadingOrder: LinkList{{ + Href: "/href", + Type: "text/html", + }}, + } + assert.Nil(t, manifest.LocatorFromLink(Link{ + Href: "/notfound", + })) +} diff --git a/pkg/manifest/metadata.go b/pkg/manifest/metadata.go index 596d5059..07e9457d 100644 --- a/pkg/manifest/metadata.go +++ b/pkg/manifest/metadata.go @@ -413,9 +413,11 @@ func (m *Metadata) UnmarshalJSON(b []byte) error { } func (m Metadata) MarshalJSON() ([]byte, error) { - j := m.OtherMetadata - if j == nil { - j = make(map[string]interface{}) + j := make(map[string]interface{}) + if m.OtherMetadata != nil { + for k, v := range m.OtherMetadata { + j[k] = v + } } if m.Presentation != nil { diff --git a/pkg/manifest/properties.go b/pkg/manifest/properties.go index 60a0e3fc..3eee516b 100644 --- a/pkg/manifest/properties.go +++ b/pkg/manifest/properties.go @@ -8,7 +8,10 @@ import ( type Properties map[string]interface{} -func (p *Properties) Add(newProperties Properties) Properties { +// Properties should be immutable, therefore these functions have been removed. +// The code is left here in case it's useful in a future implementation. + +/*func (p *Properties) Add(newProperties Properties) Properties { if *p == nil { *p = make(Properties) } @@ -18,6 +21,14 @@ func (p *Properties) Add(newProperties Properties) Properties { return *p } +func (p *Properties) Delete(key string) Properties { + if p == nil { + p = &Properties{} + } + delete(*p, key) + return *p +}*/ + func (p *Properties) Get(key string) interface{} { if p != nil { return (*p)[key] diff --git a/pkg/manifest/properties_test.go b/pkg/manifest/properties_test.go index ebb920a1..e5bb5998 100644 --- a/pkg/manifest/properties_test.go +++ b/pkg/manifest/properties_test.go @@ -32,7 +32,7 @@ func TestPropertiesUnmarshalFullJSON(t *testing.T) { }, p) } -func TestPropertiesAddGiven(t *testing.T) { +/*func TestPropertiesAddGiven(t *testing.T) { p2 := Properties{ "other-property1": "value", "other-property2": []interface{}{float64(42)}, @@ -44,7 +44,7 @@ func TestPropertiesAddGiven(t *testing.T) { }, p2.Add(Properties{ "additional": "property", })) -} +}*/ // Presentation-specific properties diff --git a/pkg/mediatype/mediatype_of.go b/pkg/mediatype/mediatype_of.go index 1c1b4bce..4f124bf2 100644 --- a/pkg/mediatype/mediatype_of.go +++ b/pkg/mediatype/mediatype_of.go @@ -9,8 +9,18 @@ import ( // You can register additional sniffers globally by modifying this list. // The sniffers order is important, because some formats are subsets of other formats. var Sniffers = []Sniffer{ - SniffXHTML, SniffHTML, SniffOPDS, SniffLCPLicense, SniffBitmap, - SniffWebpub, SniffW3CWPUB, SniffEPUB, SniffLPF, SniffArchive, SniffPDF, + SniffEPUB, + SniffLPF, + SniffArchive, + SniffPDF, + SniffXHTML, + SniffHTML, + SniffBitmap, + SniffAudio, + SniffOPDS, + SniffLCPLicense, + SniffW3CWPUB, + SniffWebpub, // Note SniffSystem isn't here! } diff --git a/pkg/mediatype/sniffer.go b/pkg/mediatype/sniffer.go index 971b9973..1e14646f 100644 --- a/pkg/mediatype/sniffer.go +++ b/pkg/mediatype/sniffer.go @@ -143,6 +143,37 @@ func SniffBitmap(context SnifferContext) *MediaType { return nil } +// Sniffs audio files. +func SniffAudio(context SnifferContext) *MediaType { + if context.HasFileExtension("aac") || context.HasMediaType("audio/aac") { + return &AAC + } + if context.HasFileExtension("aiff") || context.HasMediaType("audio/aiff") { + return &AIFF + } + // TODO flac, m4a + if context.HasFileExtension("mp3") || context.HasMediaType("audio/mpeg") { + return &MP3 + } + if context.HasFileExtension("ogg", "oga") || context.HasMediaType("audio/ogg") { + return &OGG + } + if context.HasFileExtension("opus") || context.HasMediaType("audio/opus") { + return &OPUS + } + if context.HasFileExtension("wav") || context.HasMediaType("audio/wav") { + return &WAV + } + if context.HasFileExtension("webm") || context.HasMediaType("audio/webm") { + // Note: .webm extension could also be a video + return &WEBMAudio + } + + // TODO read magic bytes? + + return nil +} + // Sniffs a Readium Web Publication, protected or not by LCP. func SniffWebpub(context SnifferContext) *MediaType { if context.HasFileExtension("audiobook") || context.HasMediaType("application/audiobook+zip") { @@ -217,8 +248,8 @@ func SniffEPUB(context SnifferContext) *MediaType { // Sniffs a Lightweight Packaging Format (LPF). // References: -// - https://www.w3.org/TR/lpf/ -// - https://www.w3.org/TR/pub-manifest/ +// - https://www.w3.org/TR/lpf/ +// - https://www.w3.org/TR/pub-manifest/ func SniffLPF(context SnifferContext) *MediaType { if context.HasFileExtension("lpf") || context.HasMediaType("application/lpf+zip") { return &LPF diff --git a/pkg/parser/epub/deobfuscator.go b/pkg/parser/epub/deobfuscator.go index 4cd5cca0..71271d25 100644 --- a/pkg/parser/epub/deobfuscator.go +++ b/pkg/parser/epub/deobfuscator.go @@ -39,23 +39,22 @@ func (d DeobfuscatingResource) Read(start, end int64) ([]byte, *fetcher.Resource algorithm = penc.Algorithm } - for k, v := range algorithm2length { - if k == algorithm { - data, err := d.ProxyResource.Read(start, end) - if err != nil { - return nil, err - } - var obfuscationKey []byte - switch algorithm { - case "http://ns.adobe.com/pdf/enc#RC": - obfuscationKey = d.getHashKeyAdobe() - default: - shasum := sha1.Sum([]byte(d.identifier)) - obfuscationKey = shasum[:] - } - deobfuscateFont(data, start, obfuscationKey, v) - return data, nil + v, ok := algorithm2length[algorithm] + if ok { + data, err := d.ProxyResource.Read(start, end) + if err != nil { + return nil, err } + var obfuscationKey []byte + switch algorithm { + case "http://ns.adobe.com/pdf/enc#RC": + obfuscationKey = d.getHashKeyAdobe() + default: + shasum := sha1.Sum([]byte(d.identifier)) + obfuscationKey = shasum[:] + } + deobfuscateFont(data, start, obfuscationKey, v) + return data, nil } // Algorithm not in known, so skip deobfuscation @@ -69,74 +68,73 @@ func (d DeobfuscatingResource) Stream(w io.Writer, start int64, end int64) (int6 algorithm = penc.Algorithm } - for k, v := range algorithm2length { - if k == algorithm { - if start >= v { - // We're past the obfuscated part, just proxy it - return d.ProxyResource.Stream(w, start, end) - } + v, ok := algorithm2length[algorithm] + if ok { + if start >= v { + // We're past the obfuscated part, just proxy it + return d.ProxyResource.Stream(w, start, end) + } - // Create a pipe to proxy the stream for deobfuscation - pr, pw := io.Pipe() - - // Start piping the resource's stream in a goroutine - go func() { - _, err := d.ProxyResource.Stream(pw, start, end) - if err != nil { - pw.CloseWithError(err) - } else { - pw.Close() - } - }() - - // First, we just read the obfuscated portion (1040 or 1024 first bytes) - obfuscatedPortion := make([]byte, v) - on, err := pr.Read(obfuscatedPortion) - if err != nil && err != io.EOF { - if fre, ok := err.(*fetcher.ResourceError); ok { - return 0, fre - } else { - return 0, fetcher.Other(errors.Wrap(err, "error reading obfuscated portion of font")) - } - } + // Create a pipe to proxy the stream for deobfuscation + pr, pw := io.Pipe() - // Handle filesize <= the obfuscated portion's length or the requested length - atEnd := false - if err == io.EOF || (end != 0 && end <= start+int64(on)) { - obfuscatedPortion = obfuscatedPortion[:on] - atEnd = true - pr.Close() + // Start piping the resource's stream in a goroutine + go func() { + _, err := d.ProxyResource.Stream(pw, start, end) + if err != nil { + pw.CloseWithError(err) + } else { + pw.Close() } - - // Deobfuscate just the obfuscated portion - var obfuscationKey []byte - switch algorithm { - case "http://ns.adobe.com/pdf/enc#RC": - obfuscationKey = d.getHashKeyAdobe() - default: - shasum := sha1.Sum([]byte(d.identifier)) - obfuscationKey = shasum[:] + }() + + // First, we just read the obfuscated portion (1040 or 1024 first bytes) + obfuscatedPortion := make([]byte, v) + on, err := pr.Read(obfuscatedPortion) + if err != nil && err != io.EOF { + if fre, ok := err.(*fetcher.ResourceError); ok { + return 0, fre + } else { + return 0, fetcher.Other(errors.Wrap(err, "error reading obfuscated portion of font")) } - deobfuscateFont(obfuscatedPortion, start, obfuscationKey, v) + } - defer pr.Close() + // Handle filesize <= the obfuscated portion's length or the requested length + atEnd := false + if on < len(obfuscatedPortion) || (end != 0 && end <= start+int64(on)) { + obfuscatedPortion = obfuscatedPortion[:on] + atEnd = true + pr.Close() + } - // And write it to the stream - _, err = w.Write(obfuscatedPortion) - if err != nil { - return 0, fetcher.Other(errors.Wrap(err, "error writing obfuscated portion of font")) - } + // Deobfuscate just the obfuscated portion + var obfuscationKey []byte + switch algorithm { + case "http://ns.adobe.com/pdf/enc#RC": + obfuscationKey = d.getHashKeyAdobe() + default: + shasum := sha1.Sum([]byte(d.identifier)) + obfuscationKey = shasum[:] + } + deobfuscateFont(obfuscatedPortion, start, obfuscationKey, v) - // The rest of the font is not obfuscated, so it's "copied" directly using a 32KB buffer - var wn int64 - if !atEnd { - wn, err = io.Copy(w, pr) - if err != nil { - return 0, fetcher.Other(errors.Wrap(err, "error writing unobfuscated portion of font")) - } + defer pr.Close() + + // And write it to the stream + _, err = w.Write(obfuscatedPortion) + if err != nil { + return 0, fetcher.Other(errors.Wrap(err, "error writing obfuscated portion of font")) + } + + // The rest of the font is not obfuscated, so it's "copied" directly using a 32KB buffer + var wn int64 + if !atEnd { + wn, err = io.Copy(w, pr) + if err != nil { + return 0, fetcher.Other(errors.Wrap(err, "error writing unobfuscated portion of font")) } - return int64(on) + wn, nil } + return int64(on) + wn, nil } // Algorithm not in known, so skip deobfuscation diff --git a/pkg/parser/epub/deobfuscator_test.go b/pkg/parser/epub/deobfuscator_test.go index 0e88753f..ba30d42f 100644 --- a/pkg/parser/epub/deobfuscator_test.go +++ b/pkg/parser/epub/deobfuscator_test.go @@ -28,11 +28,11 @@ func withDeobfuscator(t *testing.T, href string, algorithm string, start, end in Href: href, } if algorithm != "" { - link.Properties.Add(manifest.Properties{ + link.Properties = manifest.Properties{ "encrypted": map[string]interface{}{ "algorithm": algorithm, }, - }) + } } obfu, err := NewDeobfuscator(identifier).Transform(ft.Get(link)).Read(start, end) if !assert.Nil(t, err) { diff --git a/pkg/parser/epub/parser.go b/pkg/parser/epub/parser.go index b53002e0..02ba8ee4 100644 --- a/pkg/parser/epub/parser.go +++ b/pkg/parser/epub/parser.go @@ -3,6 +3,7 @@ package epub import ( "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/asset" + "github.com/readium/go-toolkit/pkg/content/iterator" "github.com/readium/go-toolkit/pkg/fetcher" "github.com/readium/go-toolkit/pkg/manifest" "github.com/readium/go-toolkit/pkg/mediatype" @@ -69,6 +70,9 @@ func (p Parser) Parse(asset asset.PublicationAsset, f fetcher.Fetcher) (*pub.Bui builder := pub.NewServicesBuilder(map[string]pub.ServiceFactory{ pub.PositionsService_Name: PositionsServiceFactory(p.reflowablePositionsStrategy), + pub.ContentService_Name: pub.DefaultContentServiceFactory([]iterator.ResourceContentIteratorFactory{ + iterator.HTMLFactory(), + }), }) return pub.NewBuilder(manifest, ffetcher, builder), nil } @@ -160,7 +164,7 @@ func parseDisplayOptions(fetcher fetcher.Fetcher) (ret map[string]string) { } } - if platform := displayOptionsXml.SelectElement("platform"); platform != nil { + if platform := displayOptionsXml.SelectElement("//platform"); platform != nil { for _, option := range platform.SelectElements("option") { optName := option.SelectAttr("name") optValue := option.InnerText() diff --git a/pkg/parser/epub/positions_service.go b/pkg/parser/epub/positions_service.go index 0da84cff..cbb345e8 100644 --- a/pkg/parser/epub/positions_service.go +++ b/pkg/parser/epub/positions_service.go @@ -171,9 +171,9 @@ type ArchiveEntryLength struct { // PositionCount implements ReflowableStrategy func (l ArchiveEntryLength) PositionCount(resource fetcher.Resource) uint { var length uint64 - props := resource.Link().Properties + props := resource.Properties() if p := props.Get("https://readium.org/webpub-manifest/properties#archive"); p != nil { - if pm, ok := p.(manifest.Properties); ok { + if pm, ok := p.(map[string]interface{}); ok { if el, ok := pm["entryLength"].(uint64); ok { length = el } diff --git a/pkg/parser/pdf/parser.go b/pkg/parser/pdf/parser.go index 34b11e9e..69133ee8 100644 --- a/pkg/parser/pdf/parser.go +++ b/pkg/parser/pdf/parser.go @@ -2,6 +2,7 @@ package pdf import ( "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" + "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/validate" "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/asset" @@ -20,7 +21,7 @@ func NewParser() Parser { func init() { // Disable this feature of pdfcpu - pdfcpu.ConfigPath = "disable" + model.ConfigPath = "disable" } // Parse implements PublicationParser @@ -43,8 +44,8 @@ func (p Parser) Parse(asset asset.PublicationAsset, f fetcher.Fetcher) (*pub.Bui return nil, errors.New("unable to find PDF file: no matching link found") } - conf := pdfcpu.NewDefaultConfiguration() - conf.ValidationMode = pdfcpu.ValidationRelaxed + conf := model.NewDefaultConfiguration() + conf.ValidationMode = model.ValidationRelaxed ctx, err := pdfcpu.Read(fetcher.NewResourceReadSeeker(f.Get(*link)), conf) if err != nil { return nil, errors.Wrap(err, "failed opening PDF") diff --git a/pkg/parser/pdf/parser_metadata.go b/pkg/parser/pdf/parser_metadata.go index 213df989..653daaf4 100644 --- a/pkg/parser/pdf/parser_metadata.go +++ b/pkg/parser/pdf/parser_metadata.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" + "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" "github.com/pkg/errors" "github.com/readium/go-toolkit/pkg/internal/extensions" "github.com/readium/go-toolkit/pkg/manifest" @@ -29,7 +30,7 @@ func loadDecoder(meta pdfcpu.Metadata) (*xmp.Document, []byte, error) { return doc, metabin, nil } -func ParseMetadata(ctx *pdfcpu.Context, link *manifest.Link) (m manifest.Manifest, err error) { +func ParseMetadata(ctx *model.Context, link *manifest.Link) (m manifest.Manifest, err error) { if link != nil { m.ReadingOrder = manifest.LinkList{{ Href: strings.TrimPrefix(link.Href, "/"), @@ -45,7 +46,7 @@ func ParseMetadata(ctx *pdfcpu.Context, link *manifest.Link) (m manifest.Manifes m.Metadata.ConformsTo = manifest.Profiles{manifest.ProfilePDF} // hashmaterial := make([]string, 0, 64) - metas, _ := ctx.ExtractMetadata() + metas, _ := pdfcpu.ExtractMetadata(ctx) for _, meta := range metas { doc, _, derr := loadDecoder(meta) if derr != nil { @@ -129,7 +130,7 @@ func ParseXMPMetadata(doc *xmp.Document, metadata *manifest.Metadata) error { return nil } -func ParsePDFMetadata(ctx *pdfcpu.Context, m *manifest.Manifest) error { +func ParsePDFMetadata(ctx *model.Context, m *manifest.Manifest) error { // Page count if ctx.PageCount > 0 && m.Metadata.NumberOfPages == nil { pc := uint(ctx.PageCount) @@ -182,7 +183,7 @@ func ParsePDFMetadata(ctx *pdfcpu.Context, m *manifest.Manifest) error { } // Bookmarks (TOC) - if bookmarks, err := ctx.BookmarksForOutline(); err == nil { + if bookmarks, err := pdfcpu.Bookmarks(ctx); err == nil { rootLink := m.ReadingOrder.FirstWithMediaType(&mediatype.PDF) root := "" if rootLink != nil { @@ -196,8 +197,8 @@ func ParsePDFMetadata(ctx *pdfcpu.Context, m *manifest.Manifest) error { Title: b.Title, Type: mediatype.PDF.String(), } - if len(b.Children) > 0 { - bf(lnk.Children, b.Children) + if len(b.Kids) > 0 { + bf(lnk.Children, b.Kids) } m.TableOfContents = append(m.TableOfContents, lnk) } diff --git a/pkg/pub/publication.go b/pkg/pub/publication.go index 3b842fc5..4d4b5de5 100644 --- a/pkg/pub/publication.go +++ b/pkg/pub/publication.go @@ -3,6 +3,7 @@ package pub import ( "encoding/json" "path" + "strings" "github.com/readium/go-toolkit/pkg/fetcher" "github.com/readium/go-toolkit/pkg/manifest" @@ -22,6 +23,29 @@ func (p Publication) ConformsTo(profile manifest.Profile) bool { return p.Manifest.ConformsTo(profile) } +// Finds the first [Link] with the given href in the publication's links. +// Searches through (in order) the reading order, resources and links recursively following alternate and children links. +// If there's no match, tries again after removing any query parameter and anchor from the given href. +func (p Publication) LinkWithHref(href string) *manifest.Link { + return p.Manifest.LinkWithHref(href) +} + +// Finds the first [Link] having the given [rel] in the publications's links. +func (p Publication) LinkWithRel(rel string) *manifest.Link { + return p.Manifest.LinkWithRel(rel) +} + +// Finds all [Link]s having the given [rel] in the publications's links. +func (p Publication) LinksWithRel(rel string) []manifest.Link { + return p.Manifest.LinksWithRel(rel) +} + +// Creates a new [Locator] object from a [Link] to a resource of this publication. +// Returns nil if the resource is not found in this publication. +func (p Publication) LocatorFromLink(link manifest.Link) *manifest.Locator { + return p.Manifest.LocatorFromLink(link) +} + // Returns the RWPM JSON representation for this [Publication]'s manifest, as a string. func (p Publication) JSONManifest() (string, error) { bin, err := json.Marshal(p.Manifest) @@ -89,7 +113,9 @@ func (p Publication) Find(path string) *manifest.Link { } } - link.Href = "/" + link.Href + if !strings.HasPrefix(link.Href, "/") { + link.Href = "/" + link.Href + } return link } diff --git a/pkg/pub/service.go b/pkg/pub/service.go index 8ca17e6a..29b40adf 100644 --- a/pkg/pub/service.go +++ b/pkg/pub/service.go @@ -11,6 +11,7 @@ const ( LocatorService_Name = "LocatorService" PositionsService_Name = "PositionsService" SearchService_Name = "SearchService" + ContentService_Name = "ContentService" ) // Base interface to be implemented by all publication services. diff --git a/pkg/pub/service_content.go b/pkg/pub/service_content.go new file mode 100644 index 00000000..5bd2f905 --- /dev/null +++ b/pkg/pub/service_content.go @@ -0,0 +1,101 @@ +package pub + +import ( + "encoding/json" + + "github.com/readium/go-toolkit/pkg/content" + "github.com/readium/go-toolkit/pkg/content/element" + "github.com/readium/go-toolkit/pkg/content/iterator" + "github.com/readium/go-toolkit/pkg/fetcher" + "github.com/readium/go-toolkit/pkg/manifest" +) + +// TODO content iterator special ~readium link + +var ContentLink = manifest.Link{ + Href: "/~readium/content.json", + Type: "application/vnd.readium.content+json", +} + +// TODO uri template or something so we're not just dumping entire content +// progression, href, cssselector, text context + +// PositionsService implements Service +// Provides a way to extract the raw [Content] of a [Publication]. +type ContentService interface { + Service + Content(start *manifest.Locator) content.Content // Creates a [Content] starting from the given [start] location. +} + +// Implements ContentService +type DefaultContentService struct { + context Context + resourceContentIteratorFactories []iterator.ResourceContentIteratorFactory +} + +func GetForContentService(service ContentService, link manifest.Link) (fetcher.Resource, bool) { + if link.Href != ContentLink.Href { + return nil, false + } + + elements, err := content.ContentElements(service.Content(nil)) + if err != nil { + return fetcher.NewFailureResource(ContentLink, fetcher.Other(err)), false + } + + return fetcher.NewBytesResource(ContentLink, func() []byte { + // Warning: this can be a massive payload since it's the entire content of the publication right now + bin, _ := json.Marshal(elements) + return bin + }), true +} + +func (s DefaultContentService) Close() {} + +func (s DefaultContentService) Links() manifest.LinkList { + return manifest.LinkList{ContentLink} +} + +func (s DefaultContentService) Get(link manifest.Link) (fetcher.Resource, bool) { + return GetForContentService(s, link) +} + +func (s DefaultContentService) Content(start *manifest.Locator) content.Content { + return contentImplementation{ + context: s.context, + start: start, + resourceContentIteratorFactories: s.resourceContentIteratorFactories, + } +} + +type contentImplementation struct { + context Context + start *manifest.Locator + resourceContentIteratorFactories []iterator.ResourceContentIteratorFactory +} + +func (c contentImplementation) Iterator() iterator.Iterator { + return iterator.NewPublicationContent( + c.context.Manifest, + c.context.Fetcher, + c.start, + c.resourceContentIteratorFactories, + ) +} + +func (c contentImplementation) Elements() ([]element.Element, error) { + return content.ContentElements(c) +} + +func (c contentImplementation) Text(separator *string) (string, error) { + return content.ContentText(c, separator) +} + +func DefaultContentServiceFactory(resourceContentIteratorFactories []iterator.ResourceContentIteratorFactory) ServiceFactory { + return func(context Context) Service { + return DefaultContentService{ + context: context, + resourceContentIteratorFactories: resourceContentIteratorFactories, + } + } +} diff --git a/pkg/streamer/a11y_infer_test.go b/pkg/streamer/a11y_infer_test.go index f0312eb7..440b7b42 100644 --- a/pkg/streamer/a11y_infer_test.go +++ b/pkg/streamer/a11y_infer_test.go @@ -303,7 +303,7 @@ func TestInferFeaturePageList(t *testing.T) { // "resources" in RWPM) func TestInferFeatureMathML(t *testing.T) { link := newLink(mediatype.HTML, "html") - link.Properties = map[string]interface{}{ + link.Properties = manifest.Properties{ "contains": []string{"mathml"}, } m := manifest.Manifest{