From 2c4e86cdff49bde3fc0ff83f5fdeeb1fef97a448 Mon Sep 17 00:00:00 2001
From: xiaodaigh <zhuojia.dai@gmail.com>
Date: Fri, 28 May 2021 13:33:33 +1000
Subject: [PATCH] another new version

---
 Project.toml        |  5 +++--
 src/TableScraper.jl | 17 ++++++++++++-----
 test/runtests.jl    | 10 +++++++++-
 3 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/Project.toml b/Project.toml
index 48e69ed..133e905 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "TableScraper"
 uuid = "3d876f86-fca9-45cb-9864-7207416dc431"
 authors = ["ZJ <zhuojia.dai@gmail.com>"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
 Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f"
@@ -18,6 +18,7 @@ julia = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 
 [targets]
-test = ["Test"]
+test = ["Test", "DataFrames"]
diff --git a/src/TableScraper.jl b/src/TableScraper.jl
index 8de6873..1380645 100644
--- a/src/TableScraper.jl
+++ b/src/TableScraper.jl
@@ -11,7 +11,8 @@ include("Tables.jl")
 
 """
     scrape_tables(url)
-    scrape_tables(url, cell_transform)
+    scrape_tables(url, cell_transform[=nodeText])
+    scrape_tables(url, cell_transform[=nodeText], header_transform[=nodeText])
 
 This function will scrape `url` for any WELL-FORMED tables wrapped in `<table>` tags and return
 them in a vector.
@@ -19,11 +20,16 @@ them in a vector.
 # Arguments
 
     - `url`: The URL to look for tables
-    - `cell_transform`: By default, each of the table cells wrapped in `<td>` have transformed by
+    - `cell_transform`: By default, each of the table cells wrapped in `<td>` is transformed by
         the callable (i.e. `Function` or type definition) `cell_transform`. The default
         `cell_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to use
         `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced processing,
         e.g. `scrape_tables(url, identity)`
+    - `header_transform`: By default, each of the table header wrapped in `<th>` is transformed by
+        the callable (i.e. `Function` or type definition) `header_transform`. The default
+        `header_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to
+        use `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced
+        processing, e.g. `scrape_tables(url, identity, identity)`
 
 # Return
 
@@ -33,7 +39,7 @@ The `TableScraper.Table` is a Tables.jl-compatible row-accessible type. So you c
 another Tables.jl compatible type if you wish e.g. `DataFrame.(scrape_tables(url))` will return a
 vector of `DataFrame`s
 """
-function scrape_tables(url, cell_transform=nodeText)::Vector{Table}
+function scrape_tables(url, cell_transform=nodeText, header_transform=nodeText)::Vector{Table}
     result_tables = []
 
     response::HTTP.Messages.Response =
@@ -59,13 +65,14 @@ function scrape_tables(url, cell_transform=nodeText)::Vector{Table}
     for (header, table_elem) in zip(headers, tables_elems)
         for header1 in eachmatch(sel"tr th", table_elem)
             # check the header span
-            if haskey(header1.attributes, "colspan")
+            # you are on your won if you don't use nodeText
+            if (nodeText == header_transform) & haskey(header1.attributes, "colspan")
                 colspan = parse(Int, header1.attributes["colspan"])
                 for i in 1:colspan
                     push!(header, nodeText(header1)*"$i")
                 end
             else
-                push!(header, nodeText(header1))
+                push!(header, header_transform(header1))
             end
         end
     end
diff --git a/test/runtests.jl b/test/runtests.jl
index a6813e9..36d62b5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,6 +1,14 @@
 using TableScraper
+using DataFrames
 using Test
 
 @testset "TableScraper.jl" begin
-    # Write your tests here.
+    table = scrape_tables("https://www.agenas.gov.it/covid19/web/index.php?r=site%2Fprovvedimento&q=010")[1] |> DataFrame;
+
+    @test nrow(table) > 0
+end
+
+@testset "TableScraper.jl goratings" begin
+    table = scrape_tables("https://goratings.org")[2] |> DataFrame;
+    @test nrow(table) > 0
 end