From 2c4e86cdff49bde3fc0ff83f5fdeeb1fef97a448 Mon Sep 17 00:00:00 2001 From: xiaodaigh Date: Fri, 28 May 2021 13:33:33 +1000 Subject: [PATCH] another new version --- Project.toml | 5 +++-- src/TableScraper.jl | 17 ++++++++++++----- test/runtests.jl | 10 +++++++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 48e69ed..133e905 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "TableScraper" uuid = "3d876f86-fca9-45cb-9864-7207416dc431" authors = ["ZJ "] -version = "0.1.1" +version = "0.1.2" [deps] Cascadia = "54eefc05-d75b-58de-a785-1a3403f0919f" @@ -18,6 +18,7 @@ julia = "1" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" [targets] -test = ["Test"] +test = ["Test", "DataFrames"] diff --git a/src/TableScraper.jl b/src/TableScraper.jl index 8de6873..1380645 100644 --- a/src/TableScraper.jl +++ b/src/TableScraper.jl @@ -11,7 +11,8 @@ include("Tables.jl") """ scrape_tables(url) - scrape_tables(url, cell_transform) + scrape_tables(url, cell_transform[=nodeText]) + scrape_tables(url, cell_transform[=nodeText], header_transform[=nodeText]) This function will scrape `url` for any WELL-FORMED tables wrapped in `` tags and return them in a vector. @@ -19,11 +20,16 @@ them in a vector. # Arguments - `url`: The URL to look for tables - - `cell_transform`: By default, each of the table cells wrapped in `
` have transformed by + - `cell_transform`: By default, each of the table cells wrapped in `` is transformed by the callable (i.e. `Function` or type definition) `cell_transform`. The default `cell_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to use `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced processing, e.g. `scrape_tables(url, identity)` + - `header_transform`: By default, each of the table header wrapped in `` is transformed by + the callable (i.e. `Function` or type definition) `header_transform`. The default + `header_transform` is `Cascadia.nodeText` which extracts the node's text. You may wish to + use `identity` to extract just the cell as a `Gumbo.HTMLNode` type for more advanced + processing, e.g. `scrape_tables(url, identity, identity)` # Return @@ -33,7 +39,7 @@ The `TableScraper.Table` is a Tables.jl-compatible row-accessible type. So you c another Tables.jl compatible type if you wish e.g. `DataFrame.(scrape_tables(url))` will return a vector of `DataFrame`s """ -function scrape_tables(url, cell_transform=nodeText)::Vector{Table} +function scrape_tables(url, cell_transform=nodeText, header_transform=nodeText)::Vector{Table} result_tables = [] response::HTTP.Messages.Response = @@ -59,13 +65,14 @@ function scrape_tables(url, cell_transform=nodeText)::Vector{Table} for (header, table_elem) in zip(headers, tables_elems) for header1 in eachmatch(sel"tr th", table_elem) # check the header span - if haskey(header1.attributes, "colspan") + # you are on your won if you don't use nodeText + if (nodeText == header_transform) & haskey(header1.attributes, "colspan") colspan = parse(Int, header1.attributes["colspan"]) for i in 1:colspan push!(header, nodeText(header1)*"$i") end else - push!(header, nodeText(header1)) + push!(header, header_transform(header1)) end end end diff --git a/test/runtests.jl b/test/runtests.jl index a6813e9..36d62b5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,6 +1,14 @@ using TableScraper +using DataFrames using Test @testset "TableScraper.jl" begin - # Write your tests here. + table = scrape_tables("https://www.agenas.gov.it/covid19/web/index.php?r=site%2Fprovvedimento&q=010")[1] |> DataFrame; + + @test nrow(table) > 0 +end + +@testset "TableScraper.jl goratings" begin + table = scrape_tables("https://goratings.org")[2] |> DataFrame; + @test nrow(table) > 0 end