Skip to content

Commit

Permalink
fix apache waf extract; add test
Browse files Browse the repository at this point in the history
  • Loading branch information
FuhuXia committed Nov 12, 2024
1 parent 1f2b9b1 commit 2e515f6
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ jobs:
- name: Install dependencies from requirements.txt
run: |
pip install -r requirements.txt
pip install pytest-ckan
pip install -r dev-requirements.txt
- name: Install harvester
run: |
Expand Down
2 changes: 2 additions & 0 deletions ckanext/spatial/harvesters/waf.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,8 @@ def _extract_waf(content, base_url, scraper, results = None, depth=0):
if 'mailto:' in url:
continue
if '..' not in url and url[-1] == '/':
if scraper == 'apache' and url[0] == '/':
continue
new_depth = depth + 1
if depth > 10:
log.info('Max WAF depth reached')
Expand Down
13 changes: 13 additions & 0 deletions ckanext/spatial/tests/waf_extract/html_files/apache-folder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /apache-folder</title>
</head>
<body>
<h1>Index of /apache-folder</h1>
<pre> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr> <a href="/">Parent Directory</a> -
<a href="record-1.xml">record-1.xml</a> 2024-11-07 15:00 356K
<a href="subfolder/">subfolder/</a> 2024-11-12 15:00 -
<hr></pre>
</body></html>
12 changes: 12 additions & 0 deletions ckanext/spatial/tests/waf_extract/html_files/apache-subfolder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /apache-folder/subfolder</title>
</head>
<body>
<h1>Index of /apache-folder/subfolder</h1>
<pre> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a><hr> <a href="/folder/">Parent Directory</a> -
<a href="record-2.xml">record-2.xml</a> 2024-11-07 16:59 182K
<hr></pre>
</body></html>
5 changes: 5 additions & 0 deletions ckanext/spatial/tests/waf_extract/html_files/iis-folder.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<html><head><title>iis.server - /iis-folder/</title></head><body><H1>iis.server - /iis-folder/</H1><hr>

<pre><A HREF="/">[To Parent Directory]</A><br><br> 11/7/2024 7:20 AM &lt;dir&gt; <A HREF="/iis-folder/subfolder/">subfolder</A><br> 11/7/2024 3:00 PM 168 <A HREF="/iis-folder/record-1.xml">record-1.xml</A><br></pre><hr></body></html>


Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
<html><head><title>iis.server - /iis-folder/subfolder/</title></head><body><H1>iis.server - /iis-folder/subfolder/</H1><hr>

<pre><A HREF="/iis-folder/">[To Parent Directory]</A><br><br> 11/7/2024 4:59 PM 8958 <A HREF="/iis-folder/subfolder/record-2.xml">record-2.xml</A><br></pre><hr></body></html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

<html>
<head><title>Index of /nginx/</title></head>
<body bgcolor="white">
<h1>Index of /nginx/</h1><hr><pre><a href="../">../</a>
<a href="subfolder/">subfolder/</a> 07-Nov-2024 15:00 -
<a href="record-1.xml">record-1.xml</a> 07-Nov-2024 15:00 364868
</pre><hr></body>
</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

<html>
<head><title>Index of /nginx/subfoler/</title></head>
<body bgcolor="white">
<h1>Index of /nginx/subfolder/</h1><hr><pre><a href="../">../</a>
<a href="record-2.xml">record-2.xml</a> 07-Nov-2024 16:59 186150
</pre><hr></body>
</html>
53 changes: 53 additions & 0 deletions ckanext/spatial/tests/waf_extract/test_waf_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os

from ckanext.spatial.harvesters.waf import _extract_waf

TEST_DIR = os.path.dirname(os.path.abspath(__file__))
HTML_DIR = os.path.join(TEST_DIR, "html_files")

def test_extract_iis(httpserver):

# feed http response with these static html content
with \
open(f"{HTML_DIR}/iis-folder.html", "r") as iis_folder, \
open(f"{HTML_DIR}/nginx-folder.html", "r") as nginx_folder, \
open(f"{HTML_DIR}/apache-folder.html", "r") as apache_folder, \
open(f"{HTML_DIR}/iis-subfolder.html", "r") as iis_subfolder, \
open(f"{HTML_DIR}/nginx-subfolder.html", "r") as nginx_subfolder, \
open(f"{HTML_DIR}/apache-subfolder.html", "r") as apache_subfolder:
iis_folder_content = iis_folder.read()
nginx_folder_content = nginx_folder.read()
apache_folder_content = apache_folder.read()
iis_subfolder_content = iis_subfolder.read()
nginx_subfolder_content = nginx_subfolder.read()
apache_subfolder_content = apache_subfolder.read()

# feed static content when it traverses the subfolder
httpserver.expect_request("/iis-folder/subfolder/").respond_with_data(iis_subfolder_content)
httpserver.expect_request("/nginx-folder/subfolder/").respond_with_data(nginx_subfolder_content)
httpserver.expect_request("/apache-folder/subfolder/").respond_with_data(apache_subfolder_content)

# let it scape, traverse and extract the content
iis_results = _extract_waf(
iis_folder_content,
httpserver.url_for("/iis-folder/"),
"iis"
)

nginx_results = _extract_waf(
nginx_folder_content,
httpserver.url_for("/nginx-folder/"),
"nginx"
)

apache_results = _extract_waf(
apache_folder_content,
httpserver.url_for("/apache-folder/"),
"apache"
)

records_expected = [('record-1.xml', '2024-11-07 15:00:00'), ('record-2.xml', '2024-11-07 16:59:00')]

assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in iis_results])
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in nginx_results])
assert records_expected == sorted([(os.path.basename(r[0]), r[1]) for r in apache_results])
2 changes: 2 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pytest-ckan
pytest-httpserver

0 comments on commit 2e515f6

Please sign in to comment.