From bc51f9378149009f3f6933bb71f62133e85e21c6 Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 13:01:55 -0800
Subject: [PATCH 1/9] Restrict crawling to a specific domain using a 3rd
 parameter

---
 seaspider.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/seaspider.py b/seaspider.py
index de4afff..0062581 100644
--- a/seaspider.py
+++ b/seaspider.py
@@ -29,7 +29,13 @@ def main():
         crawl_result = crawl_target(target_url)
         print(crawl_result['status_code'], ' ', target_url)
         soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
-        links = soup.findAll('a', attrs={'href': re.compile('^https?://')})
+        pattern = '^https?://'
+
+        # Apply domain restriction
+        if len(sys.argv) >= 3:
+            pattern += sys.argv[2]
+
+        links = soup.findAll('a', attrs={'href': re.compile(pattern)})
         print(len(links), ' links detected')
 
         for link in links:

From 7f2e990a600860277bbabc9ac94aa33291c92257 Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 13:02:28 -0800
Subject: [PATCH 2/9] Add docstring to main function

---
 seaspider.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/seaspider.py b/seaspider.py
index 0062581..15e6e13 100644
--- a/seaspider.py
+++ b/seaspider.py
@@ -20,6 +20,8 @@ def crawl_target(target_url):
     return crawl_result
 
 def main():
+    """Crawl a given starting URL, collect all links from its HTML, and then
+    recursively crawl those links, while avoiding duplicate crawls."""
     crawl_queue = {}
 
     if len(sys.argv) < 2:

From a89a06ac0c5f12d1c364b2a0d5f1406ef01f276d Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 13:19:12 -0800
Subject: [PATCH 3/9] Move primary logic into crawl_from_origin function

---
 seaspider.py | 67 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/seaspider.py b/seaspider.py
index 15e6e13..5b0e7a0 100644
--- a/seaspider.py
+++ b/seaspider.py
@@ -12,6 +12,32 @@ def check_url(url):
     print('\n', url_check_result['status_code'], ' ', url)
     return url_check_result
 
+def crawl_from_origin(origin_url, domain_restriction=''):
+    """Crawl a given starting URL, collect all links from its HTML, and then
+    recursively crawl those links, while avoiding duplicate crawls."""
+    crawl_queue = {}
+    crawl_result = crawl_target(origin_url)
+    print(crawl_result['status_code'], ' ', origin_url, ' ORIGIN')
+    soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
+    pattern = '^https?://' + domain_restriction
+    links = soup.findAll('a', attrs={'href': re.compile(pattern)})
+    print(len(links), ' links detected')
+
+    for link in links:
+        url = link.get('href')
+        
+        if not url in crawl_queue.keys():
+            crawl_queue[url] = {}
+
+    for key in crawl_queue.keys():
+        print(key)
+
+    progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
+
+    for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
+        crawl_queue[key]['crawl_result'] = check_url(key)
+        time.sleep(0.1)
+
 def crawl_target(target_url):
     crawl_result = {}
     r = requests.get(target_url, headers={'User-Agent': 'Sea'})
@@ -20,39 +46,22 @@ def crawl_target(target_url):
     return crawl_result
 
 def main():
-    """Crawl a given starting URL, collect all links from its HTML, and then
-    recursively crawl those links, while avoiding duplicate crawls."""
-    crawl_queue = {}
-
     if len(sys.argv) < 2:
         print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.')
     else:
-        target_url = sys.argv[1]
-        crawl_result = crawl_target(target_url)
-        print(crawl_result['status_code'], ' ', target_url)
-        soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
-        pattern = '^https?://'
-
-        # Apply domain restriction
+        origin_url = sys.argv[1]
+        
         if len(sys.argv) >= 3:
-            pattern += sys.argv[2]
-
-        links = soup.findAll('a', attrs={'href': re.compile(pattern)})
-        print(len(links), ' links detected')
-
-        for link in links:
-            url = link.get('href')
+            domain_restriction = sys.argv[2]
+            crawl_from_origin(origin_url, domain_restriction)
+        else:
+            domain_restriction_warning = 'You are about to crawl with domain'\
+                'restriction. Are you sure? (y/n) '
+            user_input = input(domain_restriction_warning)
             
-            if not url in crawl_queue.keys():
-                crawl_queue[url] = {}
-
-        for key in crawl_queue.keys():
-            print(key)
-
-        progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
-
-        for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
-            crawl_queue[key]['crawl_result'] = check_url(key)
-            time.sleep(0.1)
+            if user_input == 'y':
+                crawl_from_origin(origin_url)
+                
+    print('Ending session...')
             
 main()

From 2490fa93c2e902ecedb13da16ed5012e56aec53a Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 13:25:18 -0800
Subject: [PATCH 4/9] Reduce redundancy in function roles

---
 seaspider.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/seaspider.py b/seaspider.py
index 5b0e7a0..9c871b8 100644
--- a/seaspider.py
+++ b/seaspider.py
@@ -5,19 +5,11 @@
 import time
 from tqdm import tqdm
 
-def check_url(url):
-    url_check_result = {}
-    r = requests.get(url, headers={'User-Agent': 'Sea'})
-    url_check_result['status_code'] = r.status_code
-    print('\n', url_check_result['status_code'], ' ', url)
-    return url_check_result
-
 def crawl_from_origin(origin_url, domain_restriction=''):
     """Crawl a given starting URL, collect all links from its HTML, and then
     recursively crawl those links, while avoiding duplicate crawls."""
     crawl_queue = {}
     crawl_result = crawl_target(origin_url)
-    print(crawl_result['status_code'], ' ', origin_url, ' ORIGIN')
     soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
     pattern = '^https?://' + domain_restriction
     links = soup.findAll('a', attrs={'href': re.compile(pattern)})
@@ -29,19 +21,17 @@ def crawl_from_origin(origin_url, domain_restriction=''):
         if not url in crawl_queue.keys():
             crawl_queue[url] = {}
 
-    for key in crawl_queue.keys():
-        print(key)
-
     progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
 
     for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
-        crawl_queue[key]['crawl_result'] = check_url(key)
+        crawl_queue[key]['crawl_result'] = crawl_target(key)
         time.sleep(0.1)
 
 def crawl_target(target_url):
     crawl_result = {}
     r = requests.get(target_url, headers={'User-Agent': 'Sea'})
     crawl_result['status_code'] = r.status_code
+    print('\n', crawl_result['status_code'], ' ', target_url)
     crawl_result['text'] = r.text
     return crawl_result
 

From de3b32f613ec7a9aec34445b22d60b8ce318c1a9 Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 14:02:19 -0800
Subject: [PATCH 5/9] Add database directory to .gitignore list

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index b6e4761..7b3385b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Sea Spider custom ignore rules
+db/

From 646fc39fa9c30e6137e86e1cf7c0b72cc5f63a3d Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 14:13:17 -0800
Subject: [PATCH 6/9] Add initial setup instructions and example using domain
 restriction to readme

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 3804e13..56c2ec5 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,21 @@
 A humble SEO spider and link checker
 
 # Usage
+## Initial setup
 ```
 pip install -r requirements.txt
+python setup.py
+```
+
+## Basic example
+```
 python seaspider.py https://google.com
 ```
 
 ![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif)
+
+## Example with domain restriction
+You can limit crawling to a specific domain by providing a second parameter, the domain name.
+```
+python seaspider.py https://google.com google.com
+```

From 470eb9d0e17554331537e5cc5fc026408a1780b1 Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 14:13:39 -0800
Subject: [PATCH 7/9] Add SQLAlchemy to requirements.txt

---
 requirements.txt | Bin 298 -> 338 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 577d67f550a6c4a6aed9a022d4b8676c42ba6f72..8b78c232f09170f4ca4032ef7eea70af118f679a 100644
GIT binary patch
delta 45
xcmZ3*bct!g8|7ezKn5QMM}{1RWQGieREAuJN(NgXG-S|YFa}~H2IGm(odE%}3M~Kt

delta 10
Rcmcb_w2EoMo5>=Kb^sW11LFVy


From d397e34b4981e53f7960526d3d2b88d4fec6522d Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 14:14:31 -0800
Subject: [PATCH 8/9] Add setup script for sqlite database

---
 setup.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 setup.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..05064ba
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,44 @@
+import glob
+import os
+import sqlalchemy
+
+def clean_up_database(database_file_path):
+    if len(glob.glob(database_file_path)) > 0:
+        os.remove(database_file_path)
+
+def create_new_sqlite_file(database_file_path):
+    engine = sqlalchemy.create_engine(database_file_path)
+    metadata = sqlalchemy.MetaData()
+
+    results = sqlalchemy.Table(
+        'results',
+        metadata,
+        sqlalchemy.Column(
+            'id',
+            sqlalchemy.Integer(),
+            primary_key=True
+        ),
+        sqlalchemy.Column(
+            'url',
+            sqlalchemy.String(2048),
+            nullable=False
+        ),
+        sqlalchemy.Column(
+            'last_crawl_timestamp',
+            sqlalchemy.DateTime()
+        ),
+        sqlalchemy.Column(
+            'last_crawl_status_code',
+            sqlalchemy.Integer(),
+            nullable=False
+        )
+    )
+
+    metadata.create_all(engine) #Create the table
+
+def main():
+    database_file_path = 'sqlite:///db/seaspider.sqlite'
+    clean_up_database(database_file_path)
+    create_new_sqlite_file(database_file_path)
+
+main()

From ccc7408c067ee0851f93b5071a6f358ff344a712 Mon Sep 17 00:00:00 2001
From: viperior <viperior@gmail.com>
Date: Sun, 21 Feb 2021 18:46:45 -0800
Subject: [PATCH 9/9] Cache crawl results to JSON files

---
 .gitignore         |   3 +-
 config-sample.json |   6 ++
 find_errors.py     |  28 +++++++++
 requirements.txt   | Bin 338 -> 270 bytes
 seaspider.py       | 140 ++++++++++++++++++++++++++++++++-------------
 setup.py           |  44 --------------
 6 files changed, 135 insertions(+), 86 deletions(-)
 create mode 100644 config-sample.json
 create mode 100644 find_errors.py
 delete mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index 7b3385b..3d553d5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,4 +129,5 @@ dmypy.json
 .pyre/
 
 # Sea Spider custom ignore rules
-db/
+data/
+config.json
diff --git a/config-sample.json b/config-sample.json
new file mode 100644
index 0000000..0992b3d
--- /dev/null
+++ b/config-sample.json
@@ -0,0 +1,6 @@
+{
+    "allow_outside_starting_domain": false,
+    "max_crawl_count": 2000,
+    "max_crawl_depth": 3,
+    "origin_domain": "example.com"
+}
\ No newline at end of file
diff --git a/find_errors.py b/find_errors.py
new file mode 100644
index 0000000..51a0310
--- /dev/null
+++ b/find_errors.py
@@ -0,0 +1,28 @@
+import glob
+import json
+
+def find_errors():
+    ignore_list = ['data/url_id_map.json']
+    glob_pattern = 'data/*.json'
+    item_count = 0
+    ok_count = 0
+    problem_count = 0
+
+    for item in glob.glob(glob_pattern):
+        with open(item, 'r') as infile:
+            json_data = json.load(infile)
+
+        if 'id' in json_data.keys():
+            item_count += 1
+            response_code = int(json_data['response_code'])
+            url = json_data['url']
+
+            if response_code == 200:
+                ok_count += 1
+            else:
+                problem_count += 1
+
+            print(response_code, ' ', url)
+                
+    print('Statistics:\nTotal items: ', item_count, '\nHealthy signals: ', \
+        ok_count, '\nProblems: ', problem_count)
diff --git a/requirements.txt b/requirements.txt
index 8b78c232f09170f4ca4032ef7eea70af118f679a..b5a91cefa66a7a2df90e814b18f46040aeaa6305 100644
GIT binary patch
delta 10
Rcmcb_)W<a8&13^cB>)%71IGXW

delta 73
zcmeBUy2Lc$jeRgfAcGHsBSQ{DGD8MKDnl+qC4(&x8ZziH7z42pkmP0HVkluKWJm$Z
Sg5*qq;-(DdK+<61b9DeJNDYJl

diff --git a/seaspider.py b/seaspider.py
index 9c871b8..a50855f 100644
--- a/seaspider.py
+++ b/seaspider.py
@@ -1,57 +1,115 @@
 import bs4
+import find_errors
+import glob
+import json
 import re
 import requests
 import sys
 import time
-from tqdm import tqdm
-
-def crawl_from_origin(origin_url, domain_restriction=''):
-    """Crawl a given starting URL, collect all links from its HTML, and then
-    recursively crawl those links, while avoiding duplicate crawls."""
-    crawl_queue = {}
-    crawl_result = crawl_target(origin_url)
-    soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
-    pattern = '^https?://' + domain_restriction
+
+def crawl_recursively(url, depth=1):
+    url = url.split('#', 1)[0]
+    max_crawl_depth = get_config_value('max_crawl_depth')
+    
+    if depth <= max_crawl_depth:
+        crawl_target(url)
+        url_id = get_url_id(url)
+
+        with open('data/' + str(url_id) + '.json') as crawl_file:
+            crawl_json = json.load(crawl_file)
+
+        crawl_html = crawl_json['text']
+        links = extract_links_from_html(crawl_html)
+
+        for link in links:
+            crawl_recursively(link, depth + 1)
+
+def crawl_target(url):
+    url_id = get_url_id(url)
+    crawl_file_name_pattern = 'data/' + str(url_id) + '.json'
+    crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0
+
+    if not crawl_file_exists:
+        print('Crawling: ', url)
+        r = requests.get(url, headers={'User-Agent': 'Sea'})
+        crawl_result = {
+            "id": url_id,
+            "url": url,
+            "response_code": r.status_code,
+            "timestamp_float": time.time(),
+            "text": r.text
+        }
+
+        with open(crawl_file_name_pattern, 'w') as outfile:
+            json.dump(crawl_result, outfile, indent=4)
+
+def extract_links_from_html(html):
+    allow_outside_starting_domain = get_config_value('allow_outside_starting_domain')
+    origin_domain = get_config_value('origin_domain')
+    soup = bs4.BeautifulSoup(html, features='html.parser')
+    pattern = '^https?://'
+
+    if not allow_outside_starting_domain:
+        pattern += origin_domain
+
     links = soup.findAll('a', attrs={'href': re.compile(pattern)})
-    print(len(links), ' links detected')
+    links_list = []
 
     for link in links:
         url = link.get('href')
-        
-        if not url in crawl_queue.keys():
-            crawl_queue[url] = {}
+        links_list.append(url)
 
-    progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
+    return links_list
 
-    for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
-        crawl_queue[key]['crawl_result'] = crawl_target(key)
-        time.sleep(0.1)
+def get_max_url_id():
+    if len(glob.glob('data/url_id_map.json')) > 0:
+        with open('data/url_id_map.json') as url_id_map_file:
+            url_id_map = json.load(url_id_map_file)
 
-def crawl_target(target_url):
-    crawl_result = {}
-    r = requests.get(target_url, headers={'User-Agent': 'Sea'})
-    crawl_result['status_code'] = r.status_code
-    print('\n', crawl_result['status_code'], ' ', target_url)
-    crawl_result['text'] = r.text
-    return crawl_result
+        max_id = 0
 
-def main():
-    if len(sys.argv) < 2:
-        print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.')
+        for url_id in url_id_map.keys():
+            if int(url_id) > max_id:
+                max_id = int(url_id)
+
+        return max_id
     else:
-        origin_url = sys.argv[1]
+        return 0
+
+def get_url_id(url):
+    if len(glob.glob('data/url_id_map.json')) > 0:
+        with open('data/url_id_map.json', 'r') as url_id_map_file:
+            url_id_map = json.load(url_id_map_file)
+
+        for url_id in url_id_map.keys():
+            if url_id_map[url_id]['url'] == url:
+                return url_id
         
-        if len(sys.argv) >= 3:
-            domain_restriction = sys.argv[2]
-            crawl_from_origin(origin_url, domain_restriction)
-        else:
-            domain_restriction_warning = 'You are about to crawl with domain'\
-                'restriction. Are you sure? (y/n) '
-            user_input = input(domain_restriction_warning)
-            
-            if user_input == 'y':
-                crawl_from_origin(origin_url)
-                
-    print('Ending session...')
-            
+    new_url_id = get_max_url_id() + 1
+    register_new_url_id(new_url_id, url)
+    return new_url_id
+
+def get_config_value(key):
+    with open('config.json', 'r') as config_file:
+        config_json = json.load(config_file)
+
+    return config_json[key]
+
+def register_new_url_id(id, url):
+    if len(glob.glob('data/url_id_map.json')) > 0:
+        with open('data/url_id_map.json', 'r') as url_id_map_file:
+            url_id_map = json.load(url_id_map_file)
+    else:
+        url_id_map = {}
+
+    url_id_map[id] = {'url': url}
+
+    with open('data/url_id_map.json', 'w') as url_id_map_file:
+        json.dump(url_id_map, url_id_map_file, indent=4)
+
+def main():
+    origin_url = 'https://' + get_config_value('origin_domain')
+    crawl_recursively(origin_url)
+    find_errors.find_errors()
+
 main()
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 05064ba..0000000
--- a/setup.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import glob
-import os
-import sqlalchemy
-
-def clean_up_database(database_file_path):
-    if len(glob.glob(database_file_path)) > 0:
-        os.remove(database_file_path)
-
-def create_new_sqlite_file(database_file_path):
-    engine = sqlalchemy.create_engine(database_file_path)
-    metadata = sqlalchemy.MetaData()
-
-    results = sqlalchemy.Table(
-        'results',
-        metadata,
-        sqlalchemy.Column(
-            'id',
-            sqlalchemy.Integer(),
-            primary_key=True
-        ),
-        sqlalchemy.Column(
-            'url',
-            sqlalchemy.String(2048),
-            nullable=False
-        ),
-        sqlalchemy.Column(
-            'last_crawl_timestamp',
-            sqlalchemy.DateTime()
-        ),
-        sqlalchemy.Column(
-            'last_crawl_status_code',
-            sqlalchemy.Integer(),
-            nullable=False
-        )
-    )
-
-    metadata.create_all(engine) #Create the table
-
-def main():
-    database_file_path = 'sqlite:///db/seaspider.sqlite'
-    clean_up_database(database_file_path)
-    create_new_sqlite_file(database_file_path)
-
-main()