Merge branch 'feature/domain-restriction' into develop

viperior · Feb 22, 2021 · 2956800 · 2956800
2 parents 9ae19f5 + ccc7408
commit 2956800
Show file tree

Hide file tree

Showing 6 changed files with 152 additions and 37 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,7 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Sea Spider custom ignore rules
+data/
+config.json
diff --git a/README.md b/README.md
@@ -2,9 +2,21 @@
 A humble SEO spider and link checker
 
 # Usage
+## Initial setup
 ```
 pip install -r requirements.txt
+python setup.py
+```
+
+## Basic example
+```
 python seaspider.py https://google.com
 ```
 
 ![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif)
+
+## Example with domain restriction
+You can limit crawling to a specific domain by providing a second parameter, the domain name.
+```
+python seaspider.py https://google.com google.com
+```
diff --git a/config-sample.json b/config-sample.json
@@ -0,0 +1,6 @@
+{
+    "allow_outside_starting_domain": false,
+    "max_crawl_count": 2000,
+    "max_crawl_depth": 3,
+    "origin_domain": "example.com"
+}
diff --git a/find_errors.py b/find_errors.py
@@ -0,0 +1,28 @@
+import glob
+import json
+
+def find_errors():
+    ignore_list = ['data/url_id_map.json']
+    glob_pattern = 'data/*.json'
+    item_count = 0
+    ok_count = 0
+    problem_count = 0
+
+    for item in glob.glob(glob_pattern):
+        with open(item, 'r') as infile:
+            json_data = json.load(infile)
+
+        if 'id' in json_data.keys():
+            item_count += 1
+            response_code = int(json_data['response_code'])
+            url = json_data['url']
+
+            if response_code == 200:
+                ok_count += 1
+            else:
+                problem_count += 1
+
+            print(response_code, ' ', url)
+
+    print('Statistics:\nTotal items: ', item_count, '\nHealthy signals: ', \
+        ok_count, '\nProblems: ', problem_count)
diff --git a/requirements.txt b/requirements.txt
diff --git a/seaspider.py b/seaspider.py
@@ -1,50 +1,115 @@
 import bs4
+import find_errors
+import glob
+import json
 import re
 import requests
 import sys
 import time
-from tqdm import tqdm
-
-def check_url(url):
-    url_check_result = {}
-    r = requests.get(url, headers={'User-Agent': 'Sea'})
-    url_check_result['status_code'] = r.status_code
-    print('\n', url_check_result['status_code'], ' ', url)
-    return url_check_result
-
-def crawl_target(target_url):
-    crawl_result = {}
-    r = requests.get(target_url, headers={'User-Agent': 'Sea'})
-    crawl_result['status_code'] = r.status_code
-    crawl_result['text'] = r.text
-    return crawl_result
 
-def main():
-    crawl_queue = {}
+def crawl_recursively(url, depth=1):
+    url = url.split('#', 1)[0]
+    max_crawl_depth = get_config_value('max_crawl_depth')
+
+    if depth <= max_crawl_depth:
+        crawl_target(url)
+        url_id = get_url_id(url)
 
-    if len(sys.argv) < 2:
-        print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.')
-    else:
-        target_url = sys.argv[1]
-        crawl_result = crawl_target(target_url)
-        print(crawl_result['status_code'], ' ', target_url)
-        soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
-        links = soup.findAll('a', attrs={'href': re.compile('^https?://')})
-        print(len(links), ' links detected')
+        with open('data/' + str(url_id) + '.json') as crawl_file:
+            crawl_json = json.load(crawl_file)
+
+        crawl_html = crawl_json['text']
+        links = extract_links_from_html(crawl_html)
 
         for link in links:
-            url = link.get('href')
-
-            if not url in crawl_queue.keys():
-                crawl_queue[url] = {}
+            crawl_recursively(link, depth + 1)
+
+def crawl_target(url):
+    url_id = get_url_id(url)
+    crawl_file_name_pattern = 'data/' + str(url_id) + '.json'
+    crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0
+
+    if not crawl_file_exists:
+        print('Crawling: ', url)
+        r = requests.get(url, headers={'User-Agent': 'Sea'})
+        crawl_result = {
+            "id": url_id,
+            "url": url,
+            "response_code": r.status_code,
+            "timestamp_float": time.time(),
+            "text": r.text
+        }
+
+        with open(crawl_file_name_pattern, 'w') as outfile:
+            json.dump(crawl_result, outfile, indent=4)
+
+def extract_links_from_html(html):
+    allow_outside_starting_domain = get_config_value('allow_outside_starting_domain')
+    origin_domain = get_config_value('origin_domain')
+    soup = bs4.BeautifulSoup(html, features='html.parser')
+    pattern = '^https?://'
+
+    if not allow_outside_starting_domain:
+        pattern += origin_domain
+
+    links = soup.findAll('a', attrs={'href': re.compile(pattern)})
+    links_list = []
+
+    for link in links:
+        url = link.get('href')
+        links_list.append(url)
+
+    return links_list
+
+def get_max_url_id():
+    if len(glob.glob('data/url_id_map.json')) > 0:
+        with open('data/url_id_map.json') as url_id_map_file:
+            url_id_map = json.load(url_id_map_file)
 
-        for key in crawl_queue.keys():
-            print(key)
+        max_id = 0
 
-        progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
+        for url_id in url_id_map.keys():
+            if int(url_id) > max_id:
+                max_id = int(url_id)
+
+        return max_id
+    else:
+        return 0
+
+def get_url_id(url):
+    if len(glob.glob('data/url_id_map.json')) > 0:
+        with open('data/url_id_map.json', 'r') as url_id_map_file:
+            url_id_map = json.load(url_id_map_file)
+
+        for url_id in url_id_map.keys():
+            if url_id_map[url_id]['url'] == url:
+                return url_id
+
+    new_url_id = get_max_url_id() + 1
+    register_new_url_id(new_url_id, url)
+    return new_url_id
+
+def get_config_value(key):
+    with open('config.json', 'r') as config_file:
+        config_json = json.load(config_file)
+
+    return config_json[key]
+
+def register_new_url_id(id, url):
+    if len(glob.glob('data/url_id_map.json')) > 0:
+        with open('data/url_id_map.json', 'r') as url_id_map_file:
+            url_id_map = json.load(url_id_map_file)
+    else:
+        url_id_map = {}
+
+    url_id_map[id] = {'url': url}
+
+    with open('data/url_id_map.json', 'w') as url_id_map_file:
+        json.dump(url_id_map, url_id_map_file, indent=4)
+
+def main():
+    origin_url = 'https://' + get_config_value('origin_domain')
+    crawl_recursively(origin_url)
+    find_errors.find_errors()
 
-        for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
-            crawl_queue[key]['crawl_result'] = check_url(key)
-            time.sleep(0.1)
-
 main()