From bc51f9378149009f3f6933bb71f62133e85e21c6 Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 13:01:55 -0800 Subject: [PATCH 1/9] Restrict crawling to a specific domain using a 3rd parameter --- seaspider.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/seaspider.py b/seaspider.py index de4afff..0062581 100644 --- a/seaspider.py +++ b/seaspider.py @@ -29,7 +29,13 @@ def main(): crawl_result = crawl_target(target_url) print(crawl_result['status_code'], ' ', target_url) soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') - links = soup.findAll('a', attrs={'href': re.compile('^https?://')}) + pattern = '^https?://' + + # Apply domain restriction + if len(sys.argv) >= 3: + pattern += sys.argv[2] + + links = soup.findAll('a', attrs={'href': re.compile(pattern)}) print(len(links), ' links detected') for link in links: From 7f2e990a600860277bbabc9ac94aa33291c92257 Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 13:02:28 -0800 Subject: [PATCH 2/9] Add docstring to main function --- seaspider.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seaspider.py b/seaspider.py index 0062581..15e6e13 100644 --- a/seaspider.py +++ b/seaspider.py @@ -20,6 +20,8 @@ def crawl_target(target_url): return crawl_result def main(): + """Crawl a given starting URL, collect all links from its HTML, and then + recursively crawl those links, while avoiding duplicate crawls.""" crawl_queue = {} if len(sys.argv) < 2: From a89a06ac0c5f12d1c364b2a0d5f1406ef01f276d Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 13:19:12 -0800 Subject: [PATCH 3/9] Move primary logic into crawl_from_origin function --- seaspider.py | 67 +++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/seaspider.py b/seaspider.py index 15e6e13..5b0e7a0 100644 --- a/seaspider.py +++ b/seaspider.py @@ -12,6 +12,32 @@ def check_url(url): print('\n', url_check_result['status_code'], ' ', url) return url_check_result +def crawl_from_origin(origin_url, domain_restriction=''): + """Crawl a given starting URL, collect all links from its HTML, and then + recursively crawl those links, while avoiding duplicate crawls.""" + crawl_queue = {} + crawl_result = crawl_target(origin_url) + print(crawl_result['status_code'], ' ', origin_url, ' ORIGIN') + soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') + pattern = '^https?://' + domain_restriction + links = soup.findAll('a', attrs={'href': re.compile(pattern)}) + print(len(links), ' links detected') + + for link in links: + url = link.get('href') + + if not url in crawl_queue.keys(): + crawl_queue[url] = {} + + for key in crawl_queue.keys(): + print(key) + + progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs' + + for key in tqdm(crawl_queue.keys(), desc=progress_bar_label): + crawl_queue[key]['crawl_result'] = check_url(key) + time.sleep(0.1) + def crawl_target(target_url): crawl_result = {} r = requests.get(target_url, headers={'User-Agent': 'Sea'}) @@ -20,39 +46,22 @@ def crawl_target(target_url): return crawl_result def main(): - """Crawl a given starting URL, collect all links from its HTML, and then - recursively crawl those links, while avoiding duplicate crawls.""" - crawl_queue = {} - if len(sys.argv) < 2: print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.') else: - target_url = sys.argv[1] - crawl_result = crawl_target(target_url) - print(crawl_result['status_code'], ' ', target_url) - soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') - pattern = '^https?://' - - # Apply domain restriction + origin_url = sys.argv[1] + if len(sys.argv) >= 3: - pattern += sys.argv[2] - - links = soup.findAll('a', attrs={'href': re.compile(pattern)}) - print(len(links), ' links detected') - - for link in links: - url = link.get('href') + domain_restriction = sys.argv[2] + crawl_from_origin(origin_url, domain_restriction) + else: + domain_restriction_warning = 'You are about to crawl with domain'\ + 'restriction. Are you sure? (y/n) ' + user_input = input(domain_restriction_warning) - if not url in crawl_queue.keys(): - crawl_queue[url] = {} - - for key in crawl_queue.keys(): - print(key) - - progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs' - - for key in tqdm(crawl_queue.keys(), desc=progress_bar_label): - crawl_queue[key]['crawl_result'] = check_url(key) - time.sleep(0.1) + if user_input == 'y': + crawl_from_origin(origin_url) + + print('Ending session...') main() From 2490fa93c2e902ecedb13da16ed5012e56aec53a Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 13:25:18 -0800 Subject: [PATCH 4/9] Reduce redundancy in function roles --- seaspider.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/seaspider.py b/seaspider.py index 5b0e7a0..9c871b8 100644 --- a/seaspider.py +++ b/seaspider.py @@ -5,19 +5,11 @@ import time from tqdm import tqdm -def check_url(url): - url_check_result = {} - r = requests.get(url, headers={'User-Agent': 'Sea'}) - url_check_result['status_code'] = r.status_code - print('\n', url_check_result['status_code'], ' ', url) - return url_check_result - def crawl_from_origin(origin_url, domain_restriction=''): """Crawl a given starting URL, collect all links from its HTML, and then recursively crawl those links, while avoiding duplicate crawls.""" crawl_queue = {} crawl_result = crawl_target(origin_url) - print(crawl_result['status_code'], ' ', origin_url, ' ORIGIN') soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') pattern = '^https?://' + domain_restriction links = soup.findAll('a', attrs={'href': re.compile(pattern)}) @@ -29,19 +21,17 @@ def crawl_from_origin(origin_url, domain_restriction=''): if not url in crawl_queue.keys(): crawl_queue[url] = {} - for key in crawl_queue.keys(): - print(key) - progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs' for key in tqdm(crawl_queue.keys(), desc=progress_bar_label): - crawl_queue[key]['crawl_result'] = check_url(key) + crawl_queue[key]['crawl_result'] = crawl_target(key) time.sleep(0.1) def crawl_target(target_url): crawl_result = {} r = requests.get(target_url, headers={'User-Agent': 'Sea'}) crawl_result['status_code'] = r.status_code + print('\n', crawl_result['status_code'], ' ', target_url) crawl_result['text'] = r.text return crawl_result From de3b32f613ec7a9aec34445b22d60b8ce318c1a9 Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 14:02:19 -0800 Subject: [PATCH 5/9] Add database directory to .gitignore list --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index b6e4761..7b3385b 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# Sea Spider custom ignore rules +db/ From 646fc39fa9c30e6137e86e1cf7c0b72cc5f63a3d Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 14:13:17 -0800 Subject: [PATCH 6/9] Add initial setup instructions and example using domain restriction to readme --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 3804e13..56c2ec5 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,21 @@ A humble SEO spider and link checker # Usage +## Initial setup ``` pip install -r requirements.txt +python setup.py +``` + +## Basic example +``` python seaspider.py https://google.com ``` ![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif) + +## Example with domain restriction +You can limit crawling to a specific domain by providing a second parameter, the domain name. +``` +python seaspider.py https://google.com google.com +``` From 470eb9d0e17554331537e5cc5fc026408a1780b1 Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 14:13:39 -0800 Subject: [PATCH 7/9] Add SQLAlchemy to requirements.txt --- requirements.txt | Bin 298 -> 338 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/requirements.txt b/requirements.txt index 577d67f550a6c4a6aed9a022d4b8676c42ba6f72..8b78c232f09170f4ca4032ef7eea70af118f679a 100644 GIT binary patch delta 45 xcmZ3*bct!g8|7ezKn5QMM}{1RWQGieREAuJN(NgXG-S|YFa}~H2IGm(odE%}3M~Kt delta 10 Rcmcb_w2EoMo5>=Kb^sW11LFVy From d397e34b4981e53f7960526d3d2b88d4fec6522d Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 14:14:31 -0800 Subject: [PATCH 8/9] Add setup script for sqlite database --- setup.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 setup.py diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..05064ba --- /dev/null +++ b/setup.py @@ -0,0 +1,44 @@ +import glob +import os +import sqlalchemy + +def clean_up_database(database_file_path): + if len(glob.glob(database_file_path)) > 0: + os.remove(database_file_path) + +def create_new_sqlite_file(database_file_path): + engine = sqlalchemy.create_engine(database_file_path) + metadata = sqlalchemy.MetaData() + + results = sqlalchemy.Table( + 'results', + metadata, + sqlalchemy.Column( + 'id', + sqlalchemy.Integer(), + primary_key=True + ), + sqlalchemy.Column( + 'url', + sqlalchemy.String(2048), + nullable=False + ), + sqlalchemy.Column( + 'last_crawl_timestamp', + sqlalchemy.DateTime() + ), + sqlalchemy.Column( + 'last_crawl_status_code', + sqlalchemy.Integer(), + nullable=False + ) + ) + + metadata.create_all(engine) #Create the table + +def main(): + database_file_path = 'sqlite:///db/seaspider.sqlite' + clean_up_database(database_file_path) + create_new_sqlite_file(database_file_path) + +main() From ccc7408c067ee0851f93b5071a6f358ff344a712 Mon Sep 17 00:00:00 2001 From: viperior Date: Sun, 21 Feb 2021 18:46:45 -0800 Subject: [PATCH 9/9] Cache crawl results to JSON files --- .gitignore | 3 +- config-sample.json | 6 ++ find_errors.py | 28 +++++++++ requirements.txt | Bin 338 -> 270 bytes seaspider.py | 140 ++++++++++++++++++++++++++++++++------------- setup.py | 44 -------------- 6 files changed, 135 insertions(+), 86 deletions(-) create mode 100644 config-sample.json create mode 100644 find_errors.py delete mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 7b3385b..3d553d5 100644 --- a/.gitignore +++ b/.gitignore @@ -129,4 +129,5 @@ dmypy.json .pyre/ # Sea Spider custom ignore rules -db/ +data/ +config.json diff --git a/config-sample.json b/config-sample.json new file mode 100644 index 0000000..0992b3d --- /dev/null +++ b/config-sample.json @@ -0,0 +1,6 @@ +{ + "allow_outside_starting_domain": false, + "max_crawl_count": 2000, + "max_crawl_depth": 3, + "origin_domain": "example.com" +} \ No newline at end of file diff --git a/find_errors.py b/find_errors.py new file mode 100644 index 0000000..51a0310 --- /dev/null +++ b/find_errors.py @@ -0,0 +1,28 @@ +import glob +import json + +def find_errors(): + ignore_list = ['data/url_id_map.json'] + glob_pattern = 'data/*.json' + item_count = 0 + ok_count = 0 + problem_count = 0 + + for item in glob.glob(glob_pattern): + with open(item, 'r') as infile: + json_data = json.load(infile) + + if 'id' in json_data.keys(): + item_count += 1 + response_code = int(json_data['response_code']) + url = json_data['url'] + + if response_code == 200: + ok_count += 1 + else: + problem_count += 1 + + print(response_code, ' ', url) + + print('Statistics:\nTotal items: ', item_count, '\nHealthy signals: ', \ + ok_count, '\nProblems: ', problem_count) diff --git a/requirements.txt b/requirements.txt index 8b78c232f09170f4ca4032ef7eea70af118f679a..b5a91cefa66a7a2df90e814b18f46040aeaa6305 100644 GIT binary patch delta 10 Rcmcb_)W)%71IGXW delta 73 zcmeBUy2Lc$jeRgfAcGHsBSQ{DGD8MKDnl+qC4(&x8ZziH7z42pkmP0HVkluKWJm$Z Sg5*qq;-(DdK+<61b9DeJNDYJl diff --git a/seaspider.py b/seaspider.py index 9c871b8..a50855f 100644 --- a/seaspider.py +++ b/seaspider.py @@ -1,57 +1,115 @@ import bs4 +import find_errors +import glob +import json import re import requests import sys import time -from tqdm import tqdm - -def crawl_from_origin(origin_url, domain_restriction=''): - """Crawl a given starting URL, collect all links from its HTML, and then - recursively crawl those links, while avoiding duplicate crawls.""" - crawl_queue = {} - crawl_result = crawl_target(origin_url) - soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') - pattern = '^https?://' + domain_restriction + +def crawl_recursively(url, depth=1): + url = url.split('#', 1)[0] + max_crawl_depth = get_config_value('max_crawl_depth') + + if depth <= max_crawl_depth: + crawl_target(url) + url_id = get_url_id(url) + + with open('data/' + str(url_id) + '.json') as crawl_file: + crawl_json = json.load(crawl_file) + + crawl_html = crawl_json['text'] + links = extract_links_from_html(crawl_html) + + for link in links: + crawl_recursively(link, depth + 1) + +def crawl_target(url): + url_id = get_url_id(url) + crawl_file_name_pattern = 'data/' + str(url_id) + '.json' + crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0 + + if not crawl_file_exists: + print('Crawling: ', url) + r = requests.get(url, headers={'User-Agent': 'Sea'}) + crawl_result = { + "id": url_id, + "url": url, + "response_code": r.status_code, + "timestamp_float": time.time(), + "text": r.text + } + + with open(crawl_file_name_pattern, 'w') as outfile: + json.dump(crawl_result, outfile, indent=4) + +def extract_links_from_html(html): + allow_outside_starting_domain = get_config_value('allow_outside_starting_domain') + origin_domain = get_config_value('origin_domain') + soup = bs4.BeautifulSoup(html, features='html.parser') + pattern = '^https?://' + + if not allow_outside_starting_domain: + pattern += origin_domain + links = soup.findAll('a', attrs={'href': re.compile(pattern)}) - print(len(links), ' links detected') + links_list = [] for link in links: url = link.get('href') - - if not url in crawl_queue.keys(): - crawl_queue[url] = {} + links_list.append(url) - progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs' + return links_list - for key in tqdm(crawl_queue.keys(), desc=progress_bar_label): - crawl_queue[key]['crawl_result'] = crawl_target(key) - time.sleep(0.1) +def get_max_url_id(): + if len(glob.glob('data/url_id_map.json')) > 0: + with open('data/url_id_map.json') as url_id_map_file: + url_id_map = json.load(url_id_map_file) -def crawl_target(target_url): - crawl_result = {} - r = requests.get(target_url, headers={'User-Agent': 'Sea'}) - crawl_result['status_code'] = r.status_code - print('\n', crawl_result['status_code'], ' ', target_url) - crawl_result['text'] = r.text - return crawl_result + max_id = 0 -def main(): - if len(sys.argv) < 2: - print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.') + for url_id in url_id_map.keys(): + if int(url_id) > max_id: + max_id = int(url_id) + + return max_id else: - origin_url = sys.argv[1] + return 0 + +def get_url_id(url): + if len(glob.glob('data/url_id_map.json')) > 0: + with open('data/url_id_map.json', 'r') as url_id_map_file: + url_id_map = json.load(url_id_map_file) + + for url_id in url_id_map.keys(): + if url_id_map[url_id]['url'] == url: + return url_id - if len(sys.argv) >= 3: - domain_restriction = sys.argv[2] - crawl_from_origin(origin_url, domain_restriction) - else: - domain_restriction_warning = 'You are about to crawl with domain'\ - 'restriction. Are you sure? (y/n) ' - user_input = input(domain_restriction_warning) - - if user_input == 'y': - crawl_from_origin(origin_url) - - print('Ending session...') - + new_url_id = get_max_url_id() + 1 + register_new_url_id(new_url_id, url) + return new_url_id + +def get_config_value(key): + with open('config.json', 'r') as config_file: + config_json = json.load(config_file) + + return config_json[key] + +def register_new_url_id(id, url): + if len(glob.glob('data/url_id_map.json')) > 0: + with open('data/url_id_map.json', 'r') as url_id_map_file: + url_id_map = json.load(url_id_map_file) + else: + url_id_map = {} + + url_id_map[id] = {'url': url} + + with open('data/url_id_map.json', 'w') as url_id_map_file: + json.dump(url_id_map, url_id_map_file, indent=4) + +def main(): + origin_url = 'https://' + get_config_value('origin_domain') + crawl_recursively(origin_url) + find_errors.find_errors() + main() diff --git a/setup.py b/setup.py deleted file mode 100644 index 05064ba..0000000 --- a/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -import glob -import os -import sqlalchemy - -def clean_up_database(database_file_path): - if len(glob.glob(database_file_path)) > 0: - os.remove(database_file_path) - -def create_new_sqlite_file(database_file_path): - engine = sqlalchemy.create_engine(database_file_path) - metadata = sqlalchemy.MetaData() - - results = sqlalchemy.Table( - 'results', - metadata, - sqlalchemy.Column( - 'id', - sqlalchemy.Integer(), - primary_key=True - ), - sqlalchemy.Column( - 'url', - sqlalchemy.String(2048), - nullable=False - ), - sqlalchemy.Column( - 'last_crawl_timestamp', - sqlalchemy.DateTime() - ), - sqlalchemy.Column( - 'last_crawl_status_code', - sqlalchemy.Integer(), - nullable=False - ) - ) - - metadata.create_all(engine) #Create the table - -def main(): - database_file_path = 'sqlite:///db/seaspider.sqlite' - clean_up_database(database_file_path) - create_new_sqlite_file(database_file_path) - -main()