-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'feature/domain-restriction' into develop
- Loading branch information
Showing
6 changed files
with
152 additions
and
37 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,7 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# Sea Spider custom ignore rules | ||
data/ | ||
config.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"allow_outside_starting_domain": false, | ||
"max_crawl_count": 2000, | ||
"max_crawl_depth": 3, | ||
"origin_domain": "example.com" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import glob | ||
import json | ||
|
||
def find_errors(): | ||
ignore_list = ['data/url_id_map.json'] | ||
glob_pattern = 'data/*.json' | ||
item_count = 0 | ||
ok_count = 0 | ||
problem_count = 0 | ||
|
||
for item in glob.glob(glob_pattern): | ||
with open(item, 'r') as infile: | ||
json_data = json.load(infile) | ||
|
||
if 'id' in json_data.keys(): | ||
item_count += 1 | ||
response_code = int(json_data['response_code']) | ||
url = json_data['url'] | ||
|
||
if response_code == 200: | ||
ok_count += 1 | ||
else: | ||
problem_count += 1 | ||
|
||
print(response_code, ' ', url) | ||
|
||
print('Statistics:\nTotal items: ', item_count, '\nHealthy signals: ', \ | ||
ok_count, '\nProblems: ', problem_count) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,50 +1,115 @@ | ||
import bs4 | ||
import find_errors | ||
import glob | ||
import json | ||
import re | ||
import requests | ||
import sys | ||
import time | ||
from tqdm import tqdm | ||
|
||
def check_url(url): | ||
url_check_result = {} | ||
r = requests.get(url, headers={'User-Agent': 'Sea'}) | ||
url_check_result['status_code'] = r.status_code | ||
print('\n', url_check_result['status_code'], ' ', url) | ||
return url_check_result | ||
|
||
def crawl_target(target_url): | ||
crawl_result = {} | ||
r = requests.get(target_url, headers={'User-Agent': 'Sea'}) | ||
crawl_result['status_code'] = r.status_code | ||
crawl_result['text'] = r.text | ||
return crawl_result | ||
|
||
def main(): | ||
crawl_queue = {} | ||
def crawl_recursively(url, depth=1): | ||
url = url.split('#', 1)[0] | ||
max_crawl_depth = get_config_value('max_crawl_depth') | ||
|
||
if depth <= max_crawl_depth: | ||
crawl_target(url) | ||
url_id = get_url_id(url) | ||
|
||
if len(sys.argv) < 2: | ||
print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.') | ||
else: | ||
target_url = sys.argv[1] | ||
crawl_result = crawl_target(target_url) | ||
print(crawl_result['status_code'], ' ', target_url) | ||
soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser') | ||
links = soup.findAll('a', attrs={'href': re.compile('^https?://')}) | ||
print(len(links), ' links detected') | ||
with open('data/' + str(url_id) + '.json') as crawl_file: | ||
crawl_json = json.load(crawl_file) | ||
|
||
crawl_html = crawl_json['text'] | ||
links = extract_links_from_html(crawl_html) | ||
|
||
for link in links: | ||
url = link.get('href') | ||
|
||
if not url in crawl_queue.keys(): | ||
crawl_queue[url] = {} | ||
crawl_recursively(link, depth + 1) | ||
|
||
def crawl_target(url): | ||
url_id = get_url_id(url) | ||
crawl_file_name_pattern = 'data/' + str(url_id) + '.json' | ||
crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0 | ||
|
||
if not crawl_file_exists: | ||
print('Crawling: ', url) | ||
r = requests.get(url, headers={'User-Agent': 'Sea'}) | ||
crawl_result = { | ||
"id": url_id, | ||
"url": url, | ||
"response_code": r.status_code, | ||
"timestamp_float": time.time(), | ||
"text": r.text | ||
} | ||
|
||
with open(crawl_file_name_pattern, 'w') as outfile: | ||
json.dump(crawl_result, outfile, indent=4) | ||
|
||
def extract_links_from_html(html): | ||
allow_outside_starting_domain = get_config_value('allow_outside_starting_domain') | ||
origin_domain = get_config_value('origin_domain') | ||
soup = bs4.BeautifulSoup(html, features='html.parser') | ||
pattern = '^https?://' | ||
|
||
if not allow_outside_starting_domain: | ||
pattern += origin_domain | ||
|
||
links = soup.findAll('a', attrs={'href': re.compile(pattern)}) | ||
links_list = [] | ||
|
||
for link in links: | ||
url = link.get('href') | ||
links_list.append(url) | ||
|
||
return links_list | ||
|
||
def get_max_url_id(): | ||
if len(glob.glob('data/url_id_map.json')) > 0: | ||
with open('data/url_id_map.json') as url_id_map_file: | ||
url_id_map = json.load(url_id_map_file) | ||
|
||
for key in crawl_queue.keys(): | ||
print(key) | ||
max_id = 0 | ||
|
||
progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs' | ||
for url_id in url_id_map.keys(): | ||
if int(url_id) > max_id: | ||
max_id = int(url_id) | ||
|
||
return max_id | ||
else: | ||
return 0 | ||
|
||
def get_url_id(url): | ||
if len(glob.glob('data/url_id_map.json')) > 0: | ||
with open('data/url_id_map.json', 'r') as url_id_map_file: | ||
url_id_map = json.load(url_id_map_file) | ||
|
||
for url_id in url_id_map.keys(): | ||
if url_id_map[url_id]['url'] == url: | ||
return url_id | ||
|
||
new_url_id = get_max_url_id() + 1 | ||
register_new_url_id(new_url_id, url) | ||
return new_url_id | ||
|
||
def get_config_value(key): | ||
with open('config.json', 'r') as config_file: | ||
config_json = json.load(config_file) | ||
|
||
return config_json[key] | ||
|
||
def register_new_url_id(id, url): | ||
if len(glob.glob('data/url_id_map.json')) > 0: | ||
with open('data/url_id_map.json', 'r') as url_id_map_file: | ||
url_id_map = json.load(url_id_map_file) | ||
else: | ||
url_id_map = {} | ||
|
||
url_id_map[id] = {'url': url} | ||
|
||
with open('data/url_id_map.json', 'w') as url_id_map_file: | ||
json.dump(url_id_map, url_id_map_file, indent=4) | ||
|
||
def main(): | ||
origin_url = 'https://' + get_config_value('origin_domain') | ||
crawl_recursively(origin_url) | ||
find_errors.find_errors() | ||
|
||
for key in tqdm(crawl_queue.keys(), desc=progress_bar_label): | ||
crawl_queue[key]['crawl_result'] = check_url(key) | ||
time.sleep(0.1) | ||
|
||
main() |