Skip to content

Commit

Permalink
Merge branch 'feature/domain-restriction' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
viperior committed Feb 22, 2021
2 parents 9ae19f5 + ccc7408 commit 2956800
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 37 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,7 @@ dmypy.json

# Pyre type checker
.pyre/

# Sea Spider custom ignore rules
data/
config.json
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,21 @@
A humble SEO spider and link checker

# Usage
## Initial setup
```
pip install -r requirements.txt
python setup.py
```

## Basic example
```
python seaspider.py https://google.com
```

![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif)

## Example with domain restriction
You can limit crawling to a specific domain by providing a second parameter, the domain name.
```
python seaspider.py https://google.com google.com
```
6 changes: 6 additions & 0 deletions config-sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"allow_outside_starting_domain": false,
"max_crawl_count": 2000,
"max_crawl_depth": 3,
"origin_domain": "example.com"
}
28 changes: 28 additions & 0 deletions find_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import glob
import json

def find_errors():
ignore_list = ['data/url_id_map.json']
glob_pattern = 'data/*.json'
item_count = 0
ok_count = 0
problem_count = 0

for item in glob.glob(glob_pattern):
with open(item, 'r') as infile:
json_data = json.load(infile)

if 'id' in json_data.keys():
item_count += 1
response_code = int(json_data['response_code'])
url = json_data['url']

if response_code == 200:
ok_count += 1
else:
problem_count += 1

print(response_code, ' ', url)

print('Statistics:\nTotal items: ', item_count, '\nHealthy signals: ', \
ok_count, '\nProblems: ', problem_count)
Binary file modified requirements.txt
Binary file not shown.
139 changes: 102 additions & 37 deletions seaspider.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,115 @@
import bs4
import find_errors
import glob
import json
import re
import requests
import sys
import time
from tqdm import tqdm

def check_url(url):
url_check_result = {}
r = requests.get(url, headers={'User-Agent': 'Sea'})
url_check_result['status_code'] = r.status_code
print('\n', url_check_result['status_code'], ' ', url)
return url_check_result

def crawl_target(target_url):
crawl_result = {}
r = requests.get(target_url, headers={'User-Agent': 'Sea'})
crawl_result['status_code'] = r.status_code
crawl_result['text'] = r.text
return crawl_result

def main():
crawl_queue = {}
def crawl_recursively(url, depth=1):
url = url.split('#', 1)[0]
max_crawl_depth = get_config_value('max_crawl_depth')

if depth <= max_crawl_depth:
crawl_target(url)
url_id = get_url_id(url)

if len(sys.argv) < 2:
print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.')
else:
target_url = sys.argv[1]
crawl_result = crawl_target(target_url)
print(crawl_result['status_code'], ' ', target_url)
soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
links = soup.findAll('a', attrs={'href': re.compile('^https?://')})
print(len(links), ' links detected')
with open('data/' + str(url_id) + '.json') as crawl_file:
crawl_json = json.load(crawl_file)

crawl_html = crawl_json['text']
links = extract_links_from_html(crawl_html)

for link in links:
url = link.get('href')

if not url in crawl_queue.keys():
crawl_queue[url] = {}
crawl_recursively(link, depth + 1)

def crawl_target(url):
url_id = get_url_id(url)
crawl_file_name_pattern = 'data/' + str(url_id) + '.json'
crawl_file_exists = len(glob.glob(crawl_file_name_pattern)) > 0

if not crawl_file_exists:
print('Crawling: ', url)
r = requests.get(url, headers={'User-Agent': 'Sea'})
crawl_result = {
"id": url_id,
"url": url,
"response_code": r.status_code,
"timestamp_float": time.time(),
"text": r.text
}

with open(crawl_file_name_pattern, 'w') as outfile:
json.dump(crawl_result, outfile, indent=4)

def extract_links_from_html(html):
allow_outside_starting_domain = get_config_value('allow_outside_starting_domain')
origin_domain = get_config_value('origin_domain')
soup = bs4.BeautifulSoup(html, features='html.parser')
pattern = '^https?://'

if not allow_outside_starting_domain:
pattern += origin_domain

links = soup.findAll('a', attrs={'href': re.compile(pattern)})
links_list = []

for link in links:
url = link.get('href')
links_list.append(url)

return links_list

def get_max_url_id():
if len(glob.glob('data/url_id_map.json')) > 0:
with open('data/url_id_map.json') as url_id_map_file:
url_id_map = json.load(url_id_map_file)

for key in crawl_queue.keys():
print(key)
max_id = 0

progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
for url_id in url_id_map.keys():
if int(url_id) > max_id:
max_id = int(url_id)

return max_id
else:
return 0

def get_url_id(url):
if len(glob.glob('data/url_id_map.json')) > 0:
with open('data/url_id_map.json', 'r') as url_id_map_file:
url_id_map = json.load(url_id_map_file)

for url_id in url_id_map.keys():
if url_id_map[url_id]['url'] == url:
return url_id

new_url_id = get_max_url_id() + 1
register_new_url_id(new_url_id, url)
return new_url_id

def get_config_value(key):
with open('config.json', 'r') as config_file:
config_json = json.load(config_file)

return config_json[key]

def register_new_url_id(id, url):
if len(glob.glob('data/url_id_map.json')) > 0:
with open('data/url_id_map.json', 'r') as url_id_map_file:
url_id_map = json.load(url_id_map_file)
else:
url_id_map = {}

url_id_map[id] = {'url': url}

with open('data/url_id_map.json', 'w') as url_id_map_file:
json.dump(url_id_map, url_id_map_file, indent=4)

def main():
origin_url = 'https://' + get_config_value('origin_domain')
crawl_recursively(origin_url)
find_errors.find_errors()

for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
crawl_queue[key]['crawl_result'] = check_url(key)
time.sleep(0.1)

main()

0 comments on commit 2956800

Please sign in to comment.