Skip to content

Commit

Permalink
Merge branch 'feature/mvp' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
viperior committed Feb 21, 2021
2 parents 05b8aa4 + f74dad3 commit a87c884
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 0 deletions.
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,10 @@
# sea-spider
A humble SEO spider and link checker

# Usage
```
pip install -r requirements.txt
python seaspider.py https://google.com
```

![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif)
Binary file added Usage-example-screen-recording.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added requirements.txt
Binary file not shown.
50 changes: 50 additions & 0 deletions seaspider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import bs4
import re
import requests
import sys
import time
from tqdm import tqdm

def check_url(url):
url_check_result = {}
r = requests.get(url, headers={'User-Agent': 'Sea'})
url_check_result['status_code'] = r.status_code
print('\n', url_check_result['status_code'], ' ', url)
return url_check_result

def crawl_target(target_url):
crawl_result = {}
r = requests.get(target_url, headers={'User-Agent': 'Sea'})
crawl_result['status_code'] = r.status_code
crawl_result['text'] = r.text
return crawl_result

def main():
crawl_queue = {}

if len(sys.argv) < 2:
print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.')
else:
target_url = sys.argv[1]
crawl_result = crawl_target(target_url)
print(crawl_result['status_code'], ' ', target_url)
soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
links = soup.findAll('a', attrs={'href': re.compile('^https?://')})
print(len(links), ' links detected')

for link in links:
url = link.get('href')

if not url in crawl_queue.keys():
crawl_queue[url] = {}

for key in crawl_queue.keys():
print(key)

progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'

for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
crawl_queue[key]['crawl_result'] = check_url(key)
time.sleep(0.1)

main()

0 comments on commit a87c884

Please sign in to comment.