Merge branch 'feature/mvp' into develop

viperior · Feb 21, 2021 · a87c884 · a87c884
2 parents 05b8aa4 + f74dad3
commit a87c884
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,10 @@
 # sea-spider
 A humble SEO spider and link checker
+
+# Usage
+```
+pip install -r requirements.txt
+python seaspider.py https://google.com
+```
+
+![Usage example: checking all links on a given web page](Usage-example-screen-recording.gif)
diff --git a/Usage-example-screen-recording.gif b/Usage-example-screen-recording.gif
diff --git a/requirements.txt b/requirements.txt
diff --git a/seaspider.py b/seaspider.py
@@ -0,0 +1,50 @@
+import bs4
+import re
+import requests
+import sys
+import time
+from tqdm import tqdm
+
+def check_url(url):
+    url_check_result = {}
+    r = requests.get(url, headers={'User-Agent': 'Sea'})
+    url_check_result['status_code'] = r.status_code
+    print('\n', url_check_result['status_code'], ' ', url)
+    return url_check_result
+
+def crawl_target(target_url):
+    crawl_result = {}
+    r = requests.get(target_url, headers={'User-Agent': 'Sea'})
+    crawl_result['status_code'] = r.status_code
+    crawl_result['text'] = r.text
+    return crawl_result
+
+def main():
+    crawl_queue = {}
+
+    if len(sys.argv) < 2:
+        print('[ERROR] No target URL supplied. Please provide a URL for seaspider to crawl.')
+    else:
+        target_url = sys.argv[1]
+        crawl_result = crawl_target(target_url)
+        print(crawl_result['status_code'], ' ', target_url)
+        soup = bs4.BeautifulSoup(crawl_result['text'], features='html.parser')
+        links = soup.findAll('a', attrs={'href': re.compile('^https?://')})
+        print(len(links), ' links detected')
+
+        for link in links:
+            url = link.get('href')
+
+            if not url in crawl_queue.keys():
+                crawl_queue[url] = {}
+
+        for key in crawl_queue.keys():
+            print(key)
+
+        progress_bar_label = 'Crawling ' + str(len(crawl_queue)) + ' URLs'
+
+        for key in tqdm(crawl_queue.keys(), desc=progress_bar_label):
+            crawl_queue[key]['crawl_result'] = check_url(key)
+            time.sleep(0.1)
+
+main()