-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgems_crawler.py
46 lines (36 loc) · 1.58 KB
/
gems_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
__author__ = 'vidit'
from bs4 import BeautifulSoup
import urllib.request
import math
# The website is structured such a way that all gems are arranged alphabetically with each alphabet having multiple pages with 30 gems listed on each page
# On the header of each page it shows how many gems are there totally for each letter
list_of_gems = {}
for alphabet in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
page = urllib.request.urlopen("https://rubygems.org/gems?letter=" + alphabet).read()
soup1 = BeautifulSoup(page, 'html.parser')
total = soup1.find("p")
total = total.contents[3]
total = total.get_text()
total = int(total)
pages = math.ceil(total/30) + 1
for i in range(1,pages):
page_let = urllib.request.urlopen("https://rubygems.org/gems?letter=" + alphabet + "&page=" + str(i)).read()
soup2 = BeautifulSoup(page_let, 'html.parser')
name = soup2.findAll('a', {"class": "gems__gem"})
for a in name:
key = str(a.find('p', {"class": "gems__gem__downloads__count"}).contents[0])
key = key.strip(" ")
key = key.strip("\n")
key = key.strip(" ")
comma = key.find(",")
while comma>=0:
key = key[0:comma]+key[comma+1:]
comma = key.find(",")
key = int(key)
value = str(a.div.h2.contents[0])
value = value.strip(" ")
value = value.strip("\n")
value = value.strip(" ")
list_of_gems.update({key:value})
for j in sorted(list_of_gems):
print(list_of_gems[j] + " was downloaded " + str(j))