-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
65 lines (53 loc) · 2.05 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import scraperwiki
import requests
from bs4 import BeautifulSoup
import re
def add_operator(mcc, mnc, brand, operator, status, country, country_code, db):
assert re.match('^\d{3}$', mcc)
assert re.match('^\d{2,3}$', mnc)
assert re.match('^[A-Z/-]*$', country_code)
db.append({
'mccmnc': mcc + mnc,
'brand': brand,
'operator': operator,
'country': country,
'status': status,
'countryCode': country_code
})
def scan_table(table, country, country_code, db):
rows = table.find_all('tr')
hdr = rows.pop(0).find_all('th')
assert hdr[0].text == u'MCC'
assert hdr[1].text == u'MNC'
assert hdr[2].text == u'Brand'
assert hdr[3].text == u'Operator'
assert hdr[4].text == u'Status'
for row in rows:
td = row.find_all('td')
mcc = td[0].text
mnc = td[1].text
brand = td[2].text.replace('[citation needed]', '')
operator = td[3].text.replace('[citation needed]', '')
status = re.sub(r'\([^)]*\)', '', td[4].text.replace('[citation needed]', '')).strip()
if mcc and mnc and '?' not in mnc:
if '-' in mnc:
# TODO: mnc range
pass
else:
add_operator(mcc, mnc, brand, operator, status.lower(), country, country_code, db)
def contains_headline(tag):
return tag.find(class_='mw-headline') is not None
def main():
db = []
soup = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/Mobile_country_code').text, 'xml')
for th in soup.find_all('th', text='MCC'):
table = th.find_parent('table')
tab_title = table.find_previous_sibling(contains_headline).find(class_='mw-headline').findAll(text=True)
tab_title = ''.join(tab_title).split(' - ')
assert (len(tab_title) == 1) or (len(tab_title) == 2)
country = tab_title.pop(0)
country_code = ''.join(tab_title)
scan_table(table, country, country_code, db)
scraperwiki.sqlite.save(unique_keys=['mccmnc'], data=db)
if __name__ == '__main__':
main()