-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikipediaPage.py
128 lines (116 loc) · 3.76 KB
/
wikipediaPage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import sys
class wikipediaPage():
def __init__(self, url):
"""
init methode
"""
self.__url = url
self.__title = ""
self.__rawPageContent = ""
self.__allLinksList = []
# calls the needed methodes
self.getRawPageContent()
self.getAllLinksInPage()
self.getPageTitle()
def getRawPageContent(self):
"""
Gets the raw content of the page
"""
req = Request(
url=self.__url,
headers={'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0'})
handler = urlopen(req)
with handler as response:
soup = BeautifulSoup(response, 'html.parser')
self.__rawPageContent = soup
self.__url = handler.geturl() # sets the correct adress
def getAllLinksInPage(self):
"""
uses beautifusoup to retreive all the links in a page
"""
soup = self.__rawPageContent
# selects all the pragraphs
for x in soup.find_all(True, {'class': [
'IPA', 'external text',
'external', 'internal',
'citation book', 'plainlinks',
'nowrap', 'portal', 'image',
'navbox', 'infobox_v3',
'infobox_v2', 'infobox'
]}):
x.decompose()
for x in soup.find_all(True, {'id': [
'footer', 'toc',
'mw-panel', 'mw-head',
'catlinks'
]}):
x.decompose()
# all paragraphs
all_links = soup.select('p a[href^="/wiki"]')
# seests all the table
all_links.extend(soup.select('table[class^="wikitable"] a[href^="/wiki"]'))
# ul lists
all_links.extend(soup.select('ul a[href^="/wiki"]'))
# ol lists
all_links.extend(soup.select('ol a[href^="/wiki"]'))
for idx, val in enumerate(all_links):#remouves all the "null" titles. TOOD : maney don't leave this in
if(val.contents[0] == ""):
del all_links[-1]
self.__allLinksList = all_links
def getPageTitle(self):
"""
Gets the page title of the page as a string
"""
temp = str(self.__rawPageContent.find('h1').contents[0])
temp = temp.replace('<i>', '')
temp = temp.replace('</i>', '')
self.__title = temp
def getFirstSentence(self):
"""
Gets the firsqt sentence of the page
"""
text = self.__rawPageContent.find(
'p', attrs={'class': None}).get_text()
text = text.partition('.')[0] + '.'
return text
def printLinkNamesConsole(self):
"""
Print in the console all the link options
"""
for idx, val in enumerate(self.__allLinksList):
print(idx, ":", val.contents[0])
def printFirstTenLinkNamesConsole(self):
"""
Print in the console the 10 first link names
"""
if len(self.__allLinksList) > 10:
for i in range(10):
print(i, ":", self.__allLinksList[i].contents[0])
else:
self.printLinkNamesConsole()
def getTitle(self):
"""
use getPageTitle() insteed
"""
return self.__title
def getOnlyLinksListJS(self):
"""
returns the links as a list
used in JS side
"""
temp = []
for idx, val in enumerate(self.__allLinksList):
temp.append(val.contents[0])
return temp
def getUrl(self):
"""
gets the page url
"""
return self.__url
def getRawList(self):
"""
gets the page url
"""
return self.__allLinksList