-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvenueyacc.py
137 lines (106 loc) · 2.67 KB
/
venueyacc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Module: venueyacc.py
# module to build lexer and parser for venue details
import ply.yacc as yacc
import ply.lex as lex
tokens = [
'BEGINVENUE',
'OPENBODY', 'CLOSEBODY', 'OPENROW', 'CLOSEROW', 'OPENDATA', 'CLOSEDATA', 'OPENHEAD', 'CLOSEHEAD', 'OPENHREF', 'CLOSEHREF',
'CONTENT', 'GARBAGE', 'WHITESPACE', 'OPENDIV', 'CLOSEDIV', 'OPENSTYLE', 'CLOSESTYLE',
]
def t_BEGINVENUE(t):
'''<h3><span\sclass="mw-headline"\sid="Stadiums">Stadiums</span></h3>'''
return t
def t_OPENBODY(t):
r'<tbody.*?>'
return t
def t_CLOSEBODY(t):
r'</tbody>'
return t
def t_OPENROW(t):
r'<tr.*?>'
return t
def t_CLOSEROW(t):
r'</tr>'
return t
def t_OPENDATA(t):
r'<td.*?>'
return t
def t_CLOSEDATA(t):
r'</td.*?>'
return t
def t_OPENHEAD(t):
r'<th.*?>'
return t
def t_CLOSEHEAD(t):
r'</th>'
return t
def t_OPENHREF(t):
r'<a .*?>'
return t
def t_CLOSEHREF(t):
r'</a>'
return t
def t_WHITESPACE(t):
'''[ ]+'''
def t_CONTENT(t):
'''[A-Za-z0-9, ]+'''
return t
def t_OPENDIV(t):
'''<div[^>]*>'''
def t_CLOSEDIV(t):
'''</div[^>]*>'''
def t_OPENSTYLE(t):
'''<style[^>]*>'''
def t_CLOSESTYLE(t):
'''</style[^>]*>'''
def t_GARBAGE(t):
r'<.*?>'
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
def t_error(t):
t.lexer.skip(1)
venue = {}
# grammar to find Venue table
def p_init(p):
'''init : before BEGINVENUE skip OPENBODY rows CLOSEBODY'''
# skipping unwanted content
def p_before(p):
'''before : CONTENT before
| OPENHREF before
| CLOSEHREF before
| OPENHEAD before
| CLOSEHEAD before
| OPENROW before
| CLOSEROW before
| OPENDATA before
| CLOSEDATA before
| '''
# grammar to handle table rows of stadium table
def p_rows(p):
'''rows : OPENROW OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD CLOSEROW rows
| OPENROW OPENHEAD skip CLOSEHEAD columns CLOSEROW rows
| OPENROW columns CLOSEROW rows
| '''
# grammar to extract stadium details and store it
def p_columns(p):
'''columns : OPENDATA OPENHREF CONTENT CLOSEHREF CLOSEDATA OPENDATA CONTENT skip CLOSEDATA'''
if len(p)==10:
venue[p[3]] = p[7]
def p_skip(p):
'''skip : CONTENT skip
| OPENHREF skip
| CLOSEHREF skip
| '''
def p_error(p):
pass
def getVenueList():
global venue
venue = {}
lexer = lex.lex()
parser = yacc.yacc()
f=open('fifa.html','r',encoding='utf-8')
data = f.read()
res = parser.parse(data)
f.close()
return venue