-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnationyacc.py
142 lines (113 loc) · 3.3 KB
/
nationyacc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import ply.yacc as yacc
from urllib.request import Request, urlopen
import ply.lex as lex
tokens =[
'BEGINSQUAD',
'OPENBODY', 'CLOSEBODY', 'OPENROW', 'CLOSEROW', 'OPENDATA', 'CLOSEDATA', 'OPENHEAD', 'CLOSEHEAD', 'OPENHREF', 'CLOSEHREF',
'CONTENT', 'GARBAGE', 'BRACKETS','CLOSEBRACKETS', 'ABBR', 'CLOSEABBR', 'VC', 'C'
]
def t_BEGINSQUAD(t):
'''<span\sclass="mw-headline"\sid="Current_squad">Current\ssquad</span>'''
return t
def t_OPENBODY(t):
r'<tbody.*?>'
return t
def t_CLOSEBODY(t):
r'</tbody>'
return t
def t_OPENROW(t):
r'<tr.*?>'
return t
def t_CLOSEROW(t):
r'</tr>'
return t
def t_OPENDATA(t):
r'<td.*?>'
return t
def t_CLOSEDATA(t):
r'</td.*?>'
return t
def t_OPENHEAD(t):
r'<th.*?>'
return t
def t_CLOSEHEAD(t):
r'</th>'
return t
def t_ABBR(t):
r'<abbr.*?>'
def t_CLOSEABBR(t):
r'</abbr>'
def t_OPENHREF(t):
r'<a.*?>'
return t
def t_CLOSEHREF(t):
r'</a>'
return t
def t_CONTENT(t):
'''[A-Za-z0-9ñáćéíøæóúÁÉÜÚÑÓÍïüšëčýÿûã,#&;\(\) \-]+'''
return t
def t_WHITESPACE(t):
'''[ ]+'''
def t_GARBAGE(t):
r'<.*?>'
def t_newline(t):
r'\n+'
t.lexer.lineno += len(t.value)
def t_error(t):
t.lexer.skip(1)
squad_list = []
def p_start(p):
'''start : init'''
def p_init(p):
'''init : before BEGINSQUAD skip OPENBODY rows CLOSEBODY'''
def p_before(p):
'''before : CONTENT before
| OPENBODY before
| CLOSEBODY before
| OPENHEAD before
| CLOSEHEAD before
| OPENDATA before
| CLOSEDATA before
| OPENHREF before
| CLOSEHREF before
| OPENROW before
| CLOSEROW before
| '''
def p_rows(p):
'''rows : OPENROW OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD OPENHEAD CONTENT CLOSEHEAD CLOSEROW rows
| OPENROW columns CLOSEROW rows
| '''
def p_columns(p):
'''columns : OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA OPENHEAD OPENHREF CONTENT CLOSEHREF CLOSEHEAD OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA
| OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA OPENHEAD OPENHREF CONTENT CLOSEHREF CONTENT skip CLOSEHEAD OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA OPENDATA skip CLOSEDATA
| OPENDATA CLOSEDATA'''
if len(p) > 3:
squad_list.append(p[9])
def p_skip(p):
'''skip : CONTENT skip
| OPENHREF skip
| CLOSEHREF skip
| '''
def p_error(p):
# print("Syntax error in input! ",p)
pass
# Build the parser
def getCurrentSquad(fname):
global squad_list
squad_list = []
lexer = lex.lex()
parser = yacc.yacc()
f = open(fname+".html", 'r', encoding='utf-8')
data = f.read()
lexer.input(data)
file = open(fname+".txt", 'w', encoding='utf-8')
while True:
tok = lexer.token()
if not tok:
break
file.write(str(tok)+'\n')
file.close()
f.close()
parser.parse(data)
return squad_list
# print(getCurrentSquad("Australia_men%27s_national_soccer_team"))