-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataCleaning.py
89 lines (77 loc) · 3.84 KB
/
DataCleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from ast import literal_eval
import re
import requests
from bs4 import BeautifulSoup
import time
def nyt_css_class(css_class):
"""
This is a support function for the "get_nyt_review_text" function; it allows the BS4 parser to only extract desired text data.
Args:
css_class [bs4 class_ element]: This arg takes the css element passed to the keyword argument.
Returns:
boolean: Returns True if the argument matches, and False otherwise.
"""
# Return relevant text fields:
return css_class is not None and "css" in css_class
def get_nyt_review_text(url,re_cls=re.compile('class="(.*?)"')):
"""
This function returns the text from the NYT movie review at the "url" input.
Args:
url [string]: The url for a NYT movie review.
re_cls [re.compile obj]: The compiled regex to determine the different css classes existing in a bs4 paragraph object.
Returns:
string: The text from a NYT movie review.
"""
#Set up BeauifulSoup:
session = requests.Session()
session.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.1.2222.33 Safari/537.36",
"Accept-Encoding": "*",
"Connection": "keep-alive"
}
req = session.get(url)
soup = BeautifulSoup(req.text, 'html.parser')
paragraphs = soup.find_all('p', class_=nyt_css_class)
#Convert paragraphs to list for processing:
paragraphs_lst=list(paragraphs)
#Get value of "class" for each separate "paragraph" object:
paragraphs_cls=[re_cls.findall(str(cls))[0] for cls in paragraphs_lst]
#The review text makes up the majority of the text on a review page, and is associated with the
#class that has the most objects associated with it:
cls_counts = dict()
for i in paragraphs_cls:
cls_counts[i] = cls_counts.get(i, 0) + 1
review_cls = max(cls_counts, key=cls_counts.get)
review_text=""
for i in range(len(paragraphs)):
if paragraphs_cls[i]==review_cls:
review_text = review_text + paragraphs[i].get_text()
time.sleep(4) #to reduce load
return review_text
def get_omdb_awards(df_row, award_col="Awards"):
"""This function returns new, separated columns with integer values for the awards\
a movie has won, from the "awards" column found in the OMDB API response data.
Args:
df_row (pandas DataFrame row): A single row from a pandas DataFrame, containing\
a column with string data about the awards a particular movie has been \
nominated for, or won.
award_col (str): The name of the column containing the Awards string data.
Returns:
[int]: Six integers are returned, containing values for the number of oscar wins,\
oscar nominations, emmy wins, emmy nominations, total wins, and total\
nominations, respectively.
"""
x=str(df_row[[award_col]][0])
if "." in x:
x=x.split(".")
oscar_wins=[int(re.findall(r"\d+",x[0])[0]) if "Won" in x[0] and "Oscar" in x[0] else 0][0]
oscar_noms=[int(re.findall(r"\d+",x[0])[0]) if "Nominated" in x[0] and "Oscar" in x[0] else 0][0]
emmy_wins=[int(re.findall(r"\d+",x[0])[0]) if "Won" in x[0] and "Emmy" in x[0] else 0][0]
emmy_noms=[int(re.findall(r"\d+",x[0])[0]) if "Nominated" in x[0] and "Emmy" in x[0] else 0][0]
total_wins=[int(re.findall(r"\d+(?= win)",x[1])[0]) if "win" in x[1] else 0][0]
total_noms=[int(re.findall(r"\d+(?= nomination)",x[1])[0]) if "nomination" in x[1] else 0][0]
else:
oscar_wins, oscar_noms, emmy_wins, emmy_noms=0,0,0,0
total_wins=[int(re.findall(r"\d+(?= win)",x)[0]) if "win" in x else 0][0]
total_noms=[int(re.findall(r"\d+(?= nomination)",x)[0]) if "nomination" in x else 0][0]
return oscar_wins,oscar_noms,emmy_wins,emmy_noms,total_wins,total_noms