-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathIMDB_Scrap_Data.py
95 lines (61 loc) · 1.91 KB
/
IMDB_Scrap_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tnrange
# In[3]:
movie_name = []
cast = []
ids = []
genre = []
runtime = []
ratings = []
def split_cast(lists):
a = lists[2].text
idx = a.find('Stars:')
sp = ("".join(a[idx+6:].split('\n'))).strip()
sp = sp.split(',')
my_list = []
for i in range(len(sp)):
my_list.append(sp[i].strip())
return my_list
for i in tnrange(1,11):
html = requests.get("https://www.imdb.com/list/ls006266261/?sort=user_rating,desc&st_dt=&mode=detail&page="+str(i))
soup = BeautifulSoup(html.content, "html5lib")
for movie in soup.select('div.lister-item-content'):
movie_name.append((movie.find("a")).text)
ids.append(((movie.find("a"))['href']).split('/')[2])
genre.append(("".join(((movie.find('span','genre')).text).split())).split(","))
runtime.append(int(((movie.find('span','runtime')).text).split()[0]))
ratings.append(float(((movie.find('span','ipl-rating-star__rating')).text).split()[0]))
# temp = movie.findAll('p',{'class':"text-muted text-small"})
# temp_cast = []
# for tag in temp[1].findAll('a'):
# temp_cast.append(tag.text)
# cast.append(temp_cast)
cast.append(split_cast((movie.findAll("p"))))
# In[4]:
final_data = []
for i in range(len(movie_name)):
dicts = {}
dicts['id'] = ids[i]
dicts['movie_name'] = movie_name[i]
dicts['cast'] = cast[i]
dicts['rating']= ratings[i]
dicts['runtime'] = runtime[i]
dicts['genre'] = genre[i]
final_data.append(dicts)
# In[5]:
from pymongo import MongoClient
client = MongoClient()
client = MongoClient('mongodb://localhost:27017/')
db = client['IMDB']
collection = db['movies_data']
# In[6]:
for i in final_data:
db.movies_data.insert_one(i)
# In[ ]:
curr = collection.find()
for i in curr:
print(i)