-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProject2-api.py
67 lines (47 loc) · 2.01 KB
/
Project2-api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from httpx import get
import os
def get_response_for(keyword, results_per_page, page=1):
url = f'https://unsplash.com/napi/search/photos?page={page}&per_page={results_per_page}&query={keyword}&xp=semantic-search%3Aexperiment'
response = get(url)
if response.status_code == 200:
return response.json()
def get_image_url(data):
results = data['results']
img_urls = [x['urls']['raw'] for x in results if x['premium'] is False]
img_urls = [x.split('?')[0] for x in img_urls] # To get Canonical Image urls
return img_urls
def download_images(img_urls, max_download, dest_dir='images_from_api_method', tag="" ):
successfully_downloaded_images = 0
for url in img_urls:
if successfully_downloaded_images < max_download:
response = get(url)
file_name = url.split('/')[-1]
if not os.path.exists(dest_dir):
os.makedirs(dest_dir)
with open(f'{dest_dir}/{tag}{file_name}.jpeg', 'wb') as f:
f.write(response.content)
successfully_downloaded_images += 1
else:
break
return successfully_downloaded_images
def scrape(keyword, num_of_results):
start_page = 1
success_count = 0
while success_count < num_of_results:
data = get_response_for(keyword, results_per_page=20, page=start_page)
max_download = num_of_results - success_count
if data:
img_urls = get_image_url(data)
successful_downloads = download_images(img_urls, max_download, tag=keyword)
success_count += successful_downloads
start_page += 1
else:
print('Error: no data returned')
break
if __name__ == '__main__':
# download x images under this term
# x = 2, page 1 suffices
# x =200, page 1 most definitely does not suffice
# data = get_response_for('dolphins', 3)
# print(get_image_url(data))
scrape('microphone', 10)