From aa1f01254407ba49617e834850f82d6e4ce4eedc Mon Sep 17 00:00:00 2001 From: Joeclinton1 <48254978+Joeclinton1@users.noreply.github.com> Date: Wed, 5 Feb 2020 23:56:38 +0100 Subject: [PATCH 01/31] Fixed issue with links not being found Google recently changed the way they present the image data, and so the links were no longer being scraped. I figured out how to get the image urls with the new system and made the appropriate changes so it would work. Unfortunately, google no longer provides file format data so I had to try and retrieve it from the url of the image, which does not work in some cases. --- .../google_images_download.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index fd89a3a9..4eebe057 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -6,6 +6,7 @@ # Import Libraries import sys +import ast version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above @@ -271,15 +272,18 @@ def get_all_tabs(self,page): #Format the object in readable format def format_object(self,object): + data = object[1] + main = data[3] + info = data[9] formatted_object = {} - formatted_object['image_format'] = object['ity'] - formatted_object['image_height'] = object['oh'] - formatted_object['image_width'] = object['ow'] - formatted_object['image_link'] = object['ou'] - formatted_object['image_description'] = object['pt'] - formatted_object['image_host'] = object['rh'] - formatted_object['image_source'] = object['ru'] - formatted_object['image_thumbnail_url'] = object['tu'] + formatted_object['image_height'] = main[2] + formatted_object['image_width'] = main[1] + formatted_object['image_link'] = main[0] + formatted_object['image_format']=main[0][-1*(len(main[0])-main[0].rfind(".")-1):] + formatted_object['image_description'] = info['2003'][3] + formatted_object['image_host'] = info['183836587'][0] + formatted_object['image_source'] = info['2003'][2] + formatted_object['image_thumbnail_url'] = data[2][0] return formatted_object @@ -482,7 +486,7 @@ def create_directories(self,main_directory, dir_name,thumbnail,thumbnail_only): try: if not os.path.exists(main_directory): os.makedirs(main_directory) - time.sleep(0.2) + time.sleep(0.15) path = (dir_name) sub_directory = os.path.join(main_directory, path) if not os.path.exists(sub_directory): @@ -740,24 +744,29 @@ def _get_next_item(self,s): # Getting all links with the help of '_images_get_next_image' + def _get_image_objects(self,s): + start_line = s.find("AF_initDataCallback({key: \\'ds:2\\'") - 10 + start_object = s.find('[', start_line + 1) + end_object = s.find('', start_object + 1) - 4 + object_raw = str(s[start_object:end_object]) + object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") + image_objects = json.loads(object_decode)[31][0][12][2] + return image_objects + def _get_all_items(self,page,main_directory,dir_name,limit,arguments): items = [] abs_path = [] errorCount = 0 i = 0 count = 1 + image_objects = self._get_image_objects(page) while count < limit+1: - object, end_content = self._get_next_item(page) - if object == "no_links": + if len(image_objects) == 0: + print("no_links") break - elif object == "": - page = page[end_content:] - elif arguments['offset'] and count < int(arguments['offset']): - count += 1 - page = page[end_content:] else: #format the item for readability - object = self.format_object(object) + object = self.format_object(image_objects[i]) if arguments['metadata']: if not arguments["silent_mode"]: print("\nImage Metadata: " + str(object)) @@ -784,8 +793,6 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments): #delay param if arguments['delay']: time.sleep(int(arguments['delay'])) - - page = page[end_content:] i += 1 if count < limit: print("\n\nUnfortunately all " + str( From 66f69d670db194e1d6b50c70a0940379ff963409 Mon Sep 17 00:00:00 2001 From: Joe Clinton <48254978+Joeclinton1@users.noreply.github.com> Date: Sun, 9 Feb 2020 16:53:23 +0100 Subject: [PATCH 02/31] Fixed None type By filtering out the image objects which had data[0]==2, I have removed the null items and it will no longer give the error: "TypeError: 'NoneType' object is not subscriptable". --- .../google_images_download.py | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index fd89a3a9..5447de5f 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -271,15 +271,18 @@ def get_all_tabs(self,page): #Format the object in readable format def format_object(self,object): + data = object[1] + main = data[3] + info = data[9] formatted_object = {} - formatted_object['image_format'] = object['ity'] - formatted_object['image_height'] = object['oh'] - formatted_object['image_width'] = object['ow'] - formatted_object['image_link'] = object['ou'] - formatted_object['image_description'] = object['pt'] - formatted_object['image_host'] = object['rh'] - formatted_object['image_source'] = object['ru'] - formatted_object['image_thumbnail_url'] = object['tu'] + formatted_object['image_height'] = main[2] + formatted_object['image_width'] = main[1] + formatted_object['image_link'] = main[0] + formatted_object['image_format']=main[0][-1*(len(main[0])-main[0].rfind(".")-1):] + formatted_object['image_description'] = info['2003'][3] + formatted_object['image_host'] = info['183836587'][0] + formatted_object['image_source'] = info['2003'][2] + formatted_object['image_thumbnail_url'] = data[2][0] return formatted_object @@ -482,7 +485,7 @@ def create_directories(self,main_directory, dir_name,thumbnail,thumbnail_only): try: if not os.path.exists(main_directory): os.makedirs(main_directory) - time.sleep(0.2) + time.sleep(0.15) path = (dir_name) sub_directory = os.path.join(main_directory, path) if not os.path.exists(sub_directory): @@ -740,24 +743,30 @@ def _get_next_item(self,s): # Getting all links with the help of '_images_get_next_image' + def _get_image_objects(self,s): + start_line = s.find("AF_initDataCallback({key: \\'ds:2\\'") - 10 + start_object = s.find('[', start_line + 1) + end_object = s.find('', start_object + 1) - 4 + object_raw = str(s[start_object:end_object]) + object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") + image_objects = json.loads(object_decode)[31][0][12][2] + image_objects = [x for x in image_objects if x[0]==1] + return image_objects + def _get_all_items(self,page,main_directory,dir_name,limit,arguments): items = [] abs_path = [] errorCount = 0 i = 0 count = 1 - while count < limit+1: - object, end_content = self._get_next_item(page) - if object == "no_links": + image_objects = self._get_image_objects(page) + while count < limit+1 and i Date: Sun, 9 Feb 2020 17:08:49 +0100 Subject: [PATCH 03/31] Update google_images_download.py --- google_images_download/google_images_download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 4eebe057..bbceba8c 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -751,6 +751,7 @@ def _get_image_objects(self,s): object_raw = str(s[start_object:end_object]) object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") image_objects = json.loads(object_decode)[31][0][12][2] + image_objects = [x for x in image_objects if x[0]==1] return image_objects def _get_all_items(self,page,main_directory,dir_name,limit,arguments): @@ -760,7 +761,7 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments): i = 0 count = 1 image_objects = self._get_image_objects(page) - while count < limit+1: + while count < limit+1 and i Date: Mon, 10 Feb 2020 08:02:02 +0100 Subject: [PATCH 04/31] Fix more none type errors This system is not very flexible, it seems google does not keep the same positions of target items, so sometimes it doens't work. I added a try-except just in case there are more problems --- .../google_images_download.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index bbceba8c..d7cf91e6 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -6,7 +6,6 @@ # Import Libraries import sys -import ast version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above @@ -275,15 +274,21 @@ def format_object(self,object): data = object[1] main = data[3] info = data[9] + if info is None: + info = data[11] formatted_object = {} - formatted_object['image_height'] = main[2] - formatted_object['image_width'] = main[1] - formatted_object['image_link'] = main[0] - formatted_object['image_format']=main[0][-1*(len(main[0])-main[0].rfind(".")-1):] - formatted_object['image_description'] = info['2003'][3] - formatted_object['image_host'] = info['183836587'][0] - formatted_object['image_source'] = info['2003'][2] - formatted_object['image_thumbnail_url'] = data[2][0] + try: + formatted_object['image_height'] = main[2] + formatted_object['image_width'] = main[1] + formatted_object['image_link'] = main[0] + formatted_object['image_format']=main[0][-1*(len(main[0])-main[0].rfind(".")-1):] + formatted_object['image_description'] = info['2003'][3] + formatted_object['image_host'] = info['183836587'][0] + formatted_object['image_source'] = info['2003'][2] + formatted_object['image_thumbnail_url'] = data[2][0] + except Exception as e: + print(e) + return None return formatted_object @@ -1015,5 +1020,3 @@ def main(): if __name__ == "__main__": main() - -# In[ ]: From ef577fc0f7a8558073a9a8bc227fdaca0136b2ab Mon Sep 17 00:00:00 2001 From: Alexey Voinov Date: Sat, 14 Mar 2020 22:38:18 +0100 Subject: [PATCH 05/31] Fix download of >100 items It is based on patch by https://github.com/Joeclinton1, but for some reason we get escaped string when getting the results page directly (limit < 101) and unescaped one when getting the results page using selenium. This is not the most elegant solution, but it works for me. --- google_images_download/google_images_download.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index d7cf91e6..3dfc609a 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -750,11 +750,17 @@ def _get_next_item(self,s): # Getting all links with the help of '_images_get_next_image' def _get_image_objects(self,s): - start_line = s.find("AF_initDataCallback({key: \\'ds:2\\'") - 10 - start_object = s.find('[', start_line + 1) - end_object = s.find('', start_object + 1) - 4 - object_raw = str(s[start_object:end_object]) - object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") + start_line = s.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 + if start_line == -11: + start_line = s.find("AF_initDataCallback({key: 'ds:1'") - 10 + start_object = s.find('[', start_line + 1) + end_object = s.find('', start_object + 1) - 4 + object_decode = str(s[start_object:end_object]) + else: + start_object = s.find('[', start_line + 1) + end_object = s.find('', start_object + 1) - 4 + object_raw = str(s[start_object:end_object]) + object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") image_objects = json.loads(object_decode)[31][0][12][2] image_objects = [x for x in image_objects if x[0]==1] return image_objects From 90e52a4a35c3861fddcc751439064909b166da7d Mon Sep 17 00:00:00 2001 From: Alexey Voinov Date: Tue, 24 Mar 2020 09:02:20 +0100 Subject: [PATCH 06/31] Intercept ajax calls --- .../google_images_download.py | 34 +++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 3dfc609a..de39e04a 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -184,6 +184,38 @@ def download_extended_page(self,url,chromedriver): # Open the link browser.get(url) + browser.execute_script(""" + (function(XHR){ + "use strict"; + var open = XHR.prototype.open; + var send = XHR.prototype.send; + var data = []; + + XHR.prototype.open = function(method, url, async, user, pass) { + this._url = url; + open.call(this, method, url, async, user, pass); + } + + XHR.prototype.send = function(data) { + var self = this; + var url = this._url; + + function stateChanged() { + if (self.readyState == 4) { + console.log("data available for: " + url) + XHR.prototype._data.push(self.response); + } + } + if (url.includes("/batchexecute?")) { + this.addEventListener("readystatechange", stateChanged, false); + } + send.call(this, data); + }; + + XHR.prototype._data = []; + })(XMLHttpRequest); + """) + time.sleep(1) print("Getting you a lot of images. This may take a few moments...") @@ -207,6 +239,8 @@ def download_extended_page(self,url,chromedriver): time.sleep(0.5) source = browser.page_source #page source + ajax = browser.execute_script("return XMLHttpRequest.prototype._data") + #close the browser browser.close() From 7db9a4608f584ae6925e3ebff001579dce284b39 Mon Sep 17 00:00:00 2001 From: Alexey Voinov Date: Tue, 24 Mar 2020 20:51:51 +0100 Subject: [PATCH 07/31] Decode data from ajax calls --- .../google_images_download.py | 68 ++++++++++--------- 1 file changed, 37 insertions(+), 31 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index de39e04a..bac53f00 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -127,6 +127,28 @@ class googleimagesdownload: def __init__(self): pass + def _extract_data_pack(self, page): + start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 + start_object = page.find('[', start_line + 1) + end_object = page.find('', start_object + 1) - 4 + object_raw = str(page[start_object:end_object]) + return bytes(object_raw, "utf-8").decode("unicode_escape") + + def _extract_data_pack_extended(self, page): + start_line = page.find("AF_initDataCallback({key: 'ds:1'") - 10 + start_object = page.find('[', start_line + 1) + end_object = page.find('', start_object + 1) - 4 + return str(page[start_object:end_object]) + + def _extract_data_pack_ajax(self, data): + lines = data.split('\n') + return json.loads(lines[3] + lines[4])[0][2] + + def _image_objects_from_pack(self, data): + image_objects = json.loads(data)[31][0][12][2] + image_objects = [x for x in image_objects if x[0]==1] + return image_objects + # Downloading entire Web Document (Raw Page Content) def download_page(self,url): version = (3, 0) @@ -138,7 +160,7 @@ def download_page(self,url): req = urllib.request.Request(url, headers=headers) resp = urllib.request.urlopen(req) respData = str(resp.read()) - return respData + return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) except Exception as e: print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") @@ -154,7 +176,7 @@ def download_page(self,url): context = ssl._create_unverified_context() response = urlopen(req, context=context) page = response.read() - return page + return self._image_objects_from_pack(self._extract_data_pack(page)), self.get_all_tabs(page) except: print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") @@ -239,12 +261,16 @@ def download_extended_page(self,url,chromedriver): time.sleep(0.5) source = browser.page_source #page source - ajax = browser.execute_script("return XMLHttpRequest.prototype._data") + images = self._image_objects_from_pack(self._extract_data_pack_extended(source)) + + ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") + for chunk in ajax_data: + images += self._image_objects_from_pack(self._extract_data_pack_ajax(chunk)) #close the browser browser.close() - return source + return images, self.get_all_tabs(source) #Correcting the escape characters for python2 @@ -781,31 +807,12 @@ def _get_next_item(self,s): final_object = "" return final_object, end_object - - # Getting all links with the help of '_images_get_next_image' - def _get_image_objects(self,s): - start_line = s.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 - if start_line == -11: - start_line = s.find("AF_initDataCallback({key: 'ds:1'") - 10 - start_object = s.find('[', start_line + 1) - end_object = s.find('', start_object + 1) - 4 - object_decode = str(s[start_object:end_object]) - else: - start_object = s.find('[', start_line + 1) - end_object = s.find('', start_object + 1) - 4 - object_raw = str(s[start_object:end_object]) - object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") - image_objects = json.loads(object_decode)[31][0][12][2] - image_objects = [x for x in image_objects if x[0]==1] - return image_objects - - def _get_all_items(self,page,main_directory,dir_name,limit,arguments): + def _get_all_items(self,image_objects,main_directory,dir_name,limit,arguments): items = [] abs_path = [] errorCount = 0 i = 0 count = 1 - image_objects = self._get_image_objects(page) while count < limit+1 and i Date: Wed, 25 Mar 2020 09:53:58 +0100 Subject: [PATCH 08/31] Get image format from Content-Type returned by server --- .../google_images_download.py | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index bac53f00..e5242f62 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -667,29 +667,43 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri response = urlopen(req, None, timeout) data = response.read() + info = response.info() response.close() - extensions = [".jpg", ".jpeg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico"] - # keep everything after the last '/' - image_name = str(image_url[(image_url.rfind('/')) + 1:]) - if format: - if not image_format or image_format != format: - download_status = 'fail' - download_message = "Wrong image format returned. Skipping..." - return_image_name = '' - absolute_path = '' - return download_status, download_message, return_image_name, absolute_path - - if image_format == "" or not image_format or "." + image_format not in extensions: + qmark = image_url.rfind('?') + if qmark == -1: + qmark = len(image_url) + slash = image_url.rfind('/', 0, qmark) + 1 + image_name = str(image_url[slash:qmark]).lower() + + type = info.get_content_type() + if type == "image/jpeg" or type == "image/jpg": + if not image_name.endswith(".jpg") and not image_name.endswith(".jpeg"): + image_name += ".jpg" + elif type == "image/png": + if not image_name.endswith(".png"): + image_name += ".png" + elif type == "image/webp": + if not image_name.endswith(".webp"): + image_name += ".webp" + elif type == "image/gif": + if not image_name.endswith(".gif"): + image_name += ".gif" + elif type == "image/bmp" or type == "image/x-windows-bmp": + if not image_name.endswith(".bmp"): + image_name += ".bmp" + elif type == "image/x-icon" or type == "image/vnd.microsoft.icon": + if not image_name.endswith(".ico"): + image_name += ".ico" + elif type == "image/svg+xml": + if not image_name.endswith(".svg"): + image_name += ".svg" + else: download_status = 'fail' - download_message = "Invalid or missing image format. Skipping..." + download_message = "Invalid image format '" + type + "'. Skipping..." return_image_name = '' absolute_path = '' return download_status, download_message, return_image_name, absolute_path - elif image_name.lower().find("." + image_format) < 0: - image_name = image_name + "." + image_format - else: - image_name = image_name[:image_name.lower().find("." + image_format) + (len(image_format) + 1)] # prefix name in image if prefix: From 068712b4681bddcdc305b8824789361d576793ef Mon Sep 17 00:00:00 2001 From: Joe Clinton <48254978+Joeclinton1@users.noreply.github.com> Date: Wed, 25 Mar 2020 12:47:31 +0100 Subject: [PATCH 09/31] changed start_line (ds:2 to ds:1) --- google_images_download/google_images_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index bbceba8c..345f4c87 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -745,7 +745,7 @@ def _get_next_item(self,s): # Getting all links with the help of '_images_get_next_image' def _get_image_objects(self,s): - start_line = s.find("AF_initDataCallback({key: \\'ds:2\\'") - 10 + start_line = s.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 start_object = s.find('[', start_line + 1) end_object = s.find('', start_object + 1) - 4 object_raw = str(s[start_object:end_object]) From d8dd8a984282065396696322d6c10e7a32ea1cec Mon Sep 17 00:00:00 2001 From: Alexey Voinov Date: Wed, 17 Jun 2020 16:09:09 +0200 Subject: [PATCH 10/31] google changed their format a little. again --- google_images_download/google_images_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index e5242f62..b24953e0 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -130,7 +130,7 @@ def __init__(self): def _extract_data_pack(self, page): start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 start_object = page.find('[', start_line + 1) - end_object = page.find('', start_object + 1) - 4 + end_object = page.find('', start_object + 1) - 5 object_raw = str(page[start_object:end_object]) return bytes(object_raw, "utf-8").decode("unicode_escape") From 620e7f54ea29631e5e7d8a2b82a16a40932bf42a Mon Sep 17 00:00:00 2001 From: Joe Clinton Date: Sat, 27 Jun 2020 15:58:24 +0200 Subject: [PATCH 11/31] removed unused get_next_item function --- .../google_images_download.py | 407 ++++++++++-------- 1 file changed, 237 insertions(+), 170 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index b24953e0..4a6cf9eb 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -6,6 +6,7 @@ # Import Libraries import sys + version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above @@ -15,6 +16,7 @@ from urllib.parse import quote import http.client from http.client import IncompleteRead, BadStatusLine + http.client._MAXHEADERS = 1000 else: # If the Current Version of Python is 2.x import urllib2 @@ -23,6 +25,7 @@ from urllib import quote import httplib from httplib import IncompleteRead, BadStatusLine + httplib._MAXHEADERS = 1000 import time # Importing the time library to check the time of code execution import os @@ -39,8 +42,9 @@ "exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image", "output_directory", "image_directory", "no_directory", "proxy", "similar_images", "specific_site", "print_urls", "print_size", "print_paths", "metadata", "extract_metadata", "socket_timeout", - "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "related_images", "safe_search", "no_numbering", - "offset", "no_download","save_source","silent_mode","ignore_urls"] + "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "related_images", "safe_search", + "no_numbering", + "offset", "no_download", "save_source", "silent_mode", "ignore_urls"] def user_input(): @@ -52,7 +56,7 @@ def user_input(): if object_check['config_file'] != '': records = [] json_file = json.load(open(config_file_check[0].config_file)) - for record in range(0,len(json_file['Records'])): + for record in range(0, len(json_file['Records'])): arguments = {} for i in args_list: arguments[i] = None @@ -64,57 +68,104 @@ def user_input(): # Taking command line arguments from users parser = argparse.ArgumentParser() parser.add_argument('-k', '--keywords', help='delimited list input', type=str, required=False) - parser.add_argument('-kf', '--keywords_from_file', help='extract list of keywords from a text file', type=str, required=False) - parser.add_argument('-sk', '--suffix_keywords', help='comma separated additional words added after to main keyword', type=str, required=False) - parser.add_argument('-pk', '--prefix_keywords', help='comma separated additional words added before main keyword', type=str, required=False) + parser.add_argument('-kf', '--keywords_from_file', help='extract list of keywords from a text file', type=str, + required=False) + parser.add_argument('-sk', '--suffix_keywords', + help='comma separated additional words added after to main keyword', type=str, + required=False) + parser.add_argument('-pk', '--prefix_keywords', + help='comma separated additional words added before main keyword', type=str, required=False) parser.add_argument('-l', '--limit', help='delimited list input', type=str, required=False) parser.add_argument('-f', '--format', help='download images with specific format', type=str, required=False, choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico']) parser.add_argument('-u', '--url', help='search with google image URL', type=str, required=False) - parser.add_argument('-x', '--single_image', help='downloading a single image from URL', type=str, required=False) - parser.add_argument('-o', '--output_directory', help='download images in a specific main directory', type=str, required=False) - parser.add_argument('-i', '--image_directory', help='download images in a specific sub-directory', type=str, required=False) - parser.add_argument('-n', '--no_directory', default=False, help='download images in the main directory but no sub-directory', action="store_true") - parser.add_argument('-d', '--delay', help='delay in seconds to wait between downloading two images', type=int, required=False) + parser.add_argument('-x', '--single_image', help='downloading a single image from URL', type=str, + required=False) + parser.add_argument('-o', '--output_directory', help='download images in a specific main directory', type=str, + required=False) + parser.add_argument('-i', '--image_directory', help='download images in a specific sub-directory', type=str, + required=False) + parser.add_argument('-n', '--no_directory', default=False, + help='download images in the main directory but no sub-directory', action="store_true") + parser.add_argument('-d', '--delay', help='delay in seconds to wait between downloading two images', type=int, + required=False) parser.add_argument('-co', '--color', help='filter on color', type=str, required=False, - choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown']) + choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', + 'gray', 'black', 'brown']) parser.add_argument('-ct', '--color_type', help='filter on color', type=str, required=False, choices=['full-color', 'black-and-white', 'transparent']) parser.add_argument('-r', '--usage_rights', help='usage rights', type=str, required=False, - choices=['labeled-for-reuse-with-modifications','labeled-for-reuse','labeled-for-noncommercial-reuse-with-modification','labeled-for-nocommercial-reuse']) + choices=['labeled-for-reuse-with-modifications', 'labeled-for-reuse', + 'labeled-for-noncommercial-reuse-with-modification', + 'labeled-for-nocommercial-reuse']) parser.add_argument('-s', '--size', help='image size', type=str, required=False, - choices=['large','medium','icon','>400*300','>640*480','>800*600','>1024*768','>2MP','>4MP','>6MP','>8MP','>10MP','>12MP','>15MP','>20MP','>40MP','>70MP']) - parser.add_argument('-es', '--exact_size', help='exact image resolution "WIDTH,HEIGHT"', type=str, required=False) + choices=['large', 'medium', 'icon', '>400*300', '>640*480', '>800*600', '>1024*768', '>2MP', + '>4MP', '>6MP', '>8MP', '>10MP', '>12MP', '>15MP', '>20MP', '>40MP', '>70MP']) + parser.add_argument('-es', '--exact_size', help='exact image resolution "WIDTH,HEIGHT"', type=str, + required=False) parser.add_argument('-t', '--type', help='image type', type=str, required=False, - choices=['face','photo','clipart','line-drawing','animated']) + choices=['face', 'photo', 'clipart', 'line-drawing', 'animated']) parser.add_argument('-w', '--time', help='image age', type=str, required=False, - choices=['past-24-hours','past-7-days','past-month','past-year']) - parser.add_argument('-wr', '--time_range', help='time range for the age of the image. should be in the format {"time_min":"MM/DD/YYYY","time_max":"MM/DD/YYYY"}', type=str, required=False) - parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, required=False, + choices=['past-24-hours', 'past-7-days', 'past-month', 'past-year']) + parser.add_argument('-wr', '--time_range', + help='time range for the age of the image. should be in the format {"time_min":"MM/DD/YYYY","time_max":"MM/DD/YYYY"}', + type=str, required=False) + parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, + required=False, choices=['tall', 'square', 'wide', 'panoramic']) - parser.add_argument('-si', '--similar_images', help='downloads images very similar to the image URL you provide', type=str, required=False) - parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', type=str, required=False) - parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", action="store_true") - parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", action="store_true") - parser.add_argument('-pp', '--print_paths', default=False, help="Prints the list of absolute paths of the images",action="store_true") - parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", action="store_true") - parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", action="store_true") - parser.add_argument('-st', '--socket_timeout', default=False, help="Connection timeout waiting for the image to download", type=float) - parser.add_argument('-th', '--thumbnail', default=False, help="Downloads image thumbnail along with the actual image", action="store_true") - parser.add_argument('-tho', '--thumbnail_only', default=False, help="Downloads only thumbnail without downloading actual images", action="store_true") - parser.add_argument('-la', '--language', default=False, help="Defines the language filter. The search results are authomatically returned in that language", type=str, required=False, - choices=['Arabic','Chinese (Simplified)','Chinese (Traditional)','Czech','Danish','Dutch','English','Estonian','Finnish','French','German','Greek','Hebrew','Hungarian','Icelandic','Italian','Japanese','Korean','Latvian','Lithuanian','Norwegian','Portuguese','Polish','Romanian','Russian','Spanish','Swedish','Turkish']) - parser.add_argument('-pr', '--prefix', default=False, help="A word that you would want to prefix in front of each image name", type=str, required=False) + parser.add_argument('-si', '--similar_images', + help='downloads images very similar to the image URL you provide', type=str, required=False) + parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', + type=str, required=False) + parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", + action="store_true") + parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", + action="store_true") + parser.add_argument('-pp', '--print_paths', default=False, + help="Prints the list of absolute paths of the images", action="store_true") + parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", + action="store_true") + parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", + action="store_true") + parser.add_argument('-st', '--socket_timeout', default=False, + help="Connection timeout waiting for the image to download", type=float) + parser.add_argument('-th', '--thumbnail', default=False, + help="Downloads image thumbnail along with the actual image", action="store_true") + parser.add_argument('-tho', '--thumbnail_only', default=False, + help="Downloads only thumbnail without downloading actual images", action="store_true") + parser.add_argument('-la', '--language', default=False, + help="Defines the language filter. The search results are authomatically returned in that language", + type=str, required=False, + choices=['Arabic', 'Chinese (Simplified)', 'Chinese (Traditional)', 'Czech', 'Danish', + 'Dutch', 'English', 'Estonian', 'Finnish', 'French', 'German', 'Greek', 'Hebrew', + 'Hungarian', 'Icelandic', 'Italian', 'Japanese', 'Korean', 'Latvian', 'Lithuanian', + 'Norwegian', 'Portuguese', 'Polish', 'Romanian', 'Russian', 'Spanish', 'Swedish', + 'Turkish']) + parser.add_argument('-pr', '--prefix', default=False, + help="A word that you would want to prefix in front of each image name", type=str, + required=False) parser.add_argument('-px', '--proxy', help='specify a proxy address and port', type=str, required=False) - parser.add_argument('-cd', '--chromedriver', help='specify the path to chromedriver executable in your local machine', type=str, required=False) - parser.add_argument('-ri', '--related_images', default=False, help="Downloads images that are similar to the keyword provided", action="store_true") - parser.add_argument('-sa', '--safe_search', default=False, help="Turns on the safe search filter while searching for images", action="store_true") - parser.add_argument('-nn', '--no_numbering', default=False, help="Allows you to exclude the default numbering of images", action="store_true") + parser.add_argument('-cd', '--chromedriver', + help='specify the path to chromedriver executable in your local machine', type=str, + required=False) + parser.add_argument('-ri', '--related_images', default=False, + help="Downloads images that are similar to the keyword provided", action="store_true") + parser.add_argument('-sa', '--safe_search', default=False, + help="Turns on the safe search filter while searching for images", action="store_true") + parser.add_argument('-nn', '--no_numbering', default=False, + help="Allows you to exclude the default numbering of images", action="store_true") parser.add_argument('-of', '--offset', help="Where to start in the fetched links", type=str, required=False) - parser.add_argument('-nd', '--no_download', default=False, help="Prints the URLs of the images and/or thumbnails without downloading them", action="store_true") - parser.add_argument('-iu', '--ignore_urls', default=False, help="delimited list input of image urls/keywords to ignore", type=str) - parser.add_argument('-sil', '--silent_mode', default=False, help="Remains silent. Does not print notification messages on the terminal", action="store_true") - parser.add_argument('-is', '--save_source', help="creates a text file containing a list of downloaded images along with source page url", type=str, required=False) + parser.add_argument('-nd', '--no_download', default=False, + help="Prints the URLs of the images and/or thumbnails without downloading them", + action="store_true") + parser.add_argument('-iu', '--ignore_urls', default=False, + help="delimited list input of image urls/keywords to ignore", type=str) + parser.add_argument('-sil', '--silent_mode', default=False, + help="Remains silent. Does not print notification messages on the terminal", + action="store_true") + parser.add_argument('-is', '--save_source', + help="creates a text file containing a list of downloaded images along with source page url", + type=str, required=False) args = parser.parse_args() arguments = vars(args) @@ -146,17 +197,18 @@ def _extract_data_pack_ajax(self, data): def _image_objects_from_pack(self, data): image_objects = json.loads(data)[31][0][12][2] - image_objects = [x for x in image_objects if x[0]==1] + image_objects = [x for x in image_objects if x[0] == 1] return image_objects # Downloading entire Web Document (Raw Page Content) - def download_page(self,url): + def download_page(self, url): version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above try: headers = {} - headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib.request.Request(url, headers=headers) resp = urllib.request.urlopen(req) respData = str(resp.read()) @@ -168,7 +220,8 @@ def download_page(self,url): else: # If the Current Version of Python is 2.x try: headers = {} - headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" + headers[ + 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req = urllib2.Request(url, headers=headers) try: response = urllib2.urlopen(req) @@ -183,9 +236,8 @@ def download_page(self,url): sys.exit() return "Page Not found" - # Download Page for more than 100 images - def download_extended_page(self,url,chromedriver): + def download_extended_page(self, url, chromedriver): from selenium import webdriver from selenium.webdriver.common.keys import Keys if sys.version_info[0] < 3: @@ -260,35 +312,33 @@ def download_extended_page(self,url,chromedriver): print("Reached end of Page.") time.sleep(0.5) - source = browser.page_source #page source + source = browser.page_source # page source images = self._image_objects_from_pack(self._extract_data_pack_extended(source)) ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") for chunk in ajax_data: images += self._image_objects_from_pack(self._extract_data_pack_ajax(chunk)) - #close the browser + # close the browser browser.close() return images, self.get_all_tabs(source) - - #Correcting the escape characters for python2 - def replace_with_byte(self,match): + # Correcting the escape characters for python2 + def replace_with_byte(self, match): return chr(int(match.group(0)[1:], 8)) - def repair(self,brokenjson): + def repair(self, brokenjson): invalid_escape = re.compile(r'\\[0-7]{1,3}') # up to 3 digits for byte values up to FF return invalid_escape.sub(self.replace_with_byte, brokenjson) - # Finding 'Next Image' from the given raw page - def get_next_tab(self,s): + def get_next_tab(self, s): start_line = s.find('class="dtviD"') if start_line == -1: # If no links are found then give an error! end_quote = 0 link = "no_tabs" - return link,'',end_quote + return link, '', end_quote else: start_line = s.find('class="dtviD"') start_content = s.find('href="', start_line + 1) @@ -307,16 +357,15 @@ def get_next_tab(self,s): if chars_end == -1: updated_item_name = (url_item_name[chars + 5:]).replace("+", " ") else: - updated_item_name = (url_item_name[chars+5:chars_end]).replace("+", " ") + updated_item_name = (url_item_name[chars + 5:chars_end]).replace("+", " ") return url_item, updated_item_name, end_content - # Getting all links with the help of '_images_get_next_image' - def get_all_tabs(self,page): + def get_all_tabs(self, page): tabs = {} while True: - item,item_name,end_content = self.get_next_tab(page) + item, item_name, end_content = self.get_next_tab(page) if item == "no_tabs": break else: @@ -328,9 +377,8 @@ def get_all_tabs(self,page): page = page[end_content:] return tabs - - #Format the object in readable format - def format_object(self,object): + # Format the object in readable format + def format_object(self, object): data = object[1] main = data[3] info = data[9] @@ -341,7 +389,7 @@ def format_object(self,object): formatted_object['image_height'] = main[2] formatted_object['image_width'] = main[1] formatted_object['image_link'] = main[0] - formatted_object['image_format']=main[0][-1*(len(main[0])-main[0].rfind(".")-1):] + formatted_object['image_format'] = main[0][-1 * (len(main[0]) - main[0].rfind(".") - 1):] formatted_object['image_description'] = info['2003'][3] formatted_object['image_host'] = info['183836587'][0] formatted_object['image_source'] = info['2003'][2] @@ -351,9 +399,8 @@ def format_object(self,object): return None return formatted_object - - #function to download single image - def single_image(self,image_url): + # function to download single image + def single_image(self, image_url): main_directory = "downloads" extensions = (".jpg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico") url = image_url @@ -391,14 +438,15 @@ def single_image(self,image_url): print("completed ====> " + image_name.encode('raw_unicode_escape').decode('utf-8')) return - def similar_images(self,similar_images): + def similar_images(self, similar_images): version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above try: searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images headers = {} - headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req1 = urllib.request.Request(searchUrl, headers=headers) resp1 = urllib.request.urlopen(req1) @@ -420,7 +468,8 @@ def similar_images(self,similar_images): try: searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images headers = {} - headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" + headers[ + 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req1 = urllib2.Request(searchUrl, headers=headers) resp1 = urllib2.urlopen(req1) @@ -435,16 +484,23 @@ def similar_images(self,similar_images): l3 = content.find('/search?sa=X&q=') l4 = content.find(';', l3 + 19) urll2 = content[l3 + 19:l4] - return(urll2) + return (urll2) except: return "Cloud not connect to Google Images endpoint" - #Building URL parameters - def build_url_parameters(self,arguments): + # Building URL parameters + def build_url_parameters(self, arguments): if arguments['language']: lang = "&lr=" - lang_param = {"Arabic":"lang_ar","Chinese (Simplified)":"lang_zh-CN","Chinese (Traditional)":"lang_zh-TW","Czech":"lang_cs","Danish":"lang_da","Dutch":"lang_nl","English":"lang_en","Estonian":"lang_et","Finnish":"lang_fi","French":"lang_fr","German":"lang_de","Greek":"lang_el","Hebrew":"lang_iw ","Hungarian":"lang_hu","Icelandic":"lang_is","Italian":"lang_it","Japanese":"lang_ja","Korean":"lang_ko","Latvian":"lang_lv","Lithuanian":"lang_lt","Norwegian":"lang_no","Portuguese":"lang_pt","Polish":"lang_pl","Romanian":"lang_ro","Russian":"lang_ru","Spanish":"lang_es","Swedish":"lang_sv","Turkish":"lang_tr"} - lang_url = lang+lang_param[arguments['language']] + lang_param = {"Arabic": "lang_ar", "Chinese (Simplified)": "lang_zh-CN", + "Chinese (Traditional)": "lang_zh-TW", "Czech": "lang_cs", "Danish": "lang_da", + "Dutch": "lang_nl", "English": "lang_en", "Estonian": "lang_et", "Finnish": "lang_fi", + "French": "lang_fr", "German": "lang_de", "Greek": "lang_el", "Hebrew": "lang_iw ", + "Hungarian": "lang_hu", "Icelandic": "lang_is", "Italian": "lang_it", "Japanese": "lang_ja", + "Korean": "lang_ko", "Latvian": "lang_lv", "Lithuanian": "lang_lt", "Norwegian": "lang_no", + "Portuguese": "lang_pt", "Polish": "lang_pl", "Romanian": "lang_ro", "Russian": "lang_ru", + "Spanish": "lang_es", "Swedish": "lang_sv", "Turkish": "lang_tr"} + lang_url = lang + lang_param[arguments['language']] else: lang_url = '' @@ -463,14 +519,34 @@ def build_url_parameters(self,arguments): built_url = "&tbs=" counter = 0 - params = {'color':[arguments['color'],{'red':'ic:specific,isc:red', 'orange':'ic:specific,isc:orange', 'yellow':'ic:specific,isc:yellow', 'green':'ic:specific,isc:green', 'teal':'ic:specific,isc:teel', 'blue':'ic:specific,isc:blue', 'purple':'ic:specific,isc:purple', 'pink':'ic:specific,isc:pink', 'white':'ic:specific,isc:white', 'gray':'ic:specific,isc:gray', 'black':'ic:specific,isc:black', 'brown':'ic:specific,isc:brown'}], - 'color_type':[arguments['color_type'],{'full-color':'ic:color', 'black-and-white':'ic:gray','transparent':'ic:trans'}], - 'usage_rights':[arguments['usage_rights'],{'labeled-for-reuse-with-modifications':'sur:fmc','labeled-for-reuse':'sur:fc','labeled-for-noncommercial-reuse-with-modification':'sur:fm','labeled-for-nocommercial-reuse':'sur:f'}], - 'size':[arguments['size'],{'large':'isz:l','medium':'isz:m','icon':'isz:i','>400*300':'isz:lt,islt:qsvga','>640*480':'isz:lt,islt:vga','>800*600':'isz:lt,islt:svga','>1024*768':'visz:lt,islt:xga','>2MP':'isz:lt,islt:2mp','>4MP':'isz:lt,islt:4mp','>6MP':'isz:lt,islt:6mp','>8MP':'isz:lt,islt:8mp','>10MP':'isz:lt,islt:10mp','>12MP':'isz:lt,islt:12mp','>15MP':'isz:lt,islt:15mp','>20MP':'isz:lt,islt:20mp','>40MP':'isz:lt,islt:40mp','>70MP':'isz:lt,islt:70mp'}], - 'type':[arguments['type'],{'face':'itp:face','photo':'itp:photo','clipart':'itp:clipart','line-drawing':'itp:lineart','animated':'itp:animated'}], - 'time':[arguments['time'],{'past-24-hours':'qdr:d','past-7-days':'qdr:w','past-month':'qdr:m','past-year':'qdr:y'}], - 'aspect_ratio':[arguments['aspect_ratio'],{'tall':'iar:t','square':'iar:s','wide':'iar:w','panoramic':'iar:xw'}], - 'format':[arguments['format'],{'jpg':'ift:jpg','gif':'ift:gif','png':'ift:png','bmp':'ift:bmp','svg':'ift:svg','webp':'webp','ico':'ift:ico','raw':'ift:craw'}]} + params = {'color': [arguments['color'], {'red': 'ic:specific,isc:red', 'orange': 'ic:specific,isc:orange', + 'yellow': 'ic:specific,isc:yellow', 'green': 'ic:specific,isc:green', + 'teal': 'ic:specific,isc:teel', 'blue': 'ic:specific,isc:blue', + 'purple': 'ic:specific,isc:purple', 'pink': 'ic:specific,isc:pink', + 'white': 'ic:specific,isc:white', 'gray': 'ic:specific,isc:gray', + 'black': 'ic:specific,isc:black', 'brown': 'ic:specific,isc:brown'}], + 'color_type': [arguments['color_type'], + {'full-color': 'ic:color', 'black-and-white': 'ic:gray', 'transparent': 'ic:trans'}], + 'usage_rights': [arguments['usage_rights'], + {'labeled-for-reuse-with-modifications': 'sur:fmc', 'labeled-for-reuse': 'sur:fc', + 'labeled-for-noncommercial-reuse-with-modification': 'sur:fm', + 'labeled-for-nocommercial-reuse': 'sur:f'}], + 'size': [arguments['size'], + {'large': 'isz:l', 'medium': 'isz:m', 'icon': 'isz:i', '>400*300': 'isz:lt,islt:qsvga', + '>640*480': 'isz:lt,islt:vga', '>800*600': 'isz:lt,islt:svga', + '>1024*768': 'visz:lt,islt:xga', '>2MP': 'isz:lt,islt:2mp', '>4MP': 'isz:lt,islt:4mp', + '>6MP': 'isz:lt,islt:6mp', '>8MP': 'isz:lt,islt:8mp', '>10MP': 'isz:lt,islt:10mp', + '>12MP': 'isz:lt,islt:12mp', '>15MP': 'isz:lt,islt:15mp', '>20MP': 'isz:lt,islt:20mp', + '>40MP': 'isz:lt,islt:40mp', '>70MP': 'isz:lt,islt:70mp'}], + 'type': [arguments['type'], {'face': 'itp:face', 'photo': 'itp:photo', 'clipart': 'itp:clipart', + 'line-drawing': 'itp:lineart', 'animated': 'itp:animated'}], + 'time': [arguments['time'], {'past-24-hours': 'qdr:d', 'past-7-days': 'qdr:w', 'past-month': 'qdr:m', + 'past-year': 'qdr:y'}], + 'aspect_ratio': [arguments['aspect_ratio'], + {'tall': 'iar:t', 'square': 'iar:s', 'wide': 'iar:w', 'panoramic': 'iar:xw'}], + 'format': [arguments['format'], + {'jpg': 'ift:jpg', 'gif': 'ift:gif', 'png': 'ift:png', 'bmp': 'ift:bmp', 'svg': 'ift:svg', + 'webp': 'webp', 'ico': 'ift:ico', 'raw': 'ift:craw'}]} for key, value in params.items(): if value[0] is not None: ext_param = value[1][value[0]] @@ -482,13 +558,12 @@ def build_url_parameters(self,arguments): else: built_url = built_url + ',' + ext_param counter += 1 - built_url = lang_url+built_url+exact_size+time_range + built_url = lang_url + built_url + exact_size + time_range return built_url - - #building main search URL - def build_search_url(self,search_term,params,url,similar_images,specific_site,safe_search): - #check safe_search + # building main search URL + def build_search_url(self, search_term, params, url, similar_images, specific_site, safe_search): + # check safe_search safe_search_string = "&safe=active" # check the args and choose the URL if url: @@ -499,20 +574,21 @@ def build_search_url(self,search_term,params,url,similar_images,specific_site,sa url = 'https://www.google.com/search?q=' + keywordem + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' elif specific_site: url = 'https://www.google.com/search?q=' + quote( - search_term.encode('utf-8')) + '&as_sitesearch=' + specific_site + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + search_term.encode( + 'utf-8')) + '&as_sitesearch=' + specific_site + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' else: url = 'https://www.google.com/search?q=' + quote( - search_term.encode('utf-8')) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + search_term.encode( + 'utf-8')) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' - #safe search check + # safe search check if safe_search: url = url + safe_search_string return url - - #measures the file size - def file_size(self,file_path): + # measures the file size + def file_size(self, file_path): if os.path.isfile(file_path): file_info = os.stat(file_path) size = file_info.st_size @@ -522,8 +598,8 @@ def file_size(self,file_path): size /= 1024.0 return size - #keywords from file - def keywords_from_file(self,file_name): + # keywords from file + def keywords_from_file(self, file_name): search_keyword = [] with codecs.open(file_name, 'r', encoding='utf-8-sig') as f: if '.csv' in file_name: @@ -545,7 +621,7 @@ def keywords_from_file(self,file_name): return search_keyword # make directories - def create_directories(self,main_directory, dir_name,thumbnail,thumbnail_only): + def create_directories(self, main_directory, dir_name, thumbnail, thumbnail_only): dir_name_thumbnail = dir_name + " - thumbnail" # make a search keyword directory try: @@ -575,13 +651,13 @@ def create_directories(self,main_directory, dir_name,thumbnail,thumbnail_only): pass return - # Download Image thumbnails - def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image_name,print_urls,socket_timeout,print_size,no_download,save_source,img_src,ignore_urls): + def download_image_thumbnail(self, image_url, main_directory, dir_name, return_image_name, print_urls, + socket_timeout, print_size, no_download, save_source, img_src, ignore_urls): if print_urls or no_download: print("Image URL: " + image_url) if no_download: - return "success","Printed url without downloading" + return "success", "Printed url without downloading" try: req = Request(image_url, headers={ "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) @@ -604,7 +680,7 @@ def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image output_file.close() if save_source: list_path = main_directory + "/" + save_source + ".txt" - list_file = open(list_path,'a') + list_file = open(list_path, 'a') list_file.write(path + '\t' + img_src + '\n') list_file.close() except OSError as e: @@ -642,9 +718,10 @@ def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image download_message = "IOError on an image...trying next one..." + " Error: " + str(e) return download_status, download_message - # Download Images - def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size,no_numbering,no_download,save_source,img_src,silent_mode,thumbnail_only,format,ignore_urls): + def download_image(self, image_url, image_format, main_directory, dir_name, count, print_urls, socket_timeout, + prefix, print_size, no_numbering, no_download, save_source, img_src, silent_mode, thumbnail_only, + format, ignore_urls): if not silent_mode: if print_urls or no_download: print("Image URL: " + image_url) @@ -654,7 +731,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri if thumbnail_only: return "success", "Skipping image download...", str(image_url[(image_url.rfind('/')) + 1:]), image_url if no_download: - return "success","Printed url without downloading",None,image_url + return "success", "Printed url without downloading", None, image_url try: req = Request(image_url, headers={ "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) @@ -722,7 +799,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri output_file.close() if save_source: list_path = main_directory + "/" + save_source + ".txt" - list_file = open(list_path,'a') + list_file = open(list_path, 'a') list_file.write(path + '\t' + img_src + '\n') list_file.close() absolute_path = os.path.abspath(path) @@ -732,7 +809,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri return_image_name = '' absolute_path = '' - #return image name back to calling method to use it for thumbnail downloads + # return image name back to calling method to use it for thumbnail downloads download_status = 'success' download_message = "Completed Image ====> " + prefix + str(count) + "." + image_name return_image_name = prefix + str(count) + "." + image_name @@ -753,7 +830,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri download_message = "URLError on an image...trying next one..." + " Error: " + str(e) return_image_name = '' absolute_path = '' - + except BadStatusLine as e: download_status = 'fail' download_message = "BadStatusLine on an image...trying next one..." + " Error: " + str(e) @@ -790,63 +867,43 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri return_image_name = '' absolute_path = '' - return download_status,download_message,return_image_name,absolute_path - - - # Finding 'Next Image' from the given raw page - def _get_next_item(self,s): - start_line = s.find('rg_meta notranslate') - if start_line == -1: # If no links are found then give an error! - end_quote = 0 - link = "no_links" - return link, end_quote - else: - start_line = s.find('class="rg_meta notranslate">') - start_object = s.find('{', start_line + 1) - end_object = s.find('', start_object + 1) - object_raw = str(s[start_object:end_object]) - #remove escape characters based on python version - version = (3, 0) - cur_version = sys.version_info - if cur_version >= version: #python3 - try: - object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") - final_object = json.loads(object_decode) - except: - final_object = "" - else: #python2 - try: - final_object = (json.loads(self.repair(object_raw))) - except: - final_object = "" - return final_object, end_object + return download_status, download_message, return_image_name, absolute_path - def _get_all_items(self,image_objects,main_directory,dir_name,limit,arguments): + def _get_all_items(self, image_objects, main_directory, dir_name, limit, arguments): items = [] abs_path = [] errorCount = 0 i = 0 count = 1 - while count < limit+1 and i" + " Item name = " + (pky) + (search_keyword[i]) + (sky) + while i < len(search_keyword): # 3.for every main keyword + iteration = "\n" + "Item no.: " + str(i + 1) + " -->" + " Item name = " + (pky) + ( + search_keyword[i]) + (sky) if not arguments["silent_mode"]: print(iteration.encode('raw_unicode_escape').decode('utf-8')) print("Evaluating...") @@ -1004,40 +1063,45 @@ def download_executor(self,arguments): elif arguments['no_directory']: dir_name = '' else: - dir_name = search_term + ('-' + arguments['color'] if arguments['color'] else '') #sub-directory + dir_name = search_term + ( + '-' + arguments['color'] if arguments['color'] else '') # sub-directory if not arguments["no_download"]: - self.create_directories(main_directory,dir_name,arguments['thumbnail'],arguments['thumbnail_only']) #create directories in OS + self.create_directories(main_directory, dir_name, arguments['thumbnail'], + arguments['thumbnail_only']) # create directories in OS - params = self.build_url_parameters(arguments) #building URL with params + params = self.build_url_parameters(arguments) # building URL with params - url = self.build_search_url(search_term,params,arguments['url'],arguments['similar_images'],arguments['specific_site'],arguments['safe_search']) #building main search url + url = self.build_search_url(search_term, params, arguments['url'], arguments['similar_images'], + arguments['specific_site'], + arguments['safe_search']) # building main search url if limit < 101: images, tabs = self.download_page(url) # download page else: - images, tabs = self.download_extended_page(url,arguments['chromedriver']) + images, tabs = self.download_extended_page(url, arguments['chromedriver']) if not arguments["silent_mode"]: if arguments['no_download']: print("Getting URLs without downloading images...") else: print("Starting Download...") - items,errorCount,abs_path = self._get_all_items(images,main_directory,dir_name,limit,arguments) #get all image items and download images + items, errorCount, abs_path = self._get_all_items(images, main_directory, dir_name, limit, + arguments) # get all image items and download images paths[pky + search_keyword[i] + sky] = abs_path - #dumps into a json file + # dumps into a json file if arguments['extract_metadata']: try: if not os.path.exists("logs"): os.makedirs("logs") except OSError as e: print(e) - json_file = open("logs/"+search_keyword[i]+".json", "w") + json_file = open("logs/" + search_keyword[i] + ".json", "w") json.dump(items, json_file, indent=4, sort_keys=True) json_file.close() - #Related images + # Related images if arguments['related_images']: print("\nGetting list of related keywords...this may take a few moments") for key, value in tabs.items(): @@ -1046,9 +1110,10 @@ def download_executor(self,arguments): if limit < 101: images, _ = self.download_page(value) # download page else: - images, _ = self.download_extended_page(value,arguments['chromedriver']) - self.create_directories(main_directory, final_search_term,arguments['thumbnail'],arguments['thumbnail_only']) - self._get_all_items(images, main_directory, search_term + " - " + key, limit,arguments) + images, _ = self.download_extended_page(value, arguments['chromedriver']) + self.create_directories(main_directory, final_search_term, arguments['thumbnail'], + arguments['thumbnail_only']) + self._get_all_items(images, main_directory, search_term + " - " + key, limit, arguments) i += 1 total_errors = total_errors + errorCount @@ -1056,7 +1121,8 @@ def download_executor(self,arguments): print("\nErrors: " + str(errorCount) + "\n") return paths, total_errors -#------------- Main Program -------------# + +# ------------- Main Program -------------# def main(): records = user_input() total_errors = 0 @@ -1068,7 +1134,7 @@ def main(): response.single_image(arguments['single_image']) else: # or download multiple images based on keywords/keyphrase search response = googleimagesdownload() - paths,errors = response.download(arguments) #wrapping response in a variable just for consistency + paths, errors = response.download(arguments) # wrapping response in a variable just for consistency total_errors = total_errors + errors t1 = time.time() # stop the timer @@ -1078,5 +1144,6 @@ def main(): print("Total errors: " + str(total_errors)) print("Total time taken: " + str(total_time) + " Seconds") + if __name__ == "__main__": main() From bcb2af34a96ab3fc249ac605f2b951227c4aef6a Mon Sep 17 00:00:00 2001 From: Joe Clinton <48254978+Joeclinton1@users.noreply.github.com> Date: Sun, 6 Sep 2020 15:39:20 +0100 Subject: [PATCH 12/31] Fixed end_object find code Previously the end_object for the data pack was found by searching for '' and then going 4 characters back, however google in a recent update has added , 'sideChannel: {}});' to the end of the data pack, which throws it off. To fix this the end_object finding script first searches for '' and then searches for the first ']' to the left of that closing script tag. This should be more flexible. --- google_images_download/google_images_download.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 4a6cf9eb..2f2dfa01 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -181,14 +181,14 @@ def __init__(self): def _extract_data_pack(self, page): start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 start_object = page.find('[', start_line + 1) - end_object = page.find('', start_object + 1) - 5 + end_object = page.rfind(']',0,page.find('', start_object + 1)) + 1 object_raw = str(page[start_object:end_object]) return bytes(object_raw, "utf-8").decode("unicode_escape") def _extract_data_pack_extended(self, page): start_line = page.find("AF_initDataCallback({key: 'ds:1'") - 10 start_object = page.find('[', start_line + 1) - end_object = page.find('', start_object + 1) - 4 + end_object = page.rfind(']',0,page.find('', start_object + 1)) + 1 return str(page[start_object:end_object]) def _extract_data_pack_ajax(self, data): @@ -196,6 +196,7 @@ def _extract_data_pack_ajax(self, data): return json.loads(lines[3] + lines[4])[0][2] def _image_objects_from_pack(self, data): + print(data) image_objects = json.loads(data)[31][0][12][2] image_objects = [x for x in image_objects if x[0] == 1] return image_objects @@ -214,6 +215,7 @@ def download_page(self, url): respData = str(resp.read()) return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) except Exception as e: + print(e) print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") sys.exit() @@ -264,16 +266,13 @@ def download_extended_page(self, url, chromedriver): var open = XHR.prototype.open; var send = XHR.prototype.send; var data = []; - XHR.prototype.open = function(method, url, async, user, pass) { this._url = url; open.call(this, method, url, async, user, pass); } - XHR.prototype.send = function(data) { var self = this; var url = this._url; - function stateChanged() { if (self.readyState == 4) { console.log("data available for: " + url) @@ -285,7 +284,6 @@ def download_extended_page(self, url, chromedriver): } send.call(this, data); }; - XHR.prototype._data = []; })(XMLHttpRequest); """) From 58a190b584bd690c6eaab745d519179a11c7c484 Mon Sep 17 00:00:00 2001 From: Joe Clinton <48254978+Joeclinton1@users.noreply.github.com> Date: Sun, 6 Sep 2020 16:05:35 +0100 Subject: [PATCH 13/31] Improved exception handling Previously if the data unpacking failed it would tell the user that the URL could not be opened. But this is the wrong exception. So i fixed this by splitting up the data un packing and url opening into seperate parts so each can have their own exception. This should make it easier to identify what has gone wrong. --- google_images_download/google_images_download.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 2f2dfa01..ab00f783 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -181,7 +181,7 @@ def __init__(self): def _extract_data_pack(self, page): start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 start_object = page.find('[', start_line + 1) - end_object = page.rfind(']',0,page.find('', start_object + 1)) + 1 + end_object = page.rfind(']',0,page.find('', start_object + 1))+1 object_raw = str(page[start_object:end_object]) return bytes(object_raw, "utf-8").decode("unicode_escape") @@ -196,7 +196,6 @@ def _extract_data_pack_ajax(self, data): return json.loads(lines[3] + lines[4])[0][2] def _image_objects_from_pack(self, data): - print(data) image_objects = json.loads(data)[31][0][12][2] image_objects = [x for x in image_objects if x[0] == 1] return image_objects @@ -213,9 +212,7 @@ def download_page(self, url): req = urllib.request.Request(url, headers=headers) resp = urllib.request.urlopen(req) respData = str(resp.read()) - return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) - except Exception as e: - print(e) + except: print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") sys.exit() @@ -230,13 +227,18 @@ def download_page(self, url): except URLError: # Handling SSL certificate failed context = ssl._create_unverified_context() response = urlopen(req, context=context) - page = response.read() - return self._image_objects_from_pack(self._extract_data_pack(page)), self.get_all_tabs(page) + respData = response.read() except: print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") sys.exit() return "Page Not found" + try: + return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) + except Exception as e: + print(e) + print('Image objects data unpacking failed. Please leave a comment with the above error at https://github.com/hardikvasa/google-images-download/pull/298') + sys.exit() # Download Page for more than 100 images def download_extended_page(self, url, chromedriver): From aa817df6701b4c9fd64cea1f517d863a9a558693 Mon Sep 17 00:00:00 2001 From: Joe Clinton Date: Sun, 31 Jan 2021 16:29:03 +0000 Subject: [PATCH 14/31] Updated user agent to use newer browser. --- README.rst | 4 ++-- docs/arguments.rst | 2 +- docs/index.rst | 4 ++-- google_images_download/google_images_download.py | 9 +++------ 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/README.rst b/README.rst index b1e08f65..14d86228 100644 --- a/README.rst +++ b/README.rst @@ -23,7 +23,7 @@ The original creators of the images own the copyrights. Images published in the United States are automatically copyrighted by their owners, even if they do not explicitly carry a copyright warning. -You may not reproduce copyright images without their owner's permission, +You may not reproduce copyright images without their owner'self permission, except in "fair use" cases, -or you could risk running into lawyer's warnings, cease-and-desist letters, and copyright suits. +or you could risk running into lawyer'self warnings, cease-and-desist letters, and copyright suits. Please be very careful before its usage! Use this script/code only for educational purposes. diff --git a/docs/arguments.rst b/docs/arguments.rst index e18035c9..ead86ab1 100644 --- a/docs/arguments.rst +++ b/docs/arguments.rst @@ -85,7 +85,7 @@ Link to `Documentation Homepage 400*300, >640*480, >800*600, >1024*768, >2MP, >4MP, >6MP, >8MP, >10MP, | | | | >12MP, >15MP, >20MP, >40MP, >70MP` | diff --git a/docs/index.rst b/docs/index.rst index 3e276d00..001fb252 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -127,7 +127,7 @@ Disclaimer Images published in the United States are automatically copyrighted by their owners, even if they do not explicitly carry a copyright warning. - You may not reproduce copyright images without their owner's permission, + You may not reproduce copyright images without their owner'self permission, except in "fair use" cases, - or you could risk running into lawyer's warnings, cease-and-desist letters, and copyright suits. + or you could risk running into lawyer'self warnings, cease-and-desist letters, and copyright suits. Please be very careful before its usage! diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index ab00f783..49dcee77 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -204,11 +204,11 @@ def _image_objects_from_pack(self, data): def download_page(self, url): version = (3, 0) cur_version = sys.version_info + headers = {} + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" if cur_version >= version: # If the Current Version of Python is 3.0 or above try: - headers = {} - headers[ - 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib.request.Request(url, headers=headers) resp = urllib.request.urlopen(req) respData = str(resp.read()) @@ -218,9 +218,6 @@ def download_page(self, url): sys.exit() else: # If the Current Version of Python is 2.x try: - headers = {} - headers[ - 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req = urllib2.Request(url, headers=headers) try: response = urllib2.urlopen(req) From 2a310f1b3386d63dfd0661e45c513ac2e4e38f2b Mon Sep 17 00:00:00 2001 From: explosion1206 <40578236+estuhr1206@users.noreply.github.com> Date: Tue, 25 May 2021 02:26:21 -0400 Subject: [PATCH 15/31] Add files via upload --- google_images_download.py | 1152 +++++++++++++++++++++++++++++++++++++ 1 file changed, 1152 insertions(+) create mode 100644 google_images_download.py diff --git a/google_images_download.py b/google_images_download.py new file mode 100644 index 00000000..51bb251b --- /dev/null +++ b/google_images_download.py @@ -0,0 +1,1152 @@ +#!/usr/bin/env python +# In[ ]: +# coding: utf-8 + +###### Searching and Downloading Google Images to the local disk ###### + +# Import Libraries +import sys + +version = (3, 0) +cur_version = sys.version_info +if cur_version >= version: # If the Current Version of Python is 3.0 or above + import urllib.request + from urllib.request import Request, urlopen + from urllib.request import URLError, HTTPError + from urllib.parse import quote + import http.client + from http.client import IncompleteRead, BadStatusLine + + http.client._MAXHEADERS = 1000 +else: # If the Current Version of Python is 2.x + import urllib2 + from urllib2 import Request, urlopen + from urllib2 import URLError, HTTPError + from urllib import quote + import httplib + from httplib import IncompleteRead, BadStatusLine + + httplib._MAXHEADERS = 1000 +import time # Importing the time library to check the time of code execution +import os +import argparse +import ssl +import datetime +import json +import re +import codecs +import socket + +args_list = ["keywords", "keywords_from_file", "prefix_keywords", "suffix_keywords", + "limit", "format", "color", "color_type", "usage_rights", "size", + "exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image", + "output_directory", "image_directory", "no_directory", "proxy", "similar_images", "specific_site", + "print_urls", "print_size", "print_paths", "metadata", "extract_metadata", "socket_timeout", + "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "related_images", "safe_search", + "no_numbering", + "offset", "no_download", "save_source", "silent_mode", "ignore_urls"] + + +def user_input(): + config = argparse.ArgumentParser() + config.add_argument('-cf', '--config_file', help='config file name', default='', type=str, required=False) + config_file_check = config.parse_known_args() + object_check = vars(config_file_check[0]) + + if object_check['config_file'] != '': + records = [] + json_file = json.load(open(config_file_check[0].config_file)) + for record in range(0, len(json_file['Records'])): + arguments = {} + for i in args_list: + arguments[i] = None + for key, value in json_file['Records'][record].items(): + arguments[key] = value + records.append(arguments) + records_count = len(records) + else: + # Taking command line arguments from users + parser = argparse.ArgumentParser() + parser.add_argument('-k', '--keywords', help='delimited list input', type=str, required=False) + parser.add_argument('-kf', '--keywords_from_file', help='extract list of keywords from a text file', type=str, + required=False) + parser.add_argument('-sk', '--suffix_keywords', + help='comma separated additional words added after to main keyword', type=str, + required=False) + parser.add_argument('-pk', '--prefix_keywords', + help='comma separated additional words added before main keyword', type=str, required=False) + parser.add_argument('-l', '--limit', help='delimited list input', type=str, required=False) + parser.add_argument('-f', '--format', help='download images with specific format', type=str, required=False, + choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico']) + parser.add_argument('-u', '--url', help='search with google image URL', type=str, required=False) + parser.add_argument('-x', '--single_image', help='downloading a single image from URL', type=str, + required=False) + parser.add_argument('-o', '--output_directory', help='download images in a specific main directory', type=str, + required=False) + parser.add_argument('-i', '--image_directory', help='download images in a specific sub-directory', type=str, + required=False) + parser.add_argument('-n', '--no_directory', default=False, + help='download images in the main directory but no sub-directory', action="store_true") + parser.add_argument('-d', '--delay', help='delay in seconds to wait between downloading two images', type=int, + required=False) + parser.add_argument('-co', '--color', help='filter on color', type=str, required=False, + choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', + 'gray', 'black', 'brown']) + parser.add_argument('-ct', '--color_type', help='filter on color', type=str, required=False, + choices=['full-color', 'black-and-white', 'transparent']) + parser.add_argument('-r', '--usage_rights', help='usage rights', type=str, required=False, + choices=['labeled-for-reuse-with-modifications', 'labeled-for-reuse', + 'labeled-for-noncommercial-reuse-with-modification', + 'labeled-for-nocommercial-reuse']) + parser.add_argument('-s', '--size', help='image size', type=str, required=False, + choices=['large', 'medium', 'icon', '>400*300', '>640*480', '>800*600', '>1024*768', '>2MP', + '>4MP', '>6MP', '>8MP', '>10MP', '>12MP', '>15MP', '>20MP', '>40MP', '>70MP']) + parser.add_argument('-es', '--exact_size', help='exact image resolution "WIDTH,HEIGHT"', type=str, + required=False) + parser.add_argument('-t', '--type', help='image type', type=str, required=False, + choices=['face', 'photo', 'clipart', 'line-drawing', 'animated']) + parser.add_argument('-w', '--time', help='image age', type=str, required=False, + choices=['past-24-hours', 'past-7-days', 'past-month', 'past-year']) + parser.add_argument('-wr', '--time_range', + help='time range for the age of the image. should be in the format {"time_min":"MM/DD/YYYY","time_max":"MM/DD/YYYY"}', + type=str, required=False) + parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, + required=False, + choices=['tall', 'square', 'wide', 'panoramic']) + parser.add_argument('-si', '--similar_images', + help='downloads images very similar to the image URL you provide', type=str, required=False) + parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', + type=str, required=False) + parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", + action="store_true") + parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", + action="store_true") + parser.add_argument('-pp', '--print_paths', default=False, + help="Prints the list of absolute paths of the images", action="store_true") + parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", + action="store_true") + parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", + action="store_true") + parser.add_argument('-st', '--socket_timeout', default=False, + help="Connection timeout waiting for the image to download", type=float) + parser.add_argument('-th', '--thumbnail', default=False, + help="Downloads image thumbnail along with the actual image", action="store_true") + parser.add_argument('-tho', '--thumbnail_only', default=False, + help="Downloads only thumbnail without downloading actual images", action="store_true") + parser.add_argument('-la', '--language', default=False, + help="Defines the language filter. The search results are authomatically returned in that language", + type=str, required=False, + choices=['Arabic', 'Chinese (Simplified)', 'Chinese (Traditional)', 'Czech', 'Danish', + 'Dutch', 'English', 'Estonian', 'Finnish', 'French', 'German', 'Greek', 'Hebrew', + 'Hungarian', 'Icelandic', 'Italian', 'Japanese', 'Korean', 'Latvian', 'Lithuanian', + 'Norwegian', 'Portuguese', 'Polish', 'Romanian', 'Russian', 'Spanish', 'Swedish', + 'Turkish']) + parser.add_argument('-pr', '--prefix', default=False, + help="A word that you would want to prefix in front of each image name", type=str, + required=False) + parser.add_argument('-px', '--proxy', help='specify a proxy address and port', type=str, required=False) + parser.add_argument('-cd', '--chromedriver', + help='specify the path to chromedriver executable in your local machine', type=str, + required=False) + parser.add_argument('-ri', '--related_images', default=False, + help="Downloads images that are similar to the keyword provided", action="store_true") + parser.add_argument('-sa', '--safe_search', default=False, + help="Turns on the safe search filter while searching for images", action="store_true") + parser.add_argument('-nn', '--no_numbering', default=False, + help="Allows you to exclude the default numbering of images", action="store_true") + parser.add_argument('-of', '--offset', help="Where to start in the fetched links", type=str, required=False) + parser.add_argument('-nd', '--no_download', default=False, + help="Prints the URLs of the images and/or thumbnails without downloading them", + action="store_true") + parser.add_argument('-iu', '--ignore_urls', default=False, + help="delimited list input of image urls/keywords to ignore", type=str) + parser.add_argument('-sil', '--silent_mode', default=False, + help="Remains silent. Does not print notification messages on the terminal", + action="store_true") + parser.add_argument('-is', '--save_source', + help="creates a text file containing a list of downloaded images along with source page url", + type=str, required=False) + + args = parser.parse_args() + arguments = vars(args) + records = [] + records.append(arguments) + return records + + +class googleimagesdownload: + def __init__(self): + pass + + def _extract_data_pack(self, page): + start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 + start_object = page.find('[', start_line + 1) + end_object = page.rfind(']',0,page.find('', start_object + 1))+1 + object_raw = str(page[start_object:end_object]) + return bytes(object_raw, "utf-8").decode("unicode_escape") + + def _extract_data_pack_extended(self, page): + start_line = page.find("AF_initDataCallback({key: 'ds:1'") - 10 + start_object = page.find('[', start_line + 1) + end_object = page.rfind(']',0,page.find('', start_object + 1)) + 1 + return str(page[start_object:end_object]) + + def _extract_data_pack_ajax(self, data): + lines = data.split('\n') + return json.loads(lines[3] + lines[4])[0][2] + + def _image_objects_from_pack(self, data): + image_objects = json.loads(data)[31][0][12][2] + image_objects = [x for x in image_objects if x[0] == 1] + return image_objects + + # Downloading entire Web Document (Raw Page Content) + def download_page(self, url): + version = (3, 0) + cur_version = sys.version_info + headers = {} + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" + if cur_version >= version: # If the Current Version of Python is 3.0 or above + try: + req = urllib.request.Request(url, headers=headers) + resp = urllib.request.urlopen(req) + respData = str(resp.read()) + except: + print("Could not open URL. Please check your internet connection and/or ssl settings \n" + "If you are using proxy, make sure your proxy settings is configured correctly") + sys.exit() + else: # If the Current Version of Python is 2.x + try: + req = urllib2.Request(url, headers=headers) + try: + response = urllib2.urlopen(req) + except URLError: # Handling SSL certificate failed + context = ssl._create_unverified_context() + response = urlopen(req, context=context) + respData = response.read() + except: + print("Could not open URL. Please check your internet connection and/or ssl settings \n" + "If you are using proxy, make sure your proxy settings is configured correctly") + sys.exit() + return "Page Not found" + try: + return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) + except Exception as e: + print(e) + print('Image objects data unpacking failed. Please leave a comment with the above error at https://github.com/hardikvasa/google-images-download/pull/298') + sys.exit() + + # Download Page for more than 100 images + def download_extended_page(self, url, chromedriver): + from selenium import webdriver + from selenium.webdriver.common.keys import Keys + if sys.version_info[0] < 3: + reload(sys) + sys.setdefaultencoding('utf8') + options = webdriver.ChromeOptions() + options.add_argument('--no-sandbox') + options.add_argument("--headless") + + try: + browser = webdriver.Chrome(chromedriver, chrome_options=options) + except Exception as e: + print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' " + "argument to specify the path to the executable.) or google chrome browser is not " + "installed on your machine (exception: %s)" % e) + sys.exit() + browser.set_window_size(1024, 768) + + # Open the link + browser.get(url) + browser.execute_script(""" + (function(XHR){ + "use strict"; + var open = XHR.prototype.open; + var send = XHR.prototype.send; + var data = []; + XHR.prototype.open = function(method, url, async, user, pass) { + this._url = url; + open.call(this, method, url, async, user, pass); + } + XHR.prototype.send = function(data) { + var self = this; + var url = this._url; + function stateChanged() { + if (self.readyState == 4) { + console.log("data available for: " + url) + XHR.prototype._data.push(self.response); + } + } + if (url.includes("/batchexecute?")) { + this.addEventListener("readystatechange", stateChanged, false); + } + send.call(this, data); + }; + XHR.prototype._data = []; + })(XMLHttpRequest); + """) + + time.sleep(1) + print("Getting you a lot of images. This may take a few moments...") + + element = browser.find_element_by_tag_name("body") + # Scroll down + for i in range(30): + element.send_keys(Keys.PAGE_DOWN) + time.sleep(0.3) + + try: + browser.find_element_by_id("smb").click() + for i in range(50): + element.send_keys(Keys.PAGE_DOWN) + time.sleep(0.3) # bot id protection + except: + for i in range(10): + element.send_keys(Keys.PAGE_DOWN) + time.sleep(0.3) # bot id protection + + print("Reached end of Page.") + time.sleep(0.5) + + source = browser.page_source # page source + images = self._image_objects_from_pack(self._extract_data_pack_extended(source)) + + ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") + for chunk in ajax_data: + images += self._image_objects_from_pack(self._extract_data_pack_ajax(chunk)) + + # close the browser + browser.close() + + return images, self.get_all_tabs(source) + + # Correcting the escape characters for python2 + def replace_with_byte(self, match): + return chr(int(match.group(0)[1:], 8)) + + def repair(self, brokenjson): + invalid_escape = re.compile(r'\\[0-7]{1,3}') # up to 3 digits for byte values up to FF + return invalid_escape.sub(self.replace_with_byte, brokenjson) + + # Finding 'Next Image' from the given raw page + def get_next_tab(self, s): + start_line = s.find('class="dtviD"') + if start_line == -1: # If no links are found then give an error! + end_quote = 0 + link = "no_tabs" + return link, '', end_quote + else: + start_line = s.find('class="dtviD"') + start_content = s.find('href="', start_line + 1) + end_content = s.find('">', start_content + 1) + url_item = "https://www.google.com" + str(s[start_content + 6:end_content]) + url_item = url_item.replace('&', '&') + + start_line_2 = s.find('class="dtviD"') + s = s.replace('&', '&') + start_content_2 = s.find(':', start_line_2 + 1) + end_content_2 = s.find('&usg=', start_content_2 + 1) + url_item_name = str(s[start_content_2 + 1:end_content_2]) + + chars = url_item_name.find(',g_1:') + chars_end = url_item_name.find(":", chars + 6) + if chars_end == -1: + updated_item_name = (url_item_name[chars + 5:]).replace("+", " ") + else: + updated_item_name = (url_item_name[chars + 5:chars_end]).replace("+", " ") + + return url_item, updated_item_name, end_content + + # Getting all links with the help of '_images_get_next_image' + def get_all_tabs(self, page): + tabs = {} + while True: + item, item_name, end_content = self.get_next_tab(page) + if item == "no_tabs": + break + else: + if len(item_name) > 100 or item_name == "background-color": + break + else: + tabs[item_name] = item # Append all the links in the list named 'Links' + time.sleep(0.1) # Timer could be used to slow down the request for image downloads + page = page[end_content:] + return tabs + + # Format the object in readable format + def format_object(self, object): + data = object[1] + main = data[3] + info = data[9] + if info is None: + info = data[11] + formatted_object = {} + try: + formatted_object['image_height'] = main[2] + formatted_object['image_width'] = main[1] + formatted_object['image_link'] = main[0] + formatted_object['image_format'] = main[0][-1 * (len(main[0]) - main[0].rfind(".") - 1):] + formatted_object['image_description'] = info['2003'][3] + formatted_object['image_host'] = info['183836587'][0] + formatted_object['image_source'] = info['2003'][2] + formatted_object['image_thumbnail_url'] = data[2][0] + except Exception as e: + print(e) + return None + return formatted_object + + # function to download single image + def single_image(self, image_url): + main_directory = "downloads" + extensions = (".jpg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico") + url = image_url + try: + os.makedirs(main_directory) + except OSError as e: + if e.errno != 17: + raise + pass + req = Request(url, headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) + + response = urlopen(req, None, 10) + data = response.read() + response.close() + + image_name = str(url[(url.rfind('/')) + 1:]) + if '?' in image_name: + image_name = image_name[:image_name.find('?')] + # if ".jpg" in image_name or ".gif" in image_name or ".png" in image_name or ".bmp" in image_name or ".svg" in image_name or ".webp" in image_name or ".ico" in image_name: + if any(map(lambda extension: extension in image_name, extensions)): + file_name = main_directory + "/" + image_name + else: + file_name = main_directory + "/" + image_name + ".jpg" + image_name = image_name + ".jpg" + + try: + output_file = open(file_name, 'wb') + output_file.write(data) + output_file.close() + except IOError as e: + raise e + except OSError as e: + raise e + print("completed ====> " + image_name.encode('raw_unicode_escape').decode('utf-8')) + return + + def similar_images(self, similar_images): + version = (3, 0) + cur_version = sys.version_info + if cur_version >= version: # If the Current Version of Python is 3.0 or above + try: + searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images + headers = {} + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + + req1 = urllib.request.Request(searchUrl, headers=headers) + resp1 = urllib.request.urlopen(req1) + content = str(resp1.read()) + l1 = content.find('AMhZZ') + l2 = content.find('&', l1) + urll = content[l1:l2] + + newurl = "https://www.google.com/search?tbs=sbi:" + urll + "&site=search&sa=X" + req2 = urllib.request.Request(newurl, headers=headers) + resp2 = urllib.request.urlopen(req2) + l3 = content.find('/search?sa=X&q=') + l4 = content.find(';', l3 + 19) + urll2 = content[l3 + 19:l4] + return urll2 + except: + return "Cloud not connect to Google Images endpoint" + else: # If the Current Version of Python is 2.x + try: + searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images + headers = {} + headers[ + 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" + + req1 = urllib2.Request(searchUrl, headers=headers) + resp1 = urllib2.urlopen(req1) + content = str(resp1.read()) + l1 = content.find('AMhZZ') + l2 = content.find('&', l1) + urll = content[l1:l2] + + newurl = "https://www.google.com/search?tbs=sbi:" + urll + "&site=search&sa=X" + req2 = urllib2.Request(newurl, headers=headers) + resp2 = urllib2.urlopen(req2) + l3 = content.find('/search?sa=X&q=') + l4 = content.find(';', l3 + 19) + urll2 = content[l3 + 19:l4] + return (urll2) + except: + return "Cloud not connect to Google Images endpoint" + + # Building URL parameters + def build_url_parameters(self, arguments): + if arguments['language']: + lang = "&lr=" + lang_param = {"Arabic": "lang_ar", "Chinese (Simplified)": "lang_zh-CN", + "Chinese (Traditional)": "lang_zh-TW", "Czech": "lang_cs", "Danish": "lang_da", + "Dutch": "lang_nl", "English": "lang_en", "Estonian": "lang_et", "Finnish": "lang_fi", + "French": "lang_fr", "German": "lang_de", "Greek": "lang_el", "Hebrew": "lang_iw ", + "Hungarian": "lang_hu", "Icelandic": "lang_is", "Italian": "lang_it", "Japanese": "lang_ja", + "Korean": "lang_ko", "Latvian": "lang_lv", "Lithuanian": "lang_lt", "Norwegian": "lang_no", + "Portuguese": "lang_pt", "Polish": "lang_pl", "Romanian": "lang_ro", "Russian": "lang_ru", + "Spanish": "lang_es", "Swedish": "lang_sv", "Turkish": "lang_tr"} + lang_url = lang + lang_param[arguments['language']] + else: + lang_url = '' + + if arguments['time_range']: + json_acceptable_string = arguments['time_range'].replace("'", "\"") + d = json.loads(json_acceptable_string) + time_range = ',cdr:1,cd_min:' + d['time_min'] + ',cd_max:' + d['time_max'] + else: + time_range = '' + + if arguments['exact_size']: + size_array = [x.strip() for x in arguments['exact_size'].split(',')] + exact_size = ",isz:ex,iszw:" + str(size_array[0]) + ",iszh:" + str(size_array[1]) + else: + exact_size = '' + + built_url = "&tbs=" + counter = 0 + params = {'color': [arguments['color'], {'red': 'ic:specific,isc:red', 'orange': 'ic:specific,isc:orange', + 'yellow': 'ic:specific,isc:yellow', 'green': 'ic:specific,isc:green', + 'teal': 'ic:specific,isc:teel', 'blue': 'ic:specific,isc:blue', + 'purple': 'ic:specific,isc:purple', 'pink': 'ic:specific,isc:pink', + 'white': 'ic:specific,isc:white', 'gray': 'ic:specific,isc:gray', + 'black': 'ic:specific,isc:black', 'brown': 'ic:specific,isc:brown'}], + 'color_type': [arguments['color_type'], + {'full-color': 'ic:color', 'black-and-white': 'ic:gray', 'transparent': 'ic:trans'}], + 'usage_rights': [arguments['usage_rights'], + {'labeled-for-reuse-with-modifications': 'sur:fmc', 'labeled-for-reuse': 'sur:fc', + 'labeled-for-noncommercial-reuse-with-modification': 'sur:fm', + 'labeled-for-nocommercial-reuse': 'sur:f'}], + 'size': [arguments['size'], + {'large': 'isz:l', 'medium': 'isz:m', 'icon': 'isz:i', '>400*300': 'isz:lt,islt:qsvga', + '>640*480': 'isz:lt,islt:vga', '>800*600': 'isz:lt,islt:svga', + '>1024*768': 'visz:lt,islt:xga', '>2MP': 'isz:lt,islt:2mp', '>4MP': 'isz:lt,islt:4mp', + '>6MP': 'isz:lt,islt:6mp', '>8MP': 'isz:lt,islt:8mp', '>10MP': 'isz:lt,islt:10mp', + '>12MP': 'isz:lt,islt:12mp', '>15MP': 'isz:lt,islt:15mp', '>20MP': 'isz:lt,islt:20mp', + '>40MP': 'isz:lt,islt:40mp', '>70MP': 'isz:lt,islt:70mp'}], + 'type': [arguments['type'], {'face': 'itp:face', 'photo': 'itp:photo', 'clipart': 'itp:clipart', + 'line-drawing': 'itp:lineart', 'animated': 'itp:animated'}], + 'time': [arguments['time'], {'past-24-hours': 'qdr:d', 'past-7-days': 'qdr:w', 'past-month': 'qdr:m', + 'past-year': 'qdr:y'}], + 'aspect_ratio': [arguments['aspect_ratio'], + {'tall': 'iar:t', 'square': 'iar:s', 'wide': 'iar:w', 'panoramic': 'iar:xw'}], + 'format': [arguments['format'], + {'jpg': 'ift:jpg', 'gif': 'ift:gif', 'png': 'ift:png', 'bmp': 'ift:bmp', 'svg': 'ift:svg', + 'webp': 'webp', 'ico': 'ift:ico', 'raw': 'ift:craw'}]} + for key, value in params.items(): + if value[0] is not None: + ext_param = value[1][value[0]] + # counter will tell if it is first param added or not + if counter == 0: + # add it to the built url + built_url = built_url + ext_param + counter += 1 + else: + built_url = built_url + ',' + ext_param + counter += 1 + built_url = lang_url + built_url + exact_size + time_range + return built_url + + # building main search URL + def build_search_url(self, search_term, params, url, similar_images, specific_site, safe_search): + # check safe_search + safe_search_string = "&safe=active" + # check the args and choose the URL + if url: + url = url + elif similar_images: + print(similar_images) + keywordem = self.similar_images(similar_images) + url = 'https://www.google.com/search?q=' + keywordem + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + elif specific_site: + url = 'https://www.google.com/search?q=' + quote( + search_term.encode( + 'utf-8')) + '&as_sitesearch=' + specific_site + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + else: + url = 'https://www.google.com/search?q=' + quote( + search_term.encode( + 'utf-8')) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + + # safe search check + if safe_search: + url = url + safe_search_string + + return url + + # measures the file size + def file_size(self, file_path): + if os.path.isfile(file_path): + file_info = os.stat(file_path) + size = file_info.st_size + for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: + if size < 1024.0: + return "%3.1f %s" % (size, x) + size /= 1024.0 + return size + + # keywords from file + def keywords_from_file(self, file_name): + search_keyword = [] + with codecs.open(file_name, 'r', encoding='utf-8-sig') as f: + if '.csv' in file_name: + for line in f: + if line in ['\n', '\r\n']: + pass + else: + search_keyword.append(line.replace('\n', '').replace('\r', '')) + elif '.txt' in file_name: + for line in f: + if line in ['\n', '\r\n']: + pass + else: + search_keyword.append(line.replace('\n', '').replace('\r', '')) + else: + print("Invalid file type: Valid file types are either .txt or .csv \n" + "exiting...") + sys.exit() + return search_keyword + + # make directories + def create_directories(self, main_directory, dir_name, thumbnail, thumbnail_only): + dir_name_thumbnail = dir_name + " - thumbnail" + # make a search keyword directory + try: + if not os.path.exists(main_directory): + os.makedirs(main_directory) + time.sleep(0.15) + path = (dir_name) + sub_directory = os.path.join(main_directory, path) + if not os.path.exists(sub_directory): + os.makedirs(sub_directory) + if thumbnail or thumbnail_only: + sub_directory_thumbnail = os.path.join(main_directory, dir_name_thumbnail) + if not os.path.exists(sub_directory_thumbnail): + os.makedirs(sub_directory_thumbnail) + else: + path = (dir_name) + sub_directory = os.path.join(main_directory, path) + if not os.path.exists(sub_directory): + os.makedirs(sub_directory) + if thumbnail or thumbnail_only: + sub_directory_thumbnail = os.path.join(main_directory, dir_name_thumbnail) + if not os.path.exists(sub_directory_thumbnail): + os.makedirs(sub_directory_thumbnail) + except OSError as e: + if e.errno != 17: + raise + pass + return + + # Download Image thumbnails + def download_image_thumbnail(self, image_url, main_directory, dir_name, return_image_name, print_urls, + socket_timeout, print_size, no_download, save_source, img_src, ignore_urls): + if print_urls or no_download: + print("Image URL: " + image_url) + if no_download: + return "success", "Printed url without downloading" + try: + req = Request(image_url, headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) + try: + # timeout time to download an image + if socket_timeout: + timeout = float(socket_timeout) + else: + timeout = 10 + + response = urlopen(req, None, timeout) + data = response.read() + response.close() + + path = main_directory + "/" + dir_name + " - thumbnail" + "/" + return_image_name + + try: + output_file = open(path, 'wb') + output_file.write(data) + output_file.close() + if save_source: + list_path = main_directory + "/" + save_source + ".txt" + list_file = open(list_path, 'a') + list_file.write(path + '\t' + img_src + '\n') + list_file.close() + except OSError as e: + download_status = 'fail' + download_message = "OSError on an image...trying next one..." + " Error: " + str(e) + except IOError as e: + download_status = 'fail' + download_message = "IOError on an image...trying next one..." + " Error: " + str(e) + + download_status = 'success' + download_message = "Completed Image Thumbnail ====> " + return_image_name + + # image size parameter + if print_size: + print("Image Size: " + str(self.file_size(path))) + + except UnicodeEncodeError as e: + download_status = 'fail' + download_message = "UnicodeEncodeError on an image...trying next one..." + " Error: " + str(e) + + except HTTPError as e: # If there is any HTTPError + download_status = 'fail' + download_message = "HTTPError on an image...trying next one..." + " Error: " + str(e) + + except URLError as e: + download_status = 'fail' + download_message = "URLError on an image...trying next one..." + " Error: " + str(e) + + except ssl.CertificateError as e: + download_status = 'fail' + download_message = "CertificateError on an image...trying next one..." + " Error: " + str(e) + + except IOError as e: # If there is any IOError + download_status = 'fail' + download_message = "IOError on an image...trying next one..." + " Error: " + str(e) + return download_status, download_message + + # Download Images + def download_image(self, image_url, image_format, main_directory, dir_name, count, print_urls, socket_timeout, + prefix, print_size, no_numbering, no_download, save_source, img_src, silent_mode, thumbnail_only, + format, ignore_urls): + if not silent_mode: + if print_urls or no_download: + print("Image URL: " + image_url) + if ignore_urls: + if any(url in image_url for url in ignore_urls.split(',')): + return "fail", "Image ignored due to 'ignore url' parameter", None, image_url + if thumbnail_only: + return "success", "Skipping image download...", str(image_url[(image_url.rfind('/')) + 1:]), image_url + if no_download: + return "success", "Printed url without downloading", None, image_url + try: + req = Request(image_url, headers={ + "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) + try: + # timeout time to download an image + if socket_timeout: + timeout = float(socket_timeout) + else: + timeout = 10 + + response = urlopen(req, None, timeout) + data = response.read() + info = response.info() + response.close() + + qmark = image_url.rfind('?') + if qmark == -1: + qmark = len(image_url) + slash = image_url.rfind('/', 0, qmark) + 1 + image_name = str(image_url[slash:qmark]).lower() + + type = info.get_content_type() + if type == "image/jpeg" or type == "image/jpg": + if not image_name.endswith(".jpg") and not image_name.endswith(".jpeg"): + image_name += ".jpg" + elif type == "image/png": + if not image_name.endswith(".png"): + image_name += ".png" + elif type == "image/webp": + if not image_name.endswith(".webp"): + image_name += ".webp" + elif type == "image/gif": + if not image_name.endswith(".gif"): + image_name += ".gif" + elif type == "image/bmp" or type == "image/x-windows-bmp": + if not image_name.endswith(".bmp"): + image_name += ".bmp" + elif type == "image/x-icon" or type == "image/vnd.microsoft.icon": + if not image_name.endswith(".ico"): + image_name += ".ico" + elif type == "image/svg+xml": + if not image_name.endswith(".svg"): + image_name += ".svg" + else: + download_status = 'fail' + download_message = "Invalid image format '" + type + "'. Skipping..." + return_image_name = '' + absolute_path = '' + return download_status, download_message, return_image_name, absolute_path + + # prefix name in image + if prefix: + prefix = prefix + " " + else: + prefix = '' + + if no_numbering: + path = main_directory + "/" + dir_name + "/" + prefix + image_name + else: + path = main_directory + "/" + dir_name + "/" + prefix + str(count) + "." + image_name + + try: + output_file = open(path, 'wb') + output_file.write(data) + output_file.close() + if save_source: + list_path = main_directory + "/" + save_source + ".txt" + list_file = open(list_path, 'a') + list_file.write(path + '\t' + img_src + '\n') + list_file.close() + absolute_path = os.path.abspath(path) + except OSError as e: + download_status = 'fail' + download_message = "OSError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + # return image name back to calling method to use it for thumbnail downloads + download_status = 'success' + download_message = "Completed Image ====> " + prefix + str(count) + "." + image_name + return_image_name = prefix + str(count) + "." + image_name + + # image size parameter + if not silent_mode: + if print_size: + print("Image Size: " + str(self.file_size(path))) + + except UnicodeEncodeError as e: + download_status = 'fail' + download_message = "UnicodeEncodeError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except URLError as e: + download_status = 'fail' + download_message = "URLError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except BadStatusLine as e: + download_status = 'fail' + download_message = "BadStatusLine on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except HTTPError as e: # If there is any HTTPError + download_status = 'fail' + download_message = "HTTPError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except URLError as e: + download_status = 'fail' + download_message = "URLError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except ssl.CertificateError as e: + download_status = 'fail' + download_message = "CertificateError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except IOError as e: # If there is any IOError + download_status = 'fail' + download_message = "IOError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + except IncompleteRead as e: + download_status = 'fail' + download_message = "IncompleteReadError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + + return download_status, download_message, return_image_name, absolute_path + + def _get_all_items(self, image_objects, main_directory, dir_name, limit, arguments): + items = [] + abs_path = [] + errorCount = 0 + i = 0 + count = 1 + while count < limit + 1 and i < len(image_objects): + if len(image_objects) == 0: + print("no_links") + break + #code added here to attempt to implement offset correctly + #was "count < int(arguments['offset'])" in hardikvasa code, this seems + # to be contrary to the implementation details. + elif arguments['offset'] and count <= int(arguments['offset']): + count += 1 + #page = page[end_content:] + else: + # format the item for readability + object = self.format_object(image_objects[i]) + if arguments['metadata']: + if not arguments["silent_mode"]: + print("\nImage Metadata: " + str(object)) + + # download the images + download_status, download_message, return_image_name, absolute_path = self.download_image( + object['image_link'], object['image_format'], main_directory, dir_name, count, + arguments['print_urls'], arguments['socket_timeout'], arguments['prefix'], arguments['print_size'], + arguments['no_numbering'], arguments['no_download'], arguments['save_source'], + object['image_source'], arguments["silent_mode"], arguments["thumbnail_only"], arguments['format'], + arguments['ignore_urls']) + if not arguments["silent_mode"]: + print(download_message) + if download_status == "success": + + # download image_thumbnails + if arguments['thumbnail'] or arguments["thumbnail_only"]: + download_status, download_message_thumbnail = self.download_image_thumbnail( + object['image_thumbnail_url'], main_directory, dir_name, return_image_name, + arguments['print_urls'], arguments['socket_timeout'], arguments['print_size'], + arguments['no_download'], arguments['save_source'], object['image_source'], + arguments['ignore_urls']) + if not arguments["silent_mode"]: + print(download_message_thumbnail) + + count += 1 + object['image_filename'] = return_image_name + items.append(object) # Append all the links in the list named 'Links' + abs_path.append(absolute_path) + else: + errorCount += 1 + + # delay param + if arguments['delay']: + time.sleep(int(arguments['delay'])) + i += 1 + if count < limit: + print("\n\nUnfortunately all " + str( + limit) + " could not be downloaded because some images were not downloadable. " + str( + count - 1) + " is all we got for this search filter!") + return items, errorCount, abs_path + + # Bulk Download + def download(self, arguments): + paths_agg = {} + # for input coming from other python files + if __name__ != "__main__": + # if the calling file contains config_file param + if 'config_file' in arguments: + records = [] + json_file = json.load(open(arguments['config_file'])) + for record in range(0, len(json_file['Records'])): + arguments = {} + for i in args_list: + arguments[i] = None + for key, value in json_file['Records'][record].items(): + arguments[key] = value + records.append(arguments) + total_errors = 0 + for rec in records: + paths, errors = self.download_executor(rec) + for i in paths: + paths_agg[i] = paths[i] + if not arguments["silent_mode"]: + if arguments['print_paths']: + print(paths.encode('raw_unicode_escape').decode('utf-8')) + total_errors = total_errors + errors + return paths_agg, total_errors + # if the calling file contains params directly + else: + paths, errors = self.download_executor(arguments) + for i in paths: + paths_agg[i] = paths[i] + if not arguments["silent_mode"]: + if arguments['print_paths']: + print(paths.encode('raw_unicode_escape').decode('utf-8')) + return paths_agg, errors + # for input coming from CLI + else: + paths, errors = self.download_executor(arguments) + for i in paths: + paths_agg[i] = paths[i] + if not arguments["silent_mode"]: + if arguments['print_paths']: + print(paths.encode('raw_unicode_escape').decode('utf-8')) + return paths_agg, errors + + def download_executor(self, arguments): + paths = {} + errorCount = None + for arg in args_list: + if arg not in arguments: + arguments[arg] = None + ######Initialization and Validation of user arguments + if arguments['keywords']: + search_keyword = [str(item) for item in arguments['keywords'].split(',')] + + if arguments['keywords_from_file']: + search_keyword = self.keywords_from_file(arguments['keywords_from_file']) + + # both time and time range should not be allowed in the same query + if arguments['time'] and arguments['time_range']: + raise ValueError( + 'Either time or time range should be used in a query. Both cannot be used at the same time.') + + # both time and time range should not be allowed in the same query + if arguments['size'] and arguments['exact_size']: + raise ValueError( + 'Either "size" or "exact_size" should be used in a query. Both cannot be used at the same time.') + + # both image directory and no image directory should not be allowed in the same query + if arguments['image_directory'] and arguments['no_directory']: + raise ValueError('You can either specify image directory or specify no image directory, not both!') + + # Additional words added to keywords + if arguments['suffix_keywords']: + suffix_keywords = [" " + str(sk) for sk in arguments['suffix_keywords'].split(',')] + else: + suffix_keywords = [''] + + # Additional words added to keywords + if arguments['prefix_keywords']: + prefix_keywords = [str(sk) + " " for sk in arguments['prefix_keywords'].split(',')] + else: + prefix_keywords = [''] + + # Setting limit on number of images to be downloaded + if arguments['limit']: + limit = int(arguments['limit']) + else: + limit = 100 + + if arguments['url']: + current_time = str(datetime.datetime.now()).split('.')[0] + search_keyword = [current_time.replace(":", "_")] + + if arguments['similar_images']: + current_time = str(datetime.datetime.now()).split('.')[0] + search_keyword = [current_time.replace(":", "_")] + + # If single_image or url argument not present then keywords is mandatory argument + if arguments['single_image'] is None and arguments['url'] is None and arguments['similar_images'] is None and \ + arguments['keywords'] is None and arguments['keywords_from_file'] is None: + print('-------------------------------\n' + 'Uh oh! Keywords is a required argument \n\n' + 'Please refer to the documentation on guide to writing queries \n' + 'https://github.com/hardikvasa/google-images-download#examples' + '\n\nexiting!\n' + '-------------------------------') + sys.exit() + + # If this argument is present, set the custom output directory + if arguments['output_directory']: + main_directory = arguments['output_directory'] + else: + main_directory = "downloads" + + # Proxy settings + if arguments['proxy']: + os.environ["http_proxy"] = arguments['proxy'] + os.environ["https_proxy"] = arguments['proxy'] + ######Initialization Complete + total_errors = 0 + for pky in prefix_keywords: # 1.for every prefix keywords + for sky in suffix_keywords: # 2.for every suffix keywords + i = 0 + while i < len(search_keyword): # 3.for every main keyword + iteration = "\n" + "Item no.: " + str(i + 1) + " -->" + " Item name = " + (pky) + ( + search_keyword[i]) + (sky) + if not arguments["silent_mode"]: + print(iteration.encode('raw_unicode_escape').decode('utf-8')) + print("Evaluating...") + else: + print("Downloading images for: " + (pky) + (search_keyword[i]) + (sky) + " ...") + search_term = pky + search_keyword[i] + sky + + if arguments['image_directory']: + dir_name = arguments['image_directory'] + elif arguments['no_directory']: + dir_name = '' + else: + dir_name = search_term + ( + '-' + arguments['color'] if arguments['color'] else '') # sub-directory + + if not arguments["no_download"]: + self.create_directories(main_directory, dir_name, arguments['thumbnail'], + arguments['thumbnail_only']) # create directories in OS + + params = self.build_url_parameters(arguments) # building URL with params + + url = self.build_search_url(search_term, params, arguments['url'], arguments['similar_images'], + arguments['specific_site'], + arguments['safe_search']) # building main search url + + if limit < 101: + images, tabs = self.download_page(url) # download page + else: + images, tabs = self.download_extended_page(url, arguments['chromedriver']) + + if not arguments["silent_mode"]: + if arguments['no_download']: + print("Getting URLs without downloading images...") + else: + print("Starting Download...") + items, errorCount, abs_path = self._get_all_items(images, main_directory, dir_name, limit, + arguments) # get all image items and download images + paths[pky + search_keyword[i] + sky] = abs_path + + # dumps into a json file + if arguments['extract_metadata']: + try: + if not os.path.exists("logs"): + os.makedirs("logs") + except OSError as e: + print(e) + json_file = open("logs/" + search_keyword[i] + ".json", "w") + json.dump(items, json_file, indent=4, sort_keys=True) + json_file.close() + + # Related images + if arguments['related_images']: + print("\nGetting list of related keywords...this may take a few moments") + for key, value in tabs.items(): + final_search_term = (search_term + " - " + key) + print("\nNow Downloading - " + final_search_term) + if limit < 101: + images, _ = self.download_page(value) # download page + else: + images, _ = self.download_extended_page(value, arguments['chromedriver']) + self.create_directories(main_directory, final_search_term, arguments['thumbnail'], + arguments['thumbnail_only']) + self._get_all_items(images, main_directory, search_term + " - " + key, limit, arguments) + + i += 1 + total_errors = total_errors + errorCount + if not arguments["silent_mode"]: + print("\nErrors: " + str(errorCount) + "\n") + return paths, total_errors + + +# ------------- Main Program -------------# +def main(): + records = user_input() + total_errors = 0 + t0 = time.time() # start the timer + for arguments in records: + + if arguments['single_image']: # Download Single Image using a URL + response = googleimagesdownload() + response.single_image(arguments['single_image']) + else: # or download multiple images based on keywords/keyphrase search + response = googleimagesdownload() + paths, errors = response.download(arguments) # wrapping response in a variable just for consistency + total_errors = total_errors + errors + + t1 = time.time() # stop the timer + total_time = t1 - t0 # Calculating the total time required to crawl, find and download all the links of 60,000 images + if not arguments["silent_mode"]: + print("\nEverything downloaded!") + print("Total errors: " + str(total_errors)) + print("Total time taken: " + str(total_time) + " Seconds") + + +if __name__ == "__main__": + main() From c17c55d66a49b804bd3b4510efcae8a23233bf0e Mon Sep 17 00:00:00 2001 From: explosion1206 <40578236+estuhr1206@users.noreply.github.com> Date: Tue, 25 May 2021 02:29:51 -0400 Subject: [PATCH 16/31] Delete google_images_download.py just added to wrong directory by accident --- google_images_download.py | 1152 ------------------------------------- 1 file changed, 1152 deletions(-) delete mode 100644 google_images_download.py diff --git a/google_images_download.py b/google_images_download.py deleted file mode 100644 index 51bb251b..00000000 --- a/google_images_download.py +++ /dev/null @@ -1,1152 +0,0 @@ -#!/usr/bin/env python -# In[ ]: -# coding: utf-8 - -###### Searching and Downloading Google Images to the local disk ###### - -# Import Libraries -import sys - -version = (3, 0) -cur_version = sys.version_info -if cur_version >= version: # If the Current Version of Python is 3.0 or above - import urllib.request - from urllib.request import Request, urlopen - from urllib.request import URLError, HTTPError - from urllib.parse import quote - import http.client - from http.client import IncompleteRead, BadStatusLine - - http.client._MAXHEADERS = 1000 -else: # If the Current Version of Python is 2.x - import urllib2 - from urllib2 import Request, urlopen - from urllib2 import URLError, HTTPError - from urllib import quote - import httplib - from httplib import IncompleteRead, BadStatusLine - - httplib._MAXHEADERS = 1000 -import time # Importing the time library to check the time of code execution -import os -import argparse -import ssl -import datetime -import json -import re -import codecs -import socket - -args_list = ["keywords", "keywords_from_file", "prefix_keywords", "suffix_keywords", - "limit", "format", "color", "color_type", "usage_rights", "size", - "exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image", - "output_directory", "image_directory", "no_directory", "proxy", "similar_images", "specific_site", - "print_urls", "print_size", "print_paths", "metadata", "extract_metadata", "socket_timeout", - "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "related_images", "safe_search", - "no_numbering", - "offset", "no_download", "save_source", "silent_mode", "ignore_urls"] - - -def user_input(): - config = argparse.ArgumentParser() - config.add_argument('-cf', '--config_file', help='config file name', default='', type=str, required=False) - config_file_check = config.parse_known_args() - object_check = vars(config_file_check[0]) - - if object_check['config_file'] != '': - records = [] - json_file = json.load(open(config_file_check[0].config_file)) - for record in range(0, len(json_file['Records'])): - arguments = {} - for i in args_list: - arguments[i] = None - for key, value in json_file['Records'][record].items(): - arguments[key] = value - records.append(arguments) - records_count = len(records) - else: - # Taking command line arguments from users - parser = argparse.ArgumentParser() - parser.add_argument('-k', '--keywords', help='delimited list input', type=str, required=False) - parser.add_argument('-kf', '--keywords_from_file', help='extract list of keywords from a text file', type=str, - required=False) - parser.add_argument('-sk', '--suffix_keywords', - help='comma separated additional words added after to main keyword', type=str, - required=False) - parser.add_argument('-pk', '--prefix_keywords', - help='comma separated additional words added before main keyword', type=str, required=False) - parser.add_argument('-l', '--limit', help='delimited list input', type=str, required=False) - parser.add_argument('-f', '--format', help='download images with specific format', type=str, required=False, - choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico']) - parser.add_argument('-u', '--url', help='search with google image URL', type=str, required=False) - parser.add_argument('-x', '--single_image', help='downloading a single image from URL', type=str, - required=False) - parser.add_argument('-o', '--output_directory', help='download images in a specific main directory', type=str, - required=False) - parser.add_argument('-i', '--image_directory', help='download images in a specific sub-directory', type=str, - required=False) - parser.add_argument('-n', '--no_directory', default=False, - help='download images in the main directory but no sub-directory', action="store_true") - parser.add_argument('-d', '--delay', help='delay in seconds to wait between downloading two images', type=int, - required=False) - parser.add_argument('-co', '--color', help='filter on color', type=str, required=False, - choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', - 'gray', 'black', 'brown']) - parser.add_argument('-ct', '--color_type', help='filter on color', type=str, required=False, - choices=['full-color', 'black-and-white', 'transparent']) - parser.add_argument('-r', '--usage_rights', help='usage rights', type=str, required=False, - choices=['labeled-for-reuse-with-modifications', 'labeled-for-reuse', - 'labeled-for-noncommercial-reuse-with-modification', - 'labeled-for-nocommercial-reuse']) - parser.add_argument('-s', '--size', help='image size', type=str, required=False, - choices=['large', 'medium', 'icon', '>400*300', '>640*480', '>800*600', '>1024*768', '>2MP', - '>4MP', '>6MP', '>8MP', '>10MP', '>12MP', '>15MP', '>20MP', '>40MP', '>70MP']) - parser.add_argument('-es', '--exact_size', help='exact image resolution "WIDTH,HEIGHT"', type=str, - required=False) - parser.add_argument('-t', '--type', help='image type', type=str, required=False, - choices=['face', 'photo', 'clipart', 'line-drawing', 'animated']) - parser.add_argument('-w', '--time', help='image age', type=str, required=False, - choices=['past-24-hours', 'past-7-days', 'past-month', 'past-year']) - parser.add_argument('-wr', '--time_range', - help='time range for the age of the image. should be in the format {"time_min":"MM/DD/YYYY","time_max":"MM/DD/YYYY"}', - type=str, required=False) - parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, - required=False, - choices=['tall', 'square', 'wide', 'panoramic']) - parser.add_argument('-si', '--similar_images', - help='downloads images very similar to the image URL you provide', type=str, required=False) - parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', - type=str, required=False) - parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", - action="store_true") - parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", - action="store_true") - parser.add_argument('-pp', '--print_paths', default=False, - help="Prints the list of absolute paths of the images", action="store_true") - parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", - action="store_true") - parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", - action="store_true") - parser.add_argument('-st', '--socket_timeout', default=False, - help="Connection timeout waiting for the image to download", type=float) - parser.add_argument('-th', '--thumbnail', default=False, - help="Downloads image thumbnail along with the actual image", action="store_true") - parser.add_argument('-tho', '--thumbnail_only', default=False, - help="Downloads only thumbnail without downloading actual images", action="store_true") - parser.add_argument('-la', '--language', default=False, - help="Defines the language filter. The search results are authomatically returned in that language", - type=str, required=False, - choices=['Arabic', 'Chinese (Simplified)', 'Chinese (Traditional)', 'Czech', 'Danish', - 'Dutch', 'English', 'Estonian', 'Finnish', 'French', 'German', 'Greek', 'Hebrew', - 'Hungarian', 'Icelandic', 'Italian', 'Japanese', 'Korean', 'Latvian', 'Lithuanian', - 'Norwegian', 'Portuguese', 'Polish', 'Romanian', 'Russian', 'Spanish', 'Swedish', - 'Turkish']) - parser.add_argument('-pr', '--prefix', default=False, - help="A word that you would want to prefix in front of each image name", type=str, - required=False) - parser.add_argument('-px', '--proxy', help='specify a proxy address and port', type=str, required=False) - parser.add_argument('-cd', '--chromedriver', - help='specify the path to chromedriver executable in your local machine', type=str, - required=False) - parser.add_argument('-ri', '--related_images', default=False, - help="Downloads images that are similar to the keyword provided", action="store_true") - parser.add_argument('-sa', '--safe_search', default=False, - help="Turns on the safe search filter while searching for images", action="store_true") - parser.add_argument('-nn', '--no_numbering', default=False, - help="Allows you to exclude the default numbering of images", action="store_true") - parser.add_argument('-of', '--offset', help="Where to start in the fetched links", type=str, required=False) - parser.add_argument('-nd', '--no_download', default=False, - help="Prints the URLs of the images and/or thumbnails without downloading them", - action="store_true") - parser.add_argument('-iu', '--ignore_urls', default=False, - help="delimited list input of image urls/keywords to ignore", type=str) - parser.add_argument('-sil', '--silent_mode', default=False, - help="Remains silent. Does not print notification messages on the terminal", - action="store_true") - parser.add_argument('-is', '--save_source', - help="creates a text file containing a list of downloaded images along with source page url", - type=str, required=False) - - args = parser.parse_args() - arguments = vars(args) - records = [] - records.append(arguments) - return records - - -class googleimagesdownload: - def __init__(self): - pass - - def _extract_data_pack(self, page): - start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 - start_object = page.find('[', start_line + 1) - end_object = page.rfind(']',0,page.find('', start_object + 1))+1 - object_raw = str(page[start_object:end_object]) - return bytes(object_raw, "utf-8").decode("unicode_escape") - - def _extract_data_pack_extended(self, page): - start_line = page.find("AF_initDataCallback({key: 'ds:1'") - 10 - start_object = page.find('[', start_line + 1) - end_object = page.rfind(']',0,page.find('', start_object + 1)) + 1 - return str(page[start_object:end_object]) - - def _extract_data_pack_ajax(self, data): - lines = data.split('\n') - return json.loads(lines[3] + lines[4])[0][2] - - def _image_objects_from_pack(self, data): - image_objects = json.loads(data)[31][0][12][2] - image_objects = [x for x in image_objects if x[0] == 1] - return image_objects - - # Downloading entire Web Document (Raw Page Content) - def download_page(self, url): - version = (3, 0) - cur_version = sys.version_info - headers = {} - headers[ - 'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" - if cur_version >= version: # If the Current Version of Python is 3.0 or above - try: - req = urllib.request.Request(url, headers=headers) - resp = urllib.request.urlopen(req) - respData = str(resp.read()) - except: - print("Could not open URL. Please check your internet connection and/or ssl settings \n" - "If you are using proxy, make sure your proxy settings is configured correctly") - sys.exit() - else: # If the Current Version of Python is 2.x - try: - req = urllib2.Request(url, headers=headers) - try: - response = urllib2.urlopen(req) - except URLError: # Handling SSL certificate failed - context = ssl._create_unverified_context() - response = urlopen(req, context=context) - respData = response.read() - except: - print("Could not open URL. Please check your internet connection and/or ssl settings \n" - "If you are using proxy, make sure your proxy settings is configured correctly") - sys.exit() - return "Page Not found" - try: - return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) - except Exception as e: - print(e) - print('Image objects data unpacking failed. Please leave a comment with the above error at https://github.com/hardikvasa/google-images-download/pull/298') - sys.exit() - - # Download Page for more than 100 images - def download_extended_page(self, url, chromedriver): - from selenium import webdriver - from selenium.webdriver.common.keys import Keys - if sys.version_info[0] < 3: - reload(sys) - sys.setdefaultencoding('utf8') - options = webdriver.ChromeOptions() - options.add_argument('--no-sandbox') - options.add_argument("--headless") - - try: - browser = webdriver.Chrome(chromedriver, chrome_options=options) - except Exception as e: - print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' " - "argument to specify the path to the executable.) or google chrome browser is not " - "installed on your machine (exception: %s)" % e) - sys.exit() - browser.set_window_size(1024, 768) - - # Open the link - browser.get(url) - browser.execute_script(""" - (function(XHR){ - "use strict"; - var open = XHR.prototype.open; - var send = XHR.prototype.send; - var data = []; - XHR.prototype.open = function(method, url, async, user, pass) { - this._url = url; - open.call(this, method, url, async, user, pass); - } - XHR.prototype.send = function(data) { - var self = this; - var url = this._url; - function stateChanged() { - if (self.readyState == 4) { - console.log("data available for: " + url) - XHR.prototype._data.push(self.response); - } - } - if (url.includes("/batchexecute?")) { - this.addEventListener("readystatechange", stateChanged, false); - } - send.call(this, data); - }; - XHR.prototype._data = []; - })(XMLHttpRequest); - """) - - time.sleep(1) - print("Getting you a lot of images. This may take a few moments...") - - element = browser.find_element_by_tag_name("body") - # Scroll down - for i in range(30): - element.send_keys(Keys.PAGE_DOWN) - time.sleep(0.3) - - try: - browser.find_element_by_id("smb").click() - for i in range(50): - element.send_keys(Keys.PAGE_DOWN) - time.sleep(0.3) # bot id protection - except: - for i in range(10): - element.send_keys(Keys.PAGE_DOWN) - time.sleep(0.3) # bot id protection - - print("Reached end of Page.") - time.sleep(0.5) - - source = browser.page_source # page source - images = self._image_objects_from_pack(self._extract_data_pack_extended(source)) - - ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") - for chunk in ajax_data: - images += self._image_objects_from_pack(self._extract_data_pack_ajax(chunk)) - - # close the browser - browser.close() - - return images, self.get_all_tabs(source) - - # Correcting the escape characters for python2 - def replace_with_byte(self, match): - return chr(int(match.group(0)[1:], 8)) - - def repair(self, brokenjson): - invalid_escape = re.compile(r'\\[0-7]{1,3}') # up to 3 digits for byte values up to FF - return invalid_escape.sub(self.replace_with_byte, brokenjson) - - # Finding 'Next Image' from the given raw page - def get_next_tab(self, s): - start_line = s.find('class="dtviD"') - if start_line == -1: # If no links are found then give an error! - end_quote = 0 - link = "no_tabs" - return link, '', end_quote - else: - start_line = s.find('class="dtviD"') - start_content = s.find('href="', start_line + 1) - end_content = s.find('">', start_content + 1) - url_item = "https://www.google.com" + str(s[start_content + 6:end_content]) - url_item = url_item.replace('&', '&') - - start_line_2 = s.find('class="dtviD"') - s = s.replace('&', '&') - start_content_2 = s.find(':', start_line_2 + 1) - end_content_2 = s.find('&usg=', start_content_2 + 1) - url_item_name = str(s[start_content_2 + 1:end_content_2]) - - chars = url_item_name.find(',g_1:') - chars_end = url_item_name.find(":", chars + 6) - if chars_end == -1: - updated_item_name = (url_item_name[chars + 5:]).replace("+", " ") - else: - updated_item_name = (url_item_name[chars + 5:chars_end]).replace("+", " ") - - return url_item, updated_item_name, end_content - - # Getting all links with the help of '_images_get_next_image' - def get_all_tabs(self, page): - tabs = {} - while True: - item, item_name, end_content = self.get_next_tab(page) - if item == "no_tabs": - break - else: - if len(item_name) > 100 or item_name == "background-color": - break - else: - tabs[item_name] = item # Append all the links in the list named 'Links' - time.sleep(0.1) # Timer could be used to slow down the request for image downloads - page = page[end_content:] - return tabs - - # Format the object in readable format - def format_object(self, object): - data = object[1] - main = data[3] - info = data[9] - if info is None: - info = data[11] - formatted_object = {} - try: - formatted_object['image_height'] = main[2] - formatted_object['image_width'] = main[1] - formatted_object['image_link'] = main[0] - formatted_object['image_format'] = main[0][-1 * (len(main[0]) - main[0].rfind(".") - 1):] - formatted_object['image_description'] = info['2003'][3] - formatted_object['image_host'] = info['183836587'][0] - formatted_object['image_source'] = info['2003'][2] - formatted_object['image_thumbnail_url'] = data[2][0] - except Exception as e: - print(e) - return None - return formatted_object - - # function to download single image - def single_image(self, image_url): - main_directory = "downloads" - extensions = (".jpg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico") - url = image_url - try: - os.makedirs(main_directory) - except OSError as e: - if e.errno != 17: - raise - pass - req = Request(url, headers={ - "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) - - response = urlopen(req, None, 10) - data = response.read() - response.close() - - image_name = str(url[(url.rfind('/')) + 1:]) - if '?' in image_name: - image_name = image_name[:image_name.find('?')] - # if ".jpg" in image_name or ".gif" in image_name or ".png" in image_name or ".bmp" in image_name or ".svg" in image_name or ".webp" in image_name or ".ico" in image_name: - if any(map(lambda extension: extension in image_name, extensions)): - file_name = main_directory + "/" + image_name - else: - file_name = main_directory + "/" + image_name + ".jpg" - image_name = image_name + ".jpg" - - try: - output_file = open(file_name, 'wb') - output_file.write(data) - output_file.close() - except IOError as e: - raise e - except OSError as e: - raise e - print("completed ====> " + image_name.encode('raw_unicode_escape').decode('utf-8')) - return - - def similar_images(self, similar_images): - version = (3, 0) - cur_version = sys.version_info - if cur_version >= version: # If the Current Version of Python is 3.0 or above - try: - searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images - headers = {} - headers[ - 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" - - req1 = urllib.request.Request(searchUrl, headers=headers) - resp1 = urllib.request.urlopen(req1) - content = str(resp1.read()) - l1 = content.find('AMhZZ') - l2 = content.find('&', l1) - urll = content[l1:l2] - - newurl = "https://www.google.com/search?tbs=sbi:" + urll + "&site=search&sa=X" - req2 = urllib.request.Request(newurl, headers=headers) - resp2 = urllib.request.urlopen(req2) - l3 = content.find('/search?sa=X&q=') - l4 = content.find(';', l3 + 19) - urll2 = content[l3 + 19:l4] - return urll2 - except: - return "Cloud not connect to Google Images endpoint" - else: # If the Current Version of Python is 2.x - try: - searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images - headers = {} - headers[ - 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" - - req1 = urllib2.Request(searchUrl, headers=headers) - resp1 = urllib2.urlopen(req1) - content = str(resp1.read()) - l1 = content.find('AMhZZ') - l2 = content.find('&', l1) - urll = content[l1:l2] - - newurl = "https://www.google.com/search?tbs=sbi:" + urll + "&site=search&sa=X" - req2 = urllib2.Request(newurl, headers=headers) - resp2 = urllib2.urlopen(req2) - l3 = content.find('/search?sa=X&q=') - l4 = content.find(';', l3 + 19) - urll2 = content[l3 + 19:l4] - return (urll2) - except: - return "Cloud not connect to Google Images endpoint" - - # Building URL parameters - def build_url_parameters(self, arguments): - if arguments['language']: - lang = "&lr=" - lang_param = {"Arabic": "lang_ar", "Chinese (Simplified)": "lang_zh-CN", - "Chinese (Traditional)": "lang_zh-TW", "Czech": "lang_cs", "Danish": "lang_da", - "Dutch": "lang_nl", "English": "lang_en", "Estonian": "lang_et", "Finnish": "lang_fi", - "French": "lang_fr", "German": "lang_de", "Greek": "lang_el", "Hebrew": "lang_iw ", - "Hungarian": "lang_hu", "Icelandic": "lang_is", "Italian": "lang_it", "Japanese": "lang_ja", - "Korean": "lang_ko", "Latvian": "lang_lv", "Lithuanian": "lang_lt", "Norwegian": "lang_no", - "Portuguese": "lang_pt", "Polish": "lang_pl", "Romanian": "lang_ro", "Russian": "lang_ru", - "Spanish": "lang_es", "Swedish": "lang_sv", "Turkish": "lang_tr"} - lang_url = lang + lang_param[arguments['language']] - else: - lang_url = '' - - if arguments['time_range']: - json_acceptable_string = arguments['time_range'].replace("'", "\"") - d = json.loads(json_acceptable_string) - time_range = ',cdr:1,cd_min:' + d['time_min'] + ',cd_max:' + d['time_max'] - else: - time_range = '' - - if arguments['exact_size']: - size_array = [x.strip() for x in arguments['exact_size'].split(',')] - exact_size = ",isz:ex,iszw:" + str(size_array[0]) + ",iszh:" + str(size_array[1]) - else: - exact_size = '' - - built_url = "&tbs=" - counter = 0 - params = {'color': [arguments['color'], {'red': 'ic:specific,isc:red', 'orange': 'ic:specific,isc:orange', - 'yellow': 'ic:specific,isc:yellow', 'green': 'ic:specific,isc:green', - 'teal': 'ic:specific,isc:teel', 'blue': 'ic:specific,isc:blue', - 'purple': 'ic:specific,isc:purple', 'pink': 'ic:specific,isc:pink', - 'white': 'ic:specific,isc:white', 'gray': 'ic:specific,isc:gray', - 'black': 'ic:specific,isc:black', 'brown': 'ic:specific,isc:brown'}], - 'color_type': [arguments['color_type'], - {'full-color': 'ic:color', 'black-and-white': 'ic:gray', 'transparent': 'ic:trans'}], - 'usage_rights': [arguments['usage_rights'], - {'labeled-for-reuse-with-modifications': 'sur:fmc', 'labeled-for-reuse': 'sur:fc', - 'labeled-for-noncommercial-reuse-with-modification': 'sur:fm', - 'labeled-for-nocommercial-reuse': 'sur:f'}], - 'size': [arguments['size'], - {'large': 'isz:l', 'medium': 'isz:m', 'icon': 'isz:i', '>400*300': 'isz:lt,islt:qsvga', - '>640*480': 'isz:lt,islt:vga', '>800*600': 'isz:lt,islt:svga', - '>1024*768': 'visz:lt,islt:xga', '>2MP': 'isz:lt,islt:2mp', '>4MP': 'isz:lt,islt:4mp', - '>6MP': 'isz:lt,islt:6mp', '>8MP': 'isz:lt,islt:8mp', '>10MP': 'isz:lt,islt:10mp', - '>12MP': 'isz:lt,islt:12mp', '>15MP': 'isz:lt,islt:15mp', '>20MP': 'isz:lt,islt:20mp', - '>40MP': 'isz:lt,islt:40mp', '>70MP': 'isz:lt,islt:70mp'}], - 'type': [arguments['type'], {'face': 'itp:face', 'photo': 'itp:photo', 'clipart': 'itp:clipart', - 'line-drawing': 'itp:lineart', 'animated': 'itp:animated'}], - 'time': [arguments['time'], {'past-24-hours': 'qdr:d', 'past-7-days': 'qdr:w', 'past-month': 'qdr:m', - 'past-year': 'qdr:y'}], - 'aspect_ratio': [arguments['aspect_ratio'], - {'tall': 'iar:t', 'square': 'iar:s', 'wide': 'iar:w', 'panoramic': 'iar:xw'}], - 'format': [arguments['format'], - {'jpg': 'ift:jpg', 'gif': 'ift:gif', 'png': 'ift:png', 'bmp': 'ift:bmp', 'svg': 'ift:svg', - 'webp': 'webp', 'ico': 'ift:ico', 'raw': 'ift:craw'}]} - for key, value in params.items(): - if value[0] is not None: - ext_param = value[1][value[0]] - # counter will tell if it is first param added or not - if counter == 0: - # add it to the built url - built_url = built_url + ext_param - counter += 1 - else: - built_url = built_url + ',' + ext_param - counter += 1 - built_url = lang_url + built_url + exact_size + time_range - return built_url - - # building main search URL - def build_search_url(self, search_term, params, url, similar_images, specific_site, safe_search): - # check safe_search - safe_search_string = "&safe=active" - # check the args and choose the URL - if url: - url = url - elif similar_images: - print(similar_images) - keywordem = self.similar_images(similar_images) - url = 'https://www.google.com/search?q=' + keywordem + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' - elif specific_site: - url = 'https://www.google.com/search?q=' + quote( - search_term.encode( - 'utf-8')) + '&as_sitesearch=' + specific_site + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' - else: - url = 'https://www.google.com/search?q=' + quote( - search_term.encode( - 'utf-8')) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' - - # safe search check - if safe_search: - url = url + safe_search_string - - return url - - # measures the file size - def file_size(self, file_path): - if os.path.isfile(file_path): - file_info = os.stat(file_path) - size = file_info.st_size - for x in ['bytes', 'KB', 'MB', 'GB', 'TB']: - if size < 1024.0: - return "%3.1f %s" % (size, x) - size /= 1024.0 - return size - - # keywords from file - def keywords_from_file(self, file_name): - search_keyword = [] - with codecs.open(file_name, 'r', encoding='utf-8-sig') as f: - if '.csv' in file_name: - for line in f: - if line in ['\n', '\r\n']: - pass - else: - search_keyword.append(line.replace('\n', '').replace('\r', '')) - elif '.txt' in file_name: - for line in f: - if line in ['\n', '\r\n']: - pass - else: - search_keyword.append(line.replace('\n', '').replace('\r', '')) - else: - print("Invalid file type: Valid file types are either .txt or .csv \n" - "exiting...") - sys.exit() - return search_keyword - - # make directories - def create_directories(self, main_directory, dir_name, thumbnail, thumbnail_only): - dir_name_thumbnail = dir_name + " - thumbnail" - # make a search keyword directory - try: - if not os.path.exists(main_directory): - os.makedirs(main_directory) - time.sleep(0.15) - path = (dir_name) - sub_directory = os.path.join(main_directory, path) - if not os.path.exists(sub_directory): - os.makedirs(sub_directory) - if thumbnail or thumbnail_only: - sub_directory_thumbnail = os.path.join(main_directory, dir_name_thumbnail) - if not os.path.exists(sub_directory_thumbnail): - os.makedirs(sub_directory_thumbnail) - else: - path = (dir_name) - sub_directory = os.path.join(main_directory, path) - if not os.path.exists(sub_directory): - os.makedirs(sub_directory) - if thumbnail or thumbnail_only: - sub_directory_thumbnail = os.path.join(main_directory, dir_name_thumbnail) - if not os.path.exists(sub_directory_thumbnail): - os.makedirs(sub_directory_thumbnail) - except OSError as e: - if e.errno != 17: - raise - pass - return - - # Download Image thumbnails - def download_image_thumbnail(self, image_url, main_directory, dir_name, return_image_name, print_urls, - socket_timeout, print_size, no_download, save_source, img_src, ignore_urls): - if print_urls or no_download: - print("Image URL: " + image_url) - if no_download: - return "success", "Printed url without downloading" - try: - req = Request(image_url, headers={ - "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) - try: - # timeout time to download an image - if socket_timeout: - timeout = float(socket_timeout) - else: - timeout = 10 - - response = urlopen(req, None, timeout) - data = response.read() - response.close() - - path = main_directory + "/" + dir_name + " - thumbnail" + "/" + return_image_name - - try: - output_file = open(path, 'wb') - output_file.write(data) - output_file.close() - if save_source: - list_path = main_directory + "/" + save_source + ".txt" - list_file = open(list_path, 'a') - list_file.write(path + '\t' + img_src + '\n') - list_file.close() - except OSError as e: - download_status = 'fail' - download_message = "OSError on an image...trying next one..." + " Error: " + str(e) - except IOError as e: - download_status = 'fail' - download_message = "IOError on an image...trying next one..." + " Error: " + str(e) - - download_status = 'success' - download_message = "Completed Image Thumbnail ====> " + return_image_name - - # image size parameter - if print_size: - print("Image Size: " + str(self.file_size(path))) - - except UnicodeEncodeError as e: - download_status = 'fail' - download_message = "UnicodeEncodeError on an image...trying next one..." + " Error: " + str(e) - - except HTTPError as e: # If there is any HTTPError - download_status = 'fail' - download_message = "HTTPError on an image...trying next one..." + " Error: " + str(e) - - except URLError as e: - download_status = 'fail' - download_message = "URLError on an image...trying next one..." + " Error: " + str(e) - - except ssl.CertificateError as e: - download_status = 'fail' - download_message = "CertificateError on an image...trying next one..." + " Error: " + str(e) - - except IOError as e: # If there is any IOError - download_status = 'fail' - download_message = "IOError on an image...trying next one..." + " Error: " + str(e) - return download_status, download_message - - # Download Images - def download_image(self, image_url, image_format, main_directory, dir_name, count, print_urls, socket_timeout, - prefix, print_size, no_numbering, no_download, save_source, img_src, silent_mode, thumbnail_only, - format, ignore_urls): - if not silent_mode: - if print_urls or no_download: - print("Image URL: " + image_url) - if ignore_urls: - if any(url in image_url for url in ignore_urls.split(',')): - return "fail", "Image ignored due to 'ignore url' parameter", None, image_url - if thumbnail_only: - return "success", "Skipping image download...", str(image_url[(image_url.rfind('/')) + 1:]), image_url - if no_download: - return "success", "Printed url without downloading", None, image_url - try: - req = Request(image_url, headers={ - "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) - try: - # timeout time to download an image - if socket_timeout: - timeout = float(socket_timeout) - else: - timeout = 10 - - response = urlopen(req, None, timeout) - data = response.read() - info = response.info() - response.close() - - qmark = image_url.rfind('?') - if qmark == -1: - qmark = len(image_url) - slash = image_url.rfind('/', 0, qmark) + 1 - image_name = str(image_url[slash:qmark]).lower() - - type = info.get_content_type() - if type == "image/jpeg" or type == "image/jpg": - if not image_name.endswith(".jpg") and not image_name.endswith(".jpeg"): - image_name += ".jpg" - elif type == "image/png": - if not image_name.endswith(".png"): - image_name += ".png" - elif type == "image/webp": - if not image_name.endswith(".webp"): - image_name += ".webp" - elif type == "image/gif": - if not image_name.endswith(".gif"): - image_name += ".gif" - elif type == "image/bmp" or type == "image/x-windows-bmp": - if not image_name.endswith(".bmp"): - image_name += ".bmp" - elif type == "image/x-icon" or type == "image/vnd.microsoft.icon": - if not image_name.endswith(".ico"): - image_name += ".ico" - elif type == "image/svg+xml": - if not image_name.endswith(".svg"): - image_name += ".svg" - else: - download_status = 'fail' - download_message = "Invalid image format '" + type + "'. Skipping..." - return_image_name = '' - absolute_path = '' - return download_status, download_message, return_image_name, absolute_path - - # prefix name in image - if prefix: - prefix = prefix + " " - else: - prefix = '' - - if no_numbering: - path = main_directory + "/" + dir_name + "/" + prefix + image_name - else: - path = main_directory + "/" + dir_name + "/" + prefix + str(count) + "." + image_name - - try: - output_file = open(path, 'wb') - output_file.write(data) - output_file.close() - if save_source: - list_path = main_directory + "/" + save_source + ".txt" - list_file = open(list_path, 'a') - list_file.write(path + '\t' + img_src + '\n') - list_file.close() - absolute_path = os.path.abspath(path) - except OSError as e: - download_status = 'fail' - download_message = "OSError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - # return image name back to calling method to use it for thumbnail downloads - download_status = 'success' - download_message = "Completed Image ====> " + prefix + str(count) + "." + image_name - return_image_name = prefix + str(count) + "." + image_name - - # image size parameter - if not silent_mode: - if print_size: - print("Image Size: " + str(self.file_size(path))) - - except UnicodeEncodeError as e: - download_status = 'fail' - download_message = "UnicodeEncodeError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except URLError as e: - download_status = 'fail' - download_message = "URLError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except BadStatusLine as e: - download_status = 'fail' - download_message = "BadStatusLine on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except HTTPError as e: # If there is any HTTPError - download_status = 'fail' - download_message = "HTTPError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except URLError as e: - download_status = 'fail' - download_message = "URLError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except ssl.CertificateError as e: - download_status = 'fail' - download_message = "CertificateError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except IOError as e: # If there is any IOError - download_status = 'fail' - download_message = "IOError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - except IncompleteRead as e: - download_status = 'fail' - download_message = "IncompleteReadError on an image...trying next one..." + " Error: " + str(e) - return_image_name = '' - absolute_path = '' - - return download_status, download_message, return_image_name, absolute_path - - def _get_all_items(self, image_objects, main_directory, dir_name, limit, arguments): - items = [] - abs_path = [] - errorCount = 0 - i = 0 - count = 1 - while count < limit + 1 and i < len(image_objects): - if len(image_objects) == 0: - print("no_links") - break - #code added here to attempt to implement offset correctly - #was "count < int(arguments['offset'])" in hardikvasa code, this seems - # to be contrary to the implementation details. - elif arguments['offset'] and count <= int(arguments['offset']): - count += 1 - #page = page[end_content:] - else: - # format the item for readability - object = self.format_object(image_objects[i]) - if arguments['metadata']: - if not arguments["silent_mode"]: - print("\nImage Metadata: " + str(object)) - - # download the images - download_status, download_message, return_image_name, absolute_path = self.download_image( - object['image_link'], object['image_format'], main_directory, dir_name, count, - arguments['print_urls'], arguments['socket_timeout'], arguments['prefix'], arguments['print_size'], - arguments['no_numbering'], arguments['no_download'], arguments['save_source'], - object['image_source'], arguments["silent_mode"], arguments["thumbnail_only"], arguments['format'], - arguments['ignore_urls']) - if not arguments["silent_mode"]: - print(download_message) - if download_status == "success": - - # download image_thumbnails - if arguments['thumbnail'] or arguments["thumbnail_only"]: - download_status, download_message_thumbnail = self.download_image_thumbnail( - object['image_thumbnail_url'], main_directory, dir_name, return_image_name, - arguments['print_urls'], arguments['socket_timeout'], arguments['print_size'], - arguments['no_download'], arguments['save_source'], object['image_source'], - arguments['ignore_urls']) - if not arguments["silent_mode"]: - print(download_message_thumbnail) - - count += 1 - object['image_filename'] = return_image_name - items.append(object) # Append all the links in the list named 'Links' - abs_path.append(absolute_path) - else: - errorCount += 1 - - # delay param - if arguments['delay']: - time.sleep(int(arguments['delay'])) - i += 1 - if count < limit: - print("\n\nUnfortunately all " + str( - limit) + " could not be downloaded because some images were not downloadable. " + str( - count - 1) + " is all we got for this search filter!") - return items, errorCount, abs_path - - # Bulk Download - def download(self, arguments): - paths_agg = {} - # for input coming from other python files - if __name__ != "__main__": - # if the calling file contains config_file param - if 'config_file' in arguments: - records = [] - json_file = json.load(open(arguments['config_file'])) - for record in range(0, len(json_file['Records'])): - arguments = {} - for i in args_list: - arguments[i] = None - for key, value in json_file['Records'][record].items(): - arguments[key] = value - records.append(arguments) - total_errors = 0 - for rec in records: - paths, errors = self.download_executor(rec) - for i in paths: - paths_agg[i] = paths[i] - if not arguments["silent_mode"]: - if arguments['print_paths']: - print(paths.encode('raw_unicode_escape').decode('utf-8')) - total_errors = total_errors + errors - return paths_agg, total_errors - # if the calling file contains params directly - else: - paths, errors = self.download_executor(arguments) - for i in paths: - paths_agg[i] = paths[i] - if not arguments["silent_mode"]: - if arguments['print_paths']: - print(paths.encode('raw_unicode_escape').decode('utf-8')) - return paths_agg, errors - # for input coming from CLI - else: - paths, errors = self.download_executor(arguments) - for i in paths: - paths_agg[i] = paths[i] - if not arguments["silent_mode"]: - if arguments['print_paths']: - print(paths.encode('raw_unicode_escape').decode('utf-8')) - return paths_agg, errors - - def download_executor(self, arguments): - paths = {} - errorCount = None - for arg in args_list: - if arg not in arguments: - arguments[arg] = None - ######Initialization and Validation of user arguments - if arguments['keywords']: - search_keyword = [str(item) for item in arguments['keywords'].split(',')] - - if arguments['keywords_from_file']: - search_keyword = self.keywords_from_file(arguments['keywords_from_file']) - - # both time and time range should not be allowed in the same query - if arguments['time'] and arguments['time_range']: - raise ValueError( - 'Either time or time range should be used in a query. Both cannot be used at the same time.') - - # both time and time range should not be allowed in the same query - if arguments['size'] and arguments['exact_size']: - raise ValueError( - 'Either "size" or "exact_size" should be used in a query. Both cannot be used at the same time.') - - # both image directory and no image directory should not be allowed in the same query - if arguments['image_directory'] and arguments['no_directory']: - raise ValueError('You can either specify image directory or specify no image directory, not both!') - - # Additional words added to keywords - if arguments['suffix_keywords']: - suffix_keywords = [" " + str(sk) for sk in arguments['suffix_keywords'].split(',')] - else: - suffix_keywords = [''] - - # Additional words added to keywords - if arguments['prefix_keywords']: - prefix_keywords = [str(sk) + " " for sk in arguments['prefix_keywords'].split(',')] - else: - prefix_keywords = [''] - - # Setting limit on number of images to be downloaded - if arguments['limit']: - limit = int(arguments['limit']) - else: - limit = 100 - - if arguments['url']: - current_time = str(datetime.datetime.now()).split('.')[0] - search_keyword = [current_time.replace(":", "_")] - - if arguments['similar_images']: - current_time = str(datetime.datetime.now()).split('.')[0] - search_keyword = [current_time.replace(":", "_")] - - # If single_image or url argument not present then keywords is mandatory argument - if arguments['single_image'] is None and arguments['url'] is None and arguments['similar_images'] is None and \ - arguments['keywords'] is None and arguments['keywords_from_file'] is None: - print('-------------------------------\n' - 'Uh oh! Keywords is a required argument \n\n' - 'Please refer to the documentation on guide to writing queries \n' - 'https://github.com/hardikvasa/google-images-download#examples' - '\n\nexiting!\n' - '-------------------------------') - sys.exit() - - # If this argument is present, set the custom output directory - if arguments['output_directory']: - main_directory = arguments['output_directory'] - else: - main_directory = "downloads" - - # Proxy settings - if arguments['proxy']: - os.environ["http_proxy"] = arguments['proxy'] - os.environ["https_proxy"] = arguments['proxy'] - ######Initialization Complete - total_errors = 0 - for pky in prefix_keywords: # 1.for every prefix keywords - for sky in suffix_keywords: # 2.for every suffix keywords - i = 0 - while i < len(search_keyword): # 3.for every main keyword - iteration = "\n" + "Item no.: " + str(i + 1) + " -->" + " Item name = " + (pky) + ( - search_keyword[i]) + (sky) - if not arguments["silent_mode"]: - print(iteration.encode('raw_unicode_escape').decode('utf-8')) - print("Evaluating...") - else: - print("Downloading images for: " + (pky) + (search_keyword[i]) + (sky) + " ...") - search_term = pky + search_keyword[i] + sky - - if arguments['image_directory']: - dir_name = arguments['image_directory'] - elif arguments['no_directory']: - dir_name = '' - else: - dir_name = search_term + ( - '-' + arguments['color'] if arguments['color'] else '') # sub-directory - - if not arguments["no_download"]: - self.create_directories(main_directory, dir_name, arguments['thumbnail'], - arguments['thumbnail_only']) # create directories in OS - - params = self.build_url_parameters(arguments) # building URL with params - - url = self.build_search_url(search_term, params, arguments['url'], arguments['similar_images'], - arguments['specific_site'], - arguments['safe_search']) # building main search url - - if limit < 101: - images, tabs = self.download_page(url) # download page - else: - images, tabs = self.download_extended_page(url, arguments['chromedriver']) - - if not arguments["silent_mode"]: - if arguments['no_download']: - print("Getting URLs without downloading images...") - else: - print("Starting Download...") - items, errorCount, abs_path = self._get_all_items(images, main_directory, dir_name, limit, - arguments) # get all image items and download images - paths[pky + search_keyword[i] + sky] = abs_path - - # dumps into a json file - if arguments['extract_metadata']: - try: - if not os.path.exists("logs"): - os.makedirs("logs") - except OSError as e: - print(e) - json_file = open("logs/" + search_keyword[i] + ".json", "w") - json.dump(items, json_file, indent=4, sort_keys=True) - json_file.close() - - # Related images - if arguments['related_images']: - print("\nGetting list of related keywords...this may take a few moments") - for key, value in tabs.items(): - final_search_term = (search_term + " - " + key) - print("\nNow Downloading - " + final_search_term) - if limit < 101: - images, _ = self.download_page(value) # download page - else: - images, _ = self.download_extended_page(value, arguments['chromedriver']) - self.create_directories(main_directory, final_search_term, arguments['thumbnail'], - arguments['thumbnail_only']) - self._get_all_items(images, main_directory, search_term + " - " + key, limit, arguments) - - i += 1 - total_errors = total_errors + errorCount - if not arguments["silent_mode"]: - print("\nErrors: " + str(errorCount) + "\n") - return paths, total_errors - - -# ------------- Main Program -------------# -def main(): - records = user_input() - total_errors = 0 - t0 = time.time() # start the timer - for arguments in records: - - if arguments['single_image']: # Download Single Image using a URL - response = googleimagesdownload() - response.single_image(arguments['single_image']) - else: # or download multiple images based on keywords/keyphrase search - response = googleimagesdownload() - paths, errors = response.download(arguments) # wrapping response in a variable just for consistency - total_errors = total_errors + errors - - t1 = time.time() # stop the timer - total_time = t1 - t0 # Calculating the total time required to crawl, find and download all the links of 60,000 images - if not arguments["silent_mode"]: - print("\nEverything downloaded!") - print("Total errors: " + str(total_errors)) - print("Total time taken: " + str(total_time) + " Seconds") - - -if __name__ == "__main__": - main() From 4c5e6a4d52c9c6082ae00e9eb4727e0e8d3a855c Mon Sep 17 00:00:00 2001 From: explosion1206 <40578236+estuhr1206@users.noreply.github.com> Date: Tue, 25 May 2021 02:30:26 -0400 Subject: [PATCH 17/31] Add files via upload --- google_images_download/google_images_download.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 49dcee77..51bb251b 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -876,6 +876,12 @@ def _get_all_items(self, image_objects, main_directory, dir_name, limit, argumen if len(image_objects) == 0: print("no_links") break + #code added here to attempt to implement offset correctly + #was "count < int(arguments['offset'])" in hardikvasa code, this seems + # to be contrary to the implementation details. + elif arguments['offset'] and count <= int(arguments['offset']): + count += 1 + #page = page[end_content:] else: # format the item for readability object = self.format_object(image_objects[i]) From 2f9f80193f4bb7218090a94f3616eadb624bb40b Mon Sep 17 00:00:00 2001 From: Nicolas Grosjean Date: Wed, 16 Jun 2021 16:06:15 +0200 Subject: [PATCH 18/31] Get more than 400 images Fix clicking on the "Show more results" button with Selenium. - The button has no more "smb" id - We need to do more scroll down before clicking --- google_images_download/google_images_download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 51bb251b..1c5e47d6 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -292,12 +292,12 @@ def download_extended_page(self, url, chromedriver): element = browser.find_element_by_tag_name("body") # Scroll down - for i in range(30): + for i in range(50): element.send_keys(Keys.PAGE_DOWN) time.sleep(0.3) try: - browser.find_element_by_id("smb").click() + browser.find_element_by_xpath('//input[@value="Show more results"]').click() for i in range(50): element.send_keys(Keys.PAGE_DOWN) time.sleep(0.3) # bot id protection From df2e289aa2e1c0273ebcd67300ecdcf50f5dc90e Mon Sep 17 00:00:00 2001 From: Matthew LeHew Date: Wed, 30 Jun 2021 10:47:24 -0400 Subject: [PATCH 19/31] Fix JSONDecodeError: Extra Data This may have been caused by Google changing their Ajax response. Looking at the response, lines[4] only contained a single number and not any JSON. Removing it and simply pulling from lines[3] seems to fix the issue. The problem only manifested when downloading more than 100 images, which required launching ChromeDriver. --- google_images_download/google_images_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 51bb251b..bb941bf2 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -193,7 +193,7 @@ def _extract_data_pack_extended(self, page): def _extract_data_pack_ajax(self, data): lines = data.split('\n') - return json.loads(lines[3] + lines[4])[0][2] + return json.loads(lines[3])[0][2] def _image_objects_from_pack(self, data): image_objects = json.loads(data)[31][0][12][2] From a8e28e265b83a63c5c8294bacd044251a9aa4d03 Mon Sep 17 00:00:00 2001 From: Nicolas Grosjean Date: Wed, 25 Aug 2021 11:52:28 +0200 Subject: [PATCH 20/31] Manage API change We extracted images from json.loads(data)[31][0]... because in json.loads(data)[31] was a list of 1 value. Now json.loads(data)[31] is a list of 2 values and we want the last. So replacing 0 by -1 manage this new case and the old one if Google revert this change. --- google_images_download/google_images_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 51bb251b..0cc3057d 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -196,7 +196,7 @@ def _extract_data_pack_ajax(self, data): return json.loads(lines[3] + lines[4])[0][2] def _image_objects_from_pack(self, data): - image_objects = json.loads(data)[31][0][12][2] + image_objects = json.loads(data)[31][-1][12][2] image_objects = [x for x in image_objects if x[0] == 1] return image_objects From 375b6bb6141a658777f87b7bf9f6a8444e07305e Mon Sep 17 00:00:00 2001 From: Nicolas Grosjean Date: Mon, 20 Sep 2021 17:18:37 +0200 Subject: [PATCH 21/31] Fix time_range argument The time range feature has changed, I used this tweet thread to fix it : https://twitter.com/i/events/1174066444029419520. We can imagine work on the time_range format to avoid changing the "API". --- .../google_images_download.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 51bb251b..ad8443d1 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -108,7 +108,7 @@ def user_input(): parser.add_argument('-w', '--time', help='image age', type=str, required=False, choices=['past-24-hours', 'past-7-days', 'past-month', 'past-year']) parser.add_argument('-wr', '--time_range', - help='time range for the age of the image. should be in the format {"time_min":"MM/DD/YYYY","time_max":"MM/DD/YYYY"}', + help='time range for the age of the image. should be in the format {"time_min":"YYYY-MM-DD","time_max":"YYYY-MM-DD"}', type=str, required=False) parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, required=False, @@ -501,13 +501,6 @@ def build_url_parameters(self, arguments): else: lang_url = '' - if arguments['time_range']: - json_acceptable_string = arguments['time_range'].replace("'", "\"") - d = json.loads(json_acceptable_string) - time_range = ',cdr:1,cd_min:' + d['time_min'] + ',cd_max:' + d['time_max'] - else: - time_range = '' - if arguments['exact_size']: size_array = [x.strip() for x in arguments['exact_size'].split(',')] exact_size = ",isz:ex,iszw:" + str(size_array[0]) + ",iszh:" + str(size_array[1]) @@ -555,7 +548,7 @@ def build_url_parameters(self, arguments): else: built_url = built_url + ',' + ext_param counter += 1 - built_url = lang_url + built_url + exact_size + time_range + built_url = lang_url + built_url + exact_size return built_url # building main search URL @@ -1046,6 +1039,14 @@ def download_executor(self, arguments): if arguments['proxy']: os.environ["http_proxy"] = arguments['proxy'] os.environ["https_proxy"] = arguments['proxy'] + + # Add time range to keywords if asked + time_range = '' + if arguments['time_range']: + json_acceptable_string = arguments['time_range'].replace("'", "\"") + d = json.loads(json_acceptable_string) + time_range = ' after:' + d['time_min'] + ' before:' + d['time_max'] + ######Initialization Complete total_errors = 0 for pky in prefix_keywords: # 1.for every prefix keywords @@ -1059,7 +1060,7 @@ def download_executor(self, arguments): print("Evaluating...") else: print("Downloading images for: " + (pky) + (search_keyword[i]) + (sky) + " ...") - search_term = pky + search_keyword[i] + sky + search_term = pky + search_keyword[i] + sky + time_range if arguments['image_directory']: dir_name = arguments['image_directory'] From a0c18fd9386e6f256cc8bbbd3abff523138aee32 Mon Sep 17 00:00:00 2001 From: Nicolas Grosjean Date: Wed, 22 Sep 2021 16:39:06 +0200 Subject: [PATCH 22/31] Remove time range from directoriy names It is not very useful to have the time range expression in the image directory names. --- google_images_download/google_images_download.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index ad8443d1..d9e2ca7b 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -1060,7 +1060,7 @@ def download_executor(self, arguments): print("Evaluating...") else: print("Downloading images for: " + (pky) + (search_keyword[i]) + (sky) + " ...") - search_term = pky + search_keyword[i] + sky + time_range + search_term = pky + search_keyword[i] + sky if arguments['image_directory']: dir_name = arguments['image_directory'] @@ -1076,6 +1076,7 @@ def download_executor(self, arguments): params = self.build_url_parameters(arguments) # building URL with params + search_term += time_range url = self.build_search_url(search_term, params, arguments['url'], arguments['similar_images'], arguments['specific_site'], arguments['safe_search']) # building main search url From c773e1c7d12f30c24e793e1963035386764a6248 Mon Sep 17 00:00:00 2001 From: Joe Clinton Date: Sun, 26 Sep 2021 15:37:58 +0200 Subject: [PATCH 23/31] Fix "None type error" by changing location of image_host string in info object. --- google_images_download/google_images_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index e5769978..bbfbd47a 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -388,7 +388,7 @@ def format_object(self, object): formatted_object['image_link'] = main[0] formatted_object['image_format'] = main[0][-1 * (len(main[0]) - main[0].rfind(".") - 1):] formatted_object['image_description'] = info['2003'][3] - formatted_object['image_host'] = info['183836587'][0] + formatted_object['image_host'] = info['2003'][17] formatted_object['image_source'] = info['2003'][2] formatted_object['image_thumbnail_url'] = data[2][0] except Exception as e: From 36e5c06681a7b2ae751d288c7fc1f26529846a89 Mon Sep 17 00:00:00 2001 From: Nicolas Grosjean Date: Wed, 23 Feb 2022 09:12:10 +0100 Subject: [PATCH 24/31] Fix exact_size parameter #11 Update the url building to the new way to get the exact image size thanks to this article : https://www.labnol.org/internet/google-image-size-search/26902/ --- google_images_download/google_images_download.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index bbfbd47a..2920c813 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -501,11 +501,6 @@ def build_url_parameters(self, arguments): else: lang_url = '' - if arguments['exact_size']: - size_array = [x.strip() for x in arguments['exact_size'].split(',')] - exact_size = ",isz:ex,iszw:" + str(size_array[0]) + ",iszh:" + str(size_array[1]) - else: - exact_size = '' built_url = "&tbs=" counter = 0 @@ -548,7 +543,7 @@ def build_url_parameters(self, arguments): else: built_url = built_url + ',' + ext_param counter += 1 - built_url = lang_url + built_url + exact_size + built_url = lang_url + built_url return built_url # building main search URL @@ -1047,6 +1042,11 @@ def download_executor(self, arguments): d = json.loads(json_acceptable_string) time_range = ' after:' + d['time_min'] + ' before:' + d['time_max'] + exact_size = '' + if arguments['exact_size']: + size_array = [x.strip() for x in arguments['exact_size'].split(',')] + exact_size = " imagesize:" + str(size_array[0]) + "x" + str(size_array[1]) + ######Initialization Complete total_errors = 0 for pky in prefix_keywords: # 1.for every prefix keywords @@ -1076,7 +1076,7 @@ def download_executor(self, arguments): params = self.build_url_parameters(arguments) # building URL with params - search_term += time_range + search_term += time_range + exact_size url = self.build_search_url(search_term, params, arguments['url'], arguments['similar_images'], arguments['specific_site'], arguments['safe_search']) # building main search url From cf190d8650ba79e63a44f9d1cdf386da1b023258 Mon Sep 17 00:00:00 2001 From: Lex Vorona Date: Fri, 5 Aug 2022 13:11:58 -0700 Subject: [PATCH 25/31] Support Firefox --- .../google_images_download.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 2920c813..2fae83f4 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -42,7 +42,7 @@ "exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image", "output_directory", "image_directory", "no_directory", "proxy", "similar_images", "specific_site", "print_urls", "print_size", "print_paths", "metadata", "extract_metadata", "socket_timeout", - "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "related_images", "safe_search", + "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "browser", "related_images", "safe_search", "no_numbering", "offset", "no_download", "save_source", "silent_mode", "ignore_urls"] @@ -148,6 +148,9 @@ def user_input(): parser.add_argument('-cd', '--chromedriver', help='specify the path to chromedriver executable in your local machine', type=str, required=False) + parser.add_argument('-wb', '--browser', + help='Specify which driver to use', type=str, + required=False) parser.add_argument('-ri', '--related_images', default=False, help="Downloads images that are similar to the keyword provided", action="store_true") parser.add_argument('-sa', '--safe_search', default=False, @@ -238,7 +241,7 @@ def download_page(self, url): sys.exit() # Download Page for more than 100 images - def download_extended_page(self, url, chromedriver): + def download_extended_page(self, url, chromedriver, browser): from selenium import webdriver from selenium.webdriver.common.keys import Keys if sys.version_info[0] < 3: @@ -248,13 +251,16 @@ def download_extended_page(self, url, chromedriver): options.add_argument('--no-sandbox') options.add_argument("--headless") - try: - browser = webdriver.Chrome(chromedriver, chrome_options=options) - except Exception as e: - print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' " - "argument to specify the path to the executable.) or google chrome browser is not " - "installed on your machine (exception: %s)" % e) - sys.exit() + if browser == 'Firefox': + browser = webdriver.Firefox() + else: + try: + browser = webdriver.Chrome(chromedriver, chrome_options=options) + except Exception as e: + print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' " + "argument to specify the path to the executable.) or google chrome browser is not " + "installed on your machine (exception: %s)" % e) + sys.exit() browser.set_window_size(1024, 768) # Open the link @@ -1084,7 +1090,7 @@ def download_executor(self, arguments): if limit < 101: images, tabs = self.download_page(url) # download page else: - images, tabs = self.download_extended_page(url, arguments['chromedriver']) + images, tabs = self.download_extended_page(url, arguments['chromedriver'], arguments['browser']) if not arguments["silent_mode"]: if arguments['no_download']: @@ -1115,7 +1121,7 @@ def download_executor(self, arguments): if limit < 101: images, _ = self.download_page(value) # download page else: - images, _ = self.download_extended_page(value, arguments['chromedriver']) + images, _ = self.download_extended_page(value, arguments['chromedriver'], arguments['browser']) self.create_directories(main_directory, final_search_term, arguments['thumbnail'], arguments['thumbnail_only']) self._get_all_items(images, main_directory, search_term + " - " + key, limit, arguments) From dcb4619af4246973896bf633fd7c809b6d74f208 Mon Sep 17 00:00:00 2001 From: Joe Clinton Date: Thu, 18 Aug 2022 14:18:25 +0200 Subject: [PATCH 26/31] Bypass "Before you continue" and ignore empty ajax data --- .../google_images_download.py | 13 +++++++++++-- tests/test_google_images_download.py | 19 +++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 2fae83f4..a5b08eb6 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -6,6 +6,7 @@ # Import Libraries import sys +import selenium.common.exceptions version = (3, 0) cur_version = sys.version_info @@ -294,6 +295,14 @@ def download_extended_page(self, url, chromedriver, browser): """) time.sleep(1) + + # Bypass "Before you continue" if it appears + try: + browser.find_element_by_css_selector("[aria-label='Accept all']").click() + time.sleep(1) + except selenium.common.exceptions.NoSuchElementException: + pass + print("Getting you a lot of images. This may take a few moments...") element = browser.find_element_by_tag_name("body") @@ -318,8 +327,8 @@ def download_extended_page(self, url, chromedriver, browser): source = browser.page_source # page source images = self._image_objects_from_pack(self._extract_data_pack_extended(source)) - ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") - for chunk in ajax_data: + ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") # I think this is broken + for chunk in ajax_data if ajax_data else []: images += self._image_objects_from_pack(self._extract_data_pack_ajax(chunk)) # close the browser diff --git a/tests/test_google_images_download.py b/tests/test_google_images_download.py index ec62afd0..d16a76bc 100644 --- a/tests/test_google_images_download.py +++ b/tests/test_google_images_download.py @@ -15,26 +15,27 @@ def silent_remove_of_file(file): def test_download_images_to_default_location(): start_time = time.time() - argumnets = { + arguments = { "keywords": "Polar bears", - "limit": 5, - "print_urls": False + "limit":101, + "print_urls": False, + "chromedriver": 'C:/Program Files (x86)/chromedriver/chromedriver.exe' } try: - temp = argumnets['output_folder'] + temp = arguments['output_folder'] except KeyError: pass else: assert False, "This test checks download to default location yet an output folder was provided" - output_folder_path = os.path.join(os.path.realpath('.'), 'downloads', '{}'.format(argumnets['keywords'])) + output_folder_path = os.path.join(os.path.realpath('.'), 'downloads', '{}'.format(arguments['keywords'])) if os.path.exists(output_folder_path): start_amount_of_files_in_output_folder = len([name for name in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, name)) and os.path.getctime(os.path.join(output_folder_path, name)) < start_time]) else: start_amount_of_files_in_output_folder = 0 response = google_images_download.googleimagesdownload() - response.download(argumnets) + response.download(arguments) files_modified_after_test_started = [name for name in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, name)) and os.path.getmtime(os.path.join(output_folder_path, name)) > start_time] end_amount_of_files_in_output_folder = len(files_modified_after_test_started) print(f"Files downloaded by test {__name__}:") @@ -43,11 +44,13 @@ def test_download_images_to_default_location(): # assert end_amount_of_files_in_output_folder - start_amount_of_files_in_output_folder == argumnets['limit'] - assert end_amount_of_files_in_output_folder == argumnets['limit'] + assert end_amount_of_files_in_output_folder == arguments['limit'] print(f"Cleaning up all files downloaded by test {__name__}...") for file in files_modified_after_test_started: if silent_remove_of_file(os.path.join(output_folder_path, file)): print(f"Deleted {os.path.join(output_folder_path, file)}") else: - print(f"Failed to delete {os.path.join(output_folder_path, file)}") \ No newline at end of file + print(f"Failed to delete {os.path.join(output_folder_path, file)}") + +test_download_images_to_default_location() \ No newline at end of file From dffca0899d391a4673181fce29284e4f65daca9a Mon Sep 17 00:00:00 2001 From: Ellis Brown Date: Fri, 23 Sep 2022 23:31:33 +0000 Subject: [PATCH 27/31] fix breaking change due to google's response format --- google_images_download/google_images_download.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index a5b08eb6..5985f0e9 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -199,9 +199,18 @@ def _extract_data_pack_ajax(self, data): lines = data.split('\n') return json.loads(lines[3])[0][2] - def _image_objects_from_pack(self, data): - image_objects = json.loads(data)[31][-1][12][2] - image_objects = [x for x in image_objects if x[0] == 1] + @staticmethod + def _image_objects_from_pack(data): + image_data = json.loads(data) + # NOTE: google sometimes changes their format, breaking this. set a breakpoint here to find the correct index + idx = 56 + grid = image_data[idx][-1][0][-1][-1][0] + image_objects = [] + for item in grid: + obj = list(item[0][0].values())[0] + # ads and carousels will be empty + if obj: + image_objects.append(obj) return image_objects # Downloading entire Web Document (Raw Page Content) From 3f58a9a99589e17ee5ce718a27cbd56441a1fd49 Mon Sep 17 00:00:00 2001 From: Ellis Brown Date: Fri, 23 Sep 2022 23:37:03 +0000 Subject: [PATCH 28/31] update error message to point to this PR --- google_images_download/google_images_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 5985f0e9..4f1e31d9 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -247,7 +247,7 @@ def download_page(self, url): return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) except Exception as e: print(e) - print('Image objects data unpacking failed. Please leave a comment with the above error at https://github.com/hardikvasa/google-images-download/pull/298') + print('Image objects data unpacking failed. Please leave a comment with the above error at https://github.com/Joeclinton1/google-images-download/pull/26') sys.exit() # Download Page for more than 100 images From 219b850f41de9cdc71531fbbf4cbed2ae5eaf3bf Mon Sep 17 00:00:00 2001 From: Ellis Brown Date: Sat, 24 Sep 2022 01:32:50 +0000 Subject: [PATCH 29/31] fix chromium downloads --- .../google_images_download.py | 5 +++-- tests/test_google_images_download.py | 20 +++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 4f1e31d9..c5d82f24 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -209,8 +209,9 @@ def _image_objects_from_pack(data): for item in grid: obj = list(item[0][0].values())[0] # ads and carousels will be empty - if obj: - image_objects.append(obj) + if not obj or not obj[1]: + continue + image_objects.append(obj) return image_objects # Downloading entire Web Document (Raw Page Content) diff --git a/tests/test_google_images_download.py b/tests/test_google_images_download.py index d16a76bc..e9089945 100644 --- a/tests/test_google_images_download.py +++ b/tests/test_google_images_download.py @@ -1,3 +1,4 @@ +import argparse from google_images_download import google_images_download import os, errno import time @@ -13,14 +14,8 @@ def silent_remove_of_file(file): return True -def test_download_images_to_default_location(): +def test_download_images_to_default_location(arguments: dict): start_time = time.time() - arguments = { - "keywords": "Polar bears", - "limit":101, - "print_urls": False, - "chromedriver": 'C:/Program Files (x86)/chromedriver/chromedriver.exe' - } try: temp = arguments['output_folder'] except KeyError: @@ -53,4 +48,13 @@ def test_download_images_to_default_location(): else: print(f"Failed to delete {os.path.join(output_folder_path, file)}") -test_download_images_to_default_location() \ No newline at end of file +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-k', '--keywords', type=str, help='delimited list input', default="Polar bears") + parser.add_argument('-l', '--limit', type=int, help='delimited list input', default=101) + parser.add_argument('-u', '--print_urls', action='store_true', help='print the URLs of the images') + parser.add_argument('-c', '--chromedriver', type=str, help='path to chromedriver executable in your local machine', default='C:/Program Files (x86)/chromedriver/chromedriver.exe') + args = parser.parse_args() + print(f"testing with args: {args}") + + test_download_images_to_default_location(vars(args)) From 1421a434c0557e7a71e87732c913b9c8d11a92b7 Mon Sep 17 00:00:00 2001 From: Ellis Brown Date: Mon, 26 Sep 2022 21:42:39 +0000 Subject: [PATCH 30/31] fix again after new update 9/26 --- google_images_download/google_images_download.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index c5d82f24..247c8b71 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -203,11 +203,9 @@ def _extract_data_pack_ajax(self, data): def _image_objects_from_pack(data): image_data = json.loads(data) # NOTE: google sometimes changes their format, breaking this. set a breakpoint here to find the correct index - idx = 56 - grid = image_data[idx][-1][0][-1][-1][0] + grid = image_data[31][0][12][2] image_objects = [] - for item in grid: - obj = list(item[0][0].values())[0] + for obj in grid: # ads and carousels will be empty if not obj or not obj[1]: continue From 2e117f3043aa404630d4d4129e22066697f8814c Mon Sep 17 00:00:00 2001 From: Ellis Brown Date: Fri, 30 Sep 2022 05:11:40 +0000 Subject: [PATCH 31/31] revert rollback from 9/26 --- google_images_download/google_images_download.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 247c8b71..eac2a5bf 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -203,9 +203,10 @@ def _extract_data_pack_ajax(self, data): def _image_objects_from_pack(data): image_data = json.loads(data) # NOTE: google sometimes changes their format, breaking this. set a breakpoint here to find the correct index - grid = image_data[31][0][12][2] + grid = image_data[56][-1][0][-1][-1][0] image_objects = [] - for obj in grid: + for item in grid: + obj = list(item[0][0].values())[0] # ads and carousels will be empty if not obj or not obj[1]: continue