diff --git a/README.rst b/README.rst index b1e08f65..14d86228 100644 --- a/README.rst +++ b/README.rst @@ -23,7 +23,7 @@ The original creators of the images own the copyrights. Images published in the United States are automatically copyrighted by their owners, even if they do not explicitly carry a copyright warning. -You may not reproduce copyright images without their owner's permission, +You may not reproduce copyright images without their owner'self permission, except in "fair use" cases, -or you could risk running into lawyer's warnings, cease-and-desist letters, and copyright suits. +or you could risk running into lawyer'self warnings, cease-and-desist letters, and copyright suits. Please be very careful before its usage! Use this script/code only for educational purposes. diff --git a/docs/arguments.rst b/docs/arguments.rst index e18035c9..ead86ab1 100644 --- a/docs/arguments.rst +++ b/docs/arguments.rst @@ -85,7 +85,7 @@ Link to `Documentation Homepage 400*300, >640*480, >800*600, >1024*768, >2MP, >4MP, >6MP, >8MP, >10MP, | | | | >12MP, >15MP, >20MP, >40MP, >70MP` | diff --git a/docs/index.rst b/docs/index.rst index 3e276d00..001fb252 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -127,7 +127,7 @@ Disclaimer Images published in the United States are automatically copyrighted by their owners, even if they do not explicitly carry a copyright warning. - You may not reproduce copyright images without their owner's permission, + You may not reproduce copyright images without their owner'self permission, except in "fair use" cases, - or you could risk running into lawyer's warnings, cease-and-desist letters, and copyright suits. + or you could risk running into lawyer'self warnings, cease-and-desist letters, and copyright suits. Please be very careful before its usage! diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index fd89a3a9..eac2a5bf 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -6,6 +6,8 @@ # Import Libraries import sys +import selenium.common.exceptions + version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above @@ -15,6 +17,7 @@ from urllib.parse import quote import http.client from http.client import IncompleteRead, BadStatusLine + http.client._MAXHEADERS = 1000 else: # If the Current Version of Python is 2.x import urllib2 @@ -23,6 +26,7 @@ from urllib import quote import httplib from httplib import IncompleteRead, BadStatusLine + httplib._MAXHEADERS = 1000 import time # Importing the time library to check the time of code execution import os @@ -39,8 +43,9 @@ "exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image", "output_directory", "image_directory", "no_directory", "proxy", "similar_images", "specific_site", "print_urls", "print_size", "print_paths", "metadata", "extract_metadata", "socket_timeout", - "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "related_images", "safe_search", "no_numbering", - "offset", "no_download","save_source","silent_mode","ignore_urls"] + "thumbnail", "thumbnail_only", "language", "prefix", "chromedriver", "browser", "related_images", "safe_search", + "no_numbering", + "offset", "no_download", "save_source", "silent_mode", "ignore_urls"] def user_input(): @@ -52,7 +57,7 @@ def user_input(): if object_check['config_file'] != '': records = [] json_file = json.load(open(config_file_check[0].config_file)) - for record in range(0,len(json_file['Records'])): + for record in range(0, len(json_file['Records'])): arguments = {} for i in args_list: arguments[i] = None @@ -64,57 +69,107 @@ def user_input(): # Taking command line arguments from users parser = argparse.ArgumentParser() parser.add_argument('-k', '--keywords', help='delimited list input', type=str, required=False) - parser.add_argument('-kf', '--keywords_from_file', help='extract list of keywords from a text file', type=str, required=False) - parser.add_argument('-sk', '--suffix_keywords', help='comma separated additional words added after to main keyword', type=str, required=False) - parser.add_argument('-pk', '--prefix_keywords', help='comma separated additional words added before main keyword', type=str, required=False) + parser.add_argument('-kf', '--keywords_from_file', help='extract list of keywords from a text file', type=str, + required=False) + parser.add_argument('-sk', '--suffix_keywords', + help='comma separated additional words added after to main keyword', type=str, + required=False) + parser.add_argument('-pk', '--prefix_keywords', + help='comma separated additional words added before main keyword', type=str, required=False) parser.add_argument('-l', '--limit', help='delimited list input', type=str, required=False) parser.add_argument('-f', '--format', help='download images with specific format', type=str, required=False, choices=['jpg', 'gif', 'png', 'bmp', 'svg', 'webp', 'ico']) parser.add_argument('-u', '--url', help='search with google image URL', type=str, required=False) - parser.add_argument('-x', '--single_image', help='downloading a single image from URL', type=str, required=False) - parser.add_argument('-o', '--output_directory', help='download images in a specific main directory', type=str, required=False) - parser.add_argument('-i', '--image_directory', help='download images in a specific sub-directory', type=str, required=False) - parser.add_argument('-n', '--no_directory', default=False, help='download images in the main directory but no sub-directory', action="store_true") - parser.add_argument('-d', '--delay', help='delay in seconds to wait between downloading two images', type=int, required=False) + parser.add_argument('-x', '--single_image', help='downloading a single image from URL', type=str, + required=False) + parser.add_argument('-o', '--output_directory', help='download images in a specific main directory', type=str, + required=False) + parser.add_argument('-i', '--image_directory', help='download images in a specific sub-directory', type=str, + required=False) + parser.add_argument('-n', '--no_directory', default=False, + help='download images in the main directory but no sub-directory', action="store_true") + parser.add_argument('-d', '--delay', help='delay in seconds to wait between downloading two images', type=int, + required=False) parser.add_argument('-co', '--color', help='filter on color', type=str, required=False, - choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', 'gray', 'black', 'brown']) + choices=['red', 'orange', 'yellow', 'green', 'teal', 'blue', 'purple', 'pink', 'white', + 'gray', 'black', 'brown']) parser.add_argument('-ct', '--color_type', help='filter on color', type=str, required=False, choices=['full-color', 'black-and-white', 'transparent']) parser.add_argument('-r', '--usage_rights', help='usage rights', type=str, required=False, - choices=['labeled-for-reuse-with-modifications','labeled-for-reuse','labeled-for-noncommercial-reuse-with-modification','labeled-for-nocommercial-reuse']) + choices=['labeled-for-reuse-with-modifications', 'labeled-for-reuse', + 'labeled-for-noncommercial-reuse-with-modification', + 'labeled-for-nocommercial-reuse']) parser.add_argument('-s', '--size', help='image size', type=str, required=False, - choices=['large','medium','icon','>400*300','>640*480','>800*600','>1024*768','>2MP','>4MP','>6MP','>8MP','>10MP','>12MP','>15MP','>20MP','>40MP','>70MP']) - parser.add_argument('-es', '--exact_size', help='exact image resolution "WIDTH,HEIGHT"', type=str, required=False) + choices=['large', 'medium', 'icon', '>400*300', '>640*480', '>800*600', '>1024*768', '>2MP', + '>4MP', '>6MP', '>8MP', '>10MP', '>12MP', '>15MP', '>20MP', '>40MP', '>70MP']) + parser.add_argument('-es', '--exact_size', help='exact image resolution "WIDTH,HEIGHT"', type=str, + required=False) parser.add_argument('-t', '--type', help='image type', type=str, required=False, - choices=['face','photo','clipart','line-drawing','animated']) + choices=['face', 'photo', 'clipart', 'line-drawing', 'animated']) parser.add_argument('-w', '--time', help='image age', type=str, required=False, - choices=['past-24-hours','past-7-days','past-month','past-year']) - parser.add_argument('-wr', '--time_range', help='time range for the age of the image. should be in the format {"time_min":"MM/DD/YYYY","time_max":"MM/DD/YYYY"}', type=str, required=False) - parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, required=False, + choices=['past-24-hours', 'past-7-days', 'past-month', 'past-year']) + parser.add_argument('-wr', '--time_range', + help='time range for the age of the image. should be in the format {"time_min":"YYYY-MM-DD","time_max":"YYYY-MM-DD"}', + type=str, required=False) + parser.add_argument('-a', '--aspect_ratio', help='comma separated additional words added to keywords', type=str, + required=False, choices=['tall', 'square', 'wide', 'panoramic']) - parser.add_argument('-si', '--similar_images', help='downloads images very similar to the image URL you provide', type=str, required=False) - parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', type=str, required=False) - parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", action="store_true") - parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", action="store_true") - parser.add_argument('-pp', '--print_paths', default=False, help="Prints the list of absolute paths of the images",action="store_true") - parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", action="store_true") - parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", action="store_true") - parser.add_argument('-st', '--socket_timeout', default=False, help="Connection timeout waiting for the image to download", type=float) - parser.add_argument('-th', '--thumbnail', default=False, help="Downloads image thumbnail along with the actual image", action="store_true") - parser.add_argument('-tho', '--thumbnail_only', default=False, help="Downloads only thumbnail without downloading actual images", action="store_true") - parser.add_argument('-la', '--language', default=False, help="Defines the language filter. The search results are authomatically returned in that language", type=str, required=False, - choices=['Arabic','Chinese (Simplified)','Chinese (Traditional)','Czech','Danish','Dutch','English','Estonian','Finnish','French','German','Greek','Hebrew','Hungarian','Icelandic','Italian','Japanese','Korean','Latvian','Lithuanian','Norwegian','Portuguese','Polish','Romanian','Russian','Spanish','Swedish','Turkish']) - parser.add_argument('-pr', '--prefix', default=False, help="A word that you would want to prefix in front of each image name", type=str, required=False) + parser.add_argument('-si', '--similar_images', + help='downloads images very similar to the image URL you provide', type=str, required=False) + parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', + type=str, required=False) + parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", + action="store_true") + parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", + action="store_true") + parser.add_argument('-pp', '--print_paths', default=False, + help="Prints the list of absolute paths of the images", action="store_true") + parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", + action="store_true") + parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", + action="store_true") + parser.add_argument('-st', '--socket_timeout', default=False, + help="Connection timeout waiting for the image to download", type=float) + parser.add_argument('-th', '--thumbnail', default=False, + help="Downloads image thumbnail along with the actual image", action="store_true") + parser.add_argument('-tho', '--thumbnail_only', default=False, + help="Downloads only thumbnail without downloading actual images", action="store_true") + parser.add_argument('-la', '--language', default=False, + help="Defines the language filter. The search results are authomatically returned in that language", + type=str, required=False, + choices=['Arabic', 'Chinese (Simplified)', 'Chinese (Traditional)', 'Czech', 'Danish', + 'Dutch', 'English', 'Estonian', 'Finnish', 'French', 'German', 'Greek', 'Hebrew', + 'Hungarian', 'Icelandic', 'Italian', 'Japanese', 'Korean', 'Latvian', 'Lithuanian', + 'Norwegian', 'Portuguese', 'Polish', 'Romanian', 'Russian', 'Spanish', 'Swedish', + 'Turkish']) + parser.add_argument('-pr', '--prefix', default=False, + help="A word that you would want to prefix in front of each image name", type=str, + required=False) parser.add_argument('-px', '--proxy', help='specify a proxy address and port', type=str, required=False) - parser.add_argument('-cd', '--chromedriver', help='specify the path to chromedriver executable in your local machine', type=str, required=False) - parser.add_argument('-ri', '--related_images', default=False, help="Downloads images that are similar to the keyword provided", action="store_true") - parser.add_argument('-sa', '--safe_search', default=False, help="Turns on the safe search filter while searching for images", action="store_true") - parser.add_argument('-nn', '--no_numbering', default=False, help="Allows you to exclude the default numbering of images", action="store_true") + parser.add_argument('-cd', '--chromedriver', + help='specify the path to chromedriver executable in your local machine', type=str, + required=False) + parser.add_argument('-wb', '--browser', + help='Specify which driver to use', type=str, + required=False) + parser.add_argument('-ri', '--related_images', default=False, + help="Downloads images that are similar to the keyword provided", action="store_true") + parser.add_argument('-sa', '--safe_search', default=False, + help="Turns on the safe search filter while searching for images", action="store_true") + parser.add_argument('-nn', '--no_numbering', default=False, + help="Allows you to exclude the default numbering of images", action="store_true") parser.add_argument('-of', '--offset', help="Where to start in the fetched links", type=str, required=False) - parser.add_argument('-nd', '--no_download', default=False, help="Prints the URLs of the images and/or thumbnails without downloading them", action="store_true") - parser.add_argument('-iu', '--ignore_urls', default=False, help="delimited list input of image urls/keywords to ignore", type=str) - parser.add_argument('-sil', '--silent_mode', default=False, help="Remains silent. Does not print notification messages on the terminal", action="store_true") - parser.add_argument('-is', '--save_source', help="creates a text file containing a list of downloaded images along with source page url", type=str, required=False) + parser.add_argument('-nd', '--no_download', default=False, + help="Prints the URLs of the images and/or thumbnails without downloading them", + action="store_true") + parser.add_argument('-iu', '--ignore_urls', default=False, + help="delimited list input of image urls/keywords to ignore", type=str) + parser.add_argument('-sil', '--silent_mode', default=False, + help="Remains silent. Does not print notification messages on the terminal", + action="store_true") + parser.add_argument('-is', '--save_source', + help="creates a text file containing a list of downloaded images along with source page url", + type=str, required=False) args = parser.parse_args() arguments = vars(args) @@ -127,43 +182,76 @@ class googleimagesdownload: def __init__(self): pass + def _extract_data_pack(self, page): + start_line = page.find("AF_initDataCallback({key: \\'ds:1\\'") - 10 + start_object = page.find('[', start_line + 1) + end_object = page.rfind(']',0,page.find('', start_object + 1))+1 + object_raw = str(page[start_object:end_object]) + return bytes(object_raw, "utf-8").decode("unicode_escape") + + def _extract_data_pack_extended(self, page): + start_line = page.find("AF_initDataCallback({key: 'ds:1'") - 10 + start_object = page.find('[', start_line + 1) + end_object = page.rfind(']',0,page.find('', start_object + 1)) + 1 + return str(page[start_object:end_object]) + + def _extract_data_pack_ajax(self, data): + lines = data.split('\n') + return json.loads(lines[3])[0][2] + + @staticmethod + def _image_objects_from_pack(data): + image_data = json.loads(data) + # NOTE: google sometimes changes their format, breaking this. set a breakpoint here to find the correct index + grid = image_data[56][-1][0][-1][-1][0] + image_objects = [] + for item in grid: + obj = list(item[0][0].values())[0] + # ads and carousels will be empty + if not obj or not obj[1]: + continue + image_objects.append(obj) + return image_objects + # Downloading entire Web Document (Raw Page Content) - def download_page(self,url): + def download_page(self, url): version = (3, 0) cur_version = sys.version_info + headers = {} + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36" if cur_version >= version: # If the Current Version of Python is 3.0 or above try: - headers = {} - headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req = urllib.request.Request(url, headers=headers) resp = urllib.request.urlopen(req) respData = str(resp.read()) - return respData - except Exception as e: + except: print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") sys.exit() else: # If the Current Version of Python is 2.x try: - headers = {} - headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req = urllib2.Request(url, headers=headers) try: response = urllib2.urlopen(req) except URLError: # Handling SSL certificate failed context = ssl._create_unverified_context() response = urlopen(req, context=context) - page = response.read() - return page + respData = response.read() except: print("Could not open URL. Please check your internet connection and/or ssl settings \n" "If you are using proxy, make sure your proxy settings is configured correctly") sys.exit() return "Page Not found" - + try: + return self._image_objects_from_pack(self._extract_data_pack(respData)), self.get_all_tabs(respData) + except Exception as e: + print(e) + print('Image objects data unpacking failed. Please leave a comment with the above error at https://github.com/Joeclinton1/google-images-download/pull/26') + sys.exit() # Download Page for more than 100 images - def download_extended_page(self,url,chromedriver): + def download_extended_page(self, url, chromedriver, browser): from selenium import webdriver from selenium.webdriver.common.keys import Keys if sys.version_info[0] < 3: @@ -173,28 +261,67 @@ def download_extended_page(self,url,chromedriver): options.add_argument('--no-sandbox') options.add_argument("--headless") - try: - browser = webdriver.Chrome(chromedriver, chrome_options=options) - except Exception as e: - print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' " - "argument to specify the path to the executable.) or google chrome browser is not " - "installed on your machine (exception: %s)" % e) - sys.exit() + if browser == 'Firefox': + browser = webdriver.Firefox() + else: + try: + browser = webdriver.Chrome(chromedriver, chrome_options=options) + except Exception as e: + print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' " + "argument to specify the path to the executable.) or google chrome browser is not " + "installed on your machine (exception: %s)" % e) + sys.exit() browser.set_window_size(1024, 768) # Open the link browser.get(url) + browser.execute_script(""" + (function(XHR){ + "use strict"; + var open = XHR.prototype.open; + var send = XHR.prototype.send; + var data = []; + XHR.prototype.open = function(method, url, async, user, pass) { + this._url = url; + open.call(this, method, url, async, user, pass); + } + XHR.prototype.send = function(data) { + var self = this; + var url = this._url; + function stateChanged() { + if (self.readyState == 4) { + console.log("data available for: " + url) + XHR.prototype._data.push(self.response); + } + } + if (url.includes("/batchexecute?")) { + this.addEventListener("readystatechange", stateChanged, false); + } + send.call(this, data); + }; + XHR.prototype._data = []; + })(XMLHttpRequest); + """) + time.sleep(1) + + # Bypass "Before you continue" if it appears + try: + browser.find_element_by_css_selector("[aria-label='Accept all']").click() + time.sleep(1) + except selenium.common.exceptions.NoSuchElementException: + pass + print("Getting you a lot of images. This may take a few moments...") element = browser.find_element_by_tag_name("body") # Scroll down - for i in range(30): + for i in range(50): element.send_keys(Keys.PAGE_DOWN) time.sleep(0.3) try: - browser.find_element_by_id("smb").click() + browser.find_element_by_xpath('//input[@value="Show more results"]').click() for i in range(50): element.send_keys(Keys.PAGE_DOWN) time.sleep(0.3) # bot id protection @@ -206,29 +333,33 @@ def download_extended_page(self,url,chromedriver): print("Reached end of Page.") time.sleep(0.5) - source = browser.page_source #page source - #close the browser - browser.close() + source = browser.page_source # page source + images = self._image_objects_from_pack(self._extract_data_pack_extended(source)) + + ajax_data = browser.execute_script("return XMLHttpRequest.prototype._data") # I think this is broken + for chunk in ajax_data if ajax_data else []: + images += self._image_objects_from_pack(self._extract_data_pack_ajax(chunk)) - return source + # close the browser + browser.close() + return images, self.get_all_tabs(source) - #Correcting the escape characters for python2 - def replace_with_byte(self,match): + # Correcting the escape characters for python2 + def replace_with_byte(self, match): return chr(int(match.group(0)[1:], 8)) - def repair(self,brokenjson): + def repair(self, brokenjson): invalid_escape = re.compile(r'\\[0-7]{1,3}') # up to 3 digits for byte values up to FF return invalid_escape.sub(self.replace_with_byte, brokenjson) - # Finding 'Next Image' from the given raw page - def get_next_tab(self,s): + def get_next_tab(self, s): start_line = s.find('class="dtviD"') if start_line == -1: # If no links are found then give an error! end_quote = 0 link = "no_tabs" - return link,'',end_quote + return link, '', end_quote else: start_line = s.find('class="dtviD"') start_content = s.find('href="', start_line + 1) @@ -247,16 +378,15 @@ def get_next_tab(self,s): if chars_end == -1: updated_item_name = (url_item_name[chars + 5:]).replace("+", " ") else: - updated_item_name = (url_item_name[chars+5:chars_end]).replace("+", " ") + updated_item_name = (url_item_name[chars + 5:chars_end]).replace("+", " ") return url_item, updated_item_name, end_content - # Getting all links with the help of '_images_get_next_image' - def get_all_tabs(self,page): + def get_all_tabs(self, page): tabs = {} while True: - item,item_name,end_content = self.get_next_tab(page) + item, item_name, end_content = self.get_next_tab(page) if item == "no_tabs": break else: @@ -268,23 +398,30 @@ def get_all_tabs(self,page): page = page[end_content:] return tabs - - #Format the object in readable format - def format_object(self,object): + # Format the object in readable format + def format_object(self, object): + data = object[1] + main = data[3] + info = data[9] + if info is None: + info = data[11] formatted_object = {} - formatted_object['image_format'] = object['ity'] - formatted_object['image_height'] = object['oh'] - formatted_object['image_width'] = object['ow'] - formatted_object['image_link'] = object['ou'] - formatted_object['image_description'] = object['pt'] - formatted_object['image_host'] = object['rh'] - formatted_object['image_source'] = object['ru'] - formatted_object['image_thumbnail_url'] = object['tu'] + try: + formatted_object['image_height'] = main[2] + formatted_object['image_width'] = main[1] + formatted_object['image_link'] = main[0] + formatted_object['image_format'] = main[0][-1 * (len(main[0]) - main[0].rfind(".") - 1):] + formatted_object['image_description'] = info['2003'][3] + formatted_object['image_host'] = info['2003'][17] + formatted_object['image_source'] = info['2003'][2] + formatted_object['image_thumbnail_url'] = data[2][0] + except Exception as e: + print(e) + return None return formatted_object - - #function to download single image - def single_image(self,image_url): + # function to download single image + def single_image(self, image_url): main_directory = "downloads" extensions = (".jpg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico") url = image_url @@ -322,14 +459,15 @@ def single_image(self,image_url): print("completed ====> " + image_name.encode('raw_unicode_escape').decode('utf-8')) return - def similar_images(self,similar_images): + def similar_images(self, similar_images): version = (3, 0) cur_version = sys.version_info if cur_version >= version: # If the Current Version of Python is 3.0 or above try: searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images headers = {} - headers['User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" + headers[ + 'User-Agent'] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" req1 = urllib.request.Request(searchUrl, headers=headers) resp1 = urllib.request.urlopen(req1) @@ -351,7 +489,8 @@ def similar_images(self,similar_images): try: searchUrl = 'https://www.google.com/searchbyimage?site=search&sa=X&image_url=' + similar_images headers = {} - headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" + headers[ + 'User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17" req1 = urllib2.Request(searchUrl, headers=headers) resp1 = urllib2.urlopen(req1) @@ -366,42 +505,57 @@ def similar_images(self,similar_images): l3 = content.find('/search?sa=X&q=') l4 = content.find(';', l3 + 19) urll2 = content[l3 + 19:l4] - return(urll2) + return (urll2) except: return "Cloud not connect to Google Images endpoint" - #Building URL parameters - def build_url_parameters(self,arguments): + # Building URL parameters + def build_url_parameters(self, arguments): if arguments['language']: lang = "&lr=" - lang_param = {"Arabic":"lang_ar","Chinese (Simplified)":"lang_zh-CN","Chinese (Traditional)":"lang_zh-TW","Czech":"lang_cs","Danish":"lang_da","Dutch":"lang_nl","English":"lang_en","Estonian":"lang_et","Finnish":"lang_fi","French":"lang_fr","German":"lang_de","Greek":"lang_el","Hebrew":"lang_iw ","Hungarian":"lang_hu","Icelandic":"lang_is","Italian":"lang_it","Japanese":"lang_ja","Korean":"lang_ko","Latvian":"lang_lv","Lithuanian":"lang_lt","Norwegian":"lang_no","Portuguese":"lang_pt","Polish":"lang_pl","Romanian":"lang_ro","Russian":"lang_ru","Spanish":"lang_es","Swedish":"lang_sv","Turkish":"lang_tr"} - lang_url = lang+lang_param[arguments['language']] + lang_param = {"Arabic": "lang_ar", "Chinese (Simplified)": "lang_zh-CN", + "Chinese (Traditional)": "lang_zh-TW", "Czech": "lang_cs", "Danish": "lang_da", + "Dutch": "lang_nl", "English": "lang_en", "Estonian": "lang_et", "Finnish": "lang_fi", + "French": "lang_fr", "German": "lang_de", "Greek": "lang_el", "Hebrew": "lang_iw ", + "Hungarian": "lang_hu", "Icelandic": "lang_is", "Italian": "lang_it", "Japanese": "lang_ja", + "Korean": "lang_ko", "Latvian": "lang_lv", "Lithuanian": "lang_lt", "Norwegian": "lang_no", + "Portuguese": "lang_pt", "Polish": "lang_pl", "Romanian": "lang_ro", "Russian": "lang_ru", + "Spanish": "lang_es", "Swedish": "lang_sv", "Turkish": "lang_tr"} + lang_url = lang + lang_param[arguments['language']] else: lang_url = '' - if arguments['time_range']: - json_acceptable_string = arguments['time_range'].replace("'", "\"") - d = json.loads(json_acceptable_string) - time_range = ',cdr:1,cd_min:' + d['time_min'] + ',cd_max:' + d['time_max'] - else: - time_range = '' - - if arguments['exact_size']: - size_array = [x.strip() for x in arguments['exact_size'].split(',')] - exact_size = ",isz:ex,iszw:" + str(size_array[0]) + ",iszh:" + str(size_array[1]) - else: - exact_size = '' built_url = "&tbs=" counter = 0 - params = {'color':[arguments['color'],{'red':'ic:specific,isc:red', 'orange':'ic:specific,isc:orange', 'yellow':'ic:specific,isc:yellow', 'green':'ic:specific,isc:green', 'teal':'ic:specific,isc:teel', 'blue':'ic:specific,isc:blue', 'purple':'ic:specific,isc:purple', 'pink':'ic:specific,isc:pink', 'white':'ic:specific,isc:white', 'gray':'ic:specific,isc:gray', 'black':'ic:specific,isc:black', 'brown':'ic:specific,isc:brown'}], - 'color_type':[arguments['color_type'],{'full-color':'ic:color', 'black-and-white':'ic:gray','transparent':'ic:trans'}], - 'usage_rights':[arguments['usage_rights'],{'labeled-for-reuse-with-modifications':'sur:fmc','labeled-for-reuse':'sur:fc','labeled-for-noncommercial-reuse-with-modification':'sur:fm','labeled-for-nocommercial-reuse':'sur:f'}], - 'size':[arguments['size'],{'large':'isz:l','medium':'isz:m','icon':'isz:i','>400*300':'isz:lt,islt:qsvga','>640*480':'isz:lt,islt:vga','>800*600':'isz:lt,islt:svga','>1024*768':'visz:lt,islt:xga','>2MP':'isz:lt,islt:2mp','>4MP':'isz:lt,islt:4mp','>6MP':'isz:lt,islt:6mp','>8MP':'isz:lt,islt:8mp','>10MP':'isz:lt,islt:10mp','>12MP':'isz:lt,islt:12mp','>15MP':'isz:lt,islt:15mp','>20MP':'isz:lt,islt:20mp','>40MP':'isz:lt,islt:40mp','>70MP':'isz:lt,islt:70mp'}], - 'type':[arguments['type'],{'face':'itp:face','photo':'itp:photo','clipart':'itp:clipart','line-drawing':'itp:lineart','animated':'itp:animated'}], - 'time':[arguments['time'],{'past-24-hours':'qdr:d','past-7-days':'qdr:w','past-month':'qdr:m','past-year':'qdr:y'}], - 'aspect_ratio':[arguments['aspect_ratio'],{'tall':'iar:t','square':'iar:s','wide':'iar:w','panoramic':'iar:xw'}], - 'format':[arguments['format'],{'jpg':'ift:jpg','gif':'ift:gif','png':'ift:png','bmp':'ift:bmp','svg':'ift:svg','webp':'webp','ico':'ift:ico','raw':'ift:craw'}]} + params = {'color': [arguments['color'], {'red': 'ic:specific,isc:red', 'orange': 'ic:specific,isc:orange', + 'yellow': 'ic:specific,isc:yellow', 'green': 'ic:specific,isc:green', + 'teal': 'ic:specific,isc:teel', 'blue': 'ic:specific,isc:blue', + 'purple': 'ic:specific,isc:purple', 'pink': 'ic:specific,isc:pink', + 'white': 'ic:specific,isc:white', 'gray': 'ic:specific,isc:gray', + 'black': 'ic:specific,isc:black', 'brown': 'ic:specific,isc:brown'}], + 'color_type': [arguments['color_type'], + {'full-color': 'ic:color', 'black-and-white': 'ic:gray', 'transparent': 'ic:trans'}], + 'usage_rights': [arguments['usage_rights'], + {'labeled-for-reuse-with-modifications': 'sur:fmc', 'labeled-for-reuse': 'sur:fc', + 'labeled-for-noncommercial-reuse-with-modification': 'sur:fm', + 'labeled-for-nocommercial-reuse': 'sur:f'}], + 'size': [arguments['size'], + {'large': 'isz:l', 'medium': 'isz:m', 'icon': 'isz:i', '>400*300': 'isz:lt,islt:qsvga', + '>640*480': 'isz:lt,islt:vga', '>800*600': 'isz:lt,islt:svga', + '>1024*768': 'visz:lt,islt:xga', '>2MP': 'isz:lt,islt:2mp', '>4MP': 'isz:lt,islt:4mp', + '>6MP': 'isz:lt,islt:6mp', '>8MP': 'isz:lt,islt:8mp', '>10MP': 'isz:lt,islt:10mp', + '>12MP': 'isz:lt,islt:12mp', '>15MP': 'isz:lt,islt:15mp', '>20MP': 'isz:lt,islt:20mp', + '>40MP': 'isz:lt,islt:40mp', '>70MP': 'isz:lt,islt:70mp'}], + 'type': [arguments['type'], {'face': 'itp:face', 'photo': 'itp:photo', 'clipart': 'itp:clipart', + 'line-drawing': 'itp:lineart', 'animated': 'itp:animated'}], + 'time': [arguments['time'], {'past-24-hours': 'qdr:d', 'past-7-days': 'qdr:w', 'past-month': 'qdr:m', + 'past-year': 'qdr:y'}], + 'aspect_ratio': [arguments['aspect_ratio'], + {'tall': 'iar:t', 'square': 'iar:s', 'wide': 'iar:w', 'panoramic': 'iar:xw'}], + 'format': [arguments['format'], + {'jpg': 'ift:jpg', 'gif': 'ift:gif', 'png': 'ift:png', 'bmp': 'ift:bmp', 'svg': 'ift:svg', + 'webp': 'webp', 'ico': 'ift:ico', 'raw': 'ift:craw'}]} for key, value in params.items(): if value[0] is not None: ext_param = value[1][value[0]] @@ -413,13 +567,12 @@ def build_url_parameters(self,arguments): else: built_url = built_url + ',' + ext_param counter += 1 - built_url = lang_url+built_url+exact_size+time_range + built_url = lang_url + built_url return built_url - - #building main search URL - def build_search_url(self,search_term,params,url,similar_images,specific_site,safe_search): - #check safe_search + # building main search URL + def build_search_url(self, search_term, params, url, similar_images, specific_site, safe_search): + # check safe_search safe_search_string = "&safe=active" # check the args and choose the URL if url: @@ -430,20 +583,21 @@ def build_search_url(self,search_term,params,url,similar_images,specific_site,sa url = 'https://www.google.com/search?q=' + keywordem + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' elif specific_site: url = 'https://www.google.com/search?q=' + quote( - search_term.encode('utf-8')) + '&as_sitesearch=' + specific_site + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + search_term.encode( + 'utf-8')) + '&as_sitesearch=' + specific_site + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' else: url = 'https://www.google.com/search?q=' + quote( - search_term.encode('utf-8')) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' + search_term.encode( + 'utf-8')) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' - #safe search check + # safe search check if safe_search: url = url + safe_search_string return url - - #measures the file size - def file_size(self,file_path): + # measures the file size + def file_size(self, file_path): if os.path.isfile(file_path): file_info = os.stat(file_path) size = file_info.st_size @@ -453,8 +607,8 @@ def file_size(self,file_path): size /= 1024.0 return size - #keywords from file - def keywords_from_file(self,file_name): + # keywords from file + def keywords_from_file(self, file_name): search_keyword = [] with codecs.open(file_name, 'r', encoding='utf-8-sig') as f: if '.csv' in file_name: @@ -476,13 +630,13 @@ def keywords_from_file(self,file_name): return search_keyword # make directories - def create_directories(self,main_directory, dir_name,thumbnail,thumbnail_only): + def create_directories(self, main_directory, dir_name, thumbnail, thumbnail_only): dir_name_thumbnail = dir_name + " - thumbnail" # make a search keyword directory try: if not os.path.exists(main_directory): os.makedirs(main_directory) - time.sleep(0.2) + time.sleep(0.15) path = (dir_name) sub_directory = os.path.join(main_directory, path) if not os.path.exists(sub_directory): @@ -506,13 +660,13 @@ def create_directories(self,main_directory, dir_name,thumbnail,thumbnail_only): pass return - # Download Image thumbnails - def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image_name,print_urls,socket_timeout,print_size,no_download,save_source,img_src,ignore_urls): + def download_image_thumbnail(self, image_url, main_directory, dir_name, return_image_name, print_urls, + socket_timeout, print_size, no_download, save_source, img_src, ignore_urls): if print_urls or no_download: print("Image URL: " + image_url) if no_download: - return "success","Printed url without downloading" + return "success", "Printed url without downloading" try: req = Request(image_url, headers={ "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) @@ -535,7 +689,7 @@ def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image output_file.close() if save_source: list_path = main_directory + "/" + save_source + ".txt" - list_file = open(list_path,'a') + list_file = open(list_path, 'a') list_file.write(path + '\t' + img_src + '\n') list_file.close() except OSError as e: @@ -573,9 +727,10 @@ def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image download_message = "IOError on an image...trying next one..." + " Error: " + str(e) return download_status, download_message - # Download Images - def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size,no_numbering,no_download,save_source,img_src,silent_mode,thumbnail_only,format,ignore_urls): + def download_image(self, image_url, image_format, main_directory, dir_name, count, print_urls, socket_timeout, + prefix, print_size, no_numbering, no_download, save_source, img_src, silent_mode, thumbnail_only, + format, ignore_urls): if not silent_mode: if print_urls or no_download: print("Image URL: " + image_url) @@ -585,7 +740,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri if thumbnail_only: return "success", "Skipping image download...", str(image_url[(image_url.rfind('/')) + 1:]), image_url if no_download: - return "success","Printed url without downloading",None,image_url + return "success", "Printed url without downloading", None, image_url try: req = Request(image_url, headers={ "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"}) @@ -598,29 +753,43 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri response = urlopen(req, None, timeout) data = response.read() + info = response.info() response.close() - extensions = [".jpg", ".jpeg", ".gif", ".png", ".bmp", ".svg", ".webp", ".ico"] - # keep everything after the last '/' - image_name = str(image_url[(image_url.rfind('/')) + 1:]) - if format: - if not image_format or image_format != format: - download_status = 'fail' - download_message = "Wrong image format returned. Skipping..." - return_image_name = '' - absolute_path = '' - return download_status, download_message, return_image_name, absolute_path - - if image_format == "" or not image_format or "." + image_format not in extensions: + qmark = image_url.rfind('?') + if qmark == -1: + qmark = len(image_url) + slash = image_url.rfind('/', 0, qmark) + 1 + image_name = str(image_url[slash:qmark]).lower() + + type = info.get_content_type() + if type == "image/jpeg" or type == "image/jpg": + if not image_name.endswith(".jpg") and not image_name.endswith(".jpeg"): + image_name += ".jpg" + elif type == "image/png": + if not image_name.endswith(".png"): + image_name += ".png" + elif type == "image/webp": + if not image_name.endswith(".webp"): + image_name += ".webp" + elif type == "image/gif": + if not image_name.endswith(".gif"): + image_name += ".gif" + elif type == "image/bmp" or type == "image/x-windows-bmp": + if not image_name.endswith(".bmp"): + image_name += ".bmp" + elif type == "image/x-icon" or type == "image/vnd.microsoft.icon": + if not image_name.endswith(".ico"): + image_name += ".ico" + elif type == "image/svg+xml": + if not image_name.endswith(".svg"): + image_name += ".svg" + else: download_status = 'fail' - download_message = "Invalid or missing image format. Skipping..." + download_message = "Invalid image format '" + type + "'. Skipping..." return_image_name = '' absolute_path = '' return download_status, download_message, return_image_name, absolute_path - elif image_name.lower().find("." + image_format) < 0: - image_name = image_name + "." + image_format - else: - image_name = image_name[:image_name.lower().find("." + image_format) + (len(image_format) + 1)] # prefix name in image if prefix: @@ -639,7 +808,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri output_file.close() if save_source: list_path = main_directory + "/" + save_source + ".txt" - list_file = open(list_path,'a') + list_file = open(list_path, 'a') list_file.write(path + '\t' + img_src + '\n') list_file.close() absolute_path = os.path.abspath(path) @@ -649,7 +818,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri return_image_name = '' absolute_path = '' - #return image name back to calling method to use it for thumbnail downloads + # return image name back to calling method to use it for thumbnail downloads download_status = 'success' download_message = "Completed Image ====> " + prefix + str(count) + "." + image_name return_image_name = prefix + str(count) + "." + image_name @@ -670,7 +839,7 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri download_message = "URLError on an image...trying next one..." + " Error: " + str(e) return_image_name = '' absolute_path = '' - + except BadStatusLine as e: download_status = 'fail' download_message = "BadStatusLine on an image...trying next one..." + " Error: " + str(e) @@ -707,70 +876,49 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri return_image_name = '' absolute_path = '' - return download_status,download_message,return_image_name,absolute_path - - - # Finding 'Next Image' from the given raw page - def _get_next_item(self,s): - start_line = s.find('rg_meta notranslate') - if start_line == -1: # If no links are found then give an error! - end_quote = 0 - link = "no_links" - return link, end_quote - else: - start_line = s.find('class="rg_meta notranslate">') - start_object = s.find('{', start_line + 1) - end_object = s.find('', start_object + 1) - object_raw = str(s[start_object:end_object]) - #remove escape characters based on python version - version = (3, 0) - cur_version = sys.version_info - if cur_version >= version: #python3 - try: - object_decode = bytes(object_raw, "utf-8").decode("unicode_escape") - final_object = json.loads(object_decode) - except: - final_object = "" - else: #python2 - try: - final_object = (json.loads(self.repair(object_raw))) - except: - final_object = "" - return final_object, end_object - + return download_status, download_message, return_image_name, absolute_path - # Getting all links with the help of '_images_get_next_image' - def _get_all_items(self,page,main_directory,dir_name,limit,arguments): + def _get_all_items(self, image_objects, main_directory, dir_name, limit, arguments): items = [] abs_path = [] errorCount = 0 i = 0 count = 1 - while count < limit+1: - object, end_content = self._get_next_item(page) - if object == "no_links": + while count < limit + 1 and i < len(image_objects): + if len(image_objects) == 0: + print("no_links") break - elif object == "": - page = page[end_content:] - elif arguments['offset'] and count < int(arguments['offset']): + #code added here to attempt to implement offset correctly + #was "count < int(arguments['offset'])" in hardikvasa code, this seems + # to be contrary to the implementation details. + elif arguments['offset'] and count <= int(arguments['offset']): count += 1 - page = page[end_content:] + #page = page[end_content:] else: - #format the item for readability - object = self.format_object(object) + # format the item for readability + object = self.format_object(image_objects[i]) if arguments['metadata']: if not arguments["silent_mode"]: print("\nImage Metadata: " + str(object)) - #download the images - download_status,download_message,return_image_name,absolute_path = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size'],arguments['no_numbering'],arguments['no_download'],arguments['save_source'],object['image_source'],arguments["silent_mode"],arguments["thumbnail_only"],arguments['format'],arguments['ignore_urls']) + # download the images + download_status, download_message, return_image_name, absolute_path = self.download_image( + object['image_link'], object['image_format'], main_directory, dir_name, count, + arguments['print_urls'], arguments['socket_timeout'], arguments['prefix'], arguments['print_size'], + arguments['no_numbering'], arguments['no_download'], arguments['save_source'], + object['image_source'], arguments["silent_mode"], arguments["thumbnail_only"], arguments['format'], + arguments['ignore_urls']) if not arguments["silent_mode"]: print(download_message) if download_status == "success": # download image_thumbnails if arguments['thumbnail'] or arguments["thumbnail_only"]: - download_status, download_message_thumbnail = self.download_image_thumbnail(object['image_thumbnail_url'],main_directory,dir_name,return_image_name,arguments['print_urls'],arguments['socket_timeout'],arguments['print_size'],arguments['no_download'],arguments['save_source'],object['image_source'],arguments['ignore_urls']) + download_status, download_message_thumbnail = self.download_image_thumbnail( + object['image_thumbnail_url'], main_directory, dir_name, return_image_name, + arguments['print_urls'], arguments['socket_timeout'], arguments['print_size'], + arguments['no_download'], arguments['save_source'], object['image_source'], + arguments['ignore_urls']) if not arguments["silent_mode"]: print(download_message_thumbnail) @@ -781,21 +929,18 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments): else: errorCount += 1 - #delay param + # delay param if arguments['delay']: time.sleep(int(arguments['delay'])) - - page = page[end_content:] i += 1 if count < limit: print("\n\nUnfortunately all " + str( limit) + " could not be downloaded because some images were not downloadable. " + str( - count-1) + " is all we got for this search filter!") - return items,errorCount,abs_path - + count - 1) + " is all we got for this search filter!") + return items, errorCount, abs_path # Bulk Download - def download(self,arguments): + def download(self, arguments): paths_agg = {} # for input coming from other python files if __name__ != "__main__": @@ -819,7 +964,7 @@ def download(self,arguments): if arguments['print_paths']: print(paths.encode('raw_unicode_escape').decode('utf-8')) total_errors = total_errors + errors - return paths_agg,total_errors + return paths_agg, total_errors # if the calling file contains params directly else: paths, errors = self.download_executor(arguments) @@ -839,7 +984,7 @@ def download(self,arguments): print(paths.encode('raw_unicode_escape').decode('utf-8')) return paths_agg, errors - def download_executor(self,arguments): + def download_executor(self, arguments): paths = {} errorCount = None for arg in args_list: @@ -854,11 +999,13 @@ def download_executor(self,arguments): # both time and time range should not be allowed in the same query if arguments['time'] and arguments['time_range']: - raise ValueError('Either time or time range should be used in a query. Both cannot be used at the same time.') + raise ValueError( + 'Either time or time range should be used in a query. Both cannot be used at the same time.') # both time and time range should not be allowed in the same query if arguments['size'] and arguments['exact_size']: - raise ValueError('Either "size" or "exact_size" should be used in a query. Both cannot be used at the same time.') + raise ValueError( + 'Either "size" or "exact_size" should be used in a query. Both cannot be used at the same time.') # both image directory and no image directory should not be allowed in the same query if arguments['image_directory'] and arguments['no_directory']: @@ -892,7 +1039,7 @@ def download_executor(self,arguments): # If single_image or url argument not present then keywords is mandatory argument if arguments['single_image'] is None and arguments['url'] is None and arguments['similar_images'] is None and \ - arguments['keywords'] is None and arguments['keywords_from_file'] is None: + arguments['keywords'] is None and arguments['keywords_from_file'] is None: print('-------------------------------\n' 'Uh oh! Keywords is a required argument \n\n' 'Please refer to the documentation on guide to writing queries \n' @@ -911,13 +1058,27 @@ def download_executor(self,arguments): if arguments['proxy']: os.environ["http_proxy"] = arguments['proxy'] os.environ["https_proxy"] = arguments['proxy'] + + # Add time range to keywords if asked + time_range = '' + if arguments['time_range']: + json_acceptable_string = arguments['time_range'].replace("'", "\"") + d = json.loads(json_acceptable_string) + time_range = ' after:' + d['time_min'] + ' before:' + d['time_max'] + + exact_size = '' + if arguments['exact_size']: + size_array = [x.strip() for x in arguments['exact_size'].split(',')] + exact_size = " imagesize:" + str(size_array[0]) + "x" + str(size_array[1]) + ######Initialization Complete total_errors = 0 - for pky in prefix_keywords: # 1.for every prefix keywords - for sky in suffix_keywords: # 2.for every suffix keywords + for pky in prefix_keywords: # 1.for every prefix keywords + for sky in suffix_keywords: # 2.for every suffix keywords i = 0 - while i < len(search_keyword): # 3.for every main keyword - iteration = "\n" + "Item no.: " + str(i + 1) + " -->" + " Item name = " + (pky) + (search_keyword[i]) + (sky) + while i < len(search_keyword): # 3.for every main keyword + iteration = "\n" + "Item no.: " + str(i + 1) + " -->" + " Item name = " + (pky) + ( + search_keyword[i]) + (sky) if not arguments["silent_mode"]: print(iteration.encode('raw_unicode_escape').decode('utf-8')) print("Evaluating...") @@ -930,52 +1091,58 @@ def download_executor(self,arguments): elif arguments['no_directory']: dir_name = '' else: - dir_name = search_term + ('-' + arguments['color'] if arguments['color'] else '') #sub-directory + dir_name = search_term + ( + '-' + arguments['color'] if arguments['color'] else '') # sub-directory if not arguments["no_download"]: - self.create_directories(main_directory,dir_name,arguments['thumbnail'],arguments['thumbnail_only']) #create directories in OS + self.create_directories(main_directory, dir_name, arguments['thumbnail'], + arguments['thumbnail_only']) # create directories in OS - params = self.build_url_parameters(arguments) #building URL with params + params = self.build_url_parameters(arguments) # building URL with params - url = self.build_search_url(search_term,params,arguments['url'],arguments['similar_images'],arguments['specific_site'],arguments['safe_search']) #building main search url + search_term += time_range + exact_size + url = self.build_search_url(search_term, params, arguments['url'], arguments['similar_images'], + arguments['specific_site'], + arguments['safe_search']) # building main search url if limit < 101: - raw_html = self.download_page(url) # download page + images, tabs = self.download_page(url) # download page else: - raw_html = self.download_extended_page(url,arguments['chromedriver']) + images, tabs = self.download_extended_page(url, arguments['chromedriver'], arguments['browser']) if not arguments["silent_mode"]: if arguments['no_download']: print("Getting URLs without downloading images...") else: print("Starting Download...") - items,errorCount,abs_path = self._get_all_items(raw_html,main_directory,dir_name,limit,arguments) #get all image items and download images + items, errorCount, abs_path = self._get_all_items(images, main_directory, dir_name, limit, + arguments) # get all image items and download images paths[pky + search_keyword[i] + sky] = abs_path - #dumps into a json file + # dumps into a json file if arguments['extract_metadata']: try: if not os.path.exists("logs"): os.makedirs("logs") except OSError as e: print(e) - json_file = open("logs/"+search_keyword[i]+".json", "w") + json_file = open("logs/" + search_keyword[i] + ".json", "w") json.dump(items, json_file, indent=4, sort_keys=True) json_file.close() - #Related images + # Related images if arguments['related_images']: print("\nGetting list of related keywords...this may take a few moments") - tabs = self.get_all_tabs(raw_html) for key, value in tabs.items(): final_search_term = (search_term + " - " + key) print("\nNow Downloading - " + final_search_term) if limit < 101: - new_raw_html = self.download_page(value) # download page + images, _ = self.download_page(value) # download page else: - new_raw_html = self.download_extended_page(value,arguments['chromedriver']) - self.create_directories(main_directory, final_search_term,arguments['thumbnail'],arguments['thumbnail_only']) - self._get_all_items(new_raw_html, main_directory, search_term + " - " + key, limit,arguments) + images, _ = self.download_extended_page(value, arguments['chromedriver'], arguments['browser']) + self.create_directories(main_directory, final_search_term, arguments['thumbnail'], + arguments['thumbnail_only']) + self._get_all_items(images, main_directory, search_term + " - " + key, limit, arguments) i += 1 total_errors = total_errors + errorCount @@ -983,7 +1150,8 @@ def download_executor(self,arguments): print("\nErrors: " + str(errorCount) + "\n") return paths, total_errors -#------------- Main Program -------------# + +# ------------- Main Program -------------# def main(): records = user_input() total_errors = 0 @@ -995,7 +1163,7 @@ def main(): response.single_image(arguments['single_image']) else: # or download multiple images based on keywords/keyphrase search response = googleimagesdownload() - paths,errors = response.download(arguments) #wrapping response in a variable just for consistency + paths, errors = response.download(arguments) # wrapping response in a variable just for consistency total_errors = total_errors + errors t1 = time.time() # stop the timer @@ -1005,7 +1173,6 @@ def main(): print("Total errors: " + str(total_errors)) print("Total time taken: " + str(total_time) + " Seconds") + if __name__ == "__main__": main() - -# In[ ]: diff --git a/tests/test_google_images_download.py b/tests/test_google_images_download.py index ec62afd0..e9089945 100644 --- a/tests/test_google_images_download.py +++ b/tests/test_google_images_download.py @@ -1,3 +1,4 @@ +import argparse from google_images_download import google_images_download import os, errno import time @@ -13,28 +14,23 @@ def silent_remove_of_file(file): return True -def test_download_images_to_default_location(): +def test_download_images_to_default_location(arguments: dict): start_time = time.time() - argumnets = { - "keywords": "Polar bears", - "limit": 5, - "print_urls": False - } try: - temp = argumnets['output_folder'] + temp = arguments['output_folder'] except KeyError: pass else: assert False, "This test checks download to default location yet an output folder was provided" - output_folder_path = os.path.join(os.path.realpath('.'), 'downloads', '{}'.format(argumnets['keywords'])) + output_folder_path = os.path.join(os.path.realpath('.'), 'downloads', '{}'.format(arguments['keywords'])) if os.path.exists(output_folder_path): start_amount_of_files_in_output_folder = len([name for name in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, name)) and os.path.getctime(os.path.join(output_folder_path, name)) < start_time]) else: start_amount_of_files_in_output_folder = 0 response = google_images_download.googleimagesdownload() - response.download(argumnets) + response.download(arguments) files_modified_after_test_started = [name for name in os.listdir(output_folder_path) if os.path.isfile(os.path.join(output_folder_path, name)) and os.path.getmtime(os.path.join(output_folder_path, name)) > start_time] end_amount_of_files_in_output_folder = len(files_modified_after_test_started) print(f"Files downloaded by test {__name__}:") @@ -43,11 +39,22 @@ def test_download_images_to_default_location(): # assert end_amount_of_files_in_output_folder - start_amount_of_files_in_output_folder == argumnets['limit'] - assert end_amount_of_files_in_output_folder == argumnets['limit'] + assert end_amount_of_files_in_output_folder == arguments['limit'] print(f"Cleaning up all files downloaded by test {__name__}...") for file in files_modified_after_test_started: if silent_remove_of_file(os.path.join(output_folder_path, file)): print(f"Deleted {os.path.join(output_folder_path, file)}") else: - print(f"Failed to delete {os.path.join(output_folder_path, file)}") \ No newline at end of file + print(f"Failed to delete {os.path.join(output_folder_path, file)}") + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-k', '--keywords', type=str, help='delimited list input', default="Polar bears") + parser.add_argument('-l', '--limit', type=int, help='delimited list input', default=101) + parser.add_argument('-u', '--print_urls', action='store_true', help='print the URLs of the images') + parser.add_argument('-c', '--chromedriver', type=str, help='path to chromedriver executable in your local machine', default='C:/Program Files (x86)/chromedriver/chromedriver.exe') + args = parser.parse_args() + print(f"testing with args: {args}") + + test_download_images_to_default_location(vars(args))