From 439efe5e3f9228a3e624d28c45f0181687b27786 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 7 Jan 2025 09:51:06 +0100 Subject: [PATCH 1/5] psycopg2 is definitely required guys... --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 888f97cb..cbbf4ed2 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ "praw~=7.0", "prawcore~=2.0", "psutil~=5.0", - #"psycopg2~=2.9.0", + "psycopg2~=2.9.0", "pyahocorasick~=1.4.0", "PyMySQL~=1.0", "PyTumblr==0.1.0", From d69a0c3c58d7dba29e9ad477de804cdd09564b95 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 7 Jan 2025 10:38:10 +0100 Subject: [PATCH 2/5] blip: cleaner error messages (and warnings to admins); filenames for images without metadata (e.g., via imports) --- common/lib/dmi_service_manager.py | 28 +++++++++---------- .../machine_learning/blip2_image_caption.py | 20 +++++++++---- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/common/lib/dmi_service_manager.py b/common/lib/dmi_service_manager.py index c88d669f..d4991085 100644 --- a/common/lib/dmi_service_manager.py +++ b/common/lib/dmi_service_manager.py @@ -155,7 +155,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period= existing_service = self.check_service_exists() if existing_service: if len(existing_service) > 1: - raise Exception("Multiple services found with the same dataset key.") + raise DmiServiceManagerException("Multiple services found with the same dataset key.") else: existing_service = existing_service[0] if existing_service['status'] == 'complete': @@ -176,7 +176,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period= try: resp = requests.post(api_endpoint, json=data, timeout=30) except requests.exceptions.ConnectionError as e : - raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(e)}") + raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(e)}") if resp.status_code == 202: # New request successful @@ -186,10 +186,10 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period= resp_json = resp.json() if resp.status_code == 400 and 'key' in resp_json and 'error' in resp_json and resp_json['error'] == f"future_key {resp_json['key']} already exists": # Request already exists; get DMI SM database key - raise Exception(f"Request already exists; check that DMI SM is up to date") + raise DmiServiceManagerException(f"Request already exists; check that DMI SM is up to date") elif resp.status_code == 404: # Could be local vs remote not set correctly - raise DmiServiceManagerException(f"404: {resp.url} not found; DMI Service Manager may not be set up for this service") + raise DsmConnectionError(f"404: {resp.url} not found; DMI Service Manager may not be set up for this service") else: raise DmiServiceManagerException(f"DMI Service Manager error: {str(resp.status_code)}: {str(resp_json)}") except JSONDecodeError: @@ -221,14 +221,14 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period= # Have seen the Service Manager fail particularly when another processor is uploading many consecutive files connection_error += 1 if connection_error > 3: - raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(e)}") + raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(e)}") continue if result.status_code != 200 or (result.json and result.json().get('status') != "success"): # Unexpected response from DMI SM connection_error += 1 if connection_error > 3: - raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(result.status_code)}: {str(result.json()) if 'json' in result.headers.get('Content-Type', '') else str(result.text)}") + raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(result.status_code)}: {str(result.json()) if 'json' in result.headers.get('Content-Type', '') else str(result.text)}") continue service_status = result.json()["job"] @@ -255,7 +255,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period= elif service_status['status'] in ["complete", "error"]: results = json.loads(service_status['results']) if not results: - # This should not be the case is the service was written well (unless the DMI SM crashed?) + # This should not be the case if the service was written well (unless the DMI SM crashed?) #TODO test if timing issue? connection_error += 1 if connection_error > 3: @@ -268,7 +268,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period= else: error = results['error'] if "CUDA error: out of memory" in error: - raise DmiServiceManagerException("DMI Service Manager server ran out of memory; try reducing the number of files processed at once or waiting until the server is less busy.") + raise DsmOutOfMemory("DMI Service Manager server ran out of memory; try reducing the number of files processed at once or waiting until the server is less busy.") else: raise DmiServiceManagerException(f"Error {service_endpoint}: " + error) else: @@ -308,14 +308,14 @@ def request_folder_files(self, folder_name): except requests.exceptions.ConnectionError as e: retries += 1 if retries > 3: - raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}") + raise DsmConnectionError(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}") continue # Check if 4CAT has access to this server if filename_response.status_code == 403: - raise DmiServiceManagerException("403: 4CAT does not have permission to use the DMI Service Manager server") + raise DsmConnectionError("403: 4CAT does not have permission to use the DMI Service Manager server") elif filename_response.status_code in [400, 405]: - raise DmiServiceManagerException(f"400: DMI Service Manager server {filename_response.json()['reason']}") + raise DsmConnectionError(f"400: DMI Service Manager server {filename_response.json()['reason']}") elif filename_response.status_code == 404: # Folder not found; no files return {} @@ -388,9 +388,9 @@ def send_files(self, file_collection_name, results_name, files_to_upload, dir_wi self.processor.dataset.update_status(f"Uploaded {files_uploaded} of {total_files_to_upload} files!") self.processor.dataset.update_progress(files_uploaded / total_files_to_upload) elif response.status_code == 403: - raise DmiServiceManagerException("403: 4CAT does not have permission to use the DMI Service Manager server") + raise DsmConnectionError("403: 4CAT does not have permission to use the DMI Service Manager server") elif response.status_code == 405: - raise DmiServiceManagerException("405: Method not allowed; check DMI Service Manager server address (perhaps http is being used instead of https)") + raise DsmConnectionError("405: Method not allowed; check DMI Service Manager server address (perhaps http is being used instead of https)") else: self.processor.dataset.log(f"Unable to upload file ({response.status_code} - {response.reason}): {upload_file}") @@ -432,7 +432,7 @@ def download_results(self, filenames_to_download, folder_name, local_output_dir, except requests.exceptions.ConnectionError as e: retries += 1 if retries > 3: - raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading file: {folder_name}/{filename}") + raise DsmConnectionError(f"Connection Error {e} (retries {retries}) while downloading file: {folder_name}/{filename}") continue files_downloaded += 1 if files_downloaded % 1000 == 0: diff --git a/processors/machine_learning/blip2_image_caption.py b/processors/machine_learning/blip2_image_caption.py index 0287c2b5..5898ec59 100644 --- a/processors/machine_learning/blip2_image_caption.py +++ b/processors/machine_learning/blip2_image_caption.py @@ -6,7 +6,7 @@ from backend.lib.processor import BasicProcessor -from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory +from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory, DsmConnectionError from common.lib.exceptions import ProcessorInterruptedException from common.lib.user_input import UserInput from common.config_manager import config @@ -183,8 +183,15 @@ def process(self): self.dataset.finish_with_error( "DMI Service Manager ran out of memory; Try decreasing the number of images or try again or try again later.") return + except DsmConnectionError as e: + self.dataset.log(str(e)) + self.log.warning(f"DMI Service Manager connection error ({self.dataset.key}): {e}") + self.dataset.finish_with_error("DMI Service Manager connection error; please contact 4CAT admins.") + return except DmiServiceManagerException as e: - self.dataset.finish_with_error(str(e)) + self.dataset.log(str(e)) + self.log.warning(f"BLIP2 Error ({self.dataset.key}): {e}") + self.dataset.finish_with_error(f"Error with BLIP2 model; please contact 4CAT admins.") return # Load the video metadata if available @@ -202,6 +209,9 @@ def process(self): data.update({"url": url}) # using the filename without extension as the key; since that is how the results form their filename image_metadata[".".join(data['filename'].split(".")[:-1])] = data + else: + self.dataset.log("No image metadata found") + self.dataset.update_status("Processing BLIP2 results...") # Download the result files @@ -237,12 +247,12 @@ def map_item(item): :param item: :return: """ - image_metadata = item.get("image_metadata") + image_metadata = item.get("image_metadata", {}) return MappedItem({ "id": item.get("id"), "text": item.get("text"), - # "original_url": image_metadata.get("url", ""), # TODO: does not appear all image datasets are using URL properly... - "image_filename": image_metadata.get("filename", ""), + "image_filename": image_metadata.get("filename", "") if image_metadata else item.get("id"), # fallback to id which is filename + "original_url": image_metadata.get("url", "N/A"), "post_ids": ", ".join([str(post_id) for post_id in image_metadata.get("post_ids", [])]), "from_dataset": image_metadata.get("from_dataset", ""), }) From 15d8969fef818eb3d52a249986e7643d91e293c3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 7 Jan 2025 10:45:08 +0100 Subject: [PATCH 3/5] clip: update error handling and filename mapping --- .../machine_learning/clip_categorize_images.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/processors/machine_learning/clip_categorize_images.py b/processors/machine_learning/clip_categorize_images.py index f9be0b08..7682140d 100644 --- a/processors/machine_learning/clip_categorize_images.py +++ b/processors/machine_learning/clip_categorize_images.py @@ -192,8 +192,15 @@ def process(self): self.dataset.finish_with_error( "DMI Service Manager ran out of memory; Try decreasing the number of images or try again or try again later.") return + except DsmConnectionError as e: + self.dataset.log(str(e)) + self.log.warning(f"DMI Service Manager connection error ({self.dataset.key}): {e}") + self.dataset.finish_with_error("DMI Service Manager connection error; please contact 4CAT admins.") + return except DmiServiceManagerException as e: - self.dataset.finish_with_error(str(e)) + self.dataset.log(str(e)) + self.log.warning(f"CLIP Error ({self.dataset.key}): {e}") + self.dataset.finish_with_error(f"Error with CLIP model; please contact 4CAT admins.") return # Load the video metadata if available @@ -227,6 +234,7 @@ def process(self): image_name = ".".join(result_filename.split(".")[:-1]) data = { "id": image_name, + "filename": result_filename, "categories": result_data, "image_metadata": image_metadata.get(image_name, {}) if image_metadata else {}, } @@ -243,7 +251,7 @@ def map_item(item): :param item: :return: """ - image_metadata = item.get("image_metadata") + image_metadata = item.get("image_metadata", {}) # Updates to CLIP output; categories used to be a list of categories, but now is a dict with: {"predictions": [[category_label, precent_float],]} categories = item.get("categories") if type(categories) == list: @@ -263,9 +271,9 @@ def map_item(item): all_cats = {cat[0]: cat[1] for cat in categories} return MappedItem({ "id": item.get("id"), + "image_filename": item.get("filename"), "top_categories": ", ".join([f"{cat[0]}: {100* cat[1]:.2f}%" for cat in top_cats]), - "original_url": image_metadata.get("url", ""), - "image_filename": image_metadata.get("filename", ""), + "original_url": image_metadata.get("url", "N/A"), "post_ids": ", ".join([str(post_id) for post_id in image_metadata.get("post_ids", [])]), "from_dataset": image_metadata.get("from_dataset", ""), **all_cats From 97c5ca53e1f594285649d1b48f80a0ec0bc9faf5 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 7 Jan 2025 11:07:32 +0100 Subject: [PATCH 4/5] add scikit-learn and scipy libraries back --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index cbbf4ed2..71d2035c 100644 --- a/setup.py +++ b/setup.py @@ -50,8 +50,8 @@ "requests~=2.27", "requests_futures", "scenedetect==0.6.0.3", - #"scikit-learn", - #"scipy==1.10.1", + "scikit-learn", + "scipy==1.10.1", "shapely", "svgwrite~=1.4.0", "tailer", From 8b0423d58832b5ce1a9bc0fabb977d2f547f5a59 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 7 Jan 2025 11:31:52 +0100 Subject: [PATCH 5/5] remove log statement whoops --- processors/text-analysis/vectorise_by_cat.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/processors/text-analysis/vectorise_by_cat.py b/processors/text-analysis/vectorise_by_cat.py index 434733a6..cdf9a8c6 100644 --- a/processors/text-analysis/vectorise_by_cat.py +++ b/processors/text-analysis/vectorise_by_cat.py @@ -8,7 +8,6 @@ from backend.lib.processor import BasicProcessor from common.lib.helpers import UserInput -from common.config_manager import config __author__ = "Dale Wahl" __credits__ = ["Dale Wahl", "Stijn Peeters"] @@ -136,8 +135,6 @@ def get_category_dataset(dataset): Get the dataset that contains the category column; this should be the dataset above the tokenise-posts dataset """ genealogy = dataset.get_genealogy() - config.with_db() - config.db.log.info(f"Genealogy: {[(gen.key, gen.type) for gen in genealogy]}") # Find parent of tokenise-posts dataset; this dataset will contain the categories related to the tokens extracted from it tokeniser_found = False