From 439efe5e3f9228a3e624d28c45f0181687b27786 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 7 Jan 2025 09:51:06 +0100
Subject: [PATCH 1/5] psycopg2 is definitely required guys...

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 888f97cb..cbbf4ed2 100644
--- a/setup.py
+++ b/setup.py
@@ -42,7 +42,7 @@
 	"praw~=7.0",
 	"prawcore~=2.0",
 	"psutil~=5.0",
-	#"psycopg2~=2.9.0",
+	"psycopg2~=2.9.0",
 	"pyahocorasick~=1.4.0",
 	"PyMySQL~=1.0",
 	"PyTumblr==0.1.0",

From d69a0c3c58d7dba29e9ad477de804cdd09564b95 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 7 Jan 2025 10:38:10 +0100
Subject: [PATCH 2/5] blip: cleaner error messages (and warnings to admins);
 filenames for images without metadata (e.g., via imports)

---
 common/lib/dmi_service_manager.py             | 28 +++++++++----------
 .../machine_learning/blip2_image_caption.py   | 20 +++++++++----
 2 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/common/lib/dmi_service_manager.py b/common/lib/dmi_service_manager.py
index c88d669f..d4991085 100644
--- a/common/lib/dmi_service_manager.py
+++ b/common/lib/dmi_service_manager.py
@@ -155,7 +155,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
         existing_service = self.check_service_exists()
         if existing_service:
             if len(existing_service) > 1:
-                raise Exception("Multiple services found with the same dataset key.")
+                raise DmiServiceManagerException("Multiple services found with the same dataset key.")
             else:
                 existing_service = existing_service[0]
                 if existing_service['status'] == 'complete':
@@ -176,7 +176,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
             try:
                 resp = requests.post(api_endpoint, json=data, timeout=30)
             except requests.exceptions.ConnectionError as e :
-                raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(e)}")
+                raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(e)}")
 
             if resp.status_code == 202:
                 # New request successful
@@ -186,10 +186,10 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
                     resp_json = resp.json()
                     if resp.status_code == 400 and 'key' in resp_json and 'error' in resp_json and resp_json['error'] == f"future_key {resp_json['key']} already exists":
                         # Request already exists; get DMI SM database key
-                        raise Exception(f"Request already exists; check that DMI SM is up to date")
+                        raise DmiServiceManagerException(f"Request already exists; check that DMI SM is up to date")
                     elif resp.status_code == 404:
                         # Could be local vs remote not set correctly
-                        raise DmiServiceManagerException(f"404: {resp.url} not found; DMI Service Manager may not be set up for this service")
+                        raise DsmConnectionError(f"404: {resp.url} not found; DMI Service Manager may not be set up for this service")
                     else:
                         raise DmiServiceManagerException(f"DMI Service Manager error: {str(resp.status_code)}: {str(resp_json)}")
                 except JSONDecodeError:
@@ -221,14 +221,14 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
                     # Have seen the Service Manager fail particularly when another processor is uploading many consecutive files
                     connection_error += 1
                     if connection_error > 3:
-                        raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(e)}")
+                        raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(e)}")
                     continue
 
                 if result.status_code != 200 or (result.json and result.json().get('status') != "success"):
                     # Unexpected response from DMI SM
                     connection_error += 1
                     if connection_error > 3:
-                        raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(result.status_code)}: {str(result.json()) if 'json' in result.headers.get('Content-Type', '') else str(result.text)}")
+                        raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(result.status_code)}: {str(result.json()) if 'json' in result.headers.get('Content-Type', '') else str(result.text)}")
                     continue
                 service_status = result.json()["job"]
 
@@ -255,7 +255,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
                 elif service_status['status'] in ["complete", "error"]:
                     results = json.loads(service_status['results'])
                     if not results:
-                        # This should not be the case is the service was written well (unless the DMI SM crashed?)
+                        # This should not be the case if the service was written well (unless the DMI SM crashed?)
                         #TODO test if timing issue?
                         connection_error += 1
                         if connection_error > 3:
@@ -268,7 +268,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
                     else:
                         error = results['error']
                         if "CUDA error: out of memory" in error:
-                            raise DmiServiceManagerException("DMI Service Manager server ran out of memory; try reducing the number of files processed at once or waiting until the server is less busy.")
+                            raise DsmOutOfMemory("DMI Service Manager server ran out of memory; try reducing the number of files processed at once or waiting until the server is less busy.")
                         else:
                             raise DmiServiceManagerException(f"Error {service_endpoint}: " + error)
                 else:
@@ -308,14 +308,14 @@ def request_folder_files(self, folder_name):
             except requests.exceptions.ConnectionError as e:
                 retries += 1
                 if retries > 3:
-                    raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}")
+                    raise DsmConnectionError(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}")
                 continue
 
         # Check if 4CAT has access to this server
         if filename_response.status_code == 403:
-            raise DmiServiceManagerException("403: 4CAT does not have permission to use the DMI Service Manager server")
+            raise DsmConnectionError("403: 4CAT does not have permission to use the DMI Service Manager server")
         elif filename_response.status_code in [400, 405]:
-            raise DmiServiceManagerException(f"400: DMI Service Manager server {filename_response.json()['reason']}")
+            raise DsmConnectionError(f"400: DMI Service Manager server {filename_response.json()['reason']}")
         elif filename_response.status_code == 404:
             # Folder not found; no files
             return {}
@@ -388,9 +388,9 @@ def send_files(self, file_collection_name, results_name, files_to_upload, dir_wi
                         self.processor.dataset.update_status(f"Uploaded {files_uploaded} of {total_files_to_upload} files!")
                     self.processor.dataset.update_progress(files_uploaded / total_files_to_upload)
                 elif response.status_code == 403:
-                    raise DmiServiceManagerException("403: 4CAT does not have permission to use the DMI Service Manager server")
+                    raise DsmConnectionError("403: 4CAT does not have permission to use the DMI Service Manager server")
                 elif response.status_code == 405:
-                    raise DmiServiceManagerException("405: Method not allowed; check DMI Service Manager server address (perhaps http is being used instead of https)")
+                    raise DsmConnectionError("405: Method not allowed; check DMI Service Manager server address (perhaps http is being used instead of https)")
                 else:
                     self.processor.dataset.log(f"Unable to upload file ({response.status_code} - {response.reason}): {upload_file}")
 
@@ -432,7 +432,7 @@ def download_results(self, filenames_to_download, folder_name, local_output_dir,
                 except requests.exceptions.ConnectionError as e:
                     retries += 1
                     if retries > 3:
-                        raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading file: {folder_name}/{filename}")
+                        raise DsmConnectionError(f"Connection Error {e} (retries {retries}) while downloading file: {folder_name}/{filename}")
                     continue
             files_downloaded += 1
             if files_downloaded % 1000 == 0:
diff --git a/processors/machine_learning/blip2_image_caption.py b/processors/machine_learning/blip2_image_caption.py
index 0287c2b5..5898ec59 100644
--- a/processors/machine_learning/blip2_image_caption.py
+++ b/processors/machine_learning/blip2_image_caption.py
@@ -6,7 +6,7 @@
 
 
 from backend.lib.processor import BasicProcessor
-from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory
+from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory, DsmConnectionError
 from common.lib.exceptions import ProcessorInterruptedException
 from common.lib.user_input import UserInput
 from common.config_manager import config
@@ -183,8 +183,15 @@ def process(self):
             self.dataset.finish_with_error(
                 "DMI Service Manager ran out of memory; Try decreasing the number of images or try again or try again later.")
             return
+        except DsmConnectionError as e:
+            self.dataset.log(str(e))
+            self.log.warning(f"DMI Service Manager connection error ({self.dataset.key}): {e}")
+            self.dataset.finish_with_error("DMI Service Manager connection error; please contact 4CAT admins.")
+            return
         except DmiServiceManagerException as e:
-            self.dataset.finish_with_error(str(e))
+            self.dataset.log(str(e))
+            self.log.warning(f"BLIP2 Error ({self.dataset.key}): {e}")
+            self.dataset.finish_with_error(f"Error with BLIP2 model; please contact 4CAT admins.")
             return
 
         # Load the video metadata if available
@@ -202,6 +209,9 @@ def process(self):
                         data.update({"url": url})
                         # using the filename without extension as the key; since that is how the results form their filename
                         image_metadata[".".join(data['filename'].split(".")[:-1])] = data
+        else:
+            self.dataset.log("No image metadata found")
+
 
         self.dataset.update_status("Processing BLIP2 results...")
         # Download the result files
@@ -237,12 +247,12 @@ def map_item(item):
         :param item:
         :return:
         """
-        image_metadata = item.get("image_metadata")
+        image_metadata = item.get("image_metadata", {})
         return MappedItem({
             "id": item.get("id"),
             "text": item.get("text"),
-            # "original_url": image_metadata.get("url", ""), # TODO: does not appear all image datasets are using URL properly...
-            "image_filename": image_metadata.get("filename", ""),
+            "image_filename": image_metadata.get("filename", "") if image_metadata else item.get("id"), # fallback to id which is filename
+            "original_url": image_metadata.get("url", "N/A"),
             "post_ids": ", ".join([str(post_id) for post_id in image_metadata.get("post_ids", [])]),
             "from_dataset": image_metadata.get("from_dataset", ""),
         })

From 15d8969fef818eb3d52a249986e7643d91e293c3 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 7 Jan 2025 10:45:08 +0100
Subject: [PATCH 3/5] clip: update error handling and filename mapping

---
 .../machine_learning/clip_categorize_images.py   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/processors/machine_learning/clip_categorize_images.py b/processors/machine_learning/clip_categorize_images.py
index f9be0b08..7682140d 100644
--- a/processors/machine_learning/clip_categorize_images.py
+++ b/processors/machine_learning/clip_categorize_images.py
@@ -192,8 +192,15 @@ def process(self):
             self.dataset.finish_with_error(
                 "DMI Service Manager ran out of memory; Try decreasing the number of images or try again or try again later.")
             return
+        except DsmConnectionError as e:
+            self.dataset.log(str(e))
+            self.log.warning(f"DMI Service Manager connection error ({self.dataset.key}): {e}")
+            self.dataset.finish_with_error("DMI Service Manager connection error; please contact 4CAT admins.")
+            return
         except DmiServiceManagerException as e:
-            self.dataset.finish_with_error(str(e))
+            self.dataset.log(str(e))
+            self.log.warning(f"CLIP Error ({self.dataset.key}): {e}")
+            self.dataset.finish_with_error(f"Error with CLIP model; please contact 4CAT admins.")
             return
 
         # Load the video metadata if available
@@ -227,6 +234,7 @@ def process(self):
                     image_name = ".".join(result_filename.split(".")[:-1])
                     data = {
                         "id": image_name,
+                        "filename": result_filename,
                         "categories": result_data,
                         "image_metadata": image_metadata.get(image_name, {}) if image_metadata else {},
                     }
@@ -243,7 +251,7 @@ def map_item(item):
         :param item:
         :return:
         """
-        image_metadata = item.get("image_metadata")
+        image_metadata = item.get("image_metadata", {})
         # Updates to CLIP output; categories used to be a list of categories, but now is a dict with: {"predictions": [[category_label, precent_float],]}
         categories = item.get("categories")
         if type(categories) == list:
@@ -263,9 +271,9 @@ def map_item(item):
         all_cats = {cat[0]: cat[1] for cat in categories}
         return MappedItem({
             "id": item.get("id"),
+            "image_filename": item.get("filename"),
             "top_categories": ", ".join([f"{cat[0]}: {100* cat[1]:.2f}%" for cat in top_cats]),
-            "original_url": image_metadata.get("url", ""),
-            "image_filename": image_metadata.get("filename", ""),
+            "original_url": image_metadata.get("url", "N/A"),
             "post_ids": ", ".join([str(post_id) for post_id in image_metadata.get("post_ids", [])]),
             "from_dataset": image_metadata.get("from_dataset", ""),
             **all_cats

From 97c5ca53e1f594285649d1b48f80a0ec0bc9faf5 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 7 Jan 2025 11:07:32 +0100
Subject: [PATCH 4/5] add scikit-learn and scipy libraries back

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index cbbf4ed2..71d2035c 100644
--- a/setup.py
+++ b/setup.py
@@ -50,8 +50,8 @@
 	"requests~=2.27",
 	"requests_futures",
 	"scenedetect==0.6.0.3",
-	#"scikit-learn",
-	#"scipy==1.10.1",
+	"scikit-learn",
+	"scipy==1.10.1",
 	"shapely",
 	"svgwrite~=1.4.0",
 	"tailer",

From 8b0423d58832b5ce1a9bc0fabb977d2f547f5a59 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 7 Jan 2025 11:31:52 +0100
Subject: [PATCH 5/5] remove log statement

whoops
---
 processors/text-analysis/vectorise_by_cat.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/processors/text-analysis/vectorise_by_cat.py b/processors/text-analysis/vectorise_by_cat.py
index 434733a6..cdf9a8c6 100644
--- a/processors/text-analysis/vectorise_by_cat.py
+++ b/processors/text-analysis/vectorise_by_cat.py
@@ -8,7 +8,6 @@
 
 from backend.lib.processor import BasicProcessor
 from common.lib.helpers import UserInput
-from common.config_manager import config
 
 __author__ = "Dale Wahl"
 __credits__ = ["Dale Wahl", "Stijn Peeters"]
@@ -136,8 +135,6 @@ def get_category_dataset(dataset):
 		Get the dataset that contains the category column; this should be the dataset above the tokenise-posts dataset
 		"""
 		genealogy = dataset.get_genealogy()
-		config.with_db()
-		config.db.log.info(f"Genealogy: {[(gen.key, gen.type) for gen in genealogy]}")
 
 		# Find parent of tokenise-posts dataset; this dataset will contain the categories related to the tokens extracted from it
 		tokeniser_found = False