Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
sal-uva committed Jan 7, 2025
2 parents 82c8817 + 8b0423d commit b6c15de
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 29 deletions.
28 changes: 14 additions & 14 deletions common/lib/dmi_service_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
existing_service = self.check_service_exists()
if existing_service:
if len(existing_service) > 1:
raise Exception("Multiple services found with the same dataset key.")
raise DmiServiceManagerException("Multiple services found with the same dataset key.")
else:
existing_service = existing_service[0]
if existing_service['status'] == 'complete':
Expand All @@ -176,7 +176,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
try:
resp = requests.post(api_endpoint, json=data, timeout=30)
except requests.exceptions.ConnectionError as e :
raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(e)}")
raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(e)}")

if resp.status_code == 202:
# New request successful
Expand All @@ -186,10 +186,10 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
resp_json = resp.json()
if resp.status_code == 400 and 'key' in resp_json and 'error' in resp_json and resp_json['error'] == f"future_key {resp_json['key']} already exists":
# Request already exists; get DMI SM database key
raise Exception(f"Request already exists; check that DMI SM is up to date")
raise DmiServiceManagerException(f"Request already exists; check that DMI SM is up to date")
elif resp.status_code == 404:
# Could be local vs remote not set correctly
raise DmiServiceManagerException(f"404: {resp.url} not found; DMI Service Manager may not be set up for this service")
raise DsmConnectionError(f"404: {resp.url} not found; DMI Service Manager may not be set up for this service")
else:
raise DmiServiceManagerException(f"DMI Service Manager error: {str(resp.status_code)}: {str(resp_json)}")
except JSONDecodeError:
Expand Down Expand Up @@ -221,14 +221,14 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
# Have seen the Service Manager fail particularly when another processor is uploading many consecutive files
connection_error += 1
if connection_error > 3:
raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(e)}")
raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(e)}")
continue

if result.status_code != 200 or (result.json and result.json().get('status') != "success"):
# Unexpected response from DMI SM
connection_error += 1
if connection_error > 3:
raise DmiServiceManagerException(f"Unable to connect to DMI Service Manager server: {str(result.status_code)}: {str(result.json()) if 'json' in result.headers.get('Content-Type', '') else str(result.text)}")
raise DsmConnectionError(f"Unable to connect to DMI Service Manager server: {str(result.status_code)}: {str(result.json()) if 'json' in result.headers.get('Content-Type', '') else str(result.text)}")
continue
service_status = result.json()["job"]

Expand All @@ -255,7 +255,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
elif service_status['status'] in ["complete", "error"]:
results = json.loads(service_status['results'])
if not results:
# This should not be the case is the service was written well (unless the DMI SM crashed?)
# This should not be the case if the service was written well (unless the DMI SM crashed?)
#TODO test if timing issue?
connection_error += 1
if connection_error > 3:
Expand All @@ -268,7 +268,7 @@ def send_request_and_wait_for_results(self, service_endpoint, data, wait_period=
else:
error = results['error']
if "CUDA error: out of memory" in error:
raise DmiServiceManagerException("DMI Service Manager server ran out of memory; try reducing the number of files processed at once or waiting until the server is less busy.")
raise DsmOutOfMemory("DMI Service Manager server ran out of memory; try reducing the number of files processed at once or waiting until the server is less busy.")
else:
raise DmiServiceManagerException(f"Error {service_endpoint}: " + error)
else:
Expand Down Expand Up @@ -308,14 +308,14 @@ def request_folder_files(self, folder_name):
except requests.exceptions.ConnectionError as e:
retries += 1
if retries > 3:
raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}")
raise DsmConnectionError(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}")
continue

# Check if 4CAT has access to this server
if filename_response.status_code == 403:
raise DmiServiceManagerException("403: 4CAT does not have permission to use the DMI Service Manager server")
raise DsmConnectionError("403: 4CAT does not have permission to use the DMI Service Manager server")
elif filename_response.status_code in [400, 405]:
raise DmiServiceManagerException(f"400: DMI Service Manager server {filename_response.json()['reason']}")
raise DsmConnectionError(f"400: DMI Service Manager server {filename_response.json()['reason']}")
elif filename_response.status_code == 404:
# Folder not found; no files
return {}
Expand Down Expand Up @@ -388,9 +388,9 @@ def send_files(self, file_collection_name, results_name, files_to_upload, dir_wi
self.processor.dataset.update_status(f"Uploaded {files_uploaded} of {total_files_to_upload} files!")
self.processor.dataset.update_progress(files_uploaded / total_files_to_upload)
elif response.status_code == 403:
raise DmiServiceManagerException("403: 4CAT does not have permission to use the DMI Service Manager server")
raise DsmConnectionError("403: 4CAT does not have permission to use the DMI Service Manager server")
elif response.status_code == 405:
raise DmiServiceManagerException("405: Method not allowed; check DMI Service Manager server address (perhaps http is being used instead of https)")
raise DsmConnectionError("405: Method not allowed; check DMI Service Manager server address (perhaps http is being used instead of https)")
else:
self.processor.dataset.log(f"Unable to upload file ({response.status_code} - {response.reason}): {upload_file}")

Expand Down Expand Up @@ -432,7 +432,7 @@ def download_results(self, filenames_to_download, folder_name, local_output_dir,
except requests.exceptions.ConnectionError as e:
retries += 1
if retries > 3:
raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading file: {folder_name}/{filename}")
raise DsmConnectionError(f"Connection Error {e} (retries {retries}) while downloading file: {folder_name}/{filename}")
continue
files_downloaded += 1
if files_downloaded % 1000 == 0:
Expand Down
20 changes: 15 additions & 5 deletions processors/machine_learning/blip2_image_caption.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


from backend.lib.processor import BasicProcessor
from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory
from common.lib.dmi_service_manager import DmiServiceManager, DmiServiceManagerException, DsmOutOfMemory, DsmConnectionError
from common.lib.exceptions import ProcessorInterruptedException
from common.lib.user_input import UserInput
from common.config_manager import config
Expand Down Expand Up @@ -183,8 +183,15 @@ def process(self):
self.dataset.finish_with_error(
"DMI Service Manager ran out of memory; Try decreasing the number of images or try again or try again later.")
return
except DsmConnectionError as e:
self.dataset.log(str(e))
self.log.warning(f"DMI Service Manager connection error ({self.dataset.key}): {e}")
self.dataset.finish_with_error("DMI Service Manager connection error; please contact 4CAT admins.")
return
except DmiServiceManagerException as e:
self.dataset.finish_with_error(str(e))
self.dataset.log(str(e))
self.log.warning(f"BLIP2 Error ({self.dataset.key}): {e}")
self.dataset.finish_with_error(f"Error with BLIP2 model; please contact 4CAT admins.")
return

# Load the video metadata if available
Expand All @@ -202,6 +209,9 @@ def process(self):
data.update({"url": url})
# using the filename without extension as the key; since that is how the results form their filename
image_metadata[".".join(data['filename'].split(".")[:-1])] = data
else:
self.dataset.log("No image metadata found")


self.dataset.update_status("Processing BLIP2 results...")
# Download the result files
Expand Down Expand Up @@ -237,12 +247,12 @@ def map_item(item):
:param item:
:return:
"""
image_metadata = item.get("image_metadata")
image_metadata = item.get("image_metadata", {})
return MappedItem({
"id": item.get("id"),
"text": item.get("text"),
# "original_url": image_metadata.get("url", ""), # TODO: does not appear all image datasets are using URL properly...
"image_filename": image_metadata.get("filename", ""),
"image_filename": image_metadata.get("filename", "") if image_metadata else item.get("id"), # fallback to id which is filename
"original_url": image_metadata.get("url", "N/A"),
"post_ids": ", ".join([str(post_id) for post_id in image_metadata.get("post_ids", [])]),
"from_dataset": image_metadata.get("from_dataset", ""),
})
16 changes: 12 additions & 4 deletions processors/machine_learning/clip_categorize_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,15 @@ def process(self):
self.dataset.finish_with_error(
"DMI Service Manager ran out of memory; Try decreasing the number of images or try again or try again later.")
return
except DsmConnectionError as e:
self.dataset.log(str(e))
self.log.warning(f"DMI Service Manager connection error ({self.dataset.key}): {e}")
self.dataset.finish_with_error("DMI Service Manager connection error; please contact 4CAT admins.")
return
except DmiServiceManagerException as e:
self.dataset.finish_with_error(str(e))
self.dataset.log(str(e))
self.log.warning(f"CLIP Error ({self.dataset.key}): {e}")
self.dataset.finish_with_error(f"Error with CLIP model; please contact 4CAT admins.")
return

# Load the video metadata if available
Expand Down Expand Up @@ -227,6 +234,7 @@ def process(self):
image_name = ".".join(result_filename.split(".")[:-1])
data = {
"id": image_name,
"filename": result_filename,
"categories": result_data,
"image_metadata": image_metadata.get(image_name, {}) if image_metadata else {},
}
Expand All @@ -243,7 +251,7 @@ def map_item(item):
:param item:
:return:
"""
image_metadata = item.get("image_metadata")
image_metadata = item.get("image_metadata", {})
# Updates to CLIP output; categories used to be a list of categories, but now is a dict with: {"predictions": [[category_label, precent_float],]}
categories = item.get("categories")
if type(categories) == list:
Expand All @@ -263,9 +271,9 @@ def map_item(item):
all_cats = {cat[0]: cat[1] for cat in categories}
return MappedItem({
"id": item.get("id"),
"image_filename": item.get("filename"),
"top_categories": ", ".join([f"{cat[0]}: {100* cat[1]:.2f}%" for cat in top_cats]),
"original_url": image_metadata.get("url", ""),
"image_filename": image_metadata.get("filename", ""),
"original_url": image_metadata.get("url", "N/A"),
"post_ids": ", ".join([str(post_id) for post_id in image_metadata.get("post_ids", [])]),
"from_dataset": image_metadata.get("from_dataset", ""),
**all_cats
Expand Down
3 changes: 0 additions & 3 deletions processors/text-analysis/vectorise_by_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from backend.lib.processor import BasicProcessor
from common.lib.helpers import UserInput
from common.config_manager import config

__author__ = "Dale Wahl"
__credits__ = ["Dale Wahl", "Stijn Peeters"]
Expand Down Expand Up @@ -136,8 +135,6 @@ def get_category_dataset(dataset):
Get the dataset that contains the category column; this should be the dataset above the tokenise-posts dataset
"""
genealogy = dataset.get_genealogy()
config.with_db()
config.db.log.info(f"Genealogy: {[(gen.key, gen.type) for gen in genealogy]}")

# Find parent of tokenise-posts dataset; this dataset will contain the categories related to the tokens extracted from it
tokeniser_found = False
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@
"praw~=7.0",
"prawcore~=2.0",
"psutil~=5.0",
#"psycopg2~=2.9.0",
"psycopg2~=2.9.0",
"pyahocorasick~=1.4.0",
"PyMySQL~=1.0",
"PyTumblr==0.1.0",
"razdel~=0.5",
"requests~=2.27",
"requests_futures",
"scenedetect==0.6.0.3",
#"scikit-learn",
#"scipy==1.10.1",
"scikit-learn",
"scipy==1.10.1",
"shapely",
"svgwrite~=1.4.0",
"tailer",
Expand Down

0 comments on commit b6c15de

Please sign in to comment.