fix monitoring

kachiann · Aug 19, 2024 · 9340177 · 9340177
1 parent f42d55a
commit 9340177
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 9 deletions.
diff --git a/models/DecisionTreeRegressor.pkl b/models/DecisionTreeRegressor.pkl
diff --git a/models/LinearRegression.pkl b/models/LinearRegression.pkl
diff --git a/monitoring/create_monitoring_datasets.py b/monitoring/create_monitoring_datasets.py
@@ -0,0 +1,15 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+# Load the original dataset
+df = pd.read_csv("../project-mlops/data/hour.csv")
+
+
+# Split the data into reference (older data) and production (newer data)
+reference_data, production_data = train_test_split(df, test_size=0.3, shuffle=False)
+
+# Save the datasets
+reference_data.to_csv("reference_data.csv", index=False)
+production_data.to_csv("production_data.csv", index=False)
+
+print("Reference and production datasets created successfully.")
diff --git a/monitoring/evidently_metrics_calculations.py b/monitoring/evidently_metrics_calculations.py
@@ -23,20 +23,20 @@
 rand = random.Random()
 
 create_table_statement = """
-DROP TABLE IF EXISTS dummy_metrics;
-CREATE TABLE dummy_metrics(
+CREATE TABLE IF NOT EXISTS dummy_metrics (
     timestamp TIMESTAMP,
     prediction_drift FLOAT,
     num_drifted_columns INTEGER,
     share_missing_values FLOAT
 )
 """
 
-reference_data = pd.read_csv("../data/reference.csv")
-with open("../models/dec_tre.bin", "rb") as f_in:
+# Update the file path to the new location
+reference_data = pd.read_csv("../project-mlops/data/reference.csv")
+with open("../project-mlops/models/DecisionTreeRegressor.pkl", "rb") as f_in:
     model = joblib.load(f_in)
 
-raw_data = pd.read_csv("../data/hour.csv")
+raw_data = pd.read_csv("../project-mlops/data/hour.csv")
 
 features = [
     "season",
@@ -70,15 +70,24 @@ def prep_db():
         with psycopg.connect(
             "host=localhost port=5432 user=postgres password=example", autocommit=True
         ) as conn:
+            # Check if the database exists
             res = conn.execute("SELECT 1 FROM pg_database WHERE datname='test'")
             if not res.fetchall():
+                logging.info("Database 'test' not found. Creating database.")
                 conn.execute("CREATE DATABASE test;")
+            else:
+                logging.info("Database 'test' already exists.")
+
+            # Connect to the database and create the table
             with psycopg.connect(
                 "host=localhost port=5432 dbname=test user=postgres password=example"
             ) as conn:
                 conn.execute(create_table_statement)
+                logging.info("Table 'dummy_metrics' is ready.")
+    except psycopg.OperationalError as e:
+        logging.error("OperationalError: %s", str(e))
     except Exception as e:
-        logging.error("Error preparing the database: %s", {e})
+        logging.error("Error preparing the database: %s", str(e))
 
 
 @task
@@ -112,8 +121,9 @@ def calculate_metrics_postgresql(curr):
                 share_missing_values,
             ),
         )
+        logging.info("Metrics inserted into database.")
     except Exception as e:
-        logging.error("Error calculating metrics: %s", {e})
+        logging.error("Error calculating metrics: %s", str(e))
 
 
 @flow
@@ -134,9 +144,9 @@ def batch_monitoring_backfill():
                 if seconds_elapsed < SEND_TIMEOUT:
                     time.sleep(SEND_TIMEOUT - seconds_elapsed)
                 last_send += datetime.timedelta(seconds=10)
-                logging.info("Data sent")
+                logging.info("Data sent. Waiting for the next iteration.")
     except Exception as e:
-        logging.error("Error in batch monitoring: %s", e)
+        logging.error("Error in batch monitoring: %s", str(e))
 
 
 if __name__ == "__main__":