diff --git a/models/DecisionTreeRegressor.pkl b/models/DecisionTreeRegressor.pkl new file mode 100644 index 0000000..58ebcbb Binary files /dev/null and b/models/DecisionTreeRegressor.pkl differ diff --git a/models/LinearRegression.pkl b/models/LinearRegression.pkl new file mode 100644 index 0000000..d4a5ac9 Binary files /dev/null and b/models/LinearRegression.pkl differ diff --git a/monitoring/create_monitoring_datasets.py b/monitoring/create_monitoring_datasets.py new file mode 100644 index 0000000..eee755d --- /dev/null +++ b/monitoring/create_monitoring_datasets.py @@ -0,0 +1,15 @@ +import pandas as pd +from sklearn.model_selection import train_test_split + +# Load the original dataset +df = pd.read_csv("../project-mlops/data/hour.csv") + + +# Split the data into reference (older data) and production (newer data) +reference_data, production_data = train_test_split(df, test_size=0.3, shuffle=False) + +# Save the datasets +reference_data.to_csv("reference_data.csv", index=False) +production_data.to_csv("production_data.csv", index=False) + +print("Reference and production datasets created successfully.") diff --git a/monitoring/evidently_metrics_calculations.py b/monitoring/evidently_metrics_calculations.py index 22f6c4f..ee651ca 100644 --- a/monitoring/evidently_metrics_calculations.py +++ b/monitoring/evidently_metrics_calculations.py @@ -23,8 +23,7 @@ rand = random.Random() create_table_statement = """ -DROP TABLE IF EXISTS dummy_metrics; -CREATE TABLE dummy_metrics( +CREATE TABLE IF NOT EXISTS dummy_metrics ( timestamp TIMESTAMP, prediction_drift FLOAT, num_drifted_columns INTEGER, @@ -32,11 +31,12 @@ ) """ -reference_data = pd.read_csv("../data/reference.csv") -with open("../models/dec_tre.bin", "rb") as f_in: +# Update the file path to the new location +reference_data = pd.read_csv("../project-mlops/data/reference.csv") +with open("../project-mlops/models/DecisionTreeRegressor.pkl", "rb") as f_in: model = joblib.load(f_in) -raw_data = pd.read_csv("../data/hour.csv") +raw_data = pd.read_csv("../project-mlops/data/hour.csv") features = [ "season", @@ -70,15 +70,24 @@ def prep_db(): with psycopg.connect( "host=localhost port=5432 user=postgres password=example", autocommit=True ) as conn: + # Check if the database exists res = conn.execute("SELECT 1 FROM pg_database WHERE datname='test'") if not res.fetchall(): + logging.info("Database 'test' not found. Creating database.") conn.execute("CREATE DATABASE test;") + else: + logging.info("Database 'test' already exists.") + + # Connect to the database and create the table with psycopg.connect( "host=localhost port=5432 dbname=test user=postgres password=example" ) as conn: conn.execute(create_table_statement) + logging.info("Table 'dummy_metrics' is ready.") + except psycopg.OperationalError as e: + logging.error("OperationalError: %s", str(e)) except Exception as e: - logging.error("Error preparing the database: %s", {e}) + logging.error("Error preparing the database: %s", str(e)) @task @@ -112,8 +121,9 @@ def calculate_metrics_postgresql(curr): share_missing_values, ), ) + logging.info("Metrics inserted into database.") except Exception as e: - logging.error("Error calculating metrics: %s", {e}) + logging.error("Error calculating metrics: %s", str(e)) @flow @@ -134,9 +144,9 @@ def batch_monitoring_backfill(): if seconds_elapsed < SEND_TIMEOUT: time.sleep(SEND_TIMEOUT - seconds_elapsed) last_send += datetime.timedelta(seconds=10) - logging.info("Data sent") + logging.info("Data sent. Waiting for the next iteration.") except Exception as e: - logging.error("Error in batch monitoring: %s", e) + logging.error("Error in batch monitoring: %s", str(e)) if __name__ == "__main__":