Base Step proj

udacity · Jun 19, 2024 · ba0ea5e · ba0ea5e
1 parent 5278216
commit ba0ea5e
Show file tree

Hide file tree

Showing 11 changed files with 244 additions and 23 deletions.
diff --git a/conda.yml b/conda.yml
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - mlflow=1.14.1
   - pyyaml=5.3.1
+  - numpy=1.20.1
   - hydra-core=1.0.6
   - pip=20.3.3
   - pip:

diff --git a/main.py b/main.py
@@ -49,22 +49,44 @@ def go(config: DictConfig):
             )
 
         if "basic_cleaning" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
+                "main",
+                parameters={
+                    "input_artifact": "sample.csv:latest",
+                    "output_artifact": "clean_sample.csv",
+                    "output_type": "clean_sample",
+                    "output_description": "Data with outliers and null values removed",
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price']
+                },
+            )
 
         if "data_check" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": "clean_sample.csv:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config["data_check"]["kl_threshold"],
+                    "min_price": config['etl']['min_price'],
+                    "max_price": config['etl']['max_price']
+                },
+            )
+
 
         if "data_split" in active_steps:
-            ##################
-            # Implement here #
-            ##################
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/train_val_test_split",
+                "main",
+                parameters={
+                    "input": "clean_sample.csv:latest",
+                    "test_size": config['modeling']['test_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "stratify_by": config['modeling']['stratify_by']
+                },
+            )
 
         if "train_random_forest" in active_steps:
 
@@ -79,17 +101,36 @@ def go(config: DictConfig):
             ##################
             # Implement here #
             ##################
-
-            pass
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "rf_config": rf_config,
+                    "output_artifact": "random_forest_export",
+                    "random_seed": config["modeling"]["random_seed"],
+                    "val_size": config["modeling"]["val_size"],
+                    "stratify_by": config["modeling"]["stratify_by"],
+                    "max_tfidf_features": config["modeling"]["max_tfidf_features"]
+                },
+            )
+
 
         if "test_regression_model" in active_steps:
 
             ##################
             # Implement here #
             ##################
-
-            pass
+            _ = mlflow.run(
+                f"{config['main']['components_repository']}/test_regression_model",
+                "main",
+                parameters={
+                    "mlflow_model": "random_forest_export:prod",
+                    "test_dataset": "test_data.csv:latest"
+                },
+            )
+
 
 
 if __name__ == "__main__":
-    go()
+    go()
diff --git a/oryx-build-commands.txt b/oryx-build-commands.txt
@@ -0,0 +1,2 @@
+PlatformWithVersion=Python 
+BuildCommands=conda env create --file environment.yml --prefix ./venv --quiet
diff --git a/src/basic_cleaning/MLproject b/src/basic_cleaning/MLproject
@@ -0,0 +1,26 @@
+name: basic_cleaning
+conda_env: conda.yml
+
+entry_points:
+  main:
+    parameters:
+
+      parameters [parameter1:
+        description: ## ADD DESCRIPTION
+        type: string
+
+      parameter2]: parameter1:
+        description: ## ADD DESCRIPTION
+        type: string
+
+      parameter2:
+        description: ## ADD DESCRIPTION
+        type: string
+
+      parameter3:
+        description: ## ADD DESCRIPTION
+        type: string
+
+
+    command: >-
+        python run.py  --parameters [parameter1 {parameters [parameter1}  --parameter2]: parameter1 {parameter2]: parameter1}  --parameter2 {parameter2}  --parameter3 {parameter3} 
diff --git a/src/basic_cleaning/conda.yml b/src/basic_cleaning/conda.yml
@@ -0,0 +1,9 @@
+name: basic_cleaning
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - pip=20.3.3
+  - numpy=1.20.1
+  - pip:
+      - wandb==0.10.31
diff --git a/src/basic_cleaning/run.py b/src/basic_cleaning/run.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+"""
+erforms basic cleaning on the data and save the results in Weights & Biases
+"""
+import argparse
+import logging
+import wandb
+import pandas as pd
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
+logger = logging.getLogger()
+
+
+def go(args):
+
+    run = wandb.init(job_type="basic_cleaning")
+    run.config.update(args)
+
+    # Download input artifact. This will also log that this script is using this
+    # particular version of the artifact
+    # artifact_local_path = run.use_artifact(args.input_artifact).file()
+
+    logger.info(f'Downloading artifact {args.input_artifact}')
+    artifact_local_path = run.use_artifact(args.input_artifact).file()
+
+    # Read the data from the artifact
+    logger.info("Loading DataFrame")
+    df = pd.read_csv(artifact_local_path)
+
+    # Remove outliers
+    logger.info('Drop outliers and convert datetime')
+    idx = df['price'].between(args.min_price, args.max_price)
+    df = df[idx].copy()
+    df['last_review'] = pd.to_datetime(df['last_review'])
+
+    # Remove observations outside of NYC
+    idx = df['longitude'].between(-74.25, -73.50) & df['latitude'].between(40.5, 41.2)
+    df = df[idx].copy()
+
+    # Save dataframe
+    logger.info(f'Saving Dataframe {args.output_artifact}')
+    df.to_csv('clean_sample.csv', index=False)
+
+    # Upload Dataframe Artifact to W&B
+    artifact = wandb.Artifact(
+        args.output_artifact,
+        type=args.output_type,
+        description=args.output_description,
+    )
+    artifact.add_file("clean_sample.csv")
+    run.log_artifact(artifact)
+
+
+if __name__ == "__main__":
+
+     parser = argparse.ArgumentParser(description=" A very basic data cleaning")
+
+
+    parser.add_argument(
+        "--input_artifact", 
+        type=str,## INSERT TYPE HERE: str, float or int,
+        help="the input artifact name",## INSERT DESCRIPTION HERE,
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_artifact", 
+        type=str,## INSERT TYPE HERE: str, float or int,
+        help="the output artifact name",## INSERT DESCRIPTION HERE,
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_type", 
+        type=str,## INSERT TYPE HERE: str, float or int,
+        help="the output artifact type",## INSERT DESCRIPTION HERE,
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_description", 
+        type=str,## INSERT TYPE HERE: str, float or int,
+        help="the output artifact description",## INSERT DESCRIPTION HERE,
+        required=True
+    )
+
+    parser.add_argument(
+        "--min_price", 
+        type=float,## INSERT TYPE HERE: str, float or int,
+        help="Outlier Min price",## INSERT DESCRIPTION HERE,
+        required=True
+    )
+
+    parser.add_argument(
+        "--max_price", 
+        type=float,## INSERT TYPE HERE: str, float or int,
+        help="Outlier Max price",## INSERT DESCRIPTION HERE,
+        required=True
+    )
+
+
+    args = parser.parse_args()
+
+    go(args)
diff --git a/src/data_check/conda.yml b/src/data_check/conda.yml
@@ -5,6 +5,7 @@ channels:
 dependencies:
   - pandas=1.1.4
   - pytest=6.2.2
+  - numpy=1.20.1
   - scipy=1.5.2
   - pip=20.3.3
   - pip:

diff --git a/src/data_check/test_data.py b/src/data_check/test_data.py
@@ -63,3 +63,14 @@ def test_similar_neigh_distrib(data: pd.DataFrame, ref_data: pd.DataFrame, kl_th
 ########################################################
 # Implement here test_row_count and test_price_range   #
 ########################################################
+def test_row_count(data):
+    """
+    Test that the size of the dataset is in range
+    """
+    assert 15000 < data.shape[0] < 1000000
+
+def test_price_range(data, min_price, max_price):
+    """
+    Test that the price is in range
+    """
+    assert data['price'].between(min_price, max_price).all()
diff --git a/src/eda/conda.yml b/src/eda/conda.yml
@@ -6,6 +6,7 @@ dependencies:
   - jupyterlab=3.0.12
   - seaborn=0.11.1
   - pandas=1.2.3
+  - numpy=1.20.1
   - pip=20.3.3
   - pandas-profiling=2.11.0
   - pyarrow=2.0

diff --git a/src/train_random_forest/conda.yml b/src/train_random_forest/conda.yml
@@ -4,6 +4,7 @@ channels:
   - defaults
 dependencies:
   - pandas=1.1.4
+  - numpy=1.20.1
   - pip=20.3.3
   - mlflow=1.14.1
   - scikit-learn=0.24.1

diff --git a/src/train_random_forest/run.py b/src/train_random_forest/run.py
@@ -9,6 +9,8 @@
 import matplotlib.pyplot as plt
 
 import mlflow
+from mlflow.models import infer_signature
+
 import json
 
 import pandas as pd
@@ -54,7 +56,7 @@ def go(args):
     ######################################
     # Use run.use_artifact(...).file() to get the train and validation artifact (args.trainval_artifact)
     # and save the returned path in train_local_pat
-    trainval_local_path = # YOUR CODE HERE
+    trainval_local_path = run.use_artifact(args.trainval_artifact).file()# YOUR CODE HERE
     ######################################
 
     X = pd.read_csv(trainval_local_path)
@@ -76,6 +78,7 @@ def go(args):
     ######################################
     # Fit the pipeline sk_pipe by calling the .fit method on X_train and y_train
     # YOUR CODE HERE
+    sk_pipe.fit(X_train, y_train)
     ######################################
 
     # Compute r2 and MAE
@@ -99,14 +102,29 @@ def go(args):
     # HINT: use mlflow.sklearn.save_model
     # YOUR CODE HERE
     ######################################
-
+    signature = infer_signature(X_val, y_pred)
+    mlflow.sklearn.save_model(
+            sk_pipe,
+            path="random_forest_dir",
+            serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
+            signature=signature,
+            input_example=X_val.iloc[:5],
+        )
     ######################################
     # Upload the model we just exported to W&B
     # HINT: use wandb.Artifact to create an artifact. Use args.output_artifact as artifact name, "model_export" as
     # type, provide a description and add rf_config as metadata. Then, use the .add_dir method of the artifact instance
     # you just created to add the "random_forest_dir" directory to the artifact, and finally use
     # run.log_artifact to log the artifact to the run
     # YOUR CODE HERE
+    logger.info('Create Artifact to store model')
+    artifact = wandb.Artifact(
+            args.output_artifact,
+            type="model_export",
+            description="Random Forest pipeline export",
+        )
+    artifact.add_dir(local_path='random_forest_dir')
+    run.log_artifact(artifact)
     ######################################
 
     # Plot feature importance
@@ -117,6 +135,7 @@ def go(args):
     run.summary['r2'] = r_squared
     # Now log the variable "mae" under the key "mae".
     # YOUR CODE HERE
+    run.summary["mae"] = mae
     ######################################
 
     # Upload to W&B the feture importance visualization
@@ -158,7 +177,8 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
     # Build a pipeline with two steps:
     # 1 - A SimpleImputer(strategy="most_frequent") to impute missing values
     # 2 - A OneHotEncoder() step to encode the variable
-    non_ordinal_categorical_preproc = # YOUR CODE HERE
+    non_ordinal_categorical_preproc = make_pipeline(SimpleImputer(strategy="most_frequent"), 
+                                                    OneHotEncoder())# YOUR CODE HERE
     ######################################
 
     # Let's impute the numerical columns to make sure we can handle missing values
@@ -217,8 +237,11 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
     # ColumnTransformer instance that we saved in the `preprocessor` variable, and a step called "random_forest"
     # with the random forest instance that we just saved in the `random_forest` variable.
     # HINT: Use the explicit Pipeline constructor so you can assign the names to the steps, do not use make_pipeline
-    sk_pipe = # YOUR CODE HERE
-
+    sk_pipe = Pipeline(steps=[
+        ("preprocessor", preprocessor), 
+        ("random_forest", random_Forest), 
+        ]) # YOUR CODE HERE
+mlflow
     return sk_pipe, processed_features
 
 
@@ -277,4 +300,4 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
 
     args = parser.parse_args()
 
-    go(args)
+    go(args)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		PlatformWithVersion=Python
		BuildCommands=conda env create --file environment.yml --prefix ./venv --quiet