Skip to content

Commit

Permalink
Base Step proj
Browse files Browse the repository at this point in the history
  • Loading branch information
Ziad-El3assal committed Jun 19, 2024
1 parent 5278216 commit ba0ea5e
Show file tree
Hide file tree
Showing 11 changed files with 244 additions and 23 deletions.
1 change: 1 addition & 0 deletions conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- mlflow=1.14.1
- pyyaml=5.3.1
- numpy=1.20.1
- hydra-core=1.0.6
- pip=20.3.3
- pip:
Expand Down
75 changes: 58 additions & 17 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,22 +49,44 @@ def go(config: DictConfig):
)

if "basic_cleaning" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "basic_cleaning"),
"main",
parameters={
"input_artifact": "sample.csv:latest",
"output_artifact": "clean_sample.csv",
"output_type": "clean_sample",
"output_description": "Data with outliers and null values removed",
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)

if "data_check" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": "clean_sample.csv:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config["data_check"]["kl_threshold"],
"min_price": config['etl']['min_price'],
"max_price": config['etl']['max_price']
},
)


if "data_split" in active_steps:
##################
# Implement here #
##################
pass
_ = mlflow.run(
f"{config['main']['components_repository']}/train_val_test_split",
"main",
parameters={
"input": "clean_sample.csv:latest",
"test_size": config['modeling']['test_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by']
},
)

if "train_random_forest" in active_steps:

Expand All @@ -79,17 +101,36 @@ def go(config: DictConfig):
##################
# Implement here #
##################

pass
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"rf_config": rf_config,
"output_artifact": "random_forest_export",
"random_seed": config["modeling"]["random_seed"],
"val_size": config["modeling"]["val_size"],
"stratify_by": config["modeling"]["stratify_by"],
"max_tfidf_features": config["modeling"]["max_tfidf_features"]
},
)


if "test_regression_model" in active_steps:

##################
# Implement here #
##################

pass
_ = mlflow.run(
f"{config['main']['components_repository']}/test_regression_model",
"main",
parameters={
"mlflow_model": "random_forest_export:prod",
"test_dataset": "test_data.csv:latest"
},
)



if __name__ == "__main__":
go()
go()
2 changes: 2 additions & 0 deletions oryx-build-commands.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PlatformWithVersion=Python
BuildCommands=conda env create --file environment.yml --prefix ./venv --quiet
26 changes: 26 additions & 0 deletions src/basic_cleaning/MLproject
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: basic_cleaning
conda_env: conda.yml

entry_points:
main:
parameters:

parameters [parameter1:
description: ## ADD DESCRIPTION
type: string

parameter2]: parameter1:
description: ## ADD DESCRIPTION
type: string

parameter2:
description: ## ADD DESCRIPTION
type: string

parameter3:
description: ## ADD DESCRIPTION
type: string


command: >-
python run.py --parameters [parameter1 {parameters [parameter1} --parameter2]: parameter1 {parameter2]: parameter1} --parameter2 {parameter2} --parameter3 {parameter3}
9 changes: 9 additions & 0 deletions src/basic_cleaning/conda.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: basic_cleaning
channels:
- conda-forge
- defaults
dependencies:
- pip=20.3.3
- numpy=1.20.1
- pip:
- wandb==0.10.31
105 changes: 105 additions & 0 deletions src/basic_cleaning/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python
"""
erforms basic cleaning on the data and save the results in Weights & Biases
"""
import argparse
import logging
import wandb
import pandas as pd


logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
logger = logging.getLogger()


def go(args):

run = wandb.init(job_type="basic_cleaning")
run.config.update(args)

# Download input artifact. This will also log that this script is using this
# particular version of the artifact
# artifact_local_path = run.use_artifact(args.input_artifact).file()

logger.info(f'Downloading artifact {args.input_artifact}')
artifact_local_path = run.use_artifact(args.input_artifact).file()

# Read the data from the artifact
logger.info("Loading DataFrame")
df = pd.read_csv(artifact_local_path)

# Remove outliers
logger.info('Drop outliers and convert datetime')
idx = df['price'].between(args.min_price, args.max_price)
df = df[idx].copy()
df['last_review'] = pd.to_datetime(df['last_review'])

# Remove observations outside of NYC
idx = df['longitude'].between(-74.25, -73.50) & df['latitude'].between(40.5, 41.2)
df = df[idx].copy()

# Save dataframe
logger.info(f'Saving Dataframe {args.output_artifact}')
df.to_csv('clean_sample.csv', index=False)

# Upload Dataframe Artifact to W&B
artifact = wandb.Artifact(
args.output_artifact,
type=args.output_type,
description=args.output_description,
)
artifact.add_file("clean_sample.csv")
run.log_artifact(artifact)


if __name__ == "__main__":

parser = argparse.ArgumentParser(description=" A very basic data cleaning")


parser.add_argument(
"--input_artifact",
type=str,## INSERT TYPE HERE: str, float or int,
help="the input artifact name",## INSERT DESCRIPTION HERE,
required=True
)

parser.add_argument(
"--output_artifact",
type=str,## INSERT TYPE HERE: str, float or int,
help="the output artifact name",## INSERT DESCRIPTION HERE,
required=True
)

parser.add_argument(
"--output_type",
type=str,## INSERT TYPE HERE: str, float or int,
help="the output artifact type",## INSERT DESCRIPTION HERE,
required=True
)

parser.add_argument(
"--output_description",
type=str,## INSERT TYPE HERE: str, float or int,
help="the output artifact description",## INSERT DESCRIPTION HERE,
required=True
)

parser.add_argument(
"--min_price",
type=float,## INSERT TYPE HERE: str, float or int,
help="Outlier Min price",## INSERT DESCRIPTION HERE,
required=True
)

parser.add_argument(
"--max_price",
type=float,## INSERT TYPE HERE: str, float or int,
help="Outlier Max price",## INSERT DESCRIPTION HERE,
required=True
)


args = parser.parse_args()

go(args)
1 change: 1 addition & 0 deletions src/data_check/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ channels:
dependencies:
- pandas=1.1.4
- pytest=6.2.2
- numpy=1.20.1
- scipy=1.5.2
- pip=20.3.3
- pip:
Expand Down
11 changes: 11 additions & 0 deletions src/data_check/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,14 @@ def test_similar_neigh_distrib(data: pd.DataFrame, ref_data: pd.DataFrame, kl_th
########################################################
# Implement here test_row_count and test_price_range #
########################################################
def test_row_count(data):
"""
Test that the size of the dataset is in range
"""
assert 15000 < data.shape[0] < 1000000

def test_price_range(data, min_price, max_price):
"""
Test that the price is in range
"""
assert data['price'].between(min_price, max_price).all()
1 change: 1 addition & 0 deletions src/eda/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ dependencies:
- jupyterlab=3.0.12
- seaborn=0.11.1
- pandas=1.2.3
- numpy=1.20.1
- pip=20.3.3
- pandas-profiling=2.11.0
- pyarrow=2.0
Expand Down
1 change: 1 addition & 0 deletions src/train_random_forest/conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ channels:
- defaults
dependencies:
- pandas=1.1.4
- numpy=1.20.1
- pip=20.3.3
- mlflow=1.14.1
- scikit-learn=0.24.1
Expand Down
35 changes: 29 additions & 6 deletions src/train_random_forest/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import matplotlib.pyplot as plt

import mlflow
from mlflow.models import infer_signature

import json

import pandas as pd
Expand Down Expand Up @@ -54,7 +56,7 @@ def go(args):
######################################
# Use run.use_artifact(...).file() to get the train and validation artifact (args.trainval_artifact)
# and save the returned path in train_local_pat
trainval_local_path = # YOUR CODE HERE
trainval_local_path = run.use_artifact(args.trainval_artifact).file()# YOUR CODE HERE
######################################

X = pd.read_csv(trainval_local_path)
Expand All @@ -76,6 +78,7 @@ def go(args):
######################################
# Fit the pipeline sk_pipe by calling the .fit method on X_train and y_train
# YOUR CODE HERE
sk_pipe.fit(X_train, y_train)
######################################

# Compute r2 and MAE
Expand All @@ -99,14 +102,29 @@ def go(args):
# HINT: use mlflow.sklearn.save_model
# YOUR CODE HERE
######################################

signature = infer_signature(X_val, y_pred)
mlflow.sklearn.save_model(
sk_pipe,
path="random_forest_dir",
serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
signature=signature,
input_example=X_val.iloc[:5],
)
######################################
# Upload the model we just exported to W&B
# HINT: use wandb.Artifact to create an artifact. Use args.output_artifact as artifact name, "model_export" as
# type, provide a description and add rf_config as metadata. Then, use the .add_dir method of the artifact instance
# you just created to add the "random_forest_dir" directory to the artifact, and finally use
# run.log_artifact to log the artifact to the run
# YOUR CODE HERE
logger.info('Create Artifact to store model')
artifact = wandb.Artifact(
args.output_artifact,
type="model_export",
description="Random Forest pipeline export",
)
artifact.add_dir(local_path='random_forest_dir')
run.log_artifact(artifact)
######################################

# Plot feature importance
Expand All @@ -117,6 +135,7 @@ def go(args):
run.summary['r2'] = r_squared
# Now log the variable "mae" under the key "mae".
# YOUR CODE HERE
run.summary["mae"] = mae
######################################

# Upload to W&B the feture importance visualization
Expand Down Expand Up @@ -158,7 +177,8 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
# Build a pipeline with two steps:
# 1 - A SimpleImputer(strategy="most_frequent") to impute missing values
# 2 - A OneHotEncoder() step to encode the variable
non_ordinal_categorical_preproc = # YOUR CODE HERE
non_ordinal_categorical_preproc = make_pipeline(SimpleImputer(strategy="most_frequent"),
OneHotEncoder())# YOUR CODE HERE
######################################

# Let's impute the numerical columns to make sure we can handle missing values
Expand Down Expand Up @@ -217,8 +237,11 @@ def get_inference_pipeline(rf_config, max_tfidf_features):
# ColumnTransformer instance that we saved in the `preprocessor` variable, and a step called "random_forest"
# with the random forest instance that we just saved in the `random_forest` variable.
# HINT: Use the explicit Pipeline constructor so you can assign the names to the steps, do not use make_pipeline
sk_pipe = # YOUR CODE HERE

sk_pipe = Pipeline(steps=[
("preprocessor", preprocessor),
("random_forest", random_Forest),
]) # YOUR CODE HERE
mlflow
return sk_pipe, processed_features


Expand Down Expand Up @@ -277,4 +300,4 @@ def get_inference_pipeline(rf_config, max_tfidf_features):

args = parser.parse_args()

go(args)
go(args)

0 comments on commit ba0ea5e

Please sign in to comment.