-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
69065fe
commit c96fe3d
Showing
10 changed files
with
797 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# Rideindego Config | ||
|
||
We use one single global config for the whole project. It is highly recommended to place the config json file inside the artifacts folder so that we can restore everything later when syncing the whole artifacts folder. | ||
|
||
```json | ||
{ | ||
"name": "rideindego prices", | ||
"etl": { | ||
"cache_folder": "cache", | ||
"raw": { | ||
"local": "data/raw", | ||
"remote": "s3://haferml-rideindego/marshall/data/raw", | ||
"stations": { | ||
"name": "stations.parquet", | ||
"local": "data/raw/stations", | ||
"remote": "s3://haferml-rideindego/marshall/data/raw/stations" | ||
}, | ||
"trip_data": { | ||
"source": "https://www.rideindego.com/about/data/", | ||
"name": "trips.parquet", | ||
"local": "data/raw/trip_data", | ||
"remote": "s3://haferml-rideindego/marshall/data/raw/trip_data" | ||
} | ||
}, | ||
"transformed": { | ||
"local": "dataset/etl", | ||
"remote": "s3://haferml-rideindego/marshall/dataset/etl", | ||
"stations": { | ||
"name": "stations.parquet", | ||
"local": "dataset/etl", | ||
"remote": "s3://haferml-rideindego/marshall/dataset/etl" | ||
}, | ||
"trip_data": { | ||
"name": "trips.parquet", | ||
"local": "dataset/etl", | ||
"remote": "s3://haferml-rideindego/marshall/dataset/etl" | ||
} | ||
} | ||
}, | ||
"preprocessing": { | ||
"dataset": { | ||
"local": "model/dataset", | ||
"remote": "s3://haferml-rideindego/marshall/model/dataset", | ||
"preprocessed": { | ||
"name": "preprocessed.parquet", | ||
"local": "model/dataset", | ||
"remote": "s3://haferml-rideindego/marshall/model/dataset" | ||
} | ||
}, | ||
"features": [ | ||
"passholder_type", | ||
"bike_type", | ||
"trip_route_category", | ||
"hour", | ||
"weekday", | ||
"month" | ||
], | ||
"targets": [ | ||
"duration" | ||
], | ||
"feature_engineering": { | ||
}, | ||
"target_engineering": { | ||
} | ||
}, | ||
"model": { | ||
"rf": { | ||
"features": [ | ||
"passholder_type", | ||
"bike_type", | ||
"trip_route_category", | ||
"hour", | ||
"weekday", | ||
"month" | ||
], | ||
"targets": [ | ||
"duration" | ||
], | ||
"encoding": { | ||
"categorical_columns": [ | ||
"passholder_type", | ||
"bike_type", | ||
"trip_route_category" | ||
] | ||
}, | ||
"random_state": 42, | ||
"test_size": 0.3, | ||
"cv": { | ||
"folds": 3, | ||
"verbose": 6, | ||
"n_jobs": 1, | ||
"n_iter": 5 | ||
}, | ||
"hyperparameters": {}, | ||
"artifacts": { | ||
"dataset": { | ||
"local": "model/dataset", | ||
"remote": "s3://haferml-rideindego/marshall/model/dataset" | ||
}, | ||
"model": { | ||
"name": "model.joblib", | ||
"local": "model/model", | ||
"remote": "s3://haferml-rideindego/marshall/model/model" | ||
}, | ||
"prediction": { | ||
"local": "prediction", | ||
"remote": "s3://haferml-rideindego/marshall/prediction" | ||
}, | ||
"performance": { | ||
"local": "performance", | ||
"remote": "s3://haferml-rideindego/marshall/performance" | ||
} | ||
} | ||
} | ||
} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
## Extract | ||
|
||
As a first step, we will download the all the data files from the official website of [Rideindego](https://www.rideindego.com/about/data/). | ||
|
||
The web page contains multiple links to the file. We will get all the links to the data files using `DataLinkHTMLExtractor` defined in `utils.fetch`. We will then run through each link and download the zip file using `DataDownloader`. | ||
|
||
|
||
```python | ||
import os | ||
|
||
import click | ||
import simplejson as json | ||
from dotenv import load_dotenv | ||
from loguru import logger | ||
|
||
from utils.fetch import DataDownloader, DataLinkHTMLExtractor | ||
from utils.fetch import get_page_html as _get_page_html | ||
from haferml.blend.config import Config | ||
from haferml.sync.local import prepare_folders | ||
|
||
load_dotenv() | ||
|
||
@click.command() | ||
@click.option( | ||
"-c", | ||
"--config", | ||
type=str, | ||
default=os.getenv("CONFIG_FILE"), | ||
help="Path to config file", | ||
) | ||
def extract(config): | ||
|
||
base_folder = os.getenv("BASE_FOLDER") | ||
_CONFIG = Config(config, base_folder=base_folder) | ||
|
||
etl_trip_data_config = _CONFIG.get(["etl", "raw", "trip_data"]) | ||
logger.info(f"Using config: {etl_trip_data_config}") | ||
# create folders | ||
prepare_folders(etl_trip_data_config["local"], base_folder) | ||
# if not os.path.exists(etl_trip_data_config["local"]): | ||
# os.makedirs(etl_trip_data_config["local"]) | ||
|
||
# Download Raw Data | ||
source_link = etl_trip_data_config["source"] | ||
logger.info(f"Will download from {source_link}") | ||
page = _get_page_html(source_link).get("data", {}) | ||
page_extractor = DataLinkHTMLExtractor(page) | ||
links = page_extractor.get_data_links() | ||
logger.info(f"Extracted links from {source_link}: {links}") | ||
|
||
# Download data | ||
dld = DataDownloader(links, data_type="zip", folder=etl_trip_data_config["local_absolute"]) | ||
dld.run() | ||
|
||
|
||
if __name__ == "__main__": | ||
extract() | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
## Prediction | ||
|
||
To use the model, we will reload the model from the artifacts saved in the training step. It's very easy to reload the model, e.g., `ModelSet` in the following example. | ||
|
||
|
||
```python | ||
import os | ||
|
||
import click | ||
import joblib | ||
import pandas as pd | ||
from dotenv import load_dotenv | ||
from haferml.blend.config import Config | ||
from haferml.model.pipeline import ModelSetX | ||
from haferml.sync.local import prepare_folders | ||
from loguru import logger | ||
|
||
load_dotenv() | ||
|
||
|
||
class ModelSet(ModelSetX): | ||
""" | ||
The core of the model including hyperparameters | ||
""" | ||
|
||
def __init__(self, config, base_folder): | ||
super(ModelSet, self).__init__(config, base_folder) | ||
|
||
def reload(self): | ||
|
||
model_folder = self.artifacts["model"]["local_absolute"] | ||
|
||
logger.info("Reload models") | ||
self.model = joblib.load( | ||
os.path.join( | ||
self.base_folder, | ||
model_folder, | ||
self.artifacts["model"]["name"], | ||
) | ||
) | ||
|
||
def predict(self, data): | ||
|
||
return self.model.predict(data) | ||
|
||
|
||
@click.command() | ||
@click.option( | ||
"-c", | ||
"--config", | ||
type=str, | ||
default=os.getenv("CONFIG_FILE"), | ||
help="Path to config file", | ||
) | ||
def predict(config): | ||
|
||
base_folder = os.getenv("BASE_FOLDER") | ||
logger.debug(f"base folder is: {base_folder}") | ||
logger.debug(f"config: {config}") | ||
|
||
_CONFIG = Config(config, base_folder=base_folder) | ||
preprocessed_data_config = _CONFIG[["preprocessing", "dataset", "preprocessed"]] | ||
rf_config = _CONFIG[["model", "rf"]] | ||
|
||
# create folders | ||
prepare_folders(_CONFIG[["model", "rf", "artifacts", "model", "local"]], base_folder=base_folder) | ||
|
||
# load some data | ||
dataset_folder = _CONFIG[["model", "rf", "artifacts", "dataset", "local_absolute"]] | ||
df = pd.read_parquet( | ||
os.path.join(dataset_folder, "model_X_test.parquet") | ||
).sample(1) | ||
|
||
# model | ||
logger.debug(f"Prepare modelset and dataset") | ||
M = ModelSet(config=rf_config, base_folder=base_folder) | ||
M.reload() | ||
logger.info( | ||
M.predict(df) | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
predict() | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
## Preprocessing | ||
|
||
Preprocessing is using ordered member function by inheriting from `BasePreProcessor`. | ||
|
||
|
||
```python | ||
import datetime | ||
import os | ||
|
||
import click | ||
import pandas as pd | ||
import simplejson as json | ||
from dotenv import load_dotenv | ||
from haferml.blend.config import Config | ||
from haferml.preprocess.ingredients import attributes | ||
from haferml.preprocess.pipeline import BasePreProcessor | ||
from haferml.sync.local import prepare_folders | ||
from loguru import logger | ||
|
||
logger.info(f"Experiment started at: {datetime.datetime.now()}") | ||
load_dotenv() | ||
|
||
|
||
def load_data(data_path): | ||
if data_path.endswith(".parquet"): | ||
dataframe = pd.read_parquet(data_path) | ||
else: | ||
raise ValueError(f"Input path file format is not supported: {data_path}") | ||
|
||
return dataframe | ||
|
||
|
||
class Preprocess(BasePreProcessor): | ||
""" | ||
Preprocess dataset | ||
There is very little to preprocess in this example. But we will keep this class for illustration purpose. | ||
""" | ||
|
||
def __init__(self, config): | ||
super(Preprocess, self).__init__(config=config) | ||
self.feature_cols = self.config["features"] | ||
self.target_cols = self.config["targets"] | ||
|
||
@attributes(order=1) | ||
def _drop_unused_columns(self, dataframe): | ||
|
||
self.dataframe = dataframe[self.feature_cols + self.target_cols] | ||
|
||
return self.dataframe | ||
|
||
|
||
|
||
@click.command() | ||
@click.option( | ||
"-c", | ||
"--config", | ||
type=str, | ||
default=os.getenv("CONFIG_FILE"), | ||
help="Path to config file", | ||
) | ||
def preprocess(config): | ||
|
||
base_folder = os.getenv("BASE_FOLDER") | ||
|
||
_CONFIG = Config(config, base_folder=base_folder) | ||
|
||
preprocessed_data_config = _CONFIG[["preprocessing", "dataset", "preprocessed"]] | ||
transformed_trip_data_config = _CONFIG[["etl", "transformed", "trip_data"]] | ||
|
||
# create folders | ||
prepare_folders(preprocessed_data_config["local"], base_folder=base_folder) | ||
prepare_folders(transformed_trip_data_config["local"], base_folder=base_folder) | ||
|
||
# load transformed data | ||
df = load_data(transformed_trip_data_config["name_absolute"]) | ||
|
||
# preprocess | ||
pr = Preprocess(config=_CONFIG[["preprocessing"]]) | ||
df = pr.run(df) | ||
|
||
# save | ||
df.to_parquet(preprocessed_data_config["name_absolute"], index=False) | ||
logger.info(f'Saved preprocessed data to {preprocessed_data_config["name_absolute"]}') | ||
|
||
return df | ||
|
||
|
||
if __name__ == "__main__": | ||
preprocess() | ||
``` |
Oops, something went wrong.