Skip to content

Commit

Permalink
added more details about tutorials
Browse files Browse the repository at this point in the history
  • Loading branch information
emptymalei committed May 9, 2021
1 parent 69065fe commit c96fe3d
Show file tree
Hide file tree
Showing 10 changed files with 797 additions and 9 deletions.
117 changes: 117 additions & 0 deletions docs/tutorials/rideindego/config.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Rideindego Config

We use one single global config for the whole project. It is highly recommended to place the config json file inside the artifacts folder so that we can restore everything later when syncing the whole artifacts folder.

```json
{
"name": "rideindego prices",
"etl": {
"cache_folder": "cache",
"raw": {
"local": "data/raw",
"remote": "s3://haferml-rideindego/marshall/data/raw",
"stations": {
"name": "stations.parquet",
"local": "data/raw/stations",
"remote": "s3://haferml-rideindego/marshall/data/raw/stations"
},
"trip_data": {
"source": "https://www.rideindego.com/about/data/",
"name": "trips.parquet",
"local": "data/raw/trip_data",
"remote": "s3://haferml-rideindego/marshall/data/raw/trip_data"
}
},
"transformed": {
"local": "dataset/etl",
"remote": "s3://haferml-rideindego/marshall/dataset/etl",
"stations": {
"name": "stations.parquet",
"local": "dataset/etl",
"remote": "s3://haferml-rideindego/marshall/dataset/etl"
},
"trip_data": {
"name": "trips.parquet",
"local": "dataset/etl",
"remote": "s3://haferml-rideindego/marshall/dataset/etl"
}
}
},
"preprocessing": {
"dataset": {
"local": "model/dataset",
"remote": "s3://haferml-rideindego/marshall/model/dataset",
"preprocessed": {
"name": "preprocessed.parquet",
"local": "model/dataset",
"remote": "s3://haferml-rideindego/marshall/model/dataset"
}
},
"features": [
"passholder_type",
"bike_type",
"trip_route_category",
"hour",
"weekday",
"month"
],
"targets": [
"duration"
],
"feature_engineering": {
},
"target_engineering": {
}
},
"model": {
"rf": {
"features": [
"passholder_type",
"bike_type",
"trip_route_category",
"hour",
"weekday",
"month"
],
"targets": [
"duration"
],
"encoding": {
"categorical_columns": [
"passholder_type",
"bike_type",
"trip_route_category"
]
},
"random_state": 42,
"test_size": 0.3,
"cv": {
"folds": 3,
"verbose": 6,
"n_jobs": 1,
"n_iter": 5
},
"hyperparameters": {},
"artifacts": {
"dataset": {
"local": "model/dataset",
"remote": "s3://haferml-rideindego/marshall/model/dataset"
},
"model": {
"name": "model.joblib",
"local": "model/model",
"remote": "s3://haferml-rideindego/marshall/model/model"
},
"prediction": {
"local": "prediction",
"remote": "s3://haferml-rideindego/marshall/prediction"
},
"performance": {
"local": "performance",
"remote": "s3://haferml-rideindego/marshall/performance"
}
}
}
}
}
```
58 changes: 58 additions & 0 deletions docs/tutorials/rideindego/extract.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
## Extract

As a first step, we will download the all the data files from the official website of [Rideindego](https://www.rideindego.com/about/data/).

The web page contains multiple links to the file. We will get all the links to the data files using `DataLinkHTMLExtractor` defined in `utils.fetch`. We will then run through each link and download the zip file using `DataDownloader`.


```python
import os

import click
import simplejson as json
from dotenv import load_dotenv
from loguru import logger

from utils.fetch import DataDownloader, DataLinkHTMLExtractor
from utils.fetch import get_page_html as _get_page_html
from haferml.blend.config import Config
from haferml.sync.local import prepare_folders

load_dotenv()

@click.command()
@click.option(
"-c",
"--config",
type=str,
default=os.getenv("CONFIG_FILE"),
help="Path to config file",
)
def extract(config):

base_folder = os.getenv("BASE_FOLDER")
_CONFIG = Config(config, base_folder=base_folder)

etl_trip_data_config = _CONFIG.get(["etl", "raw", "trip_data"])
logger.info(f"Using config: {etl_trip_data_config}")
# create folders
prepare_folders(etl_trip_data_config["local"], base_folder)
# if not os.path.exists(etl_trip_data_config["local"]):
# os.makedirs(etl_trip_data_config["local"])

# Download Raw Data
source_link = etl_trip_data_config["source"]
logger.info(f"Will download from {source_link}")
page = _get_page_html(source_link).get("data", {})
page_extractor = DataLinkHTMLExtractor(page)
links = page_extractor.get_data_links()
logger.info(f"Extracted links from {source_link}: {links}")

# Download data
dld = DataDownloader(links, data_type="zip", folder=etl_trip_data_config["local_absolute"])
dld.run()


if __name__ == "__main__":
extract()
```
1 change: 1 addition & 0 deletions docs/tutorials/rideindego/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ This is a tutorial for rideindego data.

The code for this project is [on GitHub](https://github.com/emptymalei/haferml-tutorials/tree/main/tutorials/rideindego/models/marshall_hafer).

Everything in the artifacts folder can be synced to AWS S3 using `haferml.sync.aws` function or manually upload using aws cli. The whole project will then be restored by redownload the artifacts.
85 changes: 85 additions & 0 deletions docs/tutorials/rideindego/prediction.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
## Prediction

To use the model, we will reload the model from the artifacts saved in the training step. It's very easy to reload the model, e.g., `ModelSet` in the following example.


```python
import os

import click
import joblib
import pandas as pd
from dotenv import load_dotenv
from haferml.blend.config import Config
from haferml.model.pipeline import ModelSetX
from haferml.sync.local import prepare_folders
from loguru import logger

load_dotenv()


class ModelSet(ModelSetX):
"""
The core of the model including hyperparameters
"""

def __init__(self, config, base_folder):
super(ModelSet, self).__init__(config, base_folder)

def reload(self):

model_folder = self.artifacts["model"]["local_absolute"]

logger.info("Reload models")
self.model = joblib.load(
os.path.join(
self.base_folder,
model_folder,
self.artifacts["model"]["name"],
)
)

def predict(self, data):

return self.model.predict(data)


@click.command()
@click.option(
"-c",
"--config",
type=str,
default=os.getenv("CONFIG_FILE"),
help="Path to config file",
)
def predict(config):

base_folder = os.getenv("BASE_FOLDER")
logger.debug(f"base folder is: {base_folder}")
logger.debug(f"config: {config}")

_CONFIG = Config(config, base_folder=base_folder)
preprocessed_data_config = _CONFIG[["preprocessing", "dataset", "preprocessed"]]
rf_config = _CONFIG[["model", "rf"]]

# create folders
prepare_folders(_CONFIG[["model", "rf", "artifacts", "model", "local"]], base_folder=base_folder)

# load some data
dataset_folder = _CONFIG[["model", "rf", "artifacts", "dataset", "local_absolute"]]
df = pd.read_parquet(
os.path.join(dataset_folder, "model_X_test.parquet")
).sample(1)

# model
logger.debug(f"Prepare modelset and dataset")
M = ModelSet(config=rf_config, base_folder=base_folder)
M.reload()
logger.info(
M.predict(df)
)


if __name__ == "__main__":
predict()
```
90 changes: 90 additions & 0 deletions docs/tutorials/rideindego/preprocessing.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
## Preprocessing

Preprocessing is using ordered member function by inheriting from `BasePreProcessor`.


```python
import datetime
import os

import click
import pandas as pd
import simplejson as json
from dotenv import load_dotenv
from haferml.blend.config import Config
from haferml.preprocess.ingredients import attributes
from haferml.preprocess.pipeline import BasePreProcessor
from haferml.sync.local import prepare_folders
from loguru import logger

logger.info(f"Experiment started at: {datetime.datetime.now()}")
load_dotenv()


def load_data(data_path):
if data_path.endswith(".parquet"):
dataframe = pd.read_parquet(data_path)
else:
raise ValueError(f"Input path file format is not supported: {data_path}")

return dataframe


class Preprocess(BasePreProcessor):
"""
Preprocess dataset
There is very little to preprocess in this example. But we will keep this class for illustration purpose.
"""

def __init__(self, config):
super(Preprocess, self).__init__(config=config)
self.feature_cols = self.config["features"]
self.target_cols = self.config["targets"]

@attributes(order=1)
def _drop_unused_columns(self, dataframe):

self.dataframe = dataframe[self.feature_cols + self.target_cols]

return self.dataframe



@click.command()
@click.option(
"-c",
"--config",
type=str,
default=os.getenv("CONFIG_FILE"),
help="Path to config file",
)
def preprocess(config):

base_folder = os.getenv("BASE_FOLDER")

_CONFIG = Config(config, base_folder=base_folder)

preprocessed_data_config = _CONFIG[["preprocessing", "dataset", "preprocessed"]]
transformed_trip_data_config = _CONFIG[["etl", "transformed", "trip_data"]]

# create folders
prepare_folders(preprocessed_data_config["local"], base_folder=base_folder)
prepare_folders(transformed_trip_data_config["local"], base_folder=base_folder)

# load transformed data
df = load_data(transformed_trip_data_config["name_absolute"])

# preprocess
pr = Preprocess(config=_CONFIG[["preprocessing"]])
df = pr.run(df)

# save
df.to_parquet(preprocessed_data_config["name_absolute"], index=False)
logger.info(f'Saved preprocessed data to {preprocessed_data_config["name_absolute"]}')

return df


if __name__ == "__main__":
preprocess()
```
Loading

0 comments on commit c96fe3d

Please sign in to comment.