added more details about tutorials

emptymalei · May 9, 2021 · c96fe3d · c96fe3d
1 parent 69065fe
commit c96fe3d
Show file tree

Hide file tree

Showing 10 changed files with 797 additions and 9 deletions.
diff --git a/docs/tutorials/rideindego/config.md b/docs/tutorials/rideindego/config.md
@@ -0,0 +1,117 @@
+# Rideindego Config
+
+We use one single global config for the whole project. It is highly recommended to place the config json file inside the artifacts folder so that we can restore everything later when syncing the whole artifacts folder.
+
+```json
+{
+    "name": "rideindego prices",
+    "etl": {
+        "cache_folder": "cache",
+        "raw": {
+            "local": "data/raw",
+            "remote": "s3://haferml-rideindego/marshall/data/raw",
+            "stations": {
+                "name": "stations.parquet",
+                "local": "data/raw/stations",
+                "remote": "s3://haferml-rideindego/marshall/data/raw/stations"
+            },
+            "trip_data": {
+                "source": "https://www.rideindego.com/about/data/",
+                "name": "trips.parquet",
+                "local": "data/raw/trip_data",
+                "remote": "s3://haferml-rideindego/marshall/data/raw/trip_data"
+            }
+        },
+        "transformed": {
+            "local": "dataset/etl",
+            "remote": "s3://haferml-rideindego/marshall/dataset/etl",
+            "stations": {
+                "name": "stations.parquet",
+                "local": "dataset/etl",
+                "remote": "s3://haferml-rideindego/marshall/dataset/etl"
+            },
+            "trip_data": {
+                "name": "trips.parquet",
+                "local": "dataset/etl",
+                "remote": "s3://haferml-rideindego/marshall/dataset/etl"
+            }
+        }
+    },
+    "preprocessing": {
+        "dataset": {
+            "local": "model/dataset",
+            "remote": "s3://haferml-rideindego/marshall/model/dataset",
+            "preprocessed": {
+                "name": "preprocessed.parquet",
+                "local": "model/dataset",
+                "remote": "s3://haferml-rideindego/marshall/model/dataset"
+            }
+        },
+        "features": [
+            "passholder_type",
+            "bike_type",
+            "trip_route_category",
+            "hour",
+            "weekday",
+            "month"
+        ],
+        "targets": [
+            "duration"
+        ],
+        "feature_engineering": {
+        },
+        "target_engineering": {
+        }
+    },
+    "model": {
+        "rf": {
+            "features": [
+                "passholder_type",
+                "bike_type",
+                "trip_route_category",
+                "hour",
+                "weekday",
+                "month"
+            ],
+            "targets": [
+                "duration"
+            ],
+            "encoding": {
+                "categorical_columns": [
+                    "passholder_type",
+                    "bike_type",
+                    "trip_route_category"
+                ]
+            },
+            "random_state": 42,
+            "test_size": 0.3,
+            "cv": {
+                "folds": 3,
+                "verbose": 6,
+                "n_jobs": 1,
+                "n_iter": 5
+            },
+            "hyperparameters": {},
+            "artifacts": {
+                "dataset": {
+                    "local": "model/dataset",
+                    "remote": "s3://haferml-rideindego/marshall/model/dataset"
+                },
+                "model": {
+                    "name": "model.joblib",
+                    "local": "model/model",
+                    "remote": "s3://haferml-rideindego/marshall/model/model"
+                },
+                "prediction": {
+                    "local": "prediction",
+                    "remote": "s3://haferml-rideindego/marshall/prediction"
+                },
+                "performance": {
+                    "local": "performance",
+                    "remote": "s3://haferml-rideindego/marshall/performance"
+                }
+            }
+        }
+    }
+}
+```
diff --git a/docs/tutorials/rideindego/extract.md b/docs/tutorials/rideindego/extract.md
@@ -0,0 +1,58 @@
+## Extract
+
+As a first step, we will download the all the data files from the official website of [Rideindego](https://www.rideindego.com/about/data/).
+
+The web page contains multiple links to the file. We will get all the links to the data files using `DataLinkHTMLExtractor` defined in `utils.fetch`. We will then run through each link and download the zip file using `DataDownloader`.
+
+
+```python
+import os
+
+import click
+import simplejson as json
+from dotenv import load_dotenv
+from loguru import logger
+
+from utils.fetch import DataDownloader, DataLinkHTMLExtractor
+from utils.fetch import get_page_html as _get_page_html
+from haferml.blend.config import Config
+from haferml.sync.local import prepare_folders
+
+load_dotenv()
+
+@click.command()
+@click.option(
+    "-c",
+    "--config",
+    type=str,
+    default=os.getenv("CONFIG_FILE"),
+    help="Path to config file",
+)
+def extract(config):
+
+    base_folder = os.getenv("BASE_FOLDER")
+    _CONFIG = Config(config, base_folder=base_folder)
+
+    etl_trip_data_config = _CONFIG.get(["etl", "raw", "trip_data"])
+    logger.info(f"Using config: {etl_trip_data_config}")
+    # create folders
+    prepare_folders(etl_trip_data_config["local"], base_folder)
+    # if not os.path.exists(etl_trip_data_config["local"]):
+    #     os.makedirs(etl_trip_data_config["local"])
+
+    # Download Raw Data
+    source_link = etl_trip_data_config["source"]
+    logger.info(f"Will download from {source_link}")
+    page = _get_page_html(source_link).get("data", {})
+    page_extractor = DataLinkHTMLExtractor(page)
+    links = page_extractor.get_data_links()
+    logger.info(f"Extracted links from {source_link}: {links}")
+
+    # Download data
+    dld = DataDownloader(links, data_type="zip", folder=etl_trip_data_config["local_absolute"])
+    dld.run()
+
+
+if __name__ == "__main__":
+    extract()
+```
diff --git a/docs/tutorials/rideindego/index.md b/docs/tutorials/rideindego/index.md
@@ -4,3 +4,4 @@ This is a tutorial for rideindego data.
 
 The code for this project is [on GitHub](https://github.com/emptymalei/haferml-tutorials/tree/main/tutorials/rideindego/models/marshall_hafer).
 
+Everything in the artifacts folder can be synced to AWS S3 using `haferml.sync.aws` function or manually upload using aws cli. The whole project will then be restored by redownload the artifacts.
diff --git a/docs/tutorials/rideindego/prediction.md b/docs/tutorials/rideindego/prediction.md
@@ -0,0 +1,85 @@
+## Prediction
+
+To use the model, we will reload the model from the artifacts saved in the training step. It's very easy to reload the model, e.g., `ModelSet` in the following example.
+
+
+```python
+import os
+
+import click
+import joblib
+import pandas as pd
+from dotenv import load_dotenv
+from haferml.blend.config import Config
+from haferml.model.pipeline import ModelSetX
+from haferml.sync.local import prepare_folders
+from loguru import logger
+
+load_dotenv()
+
+
+class ModelSet(ModelSetX):
+    """
+    The core of the model including hyperparameters
+    """
+
+    def __init__(self, config, base_folder):
+        super(ModelSet, self).__init__(config, base_folder)
+
+    def reload(self):
+
+        model_folder = self.artifacts["model"]["local_absolute"]
+
+        logger.info("Reload models")
+        self.model = joblib.load(
+            os.path.join(
+                self.base_folder,
+                model_folder,
+                self.artifacts["model"]["name"],
+            )
+        )
+
+    def predict(self, data):
+
+        return self.model.predict(data)
+
+
+@click.command()
+@click.option(
+    "-c",
+    "--config",
+    type=str,
+    default=os.getenv("CONFIG_FILE"),
+    help="Path to config file",
+)
+def predict(config):
+
+    base_folder = os.getenv("BASE_FOLDER")
+    logger.debug(f"base folder is: {base_folder}")
+    logger.debug(f"config: {config}")
+
+    _CONFIG = Config(config, base_folder=base_folder)
+    preprocessed_data_config = _CONFIG[["preprocessing", "dataset", "preprocessed"]]
+    rf_config = _CONFIG[["model", "rf"]]
+
+    # create folders
+    prepare_folders(_CONFIG[["model", "rf", "artifacts", "model", "local"]], base_folder=base_folder)
+
+    # load some data
+    dataset_folder = _CONFIG[["model", "rf", "artifacts", "dataset", "local_absolute"]]
+    df = pd.read_parquet(
+        os.path.join(dataset_folder, "model_X_test.parquet")
+    ).sample(1)
+
+    # model
+    logger.debug(f"Prepare modelset and dataset")
+    M = ModelSet(config=rf_config, base_folder=base_folder)
+    M.reload()
+    logger.info(
+        M.predict(df)
+    )
+
+
+if __name__ == "__main__":
+    predict()
+```
diff --git a/docs/tutorials/rideindego/preprocessing.md b/docs/tutorials/rideindego/preprocessing.md
@@ -0,0 +1,90 @@
+## Preprocessing
+
+Preprocessing is using ordered member function by inheriting from `BasePreProcessor`.
+
+
+```python
+import datetime
+import os
+
+import click
+import pandas as pd
+import simplejson as json
+from dotenv import load_dotenv
+from haferml.blend.config import Config
+from haferml.preprocess.ingredients import attributes
+from haferml.preprocess.pipeline import BasePreProcessor
+from haferml.sync.local import prepare_folders
+from loguru import logger
+
+logger.info(f"Experiment started at: {datetime.datetime.now()}")
+load_dotenv()
+
+
+def load_data(data_path):
+    if data_path.endswith(".parquet"):
+        dataframe = pd.read_parquet(data_path)
+    else:
+        raise ValueError(f"Input path file format is not supported: {data_path}")
+
+    return dataframe
+
+
+class Preprocess(BasePreProcessor):
+    """
+    Preprocess dataset
+    There is very little to preprocess in this example. But we will keep this class for illustration purpose.
+    """
+
+    def __init__(self, config):
+        super(Preprocess, self).__init__(config=config)
+        self.feature_cols = self.config["features"]
+        self.target_cols = self.config["targets"]
+
+    @attributes(order=1)
+    def _drop_unused_columns(self, dataframe):
+
+        self.dataframe = dataframe[self.feature_cols + self.target_cols]
+
+        return self.dataframe
+
+
+
+@click.command()
+@click.option(
+    "-c",
+    "--config",
+    type=str,
+    default=os.getenv("CONFIG_FILE"),
+    help="Path to config file",
+)
+def preprocess(config):
+
+    base_folder = os.getenv("BASE_FOLDER")
+
+    _CONFIG = Config(config, base_folder=base_folder)
+
+    preprocessed_data_config = _CONFIG[["preprocessing", "dataset", "preprocessed"]]
+    transformed_trip_data_config = _CONFIG[["etl", "transformed", "trip_data"]]
+
+    # create folders
+    prepare_folders(preprocessed_data_config["local"], base_folder=base_folder)
+    prepare_folders(transformed_trip_data_config["local"], base_folder=base_folder)
+
+    # load transformed data
+    df = load_data(transformed_trip_data_config["name_absolute"])
+
+    # preprocess
+    pr = Preprocess(config=_CONFIG[["preprocessing"]])
+    df = pr.run(df)
+
+    # save
+    df.to_parquet(preprocessed_data_config["name_absolute"], index=False)
+    logger.info(f'Saved preprocessed data to {preprocessed_data_config["name_absolute"]}')
+
+    return df
+
+
+if __name__ == "__main__":
+    preprocess()
+```
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,3 +4,4 @@ This is a tutorial for rideindego data.

		The code for this project is [on GitHub](https://github.com/emptymalei/haferml-tutorials/tree/main/tutorials/rideindego/models/marshall_hafer).

		Everything in the artifacts folder can be synced to AWS S3 using `haferml.sync.aws` function or manually upload using aws cli. The whole project will then be restored by redownload the artifacts.