Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduced Random Forest Regressor with Regularization for Improved Accuracy #173

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 196 additions & 0 deletions models/house_price/ImprovedModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import warnings
import pickle
from .ModelEvaluation import ModelEvaluation
import os
import logging
import streamlit as st
import numpy as np
warnings.filterwarnings("ignore")

# Define the directory for logs
log_directory = 'models/house_price/logs'
os.makedirs(log_directory, exist_ok=True) # Create the directory if it doesn't exist

# Set up logging
log_file = os.path.join(log_directory, 'model_training.log')
logging.basicConfig(
filename=log_file,
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)

df = pd.read_csv("models/house_price/data/housing.csv")
original_df = df.copy(deep=True)

# Target and Feature Identification
target = "price"
features = [col for col in df.columns if col != target]

# Separates numerical and categorical features based on unique values
nu = df[features].nunique()
numerical_features = [col for col in features if nu[col] > 16]
categorical_features = [col for col in features if nu[col] <= 16]

# Removing outliers using IQR
def remove_outliers(df, numerical_features):
for feature in numerical_features:
Q1 = df[feature].quantile(0.25)
Q3 = df[feature].quantile(0.75)
IQR = Q3 - Q1
df = df[(df[feature] >= (Q1 - 1.5 * IQR)) & (df[feature] <= (Q3 + 1.5 * IQR))]
return df.reset_index(drop=True)


# Handling missing values
def handle_missing_values(df):
null_summary = df.isnull().sum()
null_percentage = (null_summary / df.shape[0]) * 100
return pd.DataFrame(
{"Total Null Values": null_summary, "Percentage": null_percentage}
).sort_values(by="Percentage", ascending=False)


# Removes outliers from numerical features
df = remove_outliers(df, numerical_features)

# Filters categorical features without missing values
null_value_summary = handle_missing_values(df)
valid_categorical_features = [
col
for col in categorical_features
if col not in null_value_summary[null_value_summary["Percentage"] != 0].index
]

# Encoding categorical features
def encode_categorical_features(df, categorical_features):
for feature in categorical_features:
# Binary encoding for features with 2 unique values
if df[feature].nunique() == 2:
df[feature] = pd.get_dummies(df[feature], drop_first=True, prefix=feature)
# Dummy encoding for features with more than 2 unique values
elif 2 < df[feature].nunique() <= 16:
df = pd.concat(
[
df.drop([feature], axis=1),
pd.get_dummies(df[feature], drop_first=True, prefix=feature),
],
axis=1,
)
return df

df = encode_categorical_features(df, valid_categorical_features)

# Renames columns to avoid invalid characters
df.columns = [col.replace("-", "_").replace(" ", "_") for col in df.columns]

# Splitting the data into training & testing sets
X = df.drop([target], axis=1)
Y = df[target]
Train_X, Test_X, Train_Y, Test_Y = train_test_split(
X, Y, train_size=0.8, test_size=0.2, random_state=100
)

# Feature Scaling (Standardization)
std = StandardScaler()
Train_X_std = pd.DataFrame(std.fit_transform(Train_X), columns=X.columns)
Test_X_std = pd.DataFrame(std.transform(Test_X), columns=X.columns)

#Random Forest Algorithm
rf_model = RandomForestRegressor(random_state=42, n_estimators=200, max_depth=8, min_samples_split=12)
rf_model.fit(Train_X_std, Train_Y)


pred_train = rf_model.predict(Train_X_std)
pred_test = rf_model.predict(Test_X_std)

# Calculate RMSE for train and test sets
# train_rmse = np.sqrt(mean_squared_error(Train_Y, pred_train))
# test_rmse = np.sqrt(mean_squared_error(Test_Y, pred_test))


def prepare_input_data(
area,
mainroad,
guestroom,
basement,
hotwaterheating,
airconditioning,
prefarea,
additional_bedrooms,
bathrooms,
stories,
parking,
furnishingstatus,
):
# Creates a dictionary for the input features
input_data = {
"area": [area],
"mainroad": True if mainroad == "Yes" else False,
"guestroom": True if guestroom == "Yes" else False,
"basement": True if basement == "Yes" else False,
"hotwaterheating": True if hotwaterheating == "Yes" else False,
"airconditioning": True if airconditioning == "Yes" else False,
"prefarea": True if prefarea == "Yes" else False,
"bedrooms_2": additional_bedrooms == 2,
"bedrooms_3": additional_bedrooms == 3,
"bedrooms_4": additional_bedrooms == 4,
"bedrooms_5": additional_bedrooms == 5,
"bedrooms_6": additional_bedrooms == 6,
"bathrooms_2": bathrooms == 2,
"bathrooms_3": bathrooms == 3,
"bathrooms_4": bathrooms == 4,
"stories_2": stories == 2,
"stories_3": stories == 3,
"stories_4": stories == 4,
"parking_1": parking == 1,
"parking_2": parking == 2,
"parking_3": parking == 3,
"furnishingstatus_semi_furnished": furnishingstatus == "semi_furnished",
"furnishingstatus_unfurnished": furnishingstatus == "unfurnished",
}

return pd.DataFrame(input_data)

# Note: Not removing this fxn because of the warning in predict.py file


### Final Endpoint ###
def get_predicted(area=0, mainroad=False, guestroom=False, basement=False, hotwaterheating=False,
airconditioning=False, prefarea=False,bedrooms=0, bathrooms=2,stories=1, parking=1,
furnishingstatus="semi_furnished",):

input_df = prepare_input_data(area, mainroad, guestroom,basement, hotwaterheating, airconditioning, prefarea,
bedrooms, bathrooms, stories, parking, furnishingstatus)

input_std = pd.DataFrame(std.transform(input_df), columns=input_df.columns)
predicted_price = rf_model.predict(input_std)
return round(predicted_price[0],2)

def save_model():
# todo: Ask the user for the model name, and warn that the model will be overwritten

with open("./saved_models/model_02.pkl", "wb") as file:
pickle.dump(rf_model, file)


def save_scaler():
with open("./saved_models/scaler_02.pkl", "wb") as file:
pickle.dump(std, file)


def get_evaluator():
evaluator = ModelEvaluation(rf_model, Train_X_std, Train_Y, Test_X_std, Test_Y)
return evaluator

if __name__ == "__main__":
save_model()
save_scaler()
# model_evaluation()
7 changes: 4 additions & 3 deletions models/house_price/predict.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pickle
import pandas as pd
from models.house_price.model import get_evaluator
# from models.house_price.model import get_evaluator
from models.house_price.ImprovedModel import get_evaluator

"""
Predict.py file:
Expand Down Expand Up @@ -102,8 +103,8 @@ def get_prediction(
)

# Load the model and the scaler
model = load_model("models/house_price/saved_models/model_01.pkl")
scaler = load_model("models/house_price/saved_models/scaler_01.pkl")
model = load_model("models/house_price/saved_models/model_02.pkl")
scaler = load_model("models/house_price/saved_models/scaler_02.pkl")

# Scale the input data
input_scaled = scaler.transform(input_df)
Expand Down
Binary file added models/house_price/saved_models/model_02.pkl
Binary file not shown.
Binary file added models/house_price/saved_models/scaler_02.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion page_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def render_model_details(self, model_module,tab):
if model_details_function:
metrics, prediction_plot, error_plot, performance_plot = model_details_function().evaluate()

st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2%}")
st.subheader(f"Model Accuracy: {metrics['Test_R2']:.2f}")

#mentioning the title of the scores
st.subheader(f"Scores: Training: {metrics['Train_R2']:.2f}, Testing: {metrics['Test_R2']:.2f}")
Expand Down
Loading