import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
df.shape
(1460, 81)
# Check null value
df_na = df.isna().sum()
df_na[df_na > 0]
LotFrontage 259 Alley 1369 MasVnrType 8 MasVnrArea 8 BsmtQual 37 BsmtCond 37 BsmtExposure 38 BsmtFinType1 37 BsmtFinType2 38 Electrical 1 FireplaceQu 690 GarageType 81 GarageYrBlt 81 GarageFinish 81 GarageQual 81 GarageCond 81 PoolQC 1453 Fence 1179 MiscFeature 1406 dtype: int64
# ExterQual
mapping = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
df['ExterQual'] = df['ExterQual'].map(mapping)
df['ExterCond'] = df['ExterCond'].map(mapping)
df['HeatingQC'] = df['HeatingQC'].map(mapping)
df['KitchenQual'] = df['KitchenQual'].map(mapping)
df['BsmtQual'] = df['BsmtQual'].map(mapping)
df['BsmtCond'] = df['BsmtCond'].map(mapping)
df['FireplaceQu'] = df['FireplaceQu'].map(mapping)
df['PoolQC'] = df['PoolQC'].map(mapping)
mapping_2 = {'Gd': 3,'Av': 2, 'Mn': 1}
df['BsmtExposure'] = df['BsmtExposure'].map(mapping_2)
mapping_3 = {'GLQ': 6,'ALQ': 5, 'BLQ': 4,'Rec': 3,'LwQ': 2, 'Unf': 1}
df['BsmtFinType1'] = df['BsmtFinType1'].map(mapping_3)
df['BsmtFinType2'] = df['BsmtFinType2'].map(mapping_3)
mapping_4 = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
df['GarageQual'] = df['GarageQual'].map(mapping_4)
df['GarageCond'] = df['GarageCond'].map(mapping_4)
mapping_5 = {'N': 0, 'Y': 1}
df['CentralAir'] = df['CentralAir'].map(mapping_5)
mapping_6 = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
df['LotShape'] = df['LotShape'].map(mapping_6)
mapping_7 = {'Gtl': 3, 'Mod': 2, 'Sev': 1}
df['LandSlope'] = df['LandSlope'].map(mapping_7)
df.shape
(1460, 81)
sns.boxplot(x=df['LotFrontage'])
plt.title('LotFrontage (with outliers)')
plt.show()
df['LotFrontage'].fillna(df['LotFrontage'].mean(), inplace =True)
df.Alley.describe
<bound method NDFrame.describe of 0 NaN 1 NaN 2 NaN 3 NaN 4 NaN ... 1455 NaN 1456 NaN 1457 NaN 1458 NaN 1459 NaN Name: Alley, Length: 1460, dtype: object>
df['Alley'].fillna('NA', inplace =True)
df.MasVnrType
0 BrkFace 1 None 2 BrkFace 3 None 4 BrkFace ... 1455 None 1456 Stone 1457 None 1458 None 1459 None Name: MasVnrType, Length: 1460, dtype: object
df['MasVnrType'].fillna('None', inplace =True)
df['MasVnrArea'].fillna(0, inplace =True)
df['BsmtQual'].fillna(df['BsmtQual'].mean(), inplace =True)
df['BsmtCond'].fillna(df['BsmtCond'].mean(), inplace =True)
df['BsmtExposure'].fillna(df['BsmtExposure'].mean(), inplace =True)
df['BsmtFinType1'].fillna(df['BsmtFinType1'].mean(), inplace =True)
df['BsmtFinType2'].fillna(df['BsmtFinType2'].mean(), inplace =True)
sns.displot(df['Electrical'])
<seaborn.axisgrid.FacetGrid at 0x127f27890>
df['Electrical'].fillna('None', inplace =True)
df.FireplaceQu.head()
0 NaN 1 3.0 2 3.0 3 4.0 4 3.0 Name: FireplaceQu, dtype: float64
df['FireplaceQu'].fillna(df['FireplaceQu'].mean(), inplace =True)
df['GarageType'].fillna('None', inplace =True)
df['GarageYrBlt'].fillna(df['GarageYrBlt'].mean(), inplace =True)
df['GarageFinish'].fillna('None', inplace =True)
df['GarageQual'].fillna('None', inplace =True)
df['GarageCond'].fillna('None', inplace =True)
df['PoolQC'].fillna(df['PoolQC'].mean(), inplace =True)
df['Fence'].fillna('None', inplace =True)
df['MiscFeature'].fillna('None', inplace =True)
df_na = df.isna().sum()
df_na[df_na > 0]
Series([], dtype: int64)
df_clean = df
df_clean['GarageYrBlt'].dtype
df_clean['GarageYrBlt'] = 2023 - df_clean['GarageYrBlt']
df_clean['YearBuilt'] = 2023 - df_clean['YearBuilt']
df_clean['YearRemodAdd'] = 2023 - df_clean['YearRemodAdd']
df_clean['YrSold'] = 2023 - df_clean['YrSold']
df_clean['YearBuilt'].head()
0 20 1 47 2 22 3 108 4 23 Name: YearBuilt, dtype: int64
df_clean.rename(columns={'GarageYrBlt': 'GarageAge'}, inplace=True)
df_clean.rename(columns={'YearBuilt': 'YearBuiltAge'}, inplace=True)
df_clean.rename(columns={'YearRemodAdd': 'YearRemodAddAge'}, inplace=True)
df_clean.rename(columns={'YrSold': 'YrSoldAge'}, inplace=True)
#df_clean['Log_GarageAge'] = np.log(df_clean['GarageAge'])
#df_clean['Log_YearBuiltAge'] = np.log(df_clean['YearBuiltAge'])
#df_clean['Log_YearRemodAddAge'] = np.log(df_clean['YearRemodAddAge'])
#df_clean['Log_YrSoldAge'] = np.log(df_clean['YrSoldAge'])
#df_clean.drop(columns=['GarageAge', 'YearBuiltAge', 'YearRemodAddAge', 'YrSoldAge'])
df_clean['SalePrice'] = np.log(df_clean['SalePrice'])
df_clean.shape
(1460, 81)
numerical_df_clean = df_clean.select_dtypes(include='number')
numerical_df_clean.shape
(1460, 52)
numerical_df_clean = numerical_df_clean.drop(columns=['Id', 'SalePrice'], axis=1)
numerical_df_clean.shape
(1460, 50)
numerical_df_clean.head()
MSSubClass | LotFrontage | LotArea | LotShape | LandSlope | OverallQual | OverallCond | YearBuiltAge | YearRemodAddAge | MasVnrArea | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | MiscVal | MoSold | YrSoldAge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60 | 65.0 | 8450 | 4 | 3 | 7 | 5 | 20 | 20 | 196.0 | ... | 0 | 61 | 0 | 0 | 0 | 0 | 3.714286 | 0 | 2 | 15 |
1 | 20 | 80.0 | 9600 | 4 | 3 | 6 | 8 | 47 | 47 | 0.0 | ... | 298 | 0 | 0 | 0 | 0 | 0 | 3.714286 | 0 | 5 | 16 |
2 | 60 | 68.0 | 11250 | 3 | 3 | 7 | 5 | 22 | 21 | 162.0 | ... | 0 | 42 | 0 | 0 | 0 | 0 | 3.714286 | 0 | 9 | 15 |
3 | 70 | 60.0 | 9550 | 3 | 3 | 7 | 5 | 108 | 53 | 0.0 | ... | 0 | 35 | 272 | 0 | 0 | 0 | 3.714286 | 0 | 2 | 17 |
4 | 60 | 84.0 | 14260 | 3 | 3 | 8 | 5 | 23 | 23 | 350.0 | ... | 192 | 84 | 0 | 0 | 0 | 0 | 3.714286 | 0 | 12 | 15 |
5 rows × 50 columns
# Scale the data
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scaled_numerical_df_clean = scale.fit_transform(numerical_df_clean)
scaled_numerical_df_clean
array([[ 0.07337496, -0.22937175, -0.20714171, ..., -0.08768781, -1.5991111 , -0.13877749], [-0.87256276, 0.4519361 , -0.09188637, ..., -0.08768781, -0.48911005, 0.61443862], [ 0.07337496, -0.09311018, 0.07347998, ..., -0.08768781, 0.99089135, -0.13877749], ..., [ 0.30985939, -0.18395123, -0.14781027, ..., 4.95311151, -0.48911005, -1.64520971], [-0.87256276, -0.09311018, -0.08016039, ..., -0.08768781, -0.8591104 , -1.64520971], [-0.87256276, 0.22483348, -0.05811155, ..., -0.08768781, -0.1191097 , -0.13877749]])
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)
pca.fit(scaled_numerical_df_clean)
# Find out how many components PCA has after fitting the model
pca.n_components_
37
# Transform the data into the principal components
principal_components = pca.transform(scaled_numerical_df_clean)
principal_components.shape
(1460, 37)
principal_components
array([[ 2.18984101, -0.16763339, -1.47112319, ..., 0.10860845, 0.07967898, 0.18651098], [ 0.04584576, -1.54323387, 1.0932354 , ..., -0.52074135, 0.980782 , 0.05223869], [ 2.32696368, 0.04969061, -1.34140998, ..., 0.04947632, -0.38301589, -0.09240722], ..., [ 1.40977495, 2.26467125, 0.85626749, ..., 1.17748467, -0.66432792, -0.89253948], [-2.71136048, -2.80817005, 1.70334363, ..., 0.44461475, 0.1831076 , -0.55143658], [-0.94569008, -1.84532397, 1.62666513, ..., 1.09212882, 0.45154769, -0.20866037]])
# Create a DataFrame from the principal components
columns = [f'PC{i+1}' for i in range(principal_components.shape[1])]
principal_df = pd.DataFrame(principal_components, columns=columns)
principal_df.head()
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ... | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.189841 | -0.167633 | -1.471123 | -2.262940 | 1.060092 | -0.260601 | -0.626813 | 0.068582 | -0.784960 | -1.201084 | ... | -0.129178 | -0.060662 | 0.127597 | 0.328337 | -0.118319 | 0.273451 | -0.573404 | 0.108608 | 0.079679 | 0.186511 |
1 | 0.045846 | -1.543234 | 1.093235 | -0.001000 | -0.789579 | -1.115314 | 1.085005 | -0.212486 | 2.294729 | 0.171581 | ... | 1.353092 | 0.475002 | -0.393422 | -0.711873 | -0.376811 | 0.245289 | 0.599826 | -0.520741 | 0.980782 | 0.052239 |
2 | 2.326964 | 0.049691 | -1.341410 | -1.890272 | 0.213914 | -0.145710 | 0.097192 | 0.821088 | -0.401888 | -1.050750 | ... | 0.178433 | 0.148980 | 0.628293 | 0.511766 | -0.199613 | 0.286282 | -0.168346 | 0.049476 | -0.383016 | -0.092407 |
3 | -0.131933 | 0.477460 | 0.602497 | -0.474069 | -0.403288 | -0.902429 | -0.835986 | -1.001447 | -0.538358 | 0.035704 | ... | -1.205494 | -0.579009 | -0.622643 | -0.595745 | -0.688999 | -0.031591 | -0.217376 | -0.113522 | -1.401679 | -0.329459 |
4 | 4.671188 | 1.059574 | 0.364506 | -1.838918 | 0.683339 | -0.072945 | 0.449894 | 0.733929 | 0.351705 | -0.504662 | ... | -0.255722 | 0.251739 | -0.116956 | 0.034451 | -0.711869 | 0.473764 | 0.147203 | 0.212041 | -0.772847 | -0.167668 |
5 rows × 37 columns
categorical_df_clean = df_clean.select_dtypes(exclude='number')
categorical_df_clean.shape
(1460, 29)
categorical_df_clean.head()
MSZoning | Street | Alley | LandContour | Utilities | LotConfig | Neighborhood | Condition1 | Condition2 | BldgType | ... | Functional | GarageType | GarageFinish | GarageQual | GarageCond | PavedDrive | Fence | MiscFeature | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | RL | Pave | NA | Lvl | AllPub | Inside | CollgCr | Norm | Norm | 1Fam | ... | Typ | Attchd | RFn | 3.0 | 3.0 | Y | None | None | WD | Normal |
1 | RL | Pave | NA | Lvl | AllPub | FR2 | Veenker | Feedr | Norm | 1Fam | ... | Typ | Attchd | RFn | 3.0 | 3.0 | Y | None | None | WD | Normal |
2 | RL | Pave | NA | Lvl | AllPub | Inside | CollgCr | Norm | Norm | 1Fam | ... | Typ | Attchd | RFn | 3.0 | 3.0 | Y | None | None | WD | Normal |
3 | RL | Pave | NA | Lvl | AllPub | Corner | Crawfor | Norm | Norm | 1Fam | ... | Typ | Detchd | Unf | 3.0 | 3.0 | Y | None | None | WD | Abnorml |
4 | RL | Pave | NA | Lvl | AllPub | FR2 | NoRidge | Norm | Norm | 1Fam | ... | Typ | Attchd | RFn | 3.0 | 3.0 | Y | None | None | WD | Normal |
5 rows × 29 columns
# List to conduct one hot-encode
list_to_encode = categorical_df_clean.columns.to_list()
list_to_encode
['MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
# Turn categorical data into dummy
categorical_df = pd.get_dummies(categorical_df_clean, columns=list_to_encode)
categorical_df.shape
(1460, 201)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(categorical_df, df_clean['SalePrice'], test_size=0.2, random_state=42)
x_train.shape
(1168, 201)
x_test.shape
(292, 201)
from sklearn.linear_model import Lasso, Ridge
lasso_model = Lasso(alpha=10)
lasso_model.fit(x_train, y_train)
# Get the selected feature names
selected_feature_names = categorical_df.columns[lasso_model.coef_ != 0]
# Print or use the selected feature names as needed
print("Selected Feature Names:", selected_feature_names)
Selected Feature Names: Index([], dtype='object')
selected_feature_names = selected_feature_names.to_list()
concat_df = pd.concat([principal_df, categorical_df[selected_feature_names]], axis=1)
concat_df.head()
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ... | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.189841 | -0.167633 | -1.471123 | -2.262940 | 1.060092 | -0.260601 | -0.626813 | 0.068582 | -0.784960 | -1.201084 | ... | -0.129178 | -0.060662 | 0.127597 | 0.328337 | -0.118319 | 0.273451 | -0.573404 | 0.108608 | 0.079679 | 0.186511 |
1 | 0.045846 | -1.543234 | 1.093235 | -0.001000 | -0.789579 | -1.115314 | 1.085005 | -0.212486 | 2.294729 | 0.171581 | ... | 1.353092 | 0.475002 | -0.393422 | -0.711873 | -0.376811 | 0.245289 | 0.599826 | -0.520741 | 0.980782 | 0.052239 |
2 | 2.326964 | 0.049691 | -1.341410 | -1.890272 | 0.213914 | -0.145710 | 0.097192 | 0.821088 | -0.401888 | -1.050750 | ... | 0.178433 | 0.148980 | 0.628293 | 0.511766 | -0.199613 | 0.286282 | -0.168346 | 0.049476 | -0.383016 | -0.092407 |
3 | -0.131933 | 0.477460 | 0.602497 | -0.474069 | -0.403288 | -0.902429 | -0.835986 | -1.001447 | -0.538358 | 0.035704 | ... | -1.205494 | -0.579009 | -0.622643 | -0.595745 | -0.688999 | -0.031591 | -0.217376 | -0.113522 | -1.401679 | -0.329459 |
4 | 4.671188 | 1.059574 | 0.364506 | -1.838918 | 0.683339 | -0.072945 | 0.449894 | 0.733929 | 0.351705 | -0.504662 | ... | -0.255722 | 0.251739 | -0.116956 | 0.034451 | -0.711869 | 0.473764 | 0.147203 | 0.212041 | -0.772847 | -0.167668 |
5 rows × 37 columns
concat_df.shape
(1460, 37)
df_clean
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(concat_df, df_clean['SalePrice'], test_size=0.2, random_state=42)
# Train the model - Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(x_train, y_train)
# Test the model
y_pred = rf.predict(x_test)
# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
Mean Absolute Error: 0.10416006535399967 Mean Squared Error: 0.024199680182249703 Root Mean Squared Error (RMSE): 0.15556246392446252 R-squared: 0.8703219337513204
import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBRegressor
# fit model no training data
model = XGBRegressor()
model.fit(x_train, y_train)
# Test the model
y_pred = model.predict(x_test)
# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
Mean Absolute Error: 0.10884051148169183 Mean Squared Error: 0.02466610410093325 Root Mean Squared Error (RMSE): 0.15705446221274086 R-squared: 0.8678225225454078
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBRegressor
# # Define your XGBoost model
# xgb_model = XGBRegressor()
# # Define the hyperparameter grid
# param_grid = {
# 'learning_rate': [0.01, 0.1, 0.2, 0.3],
# 'n_estimators': [50, 100, 200, 300],
# 'max_depth': [3, 5, 7, 9],
# 'min_child_weight': [1, 3, 5, 7],
# 'subsample': [0.7, 0.8, 0.9],
# 'colsample_bytree': [0.7, 0.8, 0.9],
# 'gamma': [0, 0.1, 0.2, 0.3],
# 'scale_pos_weight': [1, 2, 3]
# }
# # Create GridSearchCV object
# grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
# # Fit the model to the data
# grid_search.fit(x_train, y_train)
# # Print the best parameters and corresponding RMSE
# print("Best Parameters: ", grid_search.best_params_)
# print("Best RMSE: ", (-grid_search.best_score_) ** 0.5)
# # Get the best model
# best_xgb_model = grid_search.best_estimator_
# fit model no training data
model = XGBRegressor(
colsample_bytree=0.8,
gamma=0,
learning_rate=0.1,
max_depth=3,
min_child_weight=7,
n_estimators=300,
scale_pos_weight=1,
subsample=0.9
)
model.fit(x_train, y_train)
# Test the model
y_pred = model.predict(x_test)
# Evaluate the model
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Assuming y_pred is the predictions from your regression model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared:", r_squared)
Mean Absolute Error: 0.10013308511851164 Mean Squared Error: 0.021651133900594074 Root Mean Squared Error (RMSE): 0.14714324279624286 R-squared: 0.8839787486786836
test.isna().sum()
Id 0 MSSubClass 0 MSZoning 4 LotFrontage 227 LotArea 0 ... MiscVal 0 MoSold 0 YrSold 0 SaleType 1 SaleCondition 0 Length: 80, dtype: int64
test.shape
(1459, 80)
# ExterQual
mapping = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
test['ExterQual'] = test['ExterQual'].map(mapping)
test['ExterCond'] = test['ExterCond'].map(mapping)
test['HeatingQC'] = test['HeatingQC'].map(mapping)
test['KitchenQual'] = test['KitchenQual'].map(mapping)
test['BsmtQual'] = test['BsmtQual'].map(mapping)
test['BsmtCond'] = test['BsmtCond'].map(mapping)
test['FireplaceQu'] = test['FireplaceQu'].map(mapping)
test['PoolQC'] = test['PoolQC'].map(mapping)
mapping_2 = {'Gd': 3,'Av': 2, 'Mn': 1}
test['BsmtExposure'] = test['BsmtExposure'].map(mapping_2)
mapping_3 = {'GLQ': 6,'ALQ': 5, 'BLQ': 4,'Rec': 3,'LwQ': 2, 'Unf': 1}
test['BsmtFinType1'] = test['BsmtFinType1'].map(mapping_3)
test['BsmtFinType2'] = test['BsmtFinType2'].map(mapping_3)
mapping_4 = {'Ex': 5,'Gd': 4, 'TA': 3,'Fa': 2,'Po': 1}
test['GarageQual'] = test['GarageQual'].map(mapping_4)
test['GarageCond'] = test['GarageCond'].map(mapping_4)
mapping_5 = {'N': 0, 'Y': 1}
test['CentralAir'] = test['CentralAir'].map(mapping_5)
mapping_6 = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
test['LotShape'] = test['LotShape'].map(mapping_6)
mapping_7 = {'Gtl': 3, 'Mod': 2, 'Sev': 1}
test['LandSlope'] = test['LandSlope'].map(mapping_7)
test.shape
(1459, 80)
test['LotFrontage'].fillna(test['LotFrontage'].mean(), inplace =True)
test['Alley'].fillna('NA', inplace =True)
test['MasVnrType'].fillna('None', inplace =True)
test['MasVnrArea'].fillna(0, inplace =True)
test['BsmtQual'].fillna(test['BsmtQual'].mean(), inplace =True)
test['BsmtCond'].fillna(test['BsmtCond'].mean(), inplace =True)
test['BsmtExposure'].fillna(test['BsmtExposure'].mean(), inplace =True)
test['BsmtFinType1'].fillna(test['BsmtFinType1'].mean(), inplace =True)
test['BsmtFinType2'].fillna(test['BsmtFinType2'].mean(), inplace =True)
test['Electrical'].fillna('None', inplace =True)
test['FireplaceQu'].fillna(test['FireplaceQu'].mean(), inplace =True)
test['GarageType'].fillna('None', inplace =True)
test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean(), inplace =True)
test['GarageFinish'].fillna('None', inplace =True)
test['GarageQual'].fillna('None', inplace =True)
test['GarageCond'].fillna('None', inplace =True)
test['PoolQC'].fillna(test['PoolQC'].mean(), inplace =True)
test['Fence'].fillna('None', inplace =True)
test['MiscFeature'].fillna('None', inplace =True)
test['MSZoning'].fillna('None', inplace =True)
test['SaleType'].fillna('Oth', inplace =True)
test['Utilities'].fillna('None', inplace =True)
test['Exterior1st'].fillna('Other', inplace =True)
test['Exterior2nd'].fillna('Other', inplace =True)
test['BsmtFinSF1'].fillna(0, inplace =True)
test['BsmtFinSF2'].fillna(0, inplace =True)
test['BsmtUnfSF'].fillna(0, inplace =True)
test['TotalBsmtSF'].fillna(0, inplace =True)
test['BsmtFullBath'].fillna(0, inplace =True)
test['BsmtHalfBath'].fillna(0, inplace =True)
test['KitchenQual'].fillna(test['KitchenQual'].mean(), inplace =True)
test['Functional'].fillna('None', inplace =True)
test['GarageCars'].fillna(0, inplace =True)
test['GarageArea'].fillna(0, inplace =True)
test_na = test.isna().sum()
test_na[test_na > 0]
Series([], dtype: int64)
test_clean = test
test_clean.shape
(1459, 80)
test_clean['GarageYrBlt'].dtype
test_clean['GarageYrBlt'] = 2023 - test_clean['GarageYrBlt']
test_clean['YearBuilt'] = 2023 - test_clean['YearBuilt']
test_clean['YearRemodAdd'] = 2023 - test_clean['YearRemodAdd']
test_clean['YrSold'] = 2023 - test_clean['YrSold']
test_clean['YearBuilt'].head()
0 62 1 65 2 26 3 25 4 31 Name: YearBuilt, dtype: int64
test_clean.rename(columns={'GarageYrBlt': 'GarageAge'}, inplace=True)
test_clean.rename(columns={'YearBuilt': 'YearBuiltAge'}, inplace=True)
test_clean.rename(columns={'YearRemodAdd': 'YearRemodAddAge'}, inplace=True)
test_clean.rename(columns={'YrSold': 'YrSoldAge'}, inplace=True)
# test_clean['Log_GarageAge'] = np.log(test_clean['GarageAge'])
# test_clean['Log_YearBuiltAge'] = np.log(test_clean['YearBuiltAge'])
# test_clean['Log_YearRemodAddAge'] = np.log(test_clean['YearRemodAddAge'])
# test_clean['Log_YrSoldAge'] = np.log(test_clean['YrSoldAge'])
# test_clean.drop(columns=['GarageAge', 'YearBuiltAge', 'YearRemodAddAge', 'YrSoldAge'])
test_clean_na = test_clean.isna().sum()
test_clean_na[test_clean_na > 0]
Series([], dtype: int64)
#test_clean['Log_GarageAge'].fillna(test['Log_GarageAge'].mean(), inplace =True)
test_clean.shape
(1459, 80)
numerical_test_clean = test_clean.select_dtypes(include='number')
numerical_test_clean = numerical_test_clean.drop(columns=['Id'], axis=1)
numerical_test_clean.shape
(1459, 50)
numerical_test_clean.head()
MSSubClass | LotFrontage | LotArea | LotShape | LandSlope | OverallQual | OverallCond | YearBuiltAge | YearRemodAddAge | MasVnrArea | ... | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | MiscVal | MoSold | YrSoldAge | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 20 | 80.0 | 11622 | 4 | 3 | 5 | 6 | 62 | 62 | 0.0 | ... | 140 | 0 | 0 | 0 | 120 | 0 | 4.666667 | 0 | 6 | 13 |
1 | 20 | 81.0 | 14267 | 3 | 3 | 6 | 6 | 65 | 65 | 108.0 | ... | 393 | 36 | 0 | 0 | 0 | 0 | 4.666667 | 12500 | 6 | 13 |
2 | 60 | 74.0 | 13830 | 3 | 3 | 5 | 5 | 26 | 25 | 0.0 | ... | 212 | 34 | 0 | 0 | 0 | 0 | 4.666667 | 0 | 3 | 13 |
3 | 60 | 78.0 | 9978 | 3 | 3 | 6 | 6 | 25 | 25 | 20.0 | ... | 360 | 36 | 0 | 0 | 0 | 0 | 4.666667 | 0 | 6 | 13 |
4 | 120 | 43.0 | 5005 | 3 | 3 | 8 | 5 | 31 | 31 | 0.0 | ... | 0 | 82 | 0 | 0 | 144 | 0 | 4.666667 | 0 | 1 | 13 |
5 rows × 50 columns
scaled_numerical_test_clean = scale.fit_transform(numerical_test_clean)
test_principal_components = pca.transform(scaled_numerical_test_clean)
test_principal_components.shape
(1459, 37)
columns = [f'PC{i+1}' for i in range(test_principal_components.shape[1])]
test_principal_df = pd.DataFrame(test_principal_components, columns=columns)
test_principal_df.head()
PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ... | PC28 | PC29 | PC30 | PC31 | PC32 | PC33 | PC34 | PC35 | PC36 | PC37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -2.839040 | -1.470161 | 0.664487 | 0.779536 | -0.663260 | 0.444999 | 0.169294 | 1.049017 | -0.625156 | -0.273826 | ... | -0.310042 | 0.407158 | -0.646351 | -0.558659 | 0.486072 | 0.333655 | -0.313075 | 0.456774 | 0.923647 | -0.150863 |
1 | -1.133623 | -0.612153 | 2.279086 | -0.536540 | -0.587211 | -1.377610 | 0.556120 | 2.859048 | 3.248475 | -2.617143 | ... | -0.902030 | -0.339549 | -0.888399 | 0.741832 | -0.399486 | 0.146729 | -0.379392 | 1.920819 | -0.020897 | -0.402737 |
2 | 0.759764 | -0.246478 | 0.006912 | -1.955643 | 0.838348 | 0.437214 | 1.329624 | 0.224668 | 0.030200 | -2.362899 | ... | 0.261019 | -0.570169 | -0.114005 | -0.428628 | 0.376088 | 0.006565 | 0.077427 | -0.774392 | 0.974675 | -1.026337 |
3 | 1.445369 | 0.251309 | -0.348032 | -1.589184 | -0.202136 | -0.480743 | 0.754694 | -0.509421 | -0.076047 | -1.568251 | ... | 0.013802 | -0.099050 | -0.044536 | -0.814137 | -0.280436 | -0.618989 | 0.063194 | -0.240648 | 0.348658 | -1.909528 |
4 | 1.272475 | -0.990319 | -2.566121 | 0.613926 | -0.023332 | 0.447076 | -0.555119 | 0.019079 | -1.038859 | 0.746275 | ... | -0.969416 | -0.122335 | -0.828462 | 0.188991 | -0.495817 | -0.240772 | 0.481708 | 1.247824 | 0.415142 | -0.639205 |
5 rows × 37 columns
categorical_test_clean = test_clean.select_dtypes(exclude='number')
categorical_test_clean.shape
(1459, 29)
categorical_test = pd.get_dummies(categorical_test_clean, columns=list_to_encode)
# Assuming 'categorical_test' and 'categorical_df' are your DataFrames
missing_columns = set(categorical_df.columns) - set(categorical_test.columns)
# Add missing columns to 'categorical_test' and fill with zeros
for column in missing_columns:
categorical_test[column] = 0
# Assuming 'categorical_test' and 'categorical_df' are your DataFrames
extra_columns = set(categorical_test.columns) - set(categorical_df.columns)
# Drop extra columns from 'categorical_test'
categorical_test = categorical_test.drop(columns=extra_columns, errors='ignore')
categorical_test.shape
(1459, 201)
concat_test = pd.concat([test_principal_df, categorical_test[selected_feature_names]], axis=1)
concat_test.shape
(1459, 37)
y_pred = rf.predict(concat_test)
y_pred = np.exp(y_pred)
y_pred = pd.DataFrame(y_pred)
test['SalePrice'] = y_pred
submission_4 = test[['Id','SalePrice']]
submission_4.head()
Id | SalePrice | |
---|---|---|
0 | 1461 | 123104.018362 |
1 | 1462 | 164577.118041 |
2 | 1463 | 182534.158556 |
3 | 1464 | 201009.454464 |
4 | 1465 | 182511.697001 |
submission_4.to_csv('submission_4.csv', index=False)