import math
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
pd.set_option('display.max_columns',None)
source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
DATA = '/home/strange/shob/pywork/data/house-price-advanced-regression-KAGGLE/'
nTrain = pd.read_csv(f"{DATA}train.csv")
nTest = pd.read_csv(f"{DATA}test.csv")
print(nTrain.head().to_markdown())
| | Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice |
|---:|-----:|-------------:|-----------:|--------------:|----------:|---------:|--------:|-----------:|--------------:|------------:|------------:|------------:|---------------:|-------------:|-------------:|-----------:|-------------:|--------------:|--------------:|------------:|---------------:|------------:|-----------:|--------------:|--------------:|-------------:|-------------:|------------:|------------:|-------------:|-----------:|-----------:|---------------:|---------------:|-------------:|---------------:|-------------:|------------:|--------------:|----------:|------------:|-------------:|-------------:|-----------:|-----------:|---------------:|------------:|---------------:|---------------:|-----------:|-----------:|---------------:|---------------:|--------------:|---------------:|-------------:|-------------:|--------------:|-------------:|--------------:|---------------:|-------------:|-------------:|-------------:|-------------:|-------------:|-------------:|--------------:|----------------:|------------:|--------------:|-----------:|---------:|--------:|--------------:|----------:|---------:|---------:|-----------:|----------------:|------------:|
| 0 | 1 | 60 | 3 | 65 | 8450 | 1 | 0 | 3 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 7 | 5 | 2003 | 2003 | 1 | 1 | 12 | 13 | 1 | 196 | 2 | 4 | 2 | 2 | 3 | 3 | 2 | 706 | 5 | 0 | 150 | 856 | 1 | 0 | 1 | 4 | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | 2 | 8 | 6 | 0 | 2 | 1 | 2003 | 1 | 2 | 548 | 4 | 4 | 2 | 0 | 61 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 2 | 2008 | 8 | 4 | 208500 |
| 1 | 2 | 20 | 3 | 80 | 9600 | 1 | 0 | 3 | 3 | 0 | 2 | 0 | 24 | 1 | 2 | 0 | 2 | 6 | 8 | 1976 | 1976 | 1 | 1 | 8 | 8 | 2 | 0 | 3 | 4 | 1 | 2 | 3 | 1 | 0 | 978 | 5 | 0 | 284 | 1262 | 1 | 0 | 1 | 4 | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | 3 | 6 | 6 | 1 | 4 | 1 | 1976 | 1 | 2 | 460 | 4 | 4 | 2 | 298 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 5 | 2007 | 8 | 4 | 181500 |
| 2 | 3 | 60 | 3 | 68 | 11250 | 1 | 0 | 0 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 7 | 5 | 2001 | 2002 | 1 | 1 | 12 | 13 | 1 | 162 | 2 | 4 | 2 | 2 | 3 | 2 | 2 | 486 | 5 | 0 | 434 | 920 | 1 | 0 | 1 | 4 | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | 2 | 6 | 6 | 1 | 4 | 1 | 2001 | 1 | 2 | 608 | 4 | 4 | 2 | 0 | 42 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 9 | 2008 | 8 | 4 | 223500 |
| 3 | 4 | 70 | 3 | 60 | 9550 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 6 | 2 | 2 | 0 | 5 | 7 | 5 | 1915 | 1970 | 1 | 1 | 13 | 15 | 2 | 0 | 3 | 4 | 0 | 3 | 1 | 3 | 0 | 216 | 5 | 0 | 540 | 756 | 1 | 2 | 1 | 4 | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | 2 | 7 | 6 | 1 | 2 | 5 | 1998 | 2 | 3 | 642 | 4 | 4 | 2 | 0 | 35 | 272 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 2 | 2006 | 8 | 0 | 140000 |
| 4 | 5 | 60 | 3 | 84 | 14260 | 1 | 0 | 0 | 3 | 0 | 2 | 0 | 15 | 2 | 2 | 0 | 5 | 8 | 5 | 2000 | 2000 | 1 | 1 | 12 | 13 | 1 | 350 | 2 | 4 | 2 | 2 | 3 | 0 | 2 | 655 | 5 | 0 | 490 | 1145 | 1 | 0 | 1 | 4 | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | 2 | 9 | 6 | 1 | 4 | 1 | 2000 | 1 | 3 | 836 | 4 | 4 | 2 | 192 | 84 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 12 | 2008 | 8 | 4 | 250000 |
Impute null values. If nominal, replace with most common. If continuous, replace with mean.
def imputer(df):
for col in df.columns:
if df[col].dtype=='O': # categorical
imp_most = SimpleImputer(strategy='most_frequent')
temp = df[col].values.reshape(-1,1)
imp_most.fit(temp)
X = df[col].values.reshape(-1,1)
X = imp_most.transform(X)
df[col] = X
else:
imp_mean = SimpleImputer(strategy='mean')
temp = df[col].values.reshape(-1,1)
imp_mean.fit(temp)
X = df[col].values.reshape(-1,1)
X = imp_mean.transform(X)
df[col] = X
return df
nTrain = imputer(nTrain)
print(nTrain.head())
| | Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice |
|---:|-----:|-------------:|-----------:|--------------:|----------:|---------:|--------:|-----------:|--------------:|------------:|------------:|------------:|---------------:|-------------:|-------------:|-----------:|-------------:|--------------:|--------------:|------------:|---------------:|------------:|-----------:|--------------:|--------------:|-------------:|-------------:|------------:|------------:|-------------:|-----------:|-----------:|---------------:|---------------:|-------------:|---------------:|-------------:|------------:|--------------:|----------:|------------:|-------------:|-------------:|-----------:|-----------:|---------------:|------------:|---------------:|---------------:|-----------:|-----------:|---------------:|---------------:|--------------:|---------------:|-------------:|-------------:|--------------:|-------------:|--------------:|---------------:|-------------:|-------------:|-------------:|-------------:|-------------:|-------------:|--------------:|----------------:|------------:|--------------:|-----------:|---------:|--------:|--------------:|----------:|---------:|---------:|-----------:|----------------:|------------:|
| 0 | 1 | 60 | 3 | 65 | 8450 | 1 | 0 | 3 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 7 | 5 | 2003 | 2003 | 1 | 1 | 12 | 13 | 1 | 196 | 2 | 4 | 2 | 2 | 3 | 3 | 2 | 706 | 5 | 0 | 150 | 856 | 1 | 0 | 1 | 4 | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | 2 | 8 | 6 | 0 | 2 | 1 | 2003 | 1 | 2 | 548 | 4 | 4 | 2 | 0 | 61 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 2 | 2008 | 8 | 4 | 208500 |
| 1 | 2 | 20 | 3 | 80 | 9600 | 1 | 0 | 3 | 3 | 0 | 2 | 0 | 24 | 1 | 2 | 0 | 2 | 6 | 8 | 1976 | 1976 | 1 | 1 | 8 | 8 | 2 | 0 | 3 | 4 | 1 | 2 | 3 | 1 | 0 | 978 | 5 | 0 | 284 | 1262 | 1 | 0 | 1 | 4 | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | 3 | 6 | 6 | 1 | 4 | 1 | 1976 | 1 | 2 | 460 | 4 | 4 | 2 | 298 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 5 | 2007 | 8 | 4 | 181500 |
| 2 | 3 | 60 | 3 | 68 | 11250 | 1 | 0 | 0 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 7 | 5 | 2001 | 2002 | 1 | 1 | 12 | 13 | 1 | 162 | 2 | 4 | 2 | 2 | 3 | 2 | 2 | 486 | 5 | 0 | 434 | 920 | 1 | 0 | 1 | 4 | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | 2 | 6 | 6 | 1 | 4 | 1 | 2001 | 1 | 2 | 608 | 4 | 4 | 2 | 0 | 42 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 9 | 2008 | 8 | 4 | 223500 |
| 3 | 4 | 70 | 3 | 60 | 9550 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 6 | 2 | 2 | 0 | 5 | 7 | 5 | 1915 | 1970 | 1 | 1 | 13 | 15 | 2 | 0 | 3 | 4 | 0 | 3 | 1 | 3 | 0 | 216 | 5 | 0 | 540 | 756 | 1 | 2 | 1 | 4 | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | 2 | 7 | 6 | 1 | 2 | 5 | 1998 | 2 | 3 | 642 | 4 | 4 | 2 | 0 | 35 | 272 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 2 | 2006 | 8 | 0 | 140000 |
| 4 | 5 | 60 | 3 | 84 | 14260 | 1 | 0 | 0 | 3 | 0 | 2 | 0 | 15 | 2 | 2 | 0 | 5 | 8 | 5 | 2000 | 2000 | 1 | 1 | 12 | 13 | 1 | 350 | 2 | 4 | 2 | 2 | 3 | 0 | 2 | 655 | 5 | 0 | 490 | 1145 | 1 | 0 | 1 | 4 | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | 2 | 9 | 6 | 1 | 4 | 1 | 2000 | 1 | 3 | 836 | 4 | 4 | 2 | 192 | 84 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 12 | 2008 | 8 | 4 | 250000 |
Label encoding using sklearn
le = LabelEncoder()
def labelEncode(df):
"""
params: dataframe
returns: dataframe. classes, label encoder classes in a dict with column names as keys
"""
classes = {}
for col in df.columns:
if df[col].dtype == 'O':
le.fit(df[col])
df[col] = le.transform(df[col])
classes[col] = le.classes_
return df,classes
nTrain,labelClasses = labelEncode(nTrain)
print(nTrain.head())
| | Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice |
|---:|-----:|-------------:|-----------:|--------------:|----------:|---------:|--------:|-----------:|--------------:|------------:|------------:|------------:|---------------:|-------------:|-------------:|-----------:|-------------:|--------------:|--------------:|------------:|---------------:|------------:|-----------:|--------------:|--------------:|-------------:|-------------:|------------:|------------:|-------------:|-----------:|-----------:|---------------:|---------------:|-------------:|---------------:|-------------:|------------:|--------------:|----------:|------------:|-------------:|-------------:|-----------:|-----------:|---------------:|------------:|---------------:|---------------:|-----------:|-----------:|---------------:|---------------:|--------------:|---------------:|-------------:|-------------:|--------------:|-------------:|--------------:|---------------:|-------------:|-------------:|-------------:|-------------:|-------------:|-------------:|--------------:|----------------:|------------:|--------------:|-----------:|---------:|--------:|--------------:|----------:|---------:|---------:|-----------:|----------------:|------------:|
| 0 | 1 | 60 | 3 | 65 | 8450 | 1 | 0 | 3 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 7 | 5 | 2003 | 2003 | 1 | 1 | 12 | 13 | 1 | 196 | 2 | 4 | 2 | 2 | 3 | 3 | 2 | 706 | 5 | 0 | 150 | 856 | 1 | 0 | 1 | 4 | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | 2 | 8 | 6 | 0 | 2 | 1 | 2003 | 1 | 2 | 548 | 4 | 4 | 2 | 0 | 61 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 2 | 2008 | 8 | 4 | 208500 |
| 1 | 2 | 20 | 3 | 80 | 9600 | 1 | 0 | 3 | 3 | 0 | 2 | 0 | 24 | 1 | 2 | 0 | 2 | 6 | 8 | 1976 | 1976 | 1 | 1 | 8 | 8 | 2 | 0 | 3 | 4 | 1 | 2 | 3 | 1 | 0 | 978 | 5 | 0 | 284 | 1262 | 1 | 0 | 1 | 4 | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | 3 | 6 | 6 | 1 | 4 | 1 | 1976 | 1 | 2 | 460 | 4 | 4 | 2 | 298 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 5 | 2007 | 8 | 4 | 181500 |
| 2 | 3 | 60 | 3 | 68 | 11250 | 1 | 0 | 0 | 3 | 0 | 4 | 0 | 5 | 2 | 2 | 0 | 5 | 7 | 5 | 2001 | 2002 | 1 | 1 | 12 | 13 | 1 | 162 | 2 | 4 | 2 | 2 | 3 | 2 | 2 | 486 | 5 | 0 | 434 | 920 | 1 | 0 | 1 | 4 | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | 2 | 6 | 6 | 1 | 4 | 1 | 2001 | 1 | 2 | 608 | 4 | 4 | 2 | 0 | 42 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 9 | 2008 | 8 | 4 | 223500 |
| 3 | 4 | 70 | 3 | 60 | 9550 | 1 | 0 | 0 | 3 | 0 | 0 | 0 | 6 | 2 | 2 | 0 | 5 | 7 | 5 | 1915 | 1970 | 1 | 1 | 13 | 15 | 2 | 0 | 3 | 4 | 0 | 3 | 1 | 3 | 0 | 216 | 5 | 0 | 540 | 756 | 1 | 2 | 1 | 4 | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | 2 | 7 | 6 | 1 | 2 | 5 | 1998 | 2 | 3 | 642 | 4 | 4 | 2 | 0 | 35 | 272 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 2 | 2006 | 8 | 0 | 140000 |
| 4 | 5 | 60 | 3 | 84 | 14260 | 1 | 0 | 0 | 3 | 0 | 2 | 0 | 15 | 2 | 2 | 0 | 5 | 8 | 5 | 2000 | 2000 | 1 | 1 | 12 | 13 | 1 | 350 | 2 | 4 | 2 | 2 | 3 | 0 | 2 | 655 | 5 | 0 | 490 | 1145 | 1 | 0 | 1 | 4 | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | 2 | 9 | 6 | 1 | 4 | 1 | 2000 | 1 | 3 | 836 | 4 | 4 | 2 | 192 | 84 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 0 | 12 | 2008 | 8 | 4 | 250000 |
nTest = imputer(nTest)
nTest,_ = labelEncode(nTest)
def rounded(corr,thresh):
for i in range(corr.shape[0]):
for j in range(corr.shape[1]):
corr.iloc[i,j] = corr.iloc[i,j] if corr.iloc[i,j]>=thresh or corr.iloc[i,j]<=-thresh else np.round(corr.iloc[i,j])
return corr
corr = nTrain.corr()
# corr = rounded(corr,0.7)
fig, ax = plt.subplots(1,1,figsize=(20,15),dpi=100)
ax = sns.heatmap(corr,vmin = -1,vmax = 1,center=0)
# select features
cols = ['YearRemodAdd','SaleCondition','PoolArea','LotShape','OverallQual','ExterQual','BsmtQual', \
'TotalBsmtSF','HeatingQC','1stFlrSF','FullBath','GrLivArea','KitchenQual','GarageCars']
X = nTrain[cols]
y = nTrain['SalePrice']
testX = nTest[cols]
print(X.shape,y.shape)
(1460, 14) (1460,)
trainX,valX,trainy,valy = train_test_split(X,y,test_size=0.4)
print(trainX.shape,trainy.shape,'\n',valX.shape,valy.shape)
(876, 14) (876,)
(584, 14) (584,)
model = DecisionTreeRegressor(criterion='mae',min_samples_split=5,min_samples_leaf=3,splitter='random',random_state=21)
model.fit(trainX,trainy)
print(model.score(valX,valy))
print(model.score(trainX,trainy))
0.7333262120423661
0.8768585202914994
Log Root Mean Squared Error is used in the competitions.
def log_rmse(ytrue,ypred):
sum = 0.0
for i in range(len(ytrue)):
sum += pow((math.log2(ypred[i])-math.log2(ytrue.iloc[i])),2)
return math.sqrt(abs(sum/float(len(ytrue))))
yhat = model.predict(valX)
print(log_rmse(valy,yhat))
0.3138365972186103
Running the fitted model on Test Set to generate house prices and submit to Kaggle
yhat = model.predict(testX)
sub_file = pd.DataFrame({'Id':nTest.Id,'SalePrice':yhat})
sub_file.to_csv(f'{DATA}submission.csv',index=False)