Thomas Kelly

Ames Housing Project

18 Sep 2018 tarihinde yayınlandı.

# imports that I'll use over the course of the project
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score

%matplotlib inline
# read in data
houses = pd.read_csv('./data/train.csv')
holdout_set = pd.read_csv('./data/test.csv')

Basic EDA

# Check out the data types
houses.dtypes
Id                  int64
PID                 int64
MS SubClass         int64
MS Zoning          object
Lot Frontage      float64
Lot Area            int64
Street             object
Alley              object
Lot Shape          object
Land Contour       object
Utilities          object
Lot Config         object
Land Slope         object
Neighborhood       object
Condition 1        object
Condition 2        object
Bldg Type          object
House Style        object
Overall Qual        int64
Overall Cond        int64
Year Built          int64
Year Remod/Add      int64
Roof Style         object
Roof Matl          object
Exterior 1st       object
Exterior 2nd       object
Mas Vnr Type       object
Mas Vnr Area      float64
Exter Qual         object
Exter Cond         object
                   ...   
Half Bath           int64
Bedroom AbvGr       int64
Kitchen AbvGr       int64
Kitchen Qual       object
TotRms AbvGrd       int64
Functional         object
Fireplaces          int64
Fireplace Qu       object
Garage Type        object
Garage Yr Blt     float64
Garage Finish      object
Garage Cars       float64
Garage Area       float64
Garage Qual        object
Garage Cond        object
Paved Drive        object
Wood Deck SF        int64
Open Porch SF       int64
Enclosed Porch      int64
3Ssn Porch          int64
Screen Porch        int64
Pool Area           int64
Pool QC            object
Fence              object
Misc Feature       object
Misc Val            int64
Mo Sold             int64
Yr Sold             int64
Sale Type          object
SalePrice           int64
Length: 81, dtype: object
# Basic description to check any weirdness
houses.describe().T
count mean std min 25% 50% 75% max
Id 2051.0 1.474034e+03 8.439808e+02 1.0 753.5 1486.0 2.198000e+03 2930.0
PID 2051.0 7.135900e+08 1.886918e+08 526301100.0 528458140.0 535453200.0 9.071801e+08 924152030.0
MS SubClass 2051.0 5.700878e+01 4.282422e+01 20.0 20.0 50.0 7.000000e+01 190.0
Lot Frontage 1721.0 6.905520e+01 2.326065e+01 21.0 58.0 68.0 8.000000e+01 313.0
Lot Area 2051.0 1.006521e+04 6.742489e+03 1300.0 7500.0 9430.0 1.151350e+04 159000.0
Overall Qual 2051.0 6.112140e+00 1.426271e+00 1.0 5.0 6.0 7.000000e+00 10.0
Overall Cond 2051.0 5.562165e+00 1.104497e+00 1.0 5.0 5.0 6.000000e+00 9.0
Year Built 2051.0 1.971709e+03 3.017789e+01 1872.0 1953.5 1974.0 2.001000e+03 2010.0
Year Remod/Add 2051.0 1.984190e+03 2.103625e+01 1950.0 1964.5 1993.0 2.004000e+03 2010.0
Mas Vnr Area 2029.0 9.969591e+01 1.749631e+02 0.0 0.0 0.0 1.610000e+02 1600.0
BsmtFin SF 1 2050.0 4.423005e+02 4.612041e+02 0.0 0.0 368.0 7.337500e+02 5644.0
BsmtFin SF 2 2050.0 4.795902e+01 1.650009e+02 0.0 0.0 0.0 0.000000e+00 1474.0
Bsmt Unf SF 2050.0 5.677283e+02 4.449548e+02 0.0 220.0 474.5 8.110000e+02 2336.0
Total Bsmt SF 2050.0 1.057988e+03 4.494107e+02 0.0 793.0 994.5 1.318750e+03 6110.0
1st Flr SF 2051.0 1.164488e+03 3.964469e+02 334.0 879.5 1093.0 1.405000e+03 5095.0
2nd Flr SF 2051.0 3.293291e+02 4.256710e+02 0.0 0.0 0.0 6.925000e+02 1862.0
Low Qual Fin SF 2051.0 5.512921e+00 5.106887e+01 0.0 0.0 0.0 0.000000e+00 1064.0
Gr Liv Area 2051.0 1.499330e+03 5.004478e+02 334.0 1129.0 1444.0 1.728500e+03 5642.0
Bsmt Full Bath 2049.0 4.275256e-01 5.226732e-01 0.0 0.0 0.0 1.000000e+00 3.0
Bsmt Half Bath 2049.0 6.344558e-02 2.517052e-01 0.0 0.0 0.0 0.000000e+00 2.0
Full Bath 2051.0 1.577279e+00 5.492794e-01 0.0 1.0 2.0 2.000000e+00 4.0
Half Bath 2051.0 3.710385e-01 5.010427e-01 0.0 0.0 0.0 1.000000e+00 2.0
Bedroom AbvGr 2051.0 2.843491e+00 8.266183e-01 0.0 2.0 3.0 3.000000e+00 8.0
Kitchen AbvGr 2051.0 1.042906e+00 2.097900e-01 0.0 1.0 1.0 1.000000e+00 3.0
TotRms AbvGrd 2051.0 6.435885e+00 1.560225e+00 2.0 5.0 6.0 7.000000e+00 15.0
Fireplaces 2051.0 5.909313e-01 6.385163e-01 0.0 0.0 1.0 1.000000e+00 4.0
Garage Yr Blt 1937.0 1.978708e+03 2.544109e+01 1895.0 1961.0 1980.0 2.002000e+03 2207.0
Garage Cars 2050.0 1.776585e+00 7.645374e-01 0.0 1.0 2.0 2.000000e+00 5.0
Garage Area 2050.0 4.736717e+02 2.159346e+02 0.0 319.0 480.0 5.760000e+02 1418.0
Wood Deck SF 2051.0 9.383374e+01 1.285494e+02 0.0 0.0 0.0 1.680000e+02 1424.0
Open Porch SF 2051.0 4.755680e+01 6.674724e+01 0.0 0.0 27.0 7.000000e+01 547.0
Enclosed Porch 2051.0 2.257192e+01 5.984511e+01 0.0 0.0 0.0 0.000000e+00 432.0
3Ssn Porch 2051.0 2.591419e+00 2.522961e+01 0.0 0.0 0.0 0.000000e+00 508.0
Screen Porch 2051.0 1.651146e+01 5.737420e+01 0.0 0.0 0.0 0.000000e+00 490.0
Pool Area 2051.0 2.397855e+00 3.778257e+01 0.0 0.0 0.0 0.000000e+00 800.0
Misc Val 2051.0 5.157435e+01 5.733940e+02 0.0 0.0 0.0 0.000000e+00 17000.0
Mo Sold 2051.0 6.219893e+00 2.744736e+00 1.0 4.0 6.0 8.000000e+00 12.0
Yr Sold 2051.0 2.007776e+03 1.312014e+00 2006.0 2007.0 2008.0 2.009000e+03 2010.0
SalePrice 2051.0 1.814697e+05 7.925866e+04 12789.0 129825.0 162500.0 2.140000e+05 611657.0
# Understand the size of the data
houses.shape
(2051, 81)
# Null value check
df_nulls = pd.DataFrame(data=houses.isnull().sum(), columns=['Nulls'])
df_nulls.sort_values('Nulls', ascending=False)
# There seems to be quite a few null values, should look at cleaning these up.
Nulls
Pool QC 2042
Misc Feature 1986
Alley 1911
Fence 1651
Fireplace Qu 1000
Lot Frontage 330
Garage Finish 114
Garage Qual 114
Garage Yr Blt 114
Garage Cond 114
Garage Type 113
Bsmt Exposure 58
BsmtFin Type 2 56
BsmtFin Type 1 55
Bsmt Cond 55
Bsmt Qual 55
Mas Vnr Area 22
Mas Vnr Type 22
Bsmt Half Bath 2
Bsmt Full Bath 2
Garage Area 1
Total Bsmt SF 1
Bsmt Unf SF 1
BsmtFin SF 2 1
BsmtFin SF 1 1
Garage Cars 1
Mo Sold 0
Sale Type 0
Full Bath 0
Half Bath 0
... ...
MS Zoning 0
Lot Area 0
Street 0
Lot Shape 0
Land Contour 0
Utilities 0
Lot Config 0
Land Slope 0
Neighborhood 0
Condition 1 0
Condition 2 0
Bldg Type 0
House Style 0
Overall Cond 0
2nd Flr SF 0
Year Built 0
Year Remod/Add 0
Roof Style 0
Roof Matl 0
Exterior 1st 0
Exterior 2nd 0
Exter Qual 0
Exter Cond 0
Foundation 0
PID 0
Heating QC 0
Central Air 0
Electrical 0
1st Flr SF 0
SalePrice 0

81 rows × 1 columns

# If and when I wanna drop all na's from original DF:
# houses.dropna(axis=0, how='any')
# houses.select_dtypes(object)

Feature Engineering

Create Dummy Variables

# Setup dummy variables for later
houses_object_columns = pd.get_dummies(houses,columns=houses.select_dtypes(object).columns)
houses_object_columns.head()
Id PID MS SubClass Lot Frontage Lot Area Overall Qual Overall Cond Year Built Year Remod/Add Mas Vnr Area ... Misc Feature_TenC Sale Type_COD Sale Type_CWD Sale Type_Con Sale Type_ConLD Sale Type_ConLI Sale Type_ConLw Sale Type_New Sale Type_Oth Sale Type_WD
0 109 533352170 60 NaN 13517 6 8 1976 2005 289.0 ... 0 0 0 0 0 0 0 0 0 1
1 544 531379050 60 43.0 11492 7 5 1996 1997 132.0 ... 0 0 0 0 0 0 0 0 0 1
2 153 535304180 20 68.0 7922 5 7 1953 2007 0.0 ... 0 0 0 0 0 0 0 0 0 1
3 318 916386060 60 73.0 9802 5 5 2006 2007 0.0 ... 0 0 0 0 0 0 0 0 0 1
4 255 906425045 50 82.0 14235 6 8 1900 1993 0.0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 292 columns

df_nulls_obj = pd.DataFrame(data=houses_object_columns.isnull().sum(), columns=['Nulls'])
df_nulls_obj.sort_values('Nulls', ascending=False)
Nulls
Lot Frontage 330
Garage Yr Blt 114
Mas Vnr Area 22
Bsmt Half Bath 2
Bsmt Full Bath 2
BsmtFin SF 1 1
Garage Area 1
Total Bsmt SF 1
Bsmt Unf SF 1
BsmtFin SF 2 1
Garage Cars 1
Bsmt Exposure_No 0
BsmtFin Type 1_ALQ 0
BsmtFin Type 1_BLQ 0
BsmtFin Type 1_GLQ 0
Bsmt Exposure_Gd 0
Bsmt Exposure_Mn 0
Id 0
Bsmt Exposure_Av 0
Bsmt Cond_TA 0
Bsmt Cond_Po 0
Bsmt Cond_Fa 0
Bsmt Cond_Ex 0
Bsmt Qual_TA 0
Bsmt Qual_Po 0
Bsmt Qual_Gd 0
Bsmt Qual_Fa 0
Bsmt Cond_Gd 0
BsmtFin Type 1_Unf 0
BsmtFin Type 1_LwQ 0
... ...
Neighborhood_Timber 0
Neighborhood_StoneBr 0
Neighborhood_Somerst 0
Neighborhood_SawyerW 0
Neighborhood_SWISU 0
Bldg Type_2fmCon 0
Neighborhood_OldTown 0
Neighborhood_NridgHt 0
Neighborhood_NoRidge 0
Neighborhood_NWAmes 0
Neighborhood_NPkVill 0
Neighborhood_NAmes 0
Condition 1_Feedr 0
Condition 1_Norm 0
Condition 1_PosA 0
Condition 1_PosN 0
Condition 1_RRAe 0
Condition 1_RRAn 0
Condition 1_RRNe 0
Condition 1_RRNn 0
Condition 2_Artery 0
Condition 2_Feedr 0
Condition 2_Norm 0
Condition 2_PosA 0
Condition 2_PosN 0
Condition 2_RRAe 0
Condition 2_RRAn 0
Condition 2_RRNn 0
Bldg Type_1Fam 0
Sale Type_WD 0

292 rows × 1 columns

# Clean up NAN
houses_object_columns.fillna(value=0, inplace=True)

First run of features

# Most of the code after the features section was run multiple times to attempt to find the best model
# Original run of feature engineering
# Just randomly picking a few potential predictors
features = ['Lot Area', 'Overall Qual','Fireplaces','TotRms AbvGrd']
X = houses[features]
y = houses['SalePrice']

Second run of features

# I'm going to pull out only numeric columns, to see if there's some variables with high correlation that I can throw in to improve my model.
# Make it into it's own DF to run some EDA on it
houses_numonly = houses.select_dtypes(np.number)
# houses_numonly_dropna = houses_numonly.dropna(axis=0, how='any')
# houses_numonly_dropna.isnull().sum()
# Original run
# houses_numonly.columns
houses_numonly.shape
# For original run of determining correlations
features = houses_numonly.columns
X = houses[features]
y = houses_numonly['SalePrice']
# X.corr() is far too painful to look at here
# Run the heatmap, see if anything sticks out
plt.subplots(figsize=(40,30))
sns.heatmap(X.corr(), annot=True)
# Just eyeballing it, some important correlations I see (descending):
# I stopped at 'Garage Yr Blt', which had a correlation of .53
# Upon further review, 'Garage Yr Blt' had far too many missing data points, so was left off
features = ['Overall Qual',
'Gr Liv Area',
'Garage Cars',
'Garage Area',
'Total Bsmt SF',
'1st Flr SF',
'Year Built',
'Year Remod/Add',
'Full Bath']
# 'Garage Yr Blt']

Third run of features

# Third time around: going to add a few variables - might cause overfit, but let's see
# Was more lax on what variables I added (based on correlation > .3)
features = ['Overall Qual',
'Gr Liv Area',
'Garage Cars',
'Garage Area',
'Total Bsmt SF',
'1st Flr SF',
'Year Built',
'Year Remod/Add',
'Full Bath',
'TotRms AbvGrd',
'Mas Vnr Area',
'Fireplaces',
'BsmtFin SF 1',
'Wood Deck SF']

Code below was run on multiple sets of features

# Set your in/dependent variables
# Original usage
# X = houses[features]
# y = houses['SalePrice']
# Old run:
# X = houses_object_columns[features]
# y = houses_object_columns['SalePrice']
# Null check
X.isnull().sum()
# Fill in the nas:
# Despite the warning, this still works:
X['Mas Vnr Area'].fillna(value=0, inplace=True)
X['Garage Cars'].fillna(value=0, inplace=True)
X['Garage Area'].fillna(value=0, inplace=True)
X['Total Bsmt SF'].fillna(value=0, inplace=True)
X['BsmtFin SF 1'].fillna(value=0, inplace=True)
# Check:
X.isnull().sum()
# X.shape[0]

Graveyard for data cleanup

# 2nd feature run note: Going to rebuild my features, minus Grg yr blt as there seems to be more than a few nulls
# Some of the next few cells were only used for original run through of cleaning up features
# Take a look at null values in my features:
# X[X['Garage Cars'].isnull() == True]
# Looks like the singular null in 'Grg Cars' is one in the same w/ 'Grg Area'
# # Dropping the rows with the null values
# X = X.drop(X.index[1712])
# # Sets have to match in row length:
# y = y.drop(y.index[1712])
# Check to make sure it worked
# X[X['Garage Cars'].isnull() == True]
# X[X['Total Bsmt SF'].isnull() == True]
# Dropping the rows with the null values
# X = X.drop(X.index[1327])
# Sets have to match in row length:
# y = y.drop(y.index[1327])
# Check to make sure it worked
# X[X['Total Bsmt SF'].isnull() == True]
# Messing around trying to figure out how to locate the null rows
# X.columns[X.isna().any()].tolist()
# X.loc[:,X.isna().any()]

Lasso

X = houses_object_columns.drop('SalePrice', axis=1)
y = houses_object_columns['SalePrice']
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train.fillna(value=0, inplace=True)
# X_test.fillna(value=0, inplace=True)
y_train.fillna(value=0, inplace=True)
# y_test.fillna(value=0, inplace=True)
L = Lasso(alpha = 2.5, max_iter=10000, random_state = 42)
L.fit(X_train, y_train)
Lasso(alpha=2.5, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False)
# Original attempts:
# LinearRegression
# linreg = LinearRegression()
# Original fit
# linreg.fit(X, y)
# linreg.coef_
# Using KFold to help randomize the folds, and also see if different splits get me significantly different scores
kf = KFold(n_splits=5, random_state=42, shuffle=True)
scores = cross_val_score(L, X_train, y_train, cv=kf)
print(scores)
print(scores.mean())
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)


[0.91818856 0.93117813 0.9224465  0.92833756 0.88953215]
0.9179365787394577


/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
# Overfit
L.score(X_test, y_test)
0.6759518356006168
predictions = cross_val_predict(L, X, y, cv=kf)
plt.scatter(y, predictions)
accuracy = r2_score(y, predictions)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

IOWA

# Looks like my model is overfit
# I ran this a few times, and didn't get nearly as bad of number. Looks like there was one fold that my model was just completely ill-fitting on.
scores = cross_val_score(L, X_test, y_test, cv=kf)
print(scores)
print(scores.mean())
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)


[ 0.76061406 -1.83481449  0.78095611  0.77575566  0.77518242]
0.25153875067533676

Code for submission to Kaggle

# Clean up and make changes along the same lines to holdout_set
holdout_set.fillna(value=0, inplace=True)
holdout_set.head(10)
X_holdout = holdout_set[features]
X_holdout.head(10)
X_holdout.isnull().sum()
# Replace one NA in Mas VNR Area
X_holdout.fillna(value=0, inplace=True)
# Original linreg
y_preds = linreg.predict(X_holdout)
# Lasso run:
y_preds = L.predict(X_holdout)
y_preds
my_ids = holdout_set['Id']
df = pd.DataFrame()
df['Id'] = my_ids
df.set_index('Id', inplace=True)
df['SalePrice'] = y_preds
%pwd
df.to_csv('./data/my_preds.csv')

Graveyard - What is dead, may never die.

# Was messing around trying to find a loop that could get me all numeric columns:
# .select_dtypes solved this issue
list_house_intFloat = []
for column in dfhouses.columns:
    if dfhouses[column].dtypes == int:
#         print('is int')
        list_house_intFloat.append(dfhouses[column])
#     elif dfhouses[column].dtypes == float:
#         dfhouse_int.append(dfhouses.loc[dfhouses[column]]
# Counts for categorization of Street or Neighborhood
houses['Street'].value_counts()
# Drop na's - was using before I did it on the original set
# X.dropna(axis=0, how='any', inplace=True)
# houses_numonly_dropna = houses_numonly.dropna(axis=0, how='any')
# houses_numonly_dropna.isnull().sum()