# imports that I'll use over the course of the project
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split, KFold
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score

%matplotlib inline

# read in data
houses = pd.read_csv('./data/train.csv')
holdout_set = pd.read_csv('./data/test.csv')

Basic EDA

# Check out the data types
houses.dtypes

Id                  int64
PID                 int64
MS SubClass         int64
MS Zoning          object
Lot Frontage      float64
Lot Area            int64
Street             object
Alley              object
Lot Shape          object
Land Contour       object
Utilities          object
Lot Config         object
Land Slope         object
Neighborhood       object
Condition 1        object
Condition 2        object
Bldg Type          object
House Style        object
Overall Qual        int64
Overall Cond        int64
Year Built          int64
Year Remod/Add      int64
Roof Style         object
Roof Matl          object
Exterior 1st       object
Exterior 2nd       object
Mas Vnr Type       object
Mas Vnr Area      float64
Exter Qual         object
Exter Cond         object
                   ...   
Half Bath           int64
Bedroom AbvGr       int64
Kitchen AbvGr       int64
Kitchen Qual       object
TotRms AbvGrd       int64
Functional         object
Fireplaces          int64
Fireplace Qu       object
Garage Type        object
Garage Yr Blt     float64
Garage Finish      object
Garage Cars       float64
Garage Area       float64
Garage Qual        object
Garage Cond        object
Paved Drive        object
Wood Deck SF        int64
Open Porch SF       int64
Enclosed Porch      int64
3Ssn Porch          int64
Screen Porch        int64
Pool Area           int64
Pool QC            object
Fence              object
Misc Feature       object
Misc Val            int64
Mo Sold             int64
Yr Sold             int64
Sale Type          object
SalePrice           int64
Length: 81, dtype: object

# Basic description to check any weirdness
houses.describe().T

	count	mean	std	min	25%	50%	75%	max
Id	2051.0	1.474034e+03	8.439808e+02	1.0	753.5	1486.0	2.198000e+03	2930.0
PID	2051.0	7.135900e+08	1.886918e+08	526301100.0	528458140.0	535453200.0	9.071801e+08	924152030.0
MS SubClass	2051.0	5.700878e+01	4.282422e+01	20.0	20.0	50.0	7.000000e+01	190.0
Lot Frontage	1721.0	6.905520e+01	2.326065e+01	21.0	58.0	68.0	8.000000e+01	313.0
Lot Area	2051.0	1.006521e+04	6.742489e+03	1300.0	7500.0	9430.0	1.151350e+04	159000.0
Overall Qual	2051.0	6.112140e+00	1.426271e+00	1.0	5.0	6.0	7.000000e+00	10.0
Overall Cond	2051.0	5.562165e+00	1.104497e+00	1.0	5.0	5.0	6.000000e+00	9.0
Year Built	2051.0	1.971709e+03	3.017789e+01	1872.0	1953.5	1974.0	2.001000e+03	2010.0
Year Remod/Add	2051.0	1.984190e+03	2.103625e+01	1950.0	1964.5	1993.0	2.004000e+03	2010.0
Mas Vnr Area	2029.0	9.969591e+01	1.749631e+02	0.0	0.0	0.0	1.610000e+02	1600.0
BsmtFin SF 1	2050.0	4.423005e+02	4.612041e+02	0.0	0.0	368.0	7.337500e+02	5644.0
BsmtFin SF 2	2050.0	4.795902e+01	1.650009e+02	0.0	0.0	0.0	0.000000e+00	1474.0
Bsmt Unf SF	2050.0	5.677283e+02	4.449548e+02	0.0	220.0	474.5	8.110000e+02	2336.0
Total Bsmt SF	2050.0	1.057988e+03	4.494107e+02	0.0	793.0	994.5	1.318750e+03	6110.0
1st Flr SF	2051.0	1.164488e+03	3.964469e+02	334.0	879.5	1093.0	1.405000e+03	5095.0
2nd Flr SF	2051.0	3.293291e+02	4.256710e+02	0.0	0.0	0.0	6.925000e+02	1862.0
Low Qual Fin SF	2051.0	5.512921e+00	5.106887e+01	0.0	0.0	0.0	0.000000e+00	1064.0
Gr Liv Area	2051.0	1.499330e+03	5.004478e+02	334.0	1129.0	1444.0	1.728500e+03	5642.0
Bsmt Full Bath	2049.0	4.275256e-01	5.226732e-01	0.0	0.0	0.0	1.000000e+00	3.0
Bsmt Half Bath	2049.0	6.344558e-02	2.517052e-01	0.0	0.0	0.0	0.000000e+00	2.0
Full Bath	2051.0	1.577279e+00	5.492794e-01	0.0	1.0	2.0	2.000000e+00	4.0
Half Bath	2051.0	3.710385e-01	5.010427e-01	0.0	0.0	0.0	1.000000e+00	2.0
Bedroom AbvGr	2051.0	2.843491e+00	8.266183e-01	0.0	2.0	3.0	3.000000e+00	8.0
Kitchen AbvGr	2051.0	1.042906e+00	2.097900e-01	0.0	1.0	1.0	1.000000e+00	3.0
TotRms AbvGrd	2051.0	6.435885e+00	1.560225e+00	2.0	5.0	6.0	7.000000e+00	15.0
Fireplaces	2051.0	5.909313e-01	6.385163e-01	0.0	0.0	1.0	1.000000e+00	4.0
Garage Yr Blt	1937.0	1.978708e+03	2.544109e+01	1895.0	1961.0	1980.0	2.002000e+03	2207.0
Garage Cars	2050.0	1.776585e+00	7.645374e-01	0.0	1.0	2.0	2.000000e+00	5.0
Garage Area	2050.0	4.736717e+02	2.159346e+02	0.0	319.0	480.0	5.760000e+02	1418.0
Wood Deck SF	2051.0	9.383374e+01	1.285494e+02	0.0	0.0	0.0	1.680000e+02	1424.0
Open Porch SF	2051.0	4.755680e+01	6.674724e+01	0.0	0.0	27.0	7.000000e+01	547.0
Enclosed Porch	2051.0	2.257192e+01	5.984511e+01	0.0	0.0	0.0	0.000000e+00	432.0
3Ssn Porch	2051.0	2.591419e+00	2.522961e+01	0.0	0.0	0.0	0.000000e+00	508.0
Screen Porch	2051.0	1.651146e+01	5.737420e+01	0.0	0.0	0.0	0.000000e+00	490.0
Pool Area	2051.0	2.397855e+00	3.778257e+01	0.0	0.0	0.0	0.000000e+00	800.0
Misc Val	2051.0	5.157435e+01	5.733940e+02	0.0	0.0	0.0	0.000000e+00	17000.0
Mo Sold	2051.0	6.219893e+00	2.744736e+00	1.0	4.0	6.0	8.000000e+00	12.0
Yr Sold	2051.0	2.007776e+03	1.312014e+00	2006.0	2007.0	2008.0	2.009000e+03	2010.0
SalePrice	2051.0	1.814697e+05	7.925866e+04	12789.0	129825.0	162500.0	2.140000e+05	611657.0

# Understand the size of the data
houses.shape

(2051, 81)

# Null value check
df_nulls = pd.DataFrame(data=houses.isnull().sum(), columns=['Nulls'])
df_nulls.sort_values('Nulls', ascending=False)
# There seems to be quite a few null values, should look at cleaning these up.

	Nulls
Pool QC	2042
Misc Feature	1986
Alley	1911
Fence	1651
Fireplace Qu	1000
Lot Frontage	330
Garage Finish	114
Garage Qual	114
Garage Yr Blt	114
Garage Cond	114
Garage Type	113
Bsmt Exposure	58
BsmtFin Type 2	56
BsmtFin Type 1	55
Bsmt Cond	55
Bsmt Qual	55
Mas Vnr Area	22
Mas Vnr Type	22
Bsmt Half Bath	2
Bsmt Full Bath	2
Garage Area	1
Total Bsmt SF	1
Bsmt Unf SF	1
BsmtFin SF 2	1
BsmtFin SF 1	1
Garage Cars	1
Mo Sold	0
Sale Type	0
Full Bath	0
Half Bath	0
...	...
MS Zoning	0
Lot Area	0
Street	0
Lot Shape	0
Land Contour	0
Utilities	0
Lot Config	0
Land Slope	0
Neighborhood	0
Condition 1	0
Condition 2	0
Bldg Type	0
House Style	0
Overall Cond	0
2nd Flr SF	0
Year Built	0
Year Remod/Add	0
Roof Style	0
Roof Matl	0
Exterior 1st	0
Exterior 2nd	0
Exter Qual	0
Exter Cond	0
Foundation	0
PID	0
Heating QC	0
Central Air	0
Electrical	0
1st Flr SF	0
SalePrice	0

81 rows × 1 columns

# If and when I wanna drop all na's from original DF:
# houses.dropna(axis=0, how='any')
# houses.select_dtypes(object)

Feature Engineering

Create Dummy Variables

# Setup dummy variables for later
houses_object_columns = pd.get_dummies(houses,columns=houses.select_dtypes(object).columns)

houses_object_columns.head()

	Id	PID	MS SubClass	Lot Frontage	Lot Area	Overall Qual	Overall Cond	Year Built	Year Remod/Add	Mas Vnr Area	...	Sale Type_WD
0	109	533352170	60	NaN	13517	6	8	1976	2005	289.0	...	1
1	544	531379050	60	43.0	11492	7	5	1996	1997	132.0	...	1
2	153	535304180	20	68.0	7922	5	7	1953	2007	0.0	...	1
3	318	916386060	60	73.0	9802	5	5	2006	2007	0.0	...	1
4	255	906425045	50	82.0	14235	6	8	1900	1993	0.0	...	1

5 rows × 292 columns

df_nulls_obj = pd.DataFrame(data=houses_object_columns.isnull().sum(), columns=['Nulls'])
df_nulls_obj.sort_values('Nulls', ascending=False)

	Nulls
Lot Frontage	330
Garage Yr Blt	114
Mas Vnr Area	22
Bsmt Half Bath	2
Bsmt Full Bath	2
BsmtFin SF 1	1
Garage Area	1
Total Bsmt SF	1
Bsmt Unf SF	1
BsmtFin SF 2	1
Garage Cars	1
Bsmt Exposure_No	0
BsmtFin Type 1_ALQ	0
BsmtFin Type 1_BLQ	0
BsmtFin Type 1_GLQ	0
Bsmt Exposure_Gd	0
Bsmt Exposure_Mn	0
Id	0
Bsmt Exposure_Av	0
Bsmt Cond_TA	0
Bsmt Cond_Po	0
Bsmt Cond_Fa	0
Bsmt Cond_Ex	0
Bsmt Qual_TA	0
Bsmt Qual_Po	0
Bsmt Qual_Gd	0
Bsmt Qual_Fa	0
Bsmt Cond_Gd	0
BsmtFin Type 1_Unf	0
BsmtFin Type 1_LwQ	0
...	...
Neighborhood_Timber	0
Neighborhood_StoneBr	0
Neighborhood_Somerst	0
Neighborhood_SawyerW	0
Neighborhood_SWISU	0
Bldg Type_2fmCon	0
Neighborhood_OldTown	0
Neighborhood_NridgHt	0
Neighborhood_NoRidge	0
Neighborhood_NWAmes	0
Neighborhood_NPkVill	0
Neighborhood_NAmes	0
Condition 1_Feedr	0
Condition 1_Norm	0
Condition 1_PosA	0
Condition 1_PosN	0
Condition 1_RRAe	0
Condition 1_RRAn	0
Condition 1_RRNe	0
Condition 1_RRNn	0
Condition 2_Artery	0
Condition 2_Feedr	0
Condition 2_Norm	0
Condition 2_PosA	0
Condition 2_PosN	0
Condition 2_RRAe	0
Condition 2_RRAn	0
Condition 2_RRNn	0
Bldg Type_1Fam	0
Sale Type_WD	0

292 rows × 1 columns

# Clean up NAN
houses_object_columns.fillna(value=0, inplace=True)

First run of features

# Most of the code after the features section was run multiple times to attempt to find the best model

# Original run of feature engineering
# Just randomly picking a few potential predictors
features = ['Lot Area', 'Overall Qual','Fireplaces','TotRms AbvGrd']
X = houses[features]
y = houses['SalePrice']

Second run of features

# I'm going to pull out only numeric columns, to see if there's some variables with high correlation that I can throw in to improve my model.
# Make it into it's own DF to run some EDA on it
houses_numonly = houses.select_dtypes(np.number)

# houses_numonly_dropna = houses_numonly.dropna(axis=0, how='any')
# houses_numonly_dropna.isnull().sum()

# Original run
# houses_numonly.columns
houses_numonly.shape

# For original run of determining correlations
features = houses_numonly.columns
X = houses[features]
y = houses_numonly['SalePrice']

# X.corr() is far too painful to look at here
# Run the heatmap, see if anything sticks out
plt.subplots(figsize=(40,30))
sns.heatmap(X.corr(), annot=True)

# Just eyeballing it, some important correlations I see (descending):
# I stopped at 'Garage Yr Blt', which had a correlation of .53
# Upon further review, 'Garage Yr Blt' had far too many missing data points, so was left off
features = ['Overall Qual',
'Gr Liv Area',
'Garage Cars',
'Garage Area',
'Total Bsmt SF',
'1st Flr SF',
'Year Built',
'Year Remod/Add',
'Full Bath']
# 'Garage Yr Blt']

Third run of features

# Third time around: going to add a few variables - might cause overfit, but let's see
# Was more lax on what variables I added (based on correlation > .3)
features = ['Overall Qual',
'Gr Liv Area',
'Garage Cars',
'Garage Area',
'Total Bsmt SF',
'1st Flr SF',
'Year Built',
'Year Remod/Add',
'Full Bath',
'TotRms AbvGrd',
'Mas Vnr Area',
'Fireplaces',
'BsmtFin SF 1',
'Wood Deck SF']

Code below was run on multiple sets of features

# Set your in/dependent variables
# Original usage
# X = houses[features]
# y = houses['SalePrice']

# Old run:
# X = houses_object_columns[features]
# y = houses_object_columns['SalePrice']

# Null check
X.isnull().sum()

# Fill in the nas:
# Despite the warning, this still works:
X['Mas Vnr Area'].fillna(value=0, inplace=True)
X['Garage Cars'].fillna(value=0, inplace=True)
X['Garage Area'].fillna(value=0, inplace=True)
X['Total Bsmt SF'].fillna(value=0, inplace=True)
X['BsmtFin SF 1'].fillna(value=0, inplace=True)

# Check:
X.isnull().sum()

# X.shape[0]

Graveyard for data cleanup

# 2nd feature run note: Going to rebuild my features, minus Grg yr blt as there seems to be more than a few nulls
# Some of the next few cells were only used for original run through of cleaning up features
# Take a look at null values in my features:
# X[X['Garage Cars'].isnull() == True]
# Looks like the singular null in 'Grg Cars' is one in the same w/ 'Grg Area'

# # Dropping the rows with the null values
# X = X.drop(X.index[1712])
# # Sets have to match in row length:
# y = y.drop(y.index[1712])

# Check to make sure it worked
# X[X['Garage Cars'].isnull() == True]

# X[X['Total Bsmt SF'].isnull() == True]

# Dropping the rows with the null values
# X = X.drop(X.index[1327])
# Sets have to match in row length:
# y = y.drop(y.index[1327])

# Check to make sure it worked
# X[X['Total Bsmt SF'].isnull() == True]

# Messing around trying to figure out how to locate the null rows
# X.columns[X.isna().any()].tolist()
# X.loc[:,X.isna().any()]

Lasso

X = houses_object_columns.drop('SalePrice', axis=1)
y = houses_object_columns['SalePrice']

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.fillna(value=0, inplace=True)
# X_test.fillna(value=0, inplace=True)
y_train.fillna(value=0, inplace=True)
# y_test.fillna(value=0, inplace=True)

L = Lasso(alpha = 2.5, max_iter=10000, random_state = 42)

L.fit(X_train, y_train)

Lasso(alpha=2.5, copy_X=True, fit_intercept=True, max_iter=10000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False)

# Original attempts:
# LinearRegression
# linreg = LinearRegression()
# Original fit
# linreg.fit(X, y)
# linreg.coef_

# Using KFold to help randomize the folds, and also see if different splits get me significantly different scores
kf = KFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(L, X_train, y_train, cv=kf)
print(scores)
print(scores.mean())

/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)


[0.91818856 0.93117813 0.9224465  0.92833756 0.88953215]
0.9179365787394577


/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

# Overfit
L.score(X_test, y_test)

0.6759518356006168

predictions = cross_val_predict(L, X, y, cv=kf)
plt.scatter(y, predictions)
accuracy = r2_score(y, predictions)

/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)

IOWA

# Looks like my model is overfit
# I ran this a few times, and didn't get nearly as bad of number. Looks like there was one fold that my model was just completely ill-fitting on.
scores = cross_val_score(L, X_test, y_test, cv=kf)
print(scores)
print(scores.mean())

/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/coordinate_descent.py:491: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)


[ 0.76061406 -1.83481449  0.78095611  0.77575566  0.77518242]
0.25153875067533676

Code for submission to Kaggle

# Clean up and make changes along the same lines to holdout_set
holdout_set.fillna(value=0, inplace=True)

holdout_set.head(10)

X_holdout = holdout_set[features]

X_holdout.head(10)

X_holdout.isnull().sum()

# Replace one NA in Mas VNR Area
X_holdout.fillna(value=0, inplace=True)

# Original linreg
y_preds = linreg.predict(X_holdout)

# Lasso run:
y_preds = L.predict(X_holdout)

y_preds

my_ids = holdout_set['Id']

df = pd.DataFrame()

df['Id'] = my_ids

df.set_index('Id', inplace=True)

df['SalePrice'] = y_preds

%pwd

df.to_csv('./data/my_preds.csv')

Graveyard - What is dead, may never die.

# Was messing around trying to find a loop that could get me all numeric columns:
# .select_dtypes solved this issue
list_house_intFloat = []
for column in dfhouses.columns:
    if dfhouses[column].dtypes == int:
#         print('is int')
        list_house_intFloat.append(dfhouses[column])
#     elif dfhouses[column].dtypes == float:
#         dfhouse_int.append(dfhouses.loc[dfhouses[column]]

# Counts for categorization of Street or Neighborhood
houses['Street'].value_counts()

# Drop na's - was using before I did it on the original set
# X.dropna(axis=0, how='any', inplace=True)

# houses_numonly_dropna = houses_numonly.dropna(axis=0, how='any')
# houses_numonly_dropna.isnull().sum()

Ames Housing Project

Basic EDA

Feature Engineering

Create Dummy Variables

First run of features

Second run of features

Third run of features

Code below was run on multiple sets of features

Graveyard for data cleanup

Lasso

Code for submission to Kaggle

Graveyard - What is dead, may never die.