Housing Prices Analysis

Housing Prices Analysis

Kaggle - Housing Prices Dataset - PCA / Decision Trees / Random Forests

House_Prices.knit

Import Libraries

library(ggplot2)
library(survival)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(conflicted)
#library(plyr)
#conflict_prefer("rename", "plyr")
library(dplyr)
library(stringr)
library(data.table)
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(e1071)
library(caret)
## Loading required package: lattice
library(Metrics)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(devtools)
## Loading required package: usethis
library(ggfortify)
library(rpart)
library(rpart.plot)
library(MLmetrics)
options(warn=-1)
# download data from this link https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
train_data<- read.csv("data/train.csv", stringsAsFactors = F, header = T)
test_data <- read.csv("data/test.csv", stringsAsFactors = F, header = T)

Change the distribution of Sale price (from skewed to a normal distribution)

ggplot(train_data,aes(SalePrice))+geom_histogram(fill="steelblue",color="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(train_data,aes(SalePrice))+geom_histogram(fill="steelblue",color="black")+scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

train_data$SalePrice <- log(train_data$SalePrice+1)
test_data$SalePrice <- as.numeric(0)
combined <- rbind(train_data,test_data)

Dealing with missing values

missing_values <- train_data %>% summarise_all(funs(sum(is.na(.)/n())))
missing_values <- gather(missing_values,key = "feature",value = "missing_pct")
#missing_values 

# only  data with missing values, we ignore the rest
data_w_miss <- missing_values[missing_values$missing_pct > 0, ]
data_w_miss
##         feature  missing_pct
## 4   LotFrontage 0.1773972603
## 7         Alley 0.9376712329
## 26   MasVnrType 0.0054794521
## 27   MasVnrArea 0.0054794521
## 31     BsmtQual 0.0253424658
## 32     BsmtCond 0.0253424658
## 33 BsmtExposure 0.0260273973
## 34 BsmtFinType1 0.0253424658
## 36 BsmtFinType2 0.0260273973
## 43   Electrical 0.0006849315
## 58  FireplaceQu 0.4726027397
## 59   GarageType 0.0554794521
## 60  GarageYrBlt 0.0554794521
## 61 GarageFinish 0.0554794521
## 64   GarageQual 0.0554794521
## 65   GarageCond 0.0554794521
## 73       PoolQC 0.9952054795
## 74        Fence 0.8075342466
## 75  MiscFeature 0.9630136986
# plot to see which features have the most missing values
ggplot(data_w_miss,aes(x=feature,y=missing_pct))+geom_bar(stat="identity",fill="blue")+
  coord_flip()+theme_bw()

There are cases in which simply replacing the NAs with the mean/or deleting will not be appropriate in the context. We leave them as they will be considered as factors in the cases that are categorical variables.

combined$GarageYrBlt[combined$GarageYrBlt==2207] <- 2007 #typo

combined$LotFrontage[is.na(combined$LotFrontage)] <- 0
combined$MasVnrArea[is.na(combined$MasVnrArea)] <- 0


combined$BsmtFinSF1[is.na(combined$BsmtFinSF1)] <- 0
combined$BsmtFinSF2[is.na(combined$BsmtFinSF2)] <- 0
combined$BsmtUnfSF[is.na(combined$BsmtUnfSF)] <- 0
combined$TotalBsmtSF[is.na(combined$TotalBsmtSF)] <- 0
combined$BsmtFullBath[is.na(combined$BsmtFullBath)] <- 0
combined$BsmtHalfBath[is.na(combined$BsmtHalfBath)] <- 0

combined$GarageYrBlt[is.na(combined$GarageYrBlt)] <- 0
combined$GarageCars[is.na(combined$GarageCars)] <- 0
combined$GarageArea[is.na(combined$GarageArea)] <- 0
combined[is.na(combined)] <- "None"

Recoding ordered factors as pseudo-continuous numerical variables

combined$ExterQual<- recode(combined$ExterQual,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$ExterCond<- recode(combined$ExterCond,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$BsmtQual<- recode(combined$BsmtQual,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$BsmtCond<- recode(combined$BsmtCond,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$BsmtExposure<- recode(combined$BsmtExposure,"None"=0,"No"=1,"Mn"=2,"Av"=3,"Gd"=4)
combined$BsmtFinType1<- recode(combined$BsmtFinType1,"None"=0,"Unf"=1,"LwQ"=2,"Rec"=3,"BLQ"=4,"ALQ"=5,"GLQ"=6)
combined$BsmtFinType2<- recode(combined$BsmtFinType2,"None"=0,"Unf"=1,"LwQ"=2,"Rec"=3,"BLQ"=4,"ALQ"=5,"GLQ"=6)
combined$HeatingQC<- recode(combined$HeatingQC,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$KitchenQual<- recode(combined$KitchenQual,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$Functional<- recode(combined$Functional,"None"=0,"Sev"=1,"Maj2"=2,"Maj1"=3,"Mod"=4,"Min2"=5,"Min1"=6,"Typ"=7)
combined$FireplaceQu<- recode(combined$FireplaceQu,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$GarageFinish<- recode(combined$GarageFinish,"None"=0,"Unf"=1,"RFn"=2,"Fin"=3)
combined$GarageQual<- recode(combined$GarageQual,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$GarageCond<- recode(combined$GarageCond,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$PoolQC<- recode(combined$PoolQC,"None"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
combined$Fence<- recode(combined$Fence,"None"=0,"MnWw"=1,"GdWo"=2,"MnPrv"=3,"GdPrv"=4)
combined$TotalSF = combined$TotalBsmtSF + combined$X1stFlrSF + combined$X2ndFlrSF

Renaming columns

combined_dummy <-dummy.data.frame(combined,dummy.classes = "character")
combined_dummy <- rename(combined_dummy,"MSZoningC"="MSZoningC (all)")
combined_dummy <- rename(combined_dummy,"RoofMatlTarGrv"="RoofMatlTar&Grv")
combined_dummy <- rename(combined_dummy,"Exterior1stWdSdng"="Exterior1stWd Sdng")
combined_dummy <- rename(combined_dummy,"Exterior2ndBrkCmn"="Exterior2ndBrk Cmn")
combined_dummy <- rename(combined_dummy,"Exterior2ndWdSdng"="Exterior2ndWd Sdng")
combined_dummy <- rename(combined_dummy,"Exterior2ndWdShng"="Exterior2ndWd Shng")

Combining data and applying the BoxCox Transformation

combined_dummy <-dummy.data.frame(combined, dummy.classes = "character")


feature_classes <- sapply(names(combined_dummy), function(x) {
  class(combined_dummy[[x]])
})
numeric_feats <- names(feature_classes[feature_classes != "character"])
skewed_feats <- sapply(numeric_feats, function(x) {
  skewness(combined_dummy[[x]], na.rm = TRUE)
})
skewed_feats <- skewed_feats[abs(skewed_feats) > 0.75]
for (x in names(skewed_feats)) {
  bc = BoxCoxTrans(combined_dummy[[x]], lambda = 0.15)
  combined_dummy[[x]] = predict(bc, combined_dummy[[x]])
}

We split combined data back into test, train, validation sets.

train_dummy <- combined_dummy[1:1460,]
test_final <- combined_dummy[1461:2919,] # stays as is

set.seed(2)
in_train <- createDataPartition(train_dummy$SalePrice,p=0.7,list=F)
train_final <- train_dummy[in_train,]
validation <- train_dummy[-in_train,]

Removing outliers

We remove the datapoints beyond the 75th quantile, note that after applying the

outliers <- function(x) {

  Q1 <- quantile(x, probs=.25)
  Q3 <- quantile(x, probs=.75)
  iqr = Q3-Q1

 upper_limit = Q3 + (iqr*1.5)
 lower_limit = Q1 - (iqr*1.5)

 x > upper_limit | x < lower_limit
}

remove_outliers <- function(df, cols = names(df)) {
  for (col in cols) {
    df <- df[!outliers(df[[col]]),]
  }
  df
}


par (mfrow=c(2,2))

remove_outliers(train_final, c(colnames(train_final)))
##        Id MSSubClass MSZoningC (all) MSZoningFV MSZoningNone MSZoningRH
## 428   428   2.995732               0          0            0          0
## 968   968   2.995732               0          0            0          0
## 1071 1071   2.995732               0          0            0          0
##      MSZoningRL MSZoningRM LotFrontage  LotArea StreetGrvl StreetPave AlleyGrvl
## 428           1          0          77 9.058703          0          1         0
## 968           1          0           0 8.907883          0          1         0
## 1071          1          0          72 9.225426          0          1         0
##      AlleyNone AlleyPave LotShapeIR1 LotShapeIR2 LotShapeIR3 LotShapeReg
## 428          1         0           1           0           0           0
## 968          1         0           1           0           0           0
## 1071         1         0           0           0           0           1
##      LandContourBnk LandContourHLS LandContourLow LandContourLvl
## 428               0              0              0              1
## 968               0              0              0              1
## 1071              0              0              0              1
##      UtilitiesAllPub UtilitiesNone UtilitiesNoSeWa LotConfigCorner
## 428                1             0               0               0
## 968                1             0               0               0
## 1071               1             0               0               0
##      LotConfigCulDSac LotConfigFR2 LotConfigFR3 LotConfigInside LandSlopeGtl
## 428                 0            0            0               1            1
## 968                 0            0            0               1            1
## 1071                0            0            0               1            1
##      LandSlopeMod LandSlopeSev NeighborhoodBlmngtn NeighborhoodBlueste
## 428             0            0                   0                   0
## 968             0            0                   0                   0
## 1071            0            0                   0                   0
##      NeighborhoodBrDale NeighborhoodBrkSide NeighborhoodClearCr
## 428                   0                   0                   0
## 968                   0                   0                   0
## 1071                  0                   0                   0
##      NeighborhoodCollgCr NeighborhoodCrawfor NeighborhoodEdwards
## 428                    0                   0                   0
## 968                    0                   0                   0
## 1071                   0                   0                   0
##      NeighborhoodGilbert NeighborhoodIDOTRR NeighborhoodMeadowV
## 428                    0                  0                   0
## 968                    0                  0                   0
## 1071                   0                  0                   0
##      NeighborhoodMitchel NeighborhoodNAmes NeighborhoodNoRidge
## 428                    0                 1                   0
## 968                    0                 1                   0
## 1071                   0                 1                   0
##      NeighborhoodNPkVill NeighborhoodNridgHt NeighborhoodNWAmes
## 428                    0                   0                  0
## 968                    0                   0                  0
## 1071                   0                   0                  0
##      NeighborhoodOldTown NeighborhoodSawyer NeighborhoodSawyerW
## 428                    0                  0                   0
## 968                    0                  0                   0
## 1071                   0                  0                   0
##      NeighborhoodSomerst NeighborhoodStoneBr NeighborhoodSWISU
## 428                    0                   0                 0
## 968                    0                   0                 0
## 1071                   0                   0                 0
##      NeighborhoodTimber NeighborhoodVeenker Condition1Artery Condition1Feedr
## 428                   0                   0                0               0
## 968                   0                   0                0               0
## 1071                  0                   0                0               0
##      Condition1Norm Condition1PosA Condition1PosN Condition1RRAe Condition1RRAn
## 428               1              0              0              0              0
## 968               1              0              0              0              0
## 1071              1              0              0              0              0
##      Condition1RRNe Condition1RRNn Condition2Artery Condition2Feedr
## 428               0              0                0               0
## 968               0              0                0               0
## 1071              0              0                0               0
##      Condition2Norm Condition2PosA Condition2PosN Condition2RRAe Condition2RRAn
## 428               1              0              0              0              0
## 968               1              0              0              0              0
## 1071              1              0              0              0              0
##      Condition2RRNn BldgType1Fam BldgType2fmCon BldgTypeDuplex BldgTypeTwnhs
## 428               0            1              0              0             0
## 968               0            1              0              0             0
## 1071              0            1              0              0             0
##      BldgTypeTwnhsE HouseStyle1.5Fin HouseStyle1.5Unf HouseStyle1Story
## 428               0                0                0                1
## 968               0                0                0                1
## 1071              0                0                0                1
##      HouseStyle2.5Fin HouseStyle2.5Unf HouseStyle2Story HouseStyleSFoyer
## 428                 0                0                0                0
## 968                 0                0                0                0
## 1071                0                0                0                0
##      HouseStyleSLvl OverallQual OverallCond YearBuilt YearRemodAdd
## 428               0           4           6      1957         1957
## 968               0           5           7      1955         1955
## 1071              0           5           5      1956         1956
##      RoofStyleFlat RoofStyleGable RoofStyleGambrel RoofStyleHip
## 428              0              0                0            1
## 968              0              0                0            1
## 1071             0              0                0            1
##      RoofStyleMansard RoofStyleShed RoofMatlClyTile RoofMatlCompShg
## 428                 0             0               0               1
## 968                 0             0               0               1
## 1071                0             0               0               1
##      RoofMatlMembran RoofMatlMetal RoofMatlRoll RoofMatlTar&Grv RoofMatlWdShake
## 428                0             0            0               0               0
## 968                0             0            0               0               0
## 1071               0             0            0               0               0
##      RoofMatlWdShngl Exterior1stAsbShng Exterior1stAsphShn Exterior1stBrkComm
## 428                0                  0                  0                  0
## 968                0                  0                  0                  0
## 1071               0                  0                  0                  0
##      Exterior1stBrkFace Exterior1stCBlock Exterior1stCemntBd Exterior1stHdBoard
## 428                   0                 0                  0                  0
## 968                   0                 0                  0                  0
## 1071                  0                 0                  0                  0
##      Exterior1stImStucc Exterior1stMetalSd Exterior1stNone Exterior1stPlywood
## 428                   0                  1               0                  0
## 968                   0                  0               0                  0
## 1071                  0                  1               0                  0
##      Exterior1stStone Exterior1stStucco Exterior1stVinylSd Exterior1stWd Sdng
## 428                 0                 0                  0                  0
## 968                 0                 0                  0                  1
## 1071                0                 0                  0                  0
##      Exterior1stWdShing Exterior2ndAsbShng Exterior2ndAsphShn
## 428                   0                  0                  0
## 968                   0                  0                  0
## 1071                  0                  0                  0
##      Exterior2ndBrk Cmn Exterior2ndBrkFace Exterior2ndCBlock Exterior2ndCmentBd
## 428                   0                  0                 0                  0
## 968                   0                  0                 0                  0
## 1071                  0                  0                 0                  0
##      Exterior2ndHdBoard Exterior2ndImStucc Exterior2ndMetalSd Exterior2ndNone
## 428                   0                  0                  1               0
## 968                   0                  0                  0               0
## 1071                  0                  0                  1               0
##      Exterior2ndOther Exterior2ndPlywood Exterior2ndStone Exterior2ndStucco
## 428                 0                  0                0                 0
## 968                 0                  0                0                 0
## 1071                0                  0                0                 0
##      Exterior2ndVinylSd Exterior2ndWd Sdng Exterior2ndWd Shng MasVnrTypeBrkCmn
## 428                   0                  0                  0                0
## 968                   0                  1                  0                0
## 1071                  0                  0                  0                0
##      MasVnrTypeBrkFace MasVnrTypeNone MasVnrTypeStone MasVnrArea ExterQual
## 428                  0              1               0          0  1.098612
## 968                  1              0               0        151  1.098612
## 1071                 1              0               0        120  1.098612
##      ExterCond FoundationBrkTil FoundationCBlock FoundationPConc FoundationSlab
## 428   1.098612                0                1               0              0
## 968   1.098612                0                1               0              0
## 1071  1.098612                0                1               0              0
##      FoundationStone FoundationWood BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 428                0              0        3        3            1            3
## 968                0              0        3        3            1            5
## 1071               0              0        3        3            1            4
##      BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF HeatingFloor
## 428         288            1          0       619         907            0
## 968         902            1          0       196        1098            0
## 1071        586            1          0       462        1048            0
##      HeatingGasA HeatingGasW HeatingGrav HeatingOthW HeatingWall HeatingQC
## 428            1           0           0           0           0         5
## 968            1           0           0           0           0         3
## 1071           1           0           0           0           0         3
##      CentralAirN CentralAirY ElectricalFuseA ElectricalFuseF ElectricalFuseP
## 428            0           1               0               0               0
## 968            0           1               0               0               0
## 1071           0           1               0               0               0
##      ElectricalMix ElectricalNone ElectricalSBrkr X1stFlrSF X2ndFlrSF
## 428              0              0               1  6.810142         0
## 968              0              0               1  7.001246         0
## 1071             0              0               1  6.954639         0
##      LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 428             0  6.810142            0            0        1        0
## 968             0  7.001246            1            0        1        0
## 1071            0  6.954639            1            0        1        0
##      BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
## 428             3            1           3     1.609438          7          0
## 968             3            1           3     1.791759          7          0
## 1071            3            1           3     1.791759          7          0
##      FireplaceQu GarageType2Types GarageTypeAttchd GarageTypeBasment
## 428            0                0                0                 0
## 968            0                0                1                 0
## 1071           0                0                1                 0
##      GarageTypeBuiltIn GarageTypeCarPort GarageTypeDetchd GarageTypeNone
## 428                  0                 0                1              0
## 968                  0                 0                0              0
## 1071                 0                 0                0              0
##      GarageYrBlt GarageFinish GarageCars GarageArea GarageQual GarageCond
## 428         1964            1          1        352          3          3
## 968         1955            1          1        260          3          3
## 1071        1956            1          1        286          3          3
##      PavedDriveN PavedDriveP PavedDriveY WoodDeckSF OpenPorchSF EnclosedPorch
## 428            0           0           1          0           0             0
## 968            0           0           1          0           0             0
## 1071           0           0           1          0          20             0
##      X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeatureGar2
## 428           0           0        0      0     0               0
## 968           0           0        0      0     0               0
## 1071          0         192        0      0     0               0
##      MiscFeatureNone MiscFeatureOthr MiscFeatureShed MiscFeatureTenC MiscVal
## 428                1               0               0               0       0
## 968                1               0               0               0       0
## 1071               1               0               0               0       0
##      MoSold YrSold SaleTypeCOD SaleTypeCon SaleTypeConLD SaleTypeConLI
## 428       7   2008           0           0             0             0
## 968       7   2008           0           0             0             0
## 1071      6   2007           0           0             0             0
##      SaleTypeConLw SaleTypeCWD SaleTypeNew SaleTypeNone SaleTypeOth SaleTypeWD
## 428              0           0           0            0           0          1
## 968              0           0           0            0           0          1
## 1071             0           0           0            0           0          1
##      SaleConditionAbnorml SaleConditionAdjLand SaleConditionAlloca
## 428                     0                    0                   0
## 968                     0                    0                   0
## 1071                    0                    0                   0
##      SaleConditionFamily SaleConditionNormal SaleConditionPartial SalePrice
## 428                    0                   1                    0  11.59919
## 968                    0                   1                    0  11.81304
## 1071                   0                   1                    0  11.81304
##       TotalSF
## 428  7.503290
## 968  7.694393
## 1071 7.647786
ggplot(train_final,aes(y=SalePrice,x=GrLivArea))+ggtitle("Data Without Outliers")+geom_point()

Principal Component Analysis

dim(train_final)
## [1] 1024  242
# removes columns with 0 variance
train_final = train_final[,which(apply(train_final, 2, var) != 0)]

pca <- prcomp(train_final, scale. = T)

#center and scale refers to mean and standard deviation of the variables
names(pca)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
autoplot(pca, loadings = TRUE)

#standard deviation of each principal component
std_dev <- pca$sdev
#variance
variance <- std_dev^2
#divide the variance by sum of total variance -> to compute the proportion of variance explained by each component
variance_prop <- variance/sum(variance)
#first principal component explains 6.98% of the variance, second 3.2%, third 2.5% 
variance_prop[1:10]
##  [1] 0.07664723 0.03282208 0.02694386 0.02301143 0.01973973 0.01835723
##  [7] 0.01784603 0.01677611 0.01570886 0.01430152
#scree plot - the percentage of variance explained by each principal component
plot(variance_prop, xlab = "Principal Component", ylab = "Proportion of Variance Explained", type = "b", xlim=c(0, 100))

#cumulative variance plot
# ~ 60 components explains around 70% variance in the data set.
plot(cumsum(variance_prop), xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", type = "b", xlim=c(0, 60))

# removes columns with 0 variance
#test_final = test_final[,which(apply(test_final, 2, var) != 0)]

#add a column
test_final$SalePrice <- 1
#new training set with principal components
train_set_pca <- data.frame(SalePrice = train_final$SalePrice, pca$x)
train_set_pca = train_set_pca[,1:61]
head(train_set_pca[1:5])
##    SalePrice        PC1        PC2         PC3        PC4
## 2   12.10902  0.1551383 -2.9764653 -0.01247599  0.2175957
## 3   12.31717  4.0168783  1.1426040  0.82805455  0.9160877
## 9   11.77453 -4.5218842  3.5537311 -3.35060667  0.9019141
## 10  11.67845 -4.0704384  0.2301704 -1.15017189  1.8126455
## 11  11.77144 -2.3207574 -3.5080720  1.77489898 -0.3919360
## 12  12.75130  7.8913836  3.3030587 -2.67760983 -1.0625691

Decision Tree

# anova used for regression method here
model_tree <- rpart(SalePrice ~ .,data = train_set_pca, method = "anova", minsplit=10)

#transform test into PCA
pca <- prcomp(train_final, scale. = T)

test_set_pca <- predict(pca, newdata = test_final)
test_set_pca <- as.data.frame(test_set_pca)
# TEST SET first 60 PCAs
test_set_pca <- test_set_pca[,1:61]
#Plotting best size of tree -> on minimum error
plotcp(model_tree)
minimum.error <- which.min(model_tree$cptable[, "xerror"])
optimal.complexity <- model_tree$cptable[minimum.error, "CP"]
points(minimum.error, model_tree$cptable[minimum.error, "xerror"],
       col = "red", pch = 19)

rpart.plot(model_tree, type=1, extra=100, box.palette ="-RdYlGn", branch.lty = 2)

valid_set_pca <- predict(pca, newdata = validation)
valid_set_pca <- as.data.frame(valid_set_pca)
valid_set_pca <- valid_set_pca[,1:61]

sale_price_dtree <- predict(model_tree, newdata=valid_set_pca)
rmse(validation$SalePrice,sale_price_dtree)
## [1] 0.1993762

Random Forest

forest_model <- randomForest(SalePrice ~ ., data=train_set_pca, ntree = 300) 
varImpPlot(forest_model)

sale_price_forest <- predict(forest_model, newdata=valid_set_pca)
rmse(validation$SalePrice,sale_price_forest)
## [1] 0.1621256

Conclusion

On our validation Set, our random forest model produced a lower rmse value than the decision tree model. Both our models did a fairly well job of predicting the sales prices as our rmse for both models do not exceed 0.2, and we chose the random forest model to predict sales prices on the test set. Our final kaggle score when using PCA with random forest is 0.4984, which is better than the score of 0.60923 without PCA.

test_sp_forest <- predict(forest_model, newdata=test_set_pca)

# We undo the sp = log(sp+1) transformation we did for normality
test_sp_forest = exp(test_sp_forest)-1

Id<-test_data$Id
send_to_csv <- data.frame(Id, test_sp_forest)
names(send_to_csv)[names(send_to_csv) == 'test_sp_forest'] <- 'SalePrice'
write_csv(send_to_csv, "random_forest_submission.csv")

# Kaggle score is 0.60923 without
# kaggle score is 0.49841 when using first 60 pca