R-HW5 - Father Essays

HW5
Fengrui Liu
2020/10/1
##3. (e)
library(ISLR)
attach(Weekly)
library(MASS)
## Warning: package ‘MASS’ was built under R version 3.6.2
weekly_training <- Weekly[Year <= 2008,] weekly_test <- Weekly[Year > 2008,]
LAD_data = lda(Direction ~ Lag2, data= weekly_training)
Predicted_LDA <- predict(LAD_data, data.frame(weekly_test))$class table( weekly_test$Direction, Predicted_LDA ) ## Predicted_LDA ## Down Up ## Down 9 34 ## Up 5 56 mean(weekly_test$Direction == Predicted_LDA) ## [1] 0.625 (f) QDA_data <- qda(Direction ~ Lag2, data=weekly_training) Predicted_QDA <- predict(QDA_data, data.frame(weekly_test))$class table( weekly_test$Direction, Predicted_QDA ) ## Predicted_QDA ## Down Up ## Down 0 43 ## Up 0 61 mean(weekly_test$Direction == Predicted_QDA) ## [1] 0.5865385 (g) X.train <- matrix( Lag2[ Year <= 2008 ], sum(Year <= 2008), 1 ) X.test <- matrix( Lag2[ Year > 2008 ], sum(Year >= 2009), 1 )
Y.train <- Direction[ Year <= 2008 ] Y.test <- Direction[ Year > 2008 ]
knn.fit <- class::knn( X.train, X.test, Y.train, 1 ) table( Y.test, knn.fit ) ## knn.fit ## Y.test Down Up ## Down 21 22 1 ## Up 30 31 mean( Y.test == knn.fit ) ## [1] 0.5 (h) LDA: classification rate is 63%. QDA: classification rate is 53%. KNN: classification rate is 51%. LDA is better than QDA. LDA,QDA are better than KNN. 4 attach(Boston) names(Boston) ## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age" ## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv" Crime <- rep(0, length(crim)) Crime[crim > median(crim)] <- 1 Boston<- data.frame(Boston,Crime) train <- 1:(dim(Boston)[1]/2) test <- (dim(Boston)[1]/2 + 1):dim(Boston)[1] Boston_train <- Boston[train, ] Boston_test <- Boston[test, ] crim_test <- Crime[test] ## Logistic regression. reg_model1 <- glm(Crime ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv, family="binomial") summary(reg_model1) ## ## Call: ## glm(formula = Crime ~ zn + indus + chas + nox + rm + age + dis + ## rad + tax + ptratio + black + lstat + medv, family = "binomial") ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.3946 -0.1585 -0.0004 0.0023 3.4239 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -34.103704 6.530014 -5.223 1.76e-07 ***
## zn -0.079918 0.033731 -2.369 0.01782 *
## indus -0.059389 0.043722 -1.358 0.17436
## chas 0.785327 0.728930 1.077 0.28132
## nox 48.523782 7.396497 6.560 5.37e-11 ***
## rm -0.425596 0.701104 -0.607 0.54383
## age 0.022172 0.012221 1.814 0.06963 .
## dis 0.691400 0.218308 3.167 0.00154 **
## rad 0.656465 0.152452 4.306 1.66e-05 ***
## tax -0.006412 0.002689 -2.385 0.01709 *
## ptratio 0.368716 0.122136 3.019 0.00254 **
## black -0.013524 0.006536 -2.069 0.03853 *
## lstat 0.043862 0.048981 0.895 0.37052
## medv 0.167130 0.066940 2.497 0.01254 *
## —
2
## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ‘ 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 701.46 on 505 degrees of freedom
## Residual deviance: 211.93 on 492 degrees of freedom
## AIC: 239.93
##
## Number of Fisher Scoring iterations: 9
Because of the P-value, significant variables are: zn, nox, age, dis, rad, tax, ptratio, black, medv.
reg_model2<- glm(Crime ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data = Boston_train, family="binomial") ## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred summary(reg_model2) ## ## Call: ## glm(formula = Crime ~ zn + nox + age + dis + rad + tax + ptratio + ## black + medv, family = "binomial", data = Boston_train) ## ## Deviance Residuals: ## Min 1Q Median 3Q Max ## -2.1131 -0.1780 -0.0009 0.2415 3.0885 ## ## Coefficients: ## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -66.233571 11.236568 -5.894 3.76e-09 ***
## zn -0.428786 0.098572 -4.350 1.36e-05 ***
## nox 81.501997 14.107158 5.777 7.59e-09 ***
## age 0.020096 0.017923 1.121 0.2622
## dis 2.199728 0.463544 4.745 2.08e-06 ***
## rad 1.523621 0.312386 4.877 1.08e-06 ***
## tax -0.012168 0.005396 -2.255 0.0241 *
## ptratio 0.639813 0.164406 3.892 9.96e-05 ***
## black -0.009557 0.005961 -1.603 0.1089
## medv 0.094511 0.048743 1.939 0.0525 .
## —
## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ‘ 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 329.37 on 252 degrees of freedom
## Residual deviance: 91.88 on 243 degrees of freedom
## AIC: 111.88
##
## Number of Fisher Scoring iterations: 9
p <- predict(reg_model2, type = "response", data.frame(Boston_test)) p_crime <- 1*(p > 0.5)
table(Boston_test$Crime, p_crime)
## p_crime
## 0 1
## 0 71 19
3
## 1 21 142
mean(Boston_test$Crime != p_crime)
## [1] 0.1581028
## LDA
lda.crime<- lda(Crime ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data=Boston_train) predict_lda <- predict(lda.crime, data.frame(Boston_test))$class table(Boston_test$Crime, predict_lda) ## predict_lda ## 0 1 ## 0 80 10 ## 1 22 141 mean(Boston_test$Crime != predict_lda) ## [1] 0.1264822 ## QDA qda.crime<- qda(Crime ~ zn + nox + age + dis + rad + tax + ptratio + black + medv, data=Boston_train) predict_qda <- predict(qda.crime, data.frame(Boston_test))$class table(Boston_test$Crime, predict_qda) ## predict_qda ## 0 1 ## 0 81 9 ## 1 155 8 81+9+8+155 ## [1] 253 155+9 ## [1] 164 164/253 ## [1] 0.6482213 ## KNN x_test <- Boston_test[ , c(2,5,7,8,9,10,11,12,14)] x_train <- Boston_train[ , c(2,5,7,8,9,10,11,12,14)] crime_test <- Crime[test] knn <- class::knn(x_test, x_train,crime_test,1) table(crim_test,knn) ## knn ## crim_test 0 1 ## 0 57 33 ## 1 125 38 57+33+125+38 ## [1] 253 125+33 ## [1] 158 4 158/253 ## [1] 0.6245059 According to the error rate, LDA will be best model in this case, because it havs the lowest error rate 12.65%. 5