-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFinal02_ImputeData.R
66 lines (55 loc) · 2.14 KB
/
Final02_ImputeData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
### packages
library(mice)
library(missForest)
### ------------ Imputation ------------ ###
## try n error: imputation
HousePrice = read.csv("HousePrice.csv")
### ------------ Testing Imputation Accuracy ------------ ###
# # get complete case of house price which are continuous
# CompleteHousePrice = HousePrice[complete.cases(HousePrice), ]
# mean(is.na(HousePrice)) # missing rate
#
# ## use CompletetHousePrice as input for analysis for saving time
# write.csv(CompleteHousePrice, "ImputedHousePrice.csv", row.names = FALSE)
#
# ### imputation by multi-session
# # TestImputeHousePrice = prodNA(CompletetHousePrice, noNA = .05) # package missForest
#
# RFimputeHousePrice = mice(TestImputeHousePrice, method = "rf", maxit = 2, m = 20)
# saveRDS(RFimputeHousePrice, "RFimpute.rds")
# complete(RFimputeHousePrice)
#
# CARTimputeHousePrice = mice(TestImputeHousePrice, method = "cart", maxit = 2, m = 20)
# saveRDS(CARTimputeHousePrice, "CARTimpute.rds")
# complete(CARTimputeHousePrice)
### ------------ Ignore this part ------------ ###
## take a look at NA data
str(HousePrice)
NAPerCol = colSums(is.na(HousePrice)) # missing per columns
NAPerCol[NAPerCol > 0] # only columns with missing value
## data of NA for "cafe.*1500"
cafe1500 = HousePrice[, grepl("cafe.*1500", colnames(HousePrice))]
cafe1500[!complete.cases(cafe1500), ]
## data of NA for "cafe.*2000"
cafe2000 = HousePrice[, grepl("cafe.*2000", colnames(HousePrice))]
cafe2000[!complete.cases(cafe2000), ]
# Conclusion: data with NA all in the same row
# which will lead to data after imputation still be missing somehow
imputeFUN = function(data, METHOD, MAXIT = 2, M = 10){
# imputation
start = Sys.time()
impute = mice(data, method = METHOD, maxit = MAXIT, m = M)
end = Sys.time()
timePass = end - start
# save imputed data
saveName = paste(METHOD, "_impute.rds", sep = "")
saveRDS(impute, saveName)
# print time spent for imputation
cat("Time Pass:", timePass)
# return all the output as list
out = list(impute = impute, timePass = timePass)
return(out)
}
cart = imputeFUN(HousePrice, "cart")
randomforest = imputeFUN(HousePrice, "rf")
# pmm = imputeFUN(HousePrice, "pmm")