sim.dat <- read.csv("https://raw.githubusercontent.com/happyrabbit/DataScientistR/master/Data/SegData.csv ")
summary(sim.dat)
# set problematic values as missings
sim.dat$age[which(sim.dat$age>100)]<-NA
sim.dat$store_exp[which(sim.dat$store_exp<0)]<-NA
# see the results
summary(subset(sim.dat,select=c("age","income")))
impute()
function in imputeMissings
package# save the result as another object
demo_imp<-impute(sim.dat,method="median/mode")
# check the first 5 columns, there is no missing values in other columns
summary(demo_imp[,1:5])
preProcess()
function in caret
packageimp<-preProcess(sim.dat,method="medianImpute")
demo_imp2<-predict(imp,sim.dat)
summary(demo_imp2[,1:5])
preProcess()
function in caret
packageimp<-preProcess(sim.dat,method="knnImpute",k=5)
# need to use predict() to get KNN result
demo_imp<-predict(imp,sim.dat)
sim.dat
has non-numeric variables# find factor columns
imp<-preProcess(sim.dat,method="knnImpute",k=5)
idx<-which(lapply(sim.dat,class)=="factor")
demo_imp<-predict(imp,sim.dat[,-idx])
summary(demo_imp[,1:3])
imp<-preProcess(sim.dat,method="bagImpute")
demo_imp<-predict(imp,sim.dat)
summary(demo_imp[,1:5])
income<-sim.dat$income
# calculate the mean of income
mux<-mean(income,na.rm=T)
# calculate the standard deviation of income
sdx<-sd(income,na.rm=T)
# centering
tr1<-income-mux
# scaling
tr2<-tr1/sdx
preProcess()
sdat<-subset(sim.dat,select=c("age","income"))
# set the "method" option
trans<-preProcess(sdat,method=c("center","scale"))
# use predict() function to get the final result
transformed<-predict(trans,sdat)
describe(sim.dat)
# select the two columns and save them as dat_bc
dat_bc<-subset(sim.dat,select=c("store_trans","online_trans"))
(trans<-preProcess(dat_bc,method=c("BoxCox")))
Use predict()
to get the transformed result:
transformed<-predict(trans,dat_bc)
\[Z_{i}=\frac{Y_{i}-\bar{Y}}{s}\] where \(\bar{Y}\) and \(s\) are mean and standard deviation for \(Y\)
\[M_{i}=\frac{0.6745(Y_{i}-\bar{Y})}{MAD}\]
where MAD is the median of a series of \(|Y_{i} - \bar{Y}|\), called the median of the absolute dispersion
corrplot()
class.ind()
from nnet
packagedumVar<-class.ind(sim.dat$gender)
head(dumVar)
dummyVars()
from caret
dumMod<-dummyVars(~gender+house+income,
data=sim.dat,
# use "origional variable name + level" as new name
levelsOnly=F)
head(predict(dumMod,sim.dat))