apply()
, lapply()
and sapply()
in base Rddply()
in plyr
packagedplyr
packagereshape2
packagetidyr
packagedplyr
packageplyr
packaged
in the name)Rcpp
dplyr
packagetbl_df()
: Convert the data to tibble
library(dplyr)
tbl_df(sim.dat)
glimpse()
: This is like a transposed version of tbl_df()
glimpse(sim.dat)
income
more than 300000:filter(sim.dat, income >300000) %>%
tbl_df()
%>%
: “Pipe operator”%>%
x %>% f(y) = f(x, y)
y %>% f(x, ., z) = f(x, y, z )
For example: "Hello World" %>% substring(7, 11) %>% grepl("Wo", .)
%>%
Look at the following code. Can you tell me what it does?
ave_exp <- filter(
summarise(
group_by(
filter(
sim.dat,
!is.na(income)
),
segment
),
ave_online_exp = mean(online_exp),
n = n()
),
n > 200
)
Now look at the identical code using “%>%
”:
avg_exp <- sim.dat %>%
filter(!is.na(income)) %>%
group_by(segment) %>%
summarise(
ave_online_exp = mean(online_exp),
n = n() ) %>%
filter(n > 200)
avg_exp <- sim.dat %>%
filter(!is.na(income)) %>%
group_by(segment) %>%
summarise(
ave_online_exp = mean(online_exp),
n = n() ) %>%
filter(n > 200)
distinct()
: a generalization of unique()
from vector to data framedplyr::distinct(sim.dat)
sample_frac()
: randomly select some rows with specified percentage.sample_n()
:randomly select rows with specified number.dplyr::sample_frac(sim.dat, 0.5, replace = TRUE)
dplyr::sample_n(sim.dat, 10, replace = TRUE)
slice()
will select rows by position:# It is equivalent to `sim.dat[10:15,]`
dplyr::slice(sim.dat, 10:15)
top_n()
select the order top n entries:dplyr::top_n(sim.dat,2,income)
# select by column name
dplyr::select(sim.dat,income,age,store_exp)
# select columns whose name contains a character string
dplyr::select(sim.dat, contains("_"))
# select columns whose name ends with a character string
# similar there is "starts_with"
dplyr::select(sim.dat, ends_with("e"))
# select columns Q1,Q2,Q3,Q4 and Q5
select(sim.dat, num_range("Q", 1:5))
# select columns whose names are in a group of names
dplyr::select(sim.dat, one_of(c("age", "income")))
# select columns between age and online_exp
dplyr::select(sim.dat, age:online_exp)
# select all columns except for age
dplyr::select(sim.dat, -age)
dplyr::summarise(sim.dat, avg_online = mean(online_trans))
# apply function anyNA() to each column
# you can also assign a function vector such as: c("anyNA","is.factor")
dplyr::summarise_each(sim.dat, funs_(c("anyNA")))
group_by()
sim.dat %>% group_by(segment) %>% summarise_each(funs_(c("anyNA")))
mutate()
: compute and append one or more new columns:dplyr::mutate(sim.dat, total_exp = store_exp + online_exp)
# min_rank=rank(ties.method = "min")
# mutate_each() means apply function to each column
dplyr::mutate_each(sim.dat, funs(min_rank))
transmute()
: delete the original columns and only keep the new onesdplyr::transmute(sim.dat, total_exp = store_exp + online_exp)
(x<-data.frame(cbind(ID=c("A","B","C"),x1=c(1,2,3))))
(y<-data.frame(cbind(ID=c("B","C","D"),y1=c(T,T,F))))
# join to the left
# keep all rows in x
left_join(x,y,by="ID")
# get rows matched in both data sets
inner_join(x,y,by="ID")
# get rows in either data set
full_join(x,y,by="ID")
# filter out rows in x that can be matched in y
# it doesn't bring in any values from y
semi_join(x,y,by="ID")
# the opposite of semi_join()
# it gets rows in x that cannot be matched in y
# it doesn't bring in any values from y
anti_join(x,y,by="ID")
“Tidy data” represent the information from a dataset as data frames where each row is an observation and each column contains the values of a variable
convert data between the “wide” and the “long” format
two commonly used packages for this kind of manipulations: tidyr
and reshape2
reshape2
packagereshape
melt()
to convert an object into a molten data frame, i.e. from wide to longdcast()
to cast a molten data frame into the shape you want, i.e. from long to wide# Take a baby subset of our exemplary clothes consumers data to illustrate:
(sdat<-sim.dat[1:5,1:6])
reshape2
examplelibrary(reshape2)
(mdat <- melt(sdat, measure.vars=c("store_exp","online_exp"),
variable.name = "Channel",
value.name = "Expense"))
# Here we use all observations from sim.dat
mdat<-melt(sim.dat[,1:6], measure.vars=c("store_exp","online_exp"),
variable.name = "Channel",
value.name = "Expense")
fit<-lm(Expense~gender+house+income+Channel+age,data=mdat)
summary(fit)
reshape2
exampledcast(mdat, house + gender ~ Channel, sum)
tidyr
packagelibrary(dplyr)
library(tidyr)
# practice functions we learnt before
sdat<-sim.dat[1:5,]%>%
dplyr::select(age,gender,store_exp,store_trans)
sdat %>% tbl_df()
gather()
melt()
in reshape2
library(tidyr)
msdat<-tidyr::gather(sdat,"variable","value",store_exp,store_trans)
msdat %>% tbl_df()
sdat%>%gather("variable","value",store_exp,store_trans)
melt()
:library(reshape2)
melt(sdat, measure.vars=c("store_exp","store_trans"),
variable.name = "variable",
value.name = "value")
spread()
msdat %>% spread(variable,value)
separate()
and unite()
# You can use `sep=`
# By default, it is "`_`"
sepdat<- msdat %>%
separate(variable,c("Source","Type"))
sepdat %>% tbl_df()
sepdat %>%
unite("variable",Source,Type,sep="_")