Abstract:
Basic manipulations of data frame in R
Create
a data frame
#Pattern 1:
>
df<-data.frame('a'=c(1,2,3), 'b'=c(4,5,6))
> df
a b
1 1 4
2 2 5
3 3 6
#Pattern 2:
> a<-c('aa','bb')
> b<-c('hon', 'con')
> c<-c(3,6)
> df<-data.frame(a, b,
c, stringsAsFactors=F)
> df
a b c
1 aa hon 3
2 bb con 6
Index
a data frame
> df
#data frame
a b c
1 aa hon 3
2 bb con 6
> df[1,]
# the row No.1
a b c
1 aa hon 3
> df[1:2,]
# the rows No.1-2
a b c
1 aa hon 3
2 bb con 6
> df[,2]
#the column No.2
[1] hon con
Levels: con hon
> df[,2:3]
#the columns No.2-3
b c
1 hon 3
2 con 6
> df[2,3]
#the element intersected by the row No.2 and the column No.3.
[1] 6
> df$a
#index the column by the column name
[1] aa bb
Levels: aa bb
> df[,'a']
# the same as the above
[1] aa bb
Levels: aa bb
#index by names
> colnames(df)
#names of column
[1] "a" "b"
"c"
>
colnames(df)<-c('aa','bb','cc')
> colnames(df)
[1] "aa" "bb"
"cc"
> colnames(df)[1]<-'aaa'
#rename the column No.1 only
> colnames(df)
[1] "aaa" "bb"
"cc"
> rownames(df)
[1] "1" "2"
>
rownames(df)<-c('1a','2a')
> rownames(df)
[1] "1a" "2a"
> rownames(df)[2]<-'a2'
> rownames(df)
[1] "1a" "a2"
Transform
of a data frame
> df
aaa bb cc
1a aa hon 3
a2 bb con 6
>
>
> t(df)
#rows to columns and columns to rows
1a a2
aaa "aa" "bb"
bb "hon" "con"
cc "3" "6"
Statistics
of a data frame
> head(state.x77)
Population Income
Illiteracy Life Exp Murder HS Grad Frost Area
Alabama 3615 3624
2.1 69.05 15.1 41.3 20 50708
Alaska 365 6315
1.5 69.31 11.3 66.7 152 566432
Arizona 2212 4530
1.8 70.55 7.8 58.1 15 113417
Arkansas 2110 3378
1.9 70.66 10.1 39.9 65 51945
California 21198 5114
1.1 71.71 10.3 62.6 20 156361
Colorado 2541 4884
0.7 72.06 6.8 63.9 166 103766
> dim(state.x77)
#number of rows and columns
[1] 50 8
> nrow(state.x77)
#number of rows
[1] 50
> ncol(state.x77)
# number of columns
[1] 8
#means by columns
> apply(state.x77, 2, mean)
Population Income
Illiteracy Life Exp Murder HS Grad Frost Area
4246.4200 4435.8000
1.1700 70.8786 7.3780 53.1080 104.4600 70735.8800
> rowSums(state.x77)
Alabama Alaska
Arizona Arkansas California Colorado
58094.55 573412.81
120312.25 57620.56 182838.71 111500.46
Connecticut Delaware
Florida Georgia Hawaii Idaho
13581.68 7604.76
67328.26 67280.04 12399.60 87872.27
> rowMeans(state.x77)
Alabama Alaska
Arizona Arkansas California Colorado
7261.819 71676.601
15039.031 7202.570 22854.839 13937.558
Connecticut Delaware
Florida Georgia Hawaii Idaho
1697.710 950.595
8416.032 8410.005 1549.950 10984.034
#colSums(state.x77)
#colMeans(state.x77)
Filter
a data frame
The
function subset() can be used for filtering a data frame. The below
two examples for filtering of data frame state.x77 by Population and
Income are equal.
> subset(state.x77,
Population>=4000 & Income >4000) # state.x77 should be
data frame
Error in
subset.matrix(state.x77, Population >= 1000 & Income >
4000) :
object 'Population' not
found
>
subset(data.frame(state.x77), Population>=4000 & Income >4000)
Population
Income Illiteracy Life.Exp Murder HS.Grad Frost Area
California 21198
5114 1.1 71.71 10.3 62.6 20 156361
Florida 8277
4815 1.3 70.66 10.7 52.6 11 54090
Georgia 4931
4091 2.0 68.54 13.9 40.6 60 58073
Illinois 11197
5107 0.9 70.14 10.3 52.6 127 55748
Indiana 5313
4458 0.7 70.88 7.1 52.9 122 36097
Maryland 4122
5299 0.9 70.22 8.5 52.3 101 9891
Massachusetts 5814
4755 1.1 71.83 3.3 58.5 103 7826
Michigan 9111
4751 0.9 70.63 11.1 52.8 125 56817
Missouri 4767
4254 0.8 70.69 9.3 48.8 108 68995
New Jersey 7333
5237 1.1 70.93 5.2 52.5 115 7521
New York 18076
4903 1.4 70.55 10.9 52.7 82 47831
Ohio 10735
4561 0.8 70.82 7.4 53.2 124 40975
Pennsylvania 11860
4449 1.0 70.43 6.1 50.2 126 44966
Texas 12237
4188 2.2 70.90 12.2 47.4 35 262134
Virginia 4981
4701 1.4 70.08 9.5 47.8 85 39780
Wisconsin 4589
4468 0.7 72.48 3.0 54.5 149 54464
There
is another more precise usages, but it might be harder to read than
the subset() usage:
>
state.x77[(state.x77[,'Population']>=4000&state.x77[,'Income']>4000),]
Population
Income Illiteracy Life Exp Murder HS Grad Frost Area
California 21198
5114 1.1 71.71 10.3 62.6 20 156361
Florida 8277
4815 1.3 70.66 10.7 52.6 11 54090
Georgia 4931
4091 2.0 68.54 13.9 40.6 60 58073
Illinois 11197
5107 0.9 70.14 10.3 52.6 127 55748
Indiana 5313
4458 0.7 70.88 7.1 52.9 122 36097
Maryland 4122
5299 0.9 70.22 8.5 52.3 101 9891
Massachusetts 5814
4755 1.1 71.83 3.3 58.5 103 7826
Michigan 9111
4751 0.9 70.63 11.1 52.8 125 56817
Missouri 4767
4254 0.8 70.69 9.3 48.8 108 68995
New Jersey 7333
5237 1.1 70.93 5.2 52.5 115 7521
New York 18076
4903 1.4 70.55 10.9 52.7 82 47831
Ohio 10735
4561 0.8 70.82 7.4 53.2 124 40975
Pennsylvania 11860
4449 1.0 70.43 6.1 50.2 126 44966
Texas 12237
4188 2.2 70.90 12.2 47.4 35 262134
Virginia 4981
4701 1.4 70.08 9.5 47.8 85 39780
Wisconsin 4589
4468 0.7 72.48 3.0 54.5 149 54464
Removal
and expanding of a data frame
> df
a b c
1 aa hon 3
2 bb con 6
> df<-cbind(df,df)
# combine two data frames by columns
> df
a b c a b c
1 aa hon 3 aa hon 3
2 bb con 6 bb con 6
> df<-rbind(df,df)
#combind two data frames by rows
> df
a b c a b c
1 aa hon 3 aa hon 3
2 bb con 6 bb con 6
3 aa hon 3 aa hon 3
4 bb con 6 bb con 6
> df<-df[-1,]
# remove the row No.1
> df
a b c a b c
2 bb con 6 bb con 6
3 aa hon 3 aa hon 3
4 bb con 6 bb con 6
> df<-df[,-(3:4)]
#remove the columns No.3-4
> df
a b b.1 c
2 bb con con 6
3 aa hon hon 3
4 bb con con 6
Sort
of a data frame
This
example showed how to sort the rows of the data frame state.x77 by
Population decreasingly.
>
order.state.x77<-state.x77[order(state.x77[,'Population'],decreasing=T),
]
> head(order.state.x77)
Population
Income Illiteracy Life Exp Murder HS Grad Frost Area
California 21198
5114 1.1 71.71 10.3 62.6 20 156361
New York 18076
4903 1.4 70.55 10.9 52.7 82 47831
Texas 12237
4188 2.2 70.90 12.2 47.4 35 262134
Pennsylvania 11860
4449 1.0 70.43 6.1 50.2 126 44966
Illinois 11197
5107 0.9 70.14 10.3 52.6 127 55748
Ohio 10735
4561 0.8 70.82 7.4 53.2 124 40975
The
second example is how to sort the rows of the data frame state.x77 by
Illiteracy decreasingly first and then by Income increasingly.
>
order.state.x77<-state.x77[order(state.x77[,'Illiteracy'],-state.x77[,'Income'],
decreasing=T), ]
> head(order.state.x77)
Population
Income Illiteracy Life Exp Murder HS Grad Frost Area
Louisiana 3806
3545 2.8 68.76 13.2 42.2 12 44930
Mississippi 2341
3098 2.4 68.09 12.5 41.0 50 47296
South Carolina 2816
3635 2.3 67.96 11.6 37.8 65 30225
New Mexico 1144
3601 2.2 70.32 9.7 55.2 120 121412
Texas 12237
4188 2.2 70.90 12.2 47.4 35 262134
Alabama 3615
3624 2.1 69.05 15.1 41.3 20 50708
Except
the function of order(), The function with() can also be equal.
>
order.index<-with(data.frame(state.x77), order(Illiteracy,
-Income, decreasing=T))
>
head(state.x77[order.index,])
Population
Income Illiteracy Life Exp Murder HS Grad Frost Area
Louisiana 3806
3545 2.8 68.76 13.2 42.2 12 44930
Mississippi 2341
3098 2.4 68.09 12.5 41.0 50 47296
South Carolina 2816
3635 2.3 67.96 11.6 37.8 65 30225
New Mexico 1144
3601 2.2 70.32 9.7 55.2 120 121412
Texas 12237
4188 2.2 70.90 12.2 47.4 35 262134
Alabama 3615
3624 2.1 69.05 15.1 41.3 20 50708
Except
the functions of order() and with(), the function arrange() from R
package “plyr” is fine.
> library(plyr)
>
order.state.x77<-arrange(state.x77, desc(Illiteracy), Income)
Error: is.data.frame(df) is
not TRUE
>
order.state.x77<-arrange(data.frame(state.x77), desc(Illiteracy),
Income)
> head(order.state.x77)
Population Income Illiteracy
Life.Exp Murder HS.Grad Frost Area
1 3806 3545 2.8
68.76 13.2 42.2 12 44930
2 2341 3098 2.4
68.09 12.5 41.0 50 47296
3 2816 3635 2.3
67.96 11.6 37.8 65 30225
4 1144 3601 2.2
70.32 9.7 55.2 120 121412
5 12237 4188 2.2
70.90 12.2 47.4 35 262134
6 3615 3624 2.1
69.05 15.1 41.3 20 50708
Writing
date: 2014.08.20, 2015.02.09
No comments:
Post a Comment