Tiezheng Yuan Ph.D.: R: Data Frame

Abstract: Basic manipulations of data frame in R

Create a data frame

#Pattern 1:

> df<-data.frame('a'=c(1,2,3), 'b'=c(4,5,6))

> df

a b

1 1 4

2 2 5

3 3 6

#Pattern 2:

> a<-c('aa','bb')

> b<-c('hon', 'con')

> c<-c(3,6)

> df<-data.frame(a, b, c, stringsAsFactors=F)

> df

a b c

1 aa hon 3

2 bb con 6

Index a data frame

> df #data frame

a b c

1 aa hon 3

2 bb con 6

> df[1,] # the row No.1

a b c

1 aa hon 3

> df[1:2,] # the rows No.1-2

a b c

1 aa hon 3

2 bb con 6

> df[,2] #the column No.2

[1] hon con

Levels: con hon

> df[,2:3] #the columns No.2-3

b c

1 hon 3

2 con 6

> df[2,3] #the element intersected by the row No.2 and the column No.3.

[1] 6

> df$a #index the column by the column name

[1] aa bb

Levels: aa bb

> df[,'a'] # the same as the above

[1] aa bb

Levels: aa bb

#index by names

> colnames(df) #names of column

[1] "a" "b" "c"

> colnames(df)<-c('aa','bb','cc')

> colnames(df)

[1] "aa" "bb" "cc"

> colnames(df)[1]<-'aaa' #rename the column No.1 only

> colnames(df)

[1] "aaa" "bb" "cc"

> rownames(df)

[1] "1" "2"

> rownames(df)<-c('1a','2a')

> rownames(df)

[1] "1a" "2a"

> rownames(df)[2]<-'a2'

> rownames(df)

[1] "1a" "a2"

Transform of a data frame

> df

aaa bb cc

1a aa hon 3

a2 bb con 6

> t(df) #rows to columns and columns to rows

1a a2

aaa "aa" "bb"

bb "hon" "con"

cc "3" "6"

Statistics of a data frame

> head(state.x77)

Population Income Illiteracy Life Exp Murder HS Grad Frost Area

Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708

Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432

Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417

Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945

California 21198 5114 1.1 71.71 10.3 62.6 20 156361

Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766

> dim(state.x77) #number of rows and columns

[1] 50 8

> nrow(state.x77) #number of rows

[1] 50

> ncol(state.x77) # number of columns

[1] 8

#means by columns

> apply(state.x77, 2, mean)

Population Income Illiteracy Life Exp Murder HS Grad Frost Area

4246.4200 4435.8000 1.1700 70.8786 7.3780 53.1080 104.4600 70735.8800

> rowSums(state.x77)

Alabama Alaska Arizona Arkansas California Colorado

58094.55 573412.81 120312.25 57620.56 182838.71 111500.46

Connecticut Delaware Florida Georgia Hawaii Idaho

13581.68 7604.76 67328.26 67280.04 12399.60 87872.27

> rowMeans(state.x77)

Alabama Alaska Arizona Arkansas California Colorado

7261.819 71676.601 15039.031 7202.570 22854.839 13937.558

Connecticut Delaware Florida Georgia Hawaii Idaho

1697.710 950.595 8416.032 8410.005 1549.950 10984.034

#colSums(state.x77)

#colMeans(state.x77)

Filter a data frame

The function subset() can be used for filtering a data frame. The below two examples for filtering of data frame state.x77 by Population and Income are equal.

> subset(state.x77, Population>=4000 & Income >4000) # state.x77 should be data frame

Error in subset.matrix(state.x77, Population >= 1000 & Income > 4000) :

object 'Population' not found

> subset(data.frame(state.x77), Population>=4000 & Income >4000)

Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area

California 21198 5114 1.1 71.71 10.3 62.6 20 156361

Florida 8277 4815 1.3 70.66 10.7 52.6 11 54090

Georgia 4931 4091 2.0 68.54 13.9 40.6 60 58073

Illinois 11197 5107 0.9 70.14 10.3 52.6 127 55748

Indiana 5313 4458 0.7 70.88 7.1 52.9 122 36097

Maryland 4122 5299 0.9 70.22 8.5 52.3 101 9891

Massachusetts 5814 4755 1.1 71.83 3.3 58.5 103 7826

Michigan 9111 4751 0.9 70.63 11.1 52.8 125 56817

Missouri 4767 4254 0.8 70.69 9.3 48.8 108 68995

New Jersey 7333 5237 1.1 70.93 5.2 52.5 115 7521

New York 18076 4903 1.4 70.55 10.9 52.7 82 47831

Ohio 10735 4561 0.8 70.82 7.4 53.2 124 40975

Pennsylvania 11860 4449 1.0 70.43 6.1 50.2 126 44966

Texas 12237 4188 2.2 70.90 12.2 47.4 35 262134

Virginia 4981 4701 1.4 70.08 9.5 47.8 85 39780

Wisconsin 4589 4468 0.7 72.48 3.0 54.5 149 54464

There is another more precise usages, but it might be harder to read than the subset() usage:

> state.x77[(state.x77[,'Population']>=4000&state.x77[,'Income']>4000),]

Population Income Illiteracy Life Exp Murder HS Grad Frost Area

California 21198 5114 1.1 71.71 10.3 62.6 20 156361

Florida 8277 4815 1.3 70.66 10.7 52.6 11 54090

Georgia 4931 4091 2.0 68.54 13.9 40.6 60 58073

Illinois 11197 5107 0.9 70.14 10.3 52.6 127 55748

Indiana 5313 4458 0.7 70.88 7.1 52.9 122 36097

Maryland 4122 5299 0.9 70.22 8.5 52.3 101 9891

Massachusetts 5814 4755 1.1 71.83 3.3 58.5 103 7826

Michigan 9111 4751 0.9 70.63 11.1 52.8 125 56817

Missouri 4767 4254 0.8 70.69 9.3 48.8 108 68995

New Jersey 7333 5237 1.1 70.93 5.2 52.5 115 7521

New York 18076 4903 1.4 70.55 10.9 52.7 82 47831

Ohio 10735 4561 0.8 70.82 7.4 53.2 124 40975

Pennsylvania 11860 4449 1.0 70.43 6.1 50.2 126 44966

Texas 12237 4188 2.2 70.90 12.2 47.4 35 262134

Virginia 4981 4701 1.4 70.08 9.5 47.8 85 39780

Wisconsin 4589 4468 0.7 72.48 3.0 54.5 149 54464

Removal and expanding of a data frame

> df

a b c

1 aa hon 3

2 bb con 6

> df<-cbind(df,df) # combine two data frames by columns

> df

a b c a b c

1 aa hon 3 aa hon 3

2 bb con 6 bb con 6

> df<-rbind(df,df) #combind two data frames by rows

> df

a b c a b c

1 aa hon 3 aa hon 3

2 bb con 6 bb con 6

3 aa hon 3 aa hon 3

4 bb con 6 bb con 6

> df<-df[-1,] # remove the row No.1

> df

a b c a b c

2 bb con 6 bb con 6

3 aa hon 3 aa hon 3

4 bb con 6 bb con 6

> df<-df[,-(3:4)] #remove the columns No.3-4

> df

a b b.1 c

2 bb con con 6

3 aa hon hon 3

4 bb con con 6

Sort of a data frame

This example showed how to sort the rows of the data frame state.x77 by Population decreasingly.

> order.state.x77<-state.x77[order(state.x77[,'Population'],decreasing=T), ]

> head(order.state.x77)

Population Income Illiteracy Life Exp Murder HS Grad Frost Area

California 21198 5114 1.1 71.71 10.3 62.6 20 156361

New York 18076 4903 1.4 70.55 10.9 52.7 82 47831

Texas 12237 4188 2.2 70.90 12.2 47.4 35 262134

Pennsylvania 11860 4449 1.0 70.43 6.1 50.2 126 44966

Illinois 11197 5107 0.9 70.14 10.3 52.6 127 55748

Ohio 10735 4561 0.8 70.82 7.4 53.2 124 40975

The second example is how to sort the rows of the data frame state.x77 by Illiteracy decreasingly first and then by Income increasingly.

> order.state.x77<-state.x77[order(state.x77[,'Illiteracy'],-state.x77[,'Income'], decreasing=T), ]

> head(order.state.x77)

Population Income Illiteracy Life Exp Murder HS Grad Frost Area

Louisiana 3806 3545 2.8 68.76 13.2 42.2 12 44930

Mississippi 2341 3098 2.4 68.09 12.5 41.0 50 47296

South Carolina 2816 3635 2.3 67.96 11.6 37.8 65 30225

New Mexico 1144 3601 2.2 70.32 9.7 55.2 120 121412

Texas 12237 4188 2.2 70.90 12.2 47.4 35 262134

Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708

Except the function of order(), The function with() can also be equal.

> order.index<-with(data.frame(state.x77), order(Illiteracy, -Income, decreasing=T))

> head(state.x77[order.index,])

Population Income Illiteracy Life Exp Murder HS Grad Frost Area

Louisiana 3806 3545 2.8 68.76 13.2 42.2 12 44930

Mississippi 2341 3098 2.4 68.09 12.5 41.0 50 47296

South Carolina 2816 3635 2.3 67.96 11.6 37.8 65 30225

New Mexico 1144 3601 2.2 70.32 9.7 55.2 120 121412

Texas 12237 4188 2.2 70.90 12.2 47.4 35 262134

Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708

Except the functions of order() and with(), the function arrange() from R package “plyr” is fine.

> library(plyr)

> order.state.x77<-arrange(state.x77, desc(Illiteracy), Income)

Error: is.data.frame(df) is not TRUE

> order.state.x77<-arrange(data.frame(state.x77), desc(Illiteracy), Income)

> head(order.state.x77)

Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area

1 3806 3545 2.8 68.76 13.2 42.2 12 44930

2 2341 3098 2.4 68.09 12.5 41.0 50 47296

3 2816 3635 2.3 67.96 11.6 37.8 65 30225

4 1144 3601 2.2 70.32 9.7 55.2 120 121412

5 12237 4188 2.2 70.90 12.2 47.4 35 262134

6 3615 3624 2.1 69.05 15.1 41.3 20 50708

Writing date: 2014.08.20, 2015.02.09

Tiezheng Yuan Ph.D.

Monday, February 9, 2015

R: Data Frame

No comments:

Post a Comment