Data Frame Introduction

See the Work Along - Data frames for in-class exercises.

Navigate to the working directory.

setwd("~/Documents/Computing with Data/5_Data_frames/")

Import data on heights and weights.

ht_wt_df <- read.csv(file="../Data/01_heights_weights_genders.txt")

dim(ht_wt_df)

## [1] 10000     3

nrow(ht_wt_df)

## [1] 10000

Inspecting the data:

head(ht_wt_df)

##   Gender Height Weight
## 1   Male  73.85  241.9
## 2   Male  68.78  162.3
## 3   Male  74.11  212.7
## 4   Male  71.73  220.0
## 5   Male  69.88  206.3
## 6   Male  67.25  152.2

head prints the first 6 rows, but you can adjust the number.

head(ht_wt_df, n=10)

##    Gender Height Weight
## 1    Male  73.85  241.9
## 2    Male  68.78  162.3
## 3    Male  74.11  212.7
## 4    Male  71.73  220.0
## 5    Male  69.88  206.3
## 6    Male  67.25  152.2
## 7    Male  68.79  183.9
## 8    Male  68.35  168.0
## 9    Male  67.02  175.9
## 10   Male  63.46  156.4

tail(ht_wt_df)

##       Gender Height Weight
## 9995  Female  59.10  110.5
## 9996  Female  66.17  136.8
## 9997  Female  67.07  170.9
## 9998  Female  63.87  128.5
## 9999  Female  69.03  163.9
## 10000 Female  61.94  113.6

str gives similar info, but includes the type of the variables.

str(ht_wt_df)

## 'data.frame':    10000 obs. of  3 variables:
##  $ Gender: Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Height: num  73.8 68.8 74.1 71.7 69.9 ...
##  $ Weight: num  242 162 213 220 206 ...

Data frames can contain columns of any mode. They should be atomic, though.

Names and rownames

The “names” attribute refers to the column names. Names is an appropriate term because a data.frame is a special kind of list. The list components are the columns of data frame, so the “names” attribute of the list is the “names” atrribute of the data frame.

names(ht_wt_df)

## [1] "Gender" "Height" "Weight"

Change names as with a vector.

ht_2 <- ht_wt_df
names(ht_2)[3] <- "Wt"
head(ht_2)

##   Gender Height    Wt
## 1   Male  73.85 241.9
## 2   Male  68.78 162.3
## 3   Male  74.11 212.7
## 4   Male  71.73 220.0
## 5   Male  69.88 206.3
## 6   Male  67.25 152.2

rownames(ht_wt_df)[1:3]

## [1] "1" "2" "3"

rownames(ht_2) <- paste("S", 1:10000, sep="")
head(ht_2)

##    Gender Height    Wt
## S1   Male  73.85 241.9
## S2   Male  68.78 162.3
## S3   Male  74.11 212.7
## S4   Male  71.73 220.0
## S5   Male  69.88 206.3
## S6   Male  67.25 152.2

Identifying individual entries

head(ht_wt_df)

##   Gender Height Weight
## 1   Male  73.85  241.9
## 2   Male  68.78  162.3
## 3   Male  74.11  212.7
## 4   Male  71.73  220.0
## 5   Male  69.88  206.3
## 6   Male  67.25  152.2

ht_wt_df[3, 2]

## [1] 74.11

ht_wt_df["3", "Height"]

## [1] 74.11

ht_wt_df[3:5, 2]

## [1] 74.11 71.73 69.88

class(ht_wt_df[3:5, 2])

## [1] "numeric"

ht_wt_df[3:5, 2:3]

##   Height Weight
## 3  74.11  212.7
## 4  71.73  220.0
## 5  69.88  206.3

class(ht_wt_df[3:5, 2:3])

## [1] "data.frame"

ht_wt_df[5:10, ]

##    Gender Height Weight
## 5    Male  69.88  206.3
## 6    Male  67.25  152.2
## 7    Male  68.79  183.9
## 8    Male  68.35  168.0
## 9    Male  67.02  175.9
## 10   Male  63.46  156.4

Subsetting with a logical vector

log1 <- rep(FALSE, times = 10000)
log1[c(5,6, 8, 10)] <- TRUE

ht_wt_df[log1, ]

##    Gender Height Weight
## 5    Male  69.88  206.3
## 6    Male  67.25  152.2
## 8    Male  68.35  168.0
## 10   Male  63.46  156.4

ht_wt_df[log1, "Weight"]

## [1] 206.3 152.2 168.0 156.4

Comparisons create the logical vector.

female_logical <- ht_wt_df$Gender == "Female"
length(female_logical)

## [1] 10000

table(female_logical)

## female_logical
## FALSE  TRUE 
##  5000  5000

female_ht_wt <- ht_wt_df[female_logical, ]
head(female_ht_wt)

##      Gender Height Weight
## 5001 Female  58.91  102.1
## 5002 Female  65.23  141.3
## 5003 Female  63.37  131.0
## 5004 Female  64.48  128.2
## 5005 Female  61.79  129.8
## 5006 Female  65.97  156.8

Select all over 200 pounds.

large_df <- ht_wt_df[ ht_wt_df$Weight > 200 , ]
head(large_df)

##    Gender Height Weight
## 1    Male  73.85  241.9
## 3    Male  74.11  212.7
## 4    Male  71.73  220.0
## 5    Male  69.88  206.3
## 12   Male  71.64  213.7
## 24   Male  75.21  228.8

Notation change with subset

small_df2 <- subset(ht_wt_df, Weight < 110)
head(small_df2)

##      Gender Height Weight
## 5001 Female  58.91 102.09
## 5010 Female  63.68 104.15
## 5012 Female  61.80 106.23
## 5014 Female  58.90 101.68
## 5015 Female  58.44  98.19
## 5019 Female  61.74 107.87

small_tall_df <- subset(ht_wt_df, Weight < 110 & Height > 62)

Adding columns

states <- c(rep("IN", times = 4000), rep("MI", times=4000), rep("IL", times=2000))

st_ht_wt_df <- data.frame(ht_wt_df, State = states)
head(st_ht_wt_df)

##   Gender Height Weight State
## 1   Male  73.85  241.9    IN
## 2   Male  68.78  162.3    IN
## 3   Male  74.11  212.7    IN
## 4   Male  71.73  220.0    IN
## 5   Male  69.88  206.3    IN
## 6   Male  67.25  152.2    IN

ht_wt_df_tmp <- ht_wt_df
ht_wt_df_tmp$State <- states
head(ht_wt_df_tmp)

##   Gender Height Weight State
## 1   Male  73.85  241.9    IN
## 2   Male  68.78  162.3    IN
## 3   Male  74.11  212.7    IN
## 4   Male  71.73  220.0    IN
## 5   Male  69.88  206.3    IN
## 6   Male  67.25  152.2    IN

Transform

st_ht_wt_df2 <- transform(ht_wt_df, State = states)
head(st_ht_wt_df2)

##   Gender Height Weight State
## 1   Male  73.85  241.9    IN
## 2   Male  68.78  162.3    IN
## 3   Male  74.11  212.7    IN
## 4   Male  71.73  220.0    IN
## 5   Male  69.88  206.3    IN
## 6   Male  67.25  152.2    IN

Create columns for height and weight in metric scales.

met_ht_wt_df <- transform(ht_wt_df_tmp, Ht_metric = 2.54 * Height, Wt_metric = Weight/2.2)
head(met_ht_wt_df)

##   Gender Height Weight State Ht_metric Wt_metric
## 1   Male  73.85  241.9    IN     187.6    109.95
## 2   Male  68.78  162.3    IN     174.7     73.78
## 3   Male  74.11  212.7    IN     188.2     96.70
## 4   Male  71.73  220.0    IN     182.2    100.02
## 5   Male  69.88  206.3    IN     177.5     93.80
## 6   Male  67.25  152.2    IN     170.8     69.19

Here, the formula was applied to the Height and Weight values in each row to create the new vectors of values. These are added as new columns.