See the Work Along - Data frames for in-class exercises.
Navigate to the working directory.
setwd("~/Documents/Computing with Data/5_Data_frames/")
Import data on heights and weights.
ht_wt_df <- read.csv(file="../Data/01_heights_weights_genders.txt")
dim(ht_wt_df)
## [1] 10000 3
nrow(ht_wt_df)
## [1] 10000
Inspecting the data:
head(ht_wt_df)
## Gender Height Weight
## 1 Male 73.85 241.9
## 2 Male 68.78 162.3
## 3 Male 74.11 212.7
## 4 Male 71.73 220.0
## 5 Male 69.88 206.3
## 6 Male 67.25 152.2
head
prints the first 6 rows, but you can adjust the number.
head(ht_wt_df, n=10)
## Gender Height Weight
## 1 Male 73.85 241.9
## 2 Male 68.78 162.3
## 3 Male 74.11 212.7
## 4 Male 71.73 220.0
## 5 Male 69.88 206.3
## 6 Male 67.25 152.2
## 7 Male 68.79 183.9
## 8 Male 68.35 168.0
## 9 Male 67.02 175.9
## 10 Male 63.46 156.4
tail(ht_wt_df)
## Gender Height Weight
## 9995 Female 59.10 110.5
## 9996 Female 66.17 136.8
## 9997 Female 67.07 170.9
## 9998 Female 63.87 128.5
## 9999 Female 69.03 163.9
## 10000 Female 61.94 113.6
str gives similar info, but includes the type of the variables.
str(ht_wt_df)
## 'data.frame': 10000 obs. of 3 variables:
## $ Gender: Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
## $ Height: num 73.8 68.8 74.1 71.7 69.9 ...
## $ Weight: num 242 162 213 220 206 ...
Data frames can contain columns of any mode. They should be atomic, though.
The “names” attribute refers to the column names. Names is an appropriate term because a data.frame is a special kind of list. The list components are the columns of data frame, so the “names” attribute of the list is the “names” atrribute of the data frame.
names(ht_wt_df)
## [1] "Gender" "Height" "Weight"
Change names as with a vector.
ht_2 <- ht_wt_df
names(ht_2)[3] <- "Wt"
head(ht_2)
## Gender Height Wt
## 1 Male 73.85 241.9
## 2 Male 68.78 162.3
## 3 Male 74.11 212.7
## 4 Male 71.73 220.0
## 5 Male 69.88 206.3
## 6 Male 67.25 152.2
rownames(ht_wt_df)[1:3]
## [1] "1" "2" "3"
rownames(ht_2) <- paste("S", 1:10000, sep="")
head(ht_2)
## Gender Height Wt
## S1 Male 73.85 241.9
## S2 Male 68.78 162.3
## S3 Male 74.11 212.7
## S4 Male 71.73 220.0
## S5 Male 69.88 206.3
## S6 Male 67.25 152.2
head(ht_wt_df)
## Gender Height Weight
## 1 Male 73.85 241.9
## 2 Male 68.78 162.3
## 3 Male 74.11 212.7
## 4 Male 71.73 220.0
## 5 Male 69.88 206.3
## 6 Male 67.25 152.2
ht_wt_df[3, 2]
## [1] 74.11
ht_wt_df["3", "Height"]
## [1] 74.11
ht_wt_df[3:5, 2]
## [1] 74.11 71.73 69.88
class(ht_wt_df[3:5, 2])
## [1] "numeric"
ht_wt_df[3:5, 2:3]
## Height Weight
## 3 74.11 212.7
## 4 71.73 220.0
## 5 69.88 206.3
class(ht_wt_df[3:5, 2:3])
## [1] "data.frame"
ht_wt_df[5:10, ]
## Gender Height Weight
## 5 Male 69.88 206.3
## 6 Male 67.25 152.2
## 7 Male 68.79 183.9
## 8 Male 68.35 168.0
## 9 Male 67.02 175.9
## 10 Male 63.46 156.4
log1 <- rep(FALSE, times = 10000)
log1[c(5,6, 8, 10)] <- TRUE
ht_wt_df[log1, ]
## Gender Height Weight
## 5 Male 69.88 206.3
## 6 Male 67.25 152.2
## 8 Male 68.35 168.0
## 10 Male 63.46 156.4
ht_wt_df[log1, "Weight"]
## [1] 206.3 152.2 168.0 156.4
Comparisons create the logical vector.
female_logical <- ht_wt_df$Gender == "Female"
length(female_logical)
## [1] 10000
table(female_logical)
## female_logical
## FALSE TRUE
## 5000 5000
female_ht_wt <- ht_wt_df[female_logical, ]
head(female_ht_wt)
## Gender Height Weight
## 5001 Female 58.91 102.1
## 5002 Female 65.23 141.3
## 5003 Female 63.37 131.0
## 5004 Female 64.48 128.2
## 5005 Female 61.79 129.8
## 5006 Female 65.97 156.8
Select all over 200 pounds.
large_df <- ht_wt_df[ ht_wt_df$Weight > 200 , ]
head(large_df)
## Gender Height Weight
## 1 Male 73.85 241.9
## 3 Male 74.11 212.7
## 4 Male 71.73 220.0
## 5 Male 69.88 206.3
## 12 Male 71.64 213.7
## 24 Male 75.21 228.8
small_df2 <- subset(ht_wt_df, Weight < 110)
head(small_df2)
## Gender Height Weight
## 5001 Female 58.91 102.09
## 5010 Female 63.68 104.15
## 5012 Female 61.80 106.23
## 5014 Female 58.90 101.68
## 5015 Female 58.44 98.19
## 5019 Female 61.74 107.87
small_tall_df <- subset(ht_wt_df, Weight < 110 & Height > 62)
states <- c(rep("IN", times = 4000), rep("MI", times=4000), rep("IL", times=2000))
st_ht_wt_df <- data.frame(ht_wt_df, State = states)
head(st_ht_wt_df)
## Gender Height Weight State
## 1 Male 73.85 241.9 IN
## 2 Male 68.78 162.3 IN
## 3 Male 74.11 212.7 IN
## 4 Male 71.73 220.0 IN
## 5 Male 69.88 206.3 IN
## 6 Male 67.25 152.2 IN
ht_wt_df_tmp <- ht_wt_df
ht_wt_df_tmp$State <- states
head(ht_wt_df_tmp)
## Gender Height Weight State
## 1 Male 73.85 241.9 IN
## 2 Male 68.78 162.3 IN
## 3 Male 74.11 212.7 IN
## 4 Male 71.73 220.0 IN
## 5 Male 69.88 206.3 IN
## 6 Male 67.25 152.2 IN
Transform
st_ht_wt_df2 <- transform(ht_wt_df, State = states)
head(st_ht_wt_df2)
## Gender Height Weight State
## 1 Male 73.85 241.9 IN
## 2 Male 68.78 162.3 IN
## 3 Male 74.11 212.7 IN
## 4 Male 71.73 220.0 IN
## 5 Male 69.88 206.3 IN
## 6 Male 67.25 152.2 IN
Create columns for height and weight in metric scales.
met_ht_wt_df <- transform(ht_wt_df_tmp, Ht_metric = 2.54 * Height, Wt_metric = Weight/2.2)
head(met_ht_wt_df)
## Gender Height Weight State Ht_metric Wt_metric
## 1 Male 73.85 241.9 IN 187.6 109.95
## 2 Male 68.78 162.3 IN 174.7 73.78
## 3 Male 74.11 212.7 IN 188.2 96.70
## 4 Male 71.73 220.0 IN 182.2 100.02
## 5 Male 69.88 206.3 IN 177.5 93.80
## 6 Male 67.25 152.2 IN 170.8 69.19
Here, the formula was applied to the Height and Weight values in each row to create the new vectors of values. These are added as new columns.