You can use RStudio in the cloud!
Sign up with google or github account at rstudio.cloud.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## you may need to install tidyverse first:
# install.packages('tidyverse')
rm(list=ls())
setwd('/Users/jng2/Dropbox/Work/Library/CDS/R-RStudio/demo1hr')
R has many baked-in example datasets. Let’s load one of them.
datauto <- data.frame(mpg)
Get file dimensions and store them in new objects.
dim(datauto)
## [1] 234 11
n.obs <- dim(datauto)[1]
n.cols <- dim(datauto)[2]
Get a sense of the data.
# get glimpse of data
str(datauto)
## 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
# return first few rows
head(datauto)
# get basic summary stats
summary(datauto)
## manufacturer model displ year
## Length:234 Length:234 Min. :1.600 Min. :1999
## Class :character Class :character 1st Qu.:2.400 1st Qu.:1999
## Mode :character Mode :character Median :3.300 Median :2004
## Mean :3.472 Mean :2004
## 3rd Qu.:4.600 3rd Qu.:2008
## Max. :7.000 Max. :2008
## cyl trans drv cty
## Min. :4.000 Length:234 Length:234 Min. : 9.00
## 1st Qu.:4.000 Class :character Class :character 1st Qu.:14.00
## Median :6.000 Mode :character Mode :character Median :17.00
## Mean :5.889 Mean :16.86
## 3rd Qu.:8.000 3rd Qu.:19.00
## Max. :8.000 Max. :35.00
## hwy fl class
## Min. :12.00 Length:234 Length:234
## 1st Qu.:18.00 Class :character Class :character
## Median :24.00 Mode :character Mode :character
## Mean :23.44
## 3rd Qu.:27.00
## Max. :44.00
The variable class is categorical so let’s factorize it. Summary() is now more useful when applied to a factor.
datauto$f.class <- factor(datauto$class)
summary(datauto$f.class)
## 2seater compact midsize minivan pickup subcompact
## 5 47 41 11 33 35
## suv
## 62
Sort data by city mpg
# ascending order
arrange(datauto, cty)
# descendng order
arrange(datauto, desc(cty))
Sort by manufacturer, model and year, and commit to memory.
datauto <- arrange(datauto, manufacturer, model, year)
datauto
Give me the names of all the manufacturers
unique( datauto[, 'manufacturer'] )
## [1] "audi" "chevrolet" "dodge" "ford" "honda"
## [6] "hyundai" "jeep" "land rover" "lincoln" "mercury"
## [11] "nissan" "pontiac" "subaru" "toyota" "volkswagen"
# equivalently:
# unique( datauto$manufacturer )
How many models per year did each manufacturer have?
datauto %>%
group_by(manufacturer, year) %>%
summarize( n.models = n_distinct(model) )
Keep only the manufacturers with just one model.
datauto %>%
group_by(manufacturer, year) %>%
filter( n_distinct(model) == 1 )
Compute mean cty, hwy and displ by manufacturer
datauto.means <- datauto %>%
group_by(manufacturer) %>%
summarize(mean.cty = mean(cty), mean.hwy = mean(hwy), mean.displ = mean(displ) )
datauto.means
To produce this scatter plot.
library(ggrepel)
plot1 <- ggplot(data=datauto.means, aes(x = mean.hwy, y = mean.cty)) +
geom_point() +
geom_text_repel(aes(label=manufacturer))
plot1
Vary point color by mean displacement.
plot1 <- plot1 + geom_point(aes(color=mean.displ))
plot1
Add line of best fit.
plot1 + geom_smooth(method='lm')
Find correlation between average combined cty and hwy mpg in 1999 vs 2008.
To do this, need to ‘spread’ (reshape) the data.
# first get average combined cty and hwy, call it mpg
datauto %>% mutate( mpg = (cty+hwy)/2 )
# then summarize by manufacturer, model, year
tempdat <- datauto %>% mutate(mpg = (cty+hwy)/2) %>%
group_by(manufacturer, model, year) %>%
summarize( mpg=mean(mpg) )
tempdat
# spread mpg into mpg_1999 and mpg_2008
tempdat %>% spread( key=year, value=mpg)
# finally, rename 1999 and 2008 columns
dat.mpg.wide <- tempdat %>% spread( key=year, value=mpg) %>%
rename( mpg_1999 = `1999`, mpg_2008 = `2008` )
dat.mpg.wide
Regress mpg_2008 on mpg_1999
modl <- lm(formula = mpg_2008 ~ mpg_1999, data=dat.mpg.wide)
summary(modl)
##
## Call:
## lm(formula = mpg_2008 ~ mpg_1999, data = dat.mpg.wide)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0981 -0.5320 -0.0038 0.7256 2.8691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.77916 1.05927 1.68 0.102
## mpg_1999 0.93669 0.05267 17.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.523 on 36 degrees of freedom
## Multiple R-squared: 0.8978, Adjusted R-squared: 0.895
## F-statistic: 316.3 on 1 and 36 DF, p-value: < 2.2e-16
Want to learn more? Two great resources: