You can use RStudio in the cloud!

Sign up with google or github account at rstudio.cloud.

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## you may need to install tidyverse first:
# install.packages('tidyverse')
rm(list=ls())
setwd('/Users/jng2/Dropbox/Work/Library/CDS/R-RStudio/demo1hr')

R has many baked-in example datasets. Let’s load one of them.

datauto <- data.frame(mpg)

Get file dimensions and store them in new objects.

dim(datauto)
## [1] 234  11
n.obs <- dim(datauto)[1]
n.cols <- dim(datauto)[2]

Get a sense of the data.

# get glimpse of data
str(datauto)
## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
# return first few rows
head(datauto)
# get basic summary stats
summary(datauto)
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00

The variable class is categorical so let’s factorize it. Summary() is now more useful when applied to a factor.

datauto$f.class <- factor(datauto$class)
summary(datauto$f.class)
##    2seater    compact    midsize    minivan     pickup subcompact 
##          5         47         41         11         33         35 
##        suv 
##         62

Sort data by city mpg

# ascending order
arrange(datauto, cty)
# descendng order
arrange(datauto, desc(cty))

Sort by manufacturer, model and year, and commit to memory.

datauto <- arrange(datauto, manufacturer, model, year)
datauto

Give me the names of all the manufacturers

unique( datauto[, 'manufacturer'] )
##  [1] "audi"       "chevrolet"  "dodge"      "ford"       "honda"     
##  [6] "hyundai"    "jeep"       "land rover" "lincoln"    "mercury"   
## [11] "nissan"     "pontiac"    "subaru"     "toyota"     "volkswagen"
# equivalently: 
# unique( datauto$manufacturer )

How many models per year did each manufacturer have?

datauto %>% 
  group_by(manufacturer, year) %>% 
  summarize( n.models = n_distinct(model) )

Keep only the manufacturers with just one model.

datauto %>% 
  group_by(manufacturer, year) %>% 
  filter( n_distinct(model) == 1 )

Compute mean cty, hwy and displ by manufacturer

datauto.means <- datauto %>% 
  group_by(manufacturer) %>% 
  summarize(mean.cty = mean(cty), mean.hwy = mean(hwy), mean.displ = mean(displ) )

datauto.means

To produce this scatter plot.

library(ggrepel)
plot1 <- ggplot(data=datauto.means, aes(x = mean.hwy, y = mean.cty)) +
  geom_point() +
  geom_text_repel(aes(label=manufacturer))

plot1

Vary point color by mean displacement.

plot1 <- plot1 + geom_point(aes(color=mean.displ))

plot1

Add line of best fit.

plot1 + geom_smooth(method='lm')

Find correlation between average combined cty and hwy mpg in 1999 vs 2008.

To do this, need to ‘spread’ (reshape) the data.

# first get average combined cty and hwy, call it mpg
datauto %>% mutate( mpg = (cty+hwy)/2 )
# then summarize by manufacturer, model, year
tempdat <- datauto %>% mutate(mpg = (cty+hwy)/2) %>%
  group_by(manufacturer, model, year) %>%
  summarize( mpg=mean(mpg) )

tempdat
# spread mpg into mpg_1999 and mpg_2008
tempdat %>% spread( key=year, value=mpg)
# finally, rename 1999 and 2008 columns
dat.mpg.wide <- tempdat %>% spread( key=year, value=mpg) %>%
  rename( mpg_1999 = `1999`, mpg_2008 = `2008` )
dat.mpg.wide

Regress mpg_2008 on mpg_1999

modl <- lm(formula = mpg_2008 ~ mpg_1999, data=dat.mpg.wide)

summary(modl)
## 
## Call:
## lm(formula = mpg_2008 ~ mpg_1999, data = dat.mpg.wide)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0981 -0.5320 -0.0038  0.7256  2.8691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.77916    1.05927    1.68    0.102    
## mpg_1999     0.93669    0.05267   17.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.523 on 36 degrees of freedom
## Multiple R-squared:  0.8978, Adjusted R-squared:  0.895 
## F-statistic: 316.3 on 1 and 36 DF,  p-value: < 2.2e-16

Want to learn more? Two great resources:

R for Data Science by Hadley Wickham

Programming with R by Software Carpentry