You can use RStudio in the cloud!

Sign up with google or github account at rstudio.cloud.

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.7.2     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## you may need to install tidyverse first:
# install.packages('tidyverse')
rm(list=ls())
setwd('/Users/jng2/Dropbox/Work/Library/CDS/R-RStudio/demo1hr')

R has many baked-in example datasets. Let’s load one of them.

datauto <- data.frame(mpg)

Get file dimensions and store them in new objects.

dim(datauto)
## [1] 234  11
n.obs <- dim(datauto)[1]
n.cols <- dim(datauto)[2]

Get a sense of the data.

# get glimpse of data
str(datauto)
## 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
# return first few rows
head(datauto)
# get basic summary stats
summary(datauto)
##  manufacturer          model               displ            year     
##  Length:234         Length:234         Min.   :1.600   Min.   :1999  
##  Class :character   Class :character   1st Qu.:2.400   1st Qu.:1999  
##  Mode  :character   Mode  :character   Median :3.300   Median :2004  
##                                        Mean   :3.472   Mean   :2004  
##                                        3rd Qu.:4.600   3rd Qu.:2008  
##                                        Max.   :7.000   Max.   :2008  
##       cyl           trans               drv                 cty       
##  Min.   :4.000   Length:234         Length:234         Min.   : 9.00  
##  1st Qu.:4.000   Class :character   Class :character   1st Qu.:14.00  
##  Median :6.000   Mode  :character   Mode  :character   Median :17.00  
##  Mean   :5.889                                         Mean   :16.86  
##  3rd Qu.:8.000                                         3rd Qu.:19.00  
##  Max.   :8.000                                         Max.   :35.00  
##       hwy             fl               class          
##  Min.   :12.00   Length:234         Length:234        
##  1st Qu.:18.00   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character  
##  Mean   :23.44                                        
##  3rd Qu.:27.00                                        
##  Max.   :44.00

The variable class is categorical so let’s factorize it. Summary() is now more useful when applied to a factor.

datauto$f.class <- factor(datauto$class)
summary(datauto$f.class)
##    2seater    compact    midsize    minivan     pickup subcompact 
##          5         47         41         11         33         35 
##        suv 
##         62

Sort data by city mpg

# ascending order
arrange(datauto, cty)
# descendng order
arrange(datauto, desc(cty))

Sort by manufacturer, model and year, and commit to memory.

datauto <- arrange(datauto, manufacturer, model, year)
datauto

Give me the names of all the manufacturers

unique( datauto[, 'manufacturer'] )
##  [1] "audi"       "chevrolet"  "dodge"      "ford"       "honda"     
##  [6] "hyundai"    "jeep"       "land rover" "lincoln"    "mercury"   
## [11] "nissan"     "pontiac"    "subaru"     "toyota"     "volkswagen"
# equivalently: 
# unique( datauto$manufacturer )

How many models per year did each manufacturer have?

datauto %>% 
  group_by(manufacturer, year) %>% 
  summarize( n.models = n_distinct(model) )

Keep only the manufacturers with just one model.

datauto %>% 
  group_by(manufacturer, year) %>% 
  filter( n_distinct(model) == 1 )

Compute mean cty, hwy and displ by manufacturer

datauto.means <- datauto %>% 
  group_by(manufacturer) %>% 
  summarize(mean.cty = mean(cty), mean.hwy = mean(hwy), mean.displ = mean(displ) )

datauto.means

To produce this scatter plot.

library(ggrepel)
plot1 <- ggplot(data=datauto.means, aes(x = mean.hwy, y = mean.cty)) +
  geom_point() +
  geom_text_repel(aes(label=manufacturer))

plot1

Vary point color by mean displacement.

plot1 <- plot1 + geom_point(aes(color=mean.displ))

plot1