#####################
### Workshop Code ###
#####################

### R Install ###
# Not in the presentation, but might be handy for you.
# R: https://cloud.r-project.org/
# RStudio: https://www.rstudio.com/products/rstudio/download/#download

# Briefly, you can do everything you would ever need with R alone.
# RStudio is an IDE that makes working with R easier. You get a
# scripting environment, package manager, viewers, etc. While I
# might not have earned it, please just trust me that it will make
# learning R much easier. I learned it the old-school way and have
# the brain damage to show for it.

### Package Installation ###

# The easiest way to run a line of code is to put your cursor
# anywhere on the line and press Control + Enter. If we have a
# piped chunk (this will become clear later), then you can put
# your cursor anywhere on the chunk and run it. There is almost
# never a need to highlight everything to run it.

install.packages(c('tidyverse', 'plotly', 'DT', 'data.table'))

# You will only ever need to run the previous line once (unless you update R).

### The Basics ###

numList = 1:5

numList

### The Functions ###

nonsenseFunction = function(x) {
  res = (sqrt(x) + sin(x))^2
  return(res)
}

test = 1:10

nonsenseFunction(test)

# Just to show you, we could also do something like the following:

nonsenseFunction(1:10)

# We did not create an object, but just passed a vector into
# our function. We can compound things like this:

nonSenseOutput = nonsenseFunction(test)

# So, we have created a new object using our test object and our function!

### Our Data ###

library(dplyr); library(rvest); library(xml2); library(magrittr)

filmLink = "https://en.wikipedia.org/wiki/List_of_highest-grossing_films"

allTables = read_html(filmLink) %>%
  html_table(fill = TRUE)

highestGrossing = allTables %>%
  extract2(1)

head(highestGrossing)

# You may want to see all of your data -- use View(highestGrossing)
# This is an important note to make: R is case sensitive.
# Be honest -- you typed View() with a lowercase "v", right?
# If you didn't make that mistake, then pat yourself on the back.

### Action! (Or using select) ###

highestGrossing = highestGrossing %>%
  select(-Rank, -`Reference(s)`)

head(highestGrossing)

### String Cleaning ###

library(stringr)

highestGrossing = highestGrossing %>%
  mutate(Peak = str_replace(Peak, "[A-Z].*", ""),
         Peak = as.numeric(Peak),
         gross = as.numeric(str_replace_all(`Worldwide gross`, "\\$|,", ""))) %>%
  select(-`Worldwide gross`)

### Filter ###

highestGrossing = highestGrossing %>%
  filter(Peak != 1 & Peak != max(Peak))

### GDP Deflator ###

gdpDeflator2017 = read_html("http://www.multpl.com/gdp-deflator/table") %>%
  html_table(trim = TRUE, header = TRUE) %>%
  extract2(1) %>%
  filter(grepl("2016", Date) == TRUE) %>%
  select(-Date) %>%
  extract2(1)

### Using Mutate ###

highestGrossing = highestGrossing %>%
  mutate(grossReal2017 = gross/(gdpDeflator2017/100))

### Group_by ###

highestGrossing = highestGrossing %>%
  group_by(Year)

### Summarize ###

highestGrossingSummary = highestGrossing %>%
  summarize(n = n(),
            meanGross = mean(grossReal2017))

head(highestGrossingSummary)

### Coding How We Think ###

allTables = read_html(filmLink) %>%
  html_table(fill = TRUE) %>%
  extract2(1) %>%
  select(-Rank, -`Reference(s)`) %>%
  mutate(Peak = str_replace(Peak, "[A-Z].*", ""),
         Peak = as.numeric(Peak),
         gross = as.numeric(str_replace_all(`Worldwide gross`, "\\$|,", ""))) %>%
  select(-`Worldwide gross`) %>%
  filter(Peak != 1 & Peak != max(Peak)) %>%
  mutate(grossReal2017 = gross/(gdpDeflator2017/100)) %>%
  group_by(Year) %>%
  summarize(n = n(),
            meanGross = mean(grossReal2017))


# Are you still here, after all this time?
# I am proud of you! Not only am I proud of
# you, but so is Bill "The Wizard" Venables.
# If you have made it here, without tears,
# speak now (yes, right now -- I am probably
# not saying anything too important) to
# get your pick of candy. Be fast, though,
# because it will only go to one person.

### An Interactive Look ###

library(DT)

datatable(highestGrossingSummary)

### Visual Exploration ###

library(ggplot2)

ggplot(highestGrossingSummary, aes(Year, meanGross, size = n)) +
  geom_point() +
  theme_minimal()

# If this is your first exposure to ggplot2, then it
# might seem daunting -- just know that it isn't.
# You can do things with ggplot2 that you never thought
# possible. With regard to a typical plot, I went pretty
# sparse here, but do know that the sky is the limit and
# we can build layers upon layer.

### Something A Bit More Modern ###

movieByYear = highestGrossing %>%
  group_by(Year) %>%
  dplyr::summarise(movies = paste(Title, collapse = "\n")) %>%
  right_join(., highestGrossingSummary, by = "Year")

library(plotly)

p = ggplot(movieByYear, aes(Year, meanGross, size = n, color = meanGross)) +
  geom_point(aes(text = movies)) +
  theme_minimal() +
  theme(axis.text.y=element_blank(),
        axis.title.y=element_blank())

ggplotly(p, tooltip = c("text", "size", "color"))

### Coalesce ###

testDF = data.frame(x1 = c(1:5, rep(NA, 5)),
                    x2 = c(rep(NA, 5), 6:10))

testDF

testDF %>%
  mutate(z = coalesce(x1, x2))

# The first person to yell their favorite
# dplyr function gets to pick some candy.
# If you start a slow-clap/chant, I will throw
# in an extrac pick from the candy bag; I see no reason
# why I should be the only person looking foolish.

### Parting Thoughts ###

# I wish that we had the time to get into many other things, but
# we don't. Do know, however, that dplyr makes joining data frames
# simple (left_join, right_join, inner_join, full_join, anti_join)
# and it has a ton of other functions. Similarly, the tidyr package
# makes reshaping data a breeze (wide to long, long to wide, and some
# other handy features for splitting and joining columns). Finally, while
# not part of the tidyverse, the data.table package is great for larger
# data issues. The syntax is wacky, but it can do a lot of neat things.