---
title: "Reddit Post Experiment"
author: "Tim Weninger, Thomas J. Johnston, Maria Glenski"
date: "2/6/2015"
output: html_document
---

###Reddit Post Experiment dataset can be found here: 
<http://www3.nd.edu/~tweninge/data/reddit_post_manipulation_data.csv>

###The R-Markdown code can be found here:
<http://www3.nd.edu/~tweninge/data/reddit_report.Rmd>

These are the statistics generated from the reddit data.

# Basic dataset statistics:

```{r, echo=FALSE, message=FALSE}

setwd("/data/tweninge/reddit_pnas")
post_dat = read.csv("reddit_post_manipulation_data.csv", header = TRUE, sep="\t")
post_dat$score = post_dat$ups - post_dat$downs

library("ggplot2", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.0")
library(grid)
library("moments", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.0")
library("fitdistrplus", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.0")
library(data.table)

```

Number of posts, N=

```{r, echo=FALSE}
n = nrow(post_dat)
n
```

Control:

```{r , echo=FALSE}
nothingHits = post_dat$action == 'Nothing' 
nothingData = post_dat[nothingHits,]

count_control = nrow(nothingData)
count_control
```

Positive Treatment:

```{r, echo=FALSE}
upvoteHits = post_dat$action == 'Upvote' 
upvoteData = post_dat[upvoteHits,]

count_upvote = nrow(upvoteData)
count_upvote
```

Negative Treatment:

```{r, echo=FALSE}
downvoteHits = post_dat$action == 'Downvote' 
downvoteData = post_dat[downvoteHits,]

count_downvote = nrow(downvoteData)
count_downvote
```

## Post score (ups-downs) summary statistics

All scores:

```{r, echo=FALSE} 
nothingScore = nothingData$score
upvoteScore = upvoteData$score
downvoteScore =  downvoteData$score

allScore = c(t(nothingScore), t(upvoteScore), t(downvoteScore))

summary(allScore)

```

Control Scores:

```{r, echo=FALSE}
summary(nothingScore)
```

Positive Treatment Scores (without adjustment):

```{r, echo=FALSE}
summary(upvoteScore)
```

Negative Treatment Scores (without adjustment):

```{r, echo=FALSE}
summary(downvoteScore)
```

Boxplot Comparison:

```{r, echo=FALSE}
boxplot(list(nothingScore, upvoteScore, downvoteScore), names = list("Control", "Positive", "Negative"))
```

Boxplot Comparison (log):

```{r, echo=FALSE}
lns = nothingScore
#eliminate nothing scores <= 0
lns = lns[lns>0]  
lus = upvoteScore
#eliminate upvote scores <= 0
lus = lus[lus>0]
lds = downvoteScore
#eliminate downvote scores <= 0
lds = lds[lds>0]

sk_nothing <- skewness(log(lns))
sk_upvote <- skewness(log(lus))
sk_downvote <- skewness(log(lds))

print("Skewness (log) by Treatment:")
paste("Control = ", sk_nothing)
paste("Positive = ", sk_upvote)
paste("Negative = ", sk_downvote)

kur_nothing <- kurtosis(log(lns))
kur_upvote <- kurtosis(log(lus))
kur_downvote <- kurtosis(log(lds))

print("Kurtosis (log) by Treatment:")
paste("Control = ", kur_nothing)
paste("Positive = ", kur_upvote)
paste("Negative = ", kur_downvote)

print("Boxplot Comparison (log) of Scores by Treatment")
boxplot(list( log(lns), log(lus), log(lds) ), names = list("Control", "Positive", "Negative"))

```

Line graph Comparison:

```{r, echo=FALSE}
plot( rev(sort(nothingScore)),
  type = "l", 
  ylim = c(min(nothingScore), max(nothingScore)),
  ylab = "value")

lines(rev(sort(upvoteScore)), col = "blue")
lines(rev(sort(downvoteScore)), col = "red")
```


Outlier Elimination:

```{r outlier elimination, echo=FALSE, message=FALSE, warning=FALSE}

require(gridExtra)

## Summarizes data.
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    require(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
      .fun = function(xx, col) {
        c(N    = length2(xx[[col]], na.rm=na.rm),
          mean = mean   (xx[[col]], na.rm=na.rm),
          sd   = sd     (xx[[col]], na.rm=na.rm)
        )
      },
      measurevar
    )

    # Rename the "mean" column    
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval: 
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}


sk_post_dat <- skewness(post_dat$score) 
kur_post_dat <- kurtosis(post_dat$score) 
paste("Skewness for all scores = ", sk_post_dat)
paste("Kurtosis for all scores = ", kur_post_dat)


q1 = quantile(post_dat$score,  probs = c(0.0, 0.99), na.rm=TRUE)[1]
q3 = quantile(post_dat$score,  probs = c(0.0, 0.99), na.rm=TRUE)[2]

post_dat_elim = post_dat[post_dat$score>=q1,]
post_dat_elim = post_dat_elim[post_dat_elim$score<=q3,]

sk_post_dat_elim <- skewness(post_dat_elim$score) 
kur_post_dat_elim <- kurtosis(post_dat_elim$score) 
paste("Skewness for all scores (with outliers eliminated) = ", sk_post_dat_elim)
paste("Kurtosis for all scores (with outliers eliminated) = ", kur_post_dat_elim)

median_post_dat_nothing <- median(post_dat[post_dat$action=='Nothing',]$score)
median_post_dat_upvote <- median(post_dat[post_dat$action=='Upvote',]$score)
median_post_dat_downvote <- median(post_dat[post_dat$action=='Downvote',]$score)

dfc <- summarySE(post_dat, measurevar="score", groupvars=c("action"))

dfce <- summarySE(post_dat_elim, measurevar="score", groupvars=c("action"))


plot1 <- ggplot(dfc, aes(x=action, y=score, colour=action, group=action, fill=action, shape=action)) + 
    geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.1) +
  geom_errorbar(aes(ymin=score-se, ymax=score+se), colour="red", width=.1) +
    geom_point(size=3)+
xlab("Treatment") +
    ylab("Final Post Score") +
  scale_x_discrete(breaks=c("Downvote", "Nothing", "Upvote"), labels=c("Down Vote", "Control", "Up Vote")) +
    scale_color_manual(values=c("#000000", "#000000", "#000000"))+
    ggtitle("Voting Effects on Final Post Score") +
    scale_y_continuous(
                       breaks=0:20*2) +                       # Set tick every 4
    theme_classic() +
  scale_shape_manual(values=c(25,21, 24)) +
  scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+
  theme(
    legend.position = "none"
  )

plot2 <- ggplot(dfce, aes(x=action, y=score, colour=action, group=action, fill=action, shape=action)) + 
    geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.1) +
  geom_errorbar(aes(ymin=score-se, ymax=score+se), colour="red", width=.1) +
    geom_point(size=3)+
xlab("Treatment") +
    ylab("Final Post Score") +
    scale_color_manual(values=c("#000000", "#000000", "#000000"))+
    ggtitle(expression(atop("Voting Effects on Final Post Score", atop(italic("Top 1% Removed"), "")))) +    
    theme_classic() +
  scale_shape_manual(values=c(25,21, 24)) +
  scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+
  scale_x_discrete(breaks=c("Downvote", "Nothing", "Upvote"), labels=c("Down Vote", "Control", "Up Vote")) +
  theme(
    legend.position = "none"
  )

plot1

n = nrow(post_dat[post_dat$action=='Nothing',])
probs = vector()

#prob of reaching the front page >1000 votes
for ( i in seq(from=500,to=2000,by=20) ){
  probs[i] <-nrow(post_dat[post_dat$score>=i & post_dat$action=='Nothing',])/n
}
probs<-probs[!is.na(probs)]

n = nrow(post_dat[post_dat$action=='Upvote',])
probs_u = vector()

#prob of reaching the front page >1000 votes
for ( i in seq(from=500,to=2000,by=20) ){
  probs_u[i] <-nrow(post_dat[post_dat$score>=i & post_dat$action=='Upvote',])/n
}
probs_u<-probs_u[!is.na(probs_u)]


n = nrow(post_dat[post_dat$action=='Downvote',])
probs_d = vector()

#prob of reaching the front page >1000 votes
for ( i in seq(from=500,to=2000,by=20) ){
  probs_d[i] <-nrow(post_dat[post_dat$score>=i & post_dat$action=='Nothing',])/n
}
probs_d<-probs_d[!is.na(probs_d)]


df <- data.frame(probs, probs_u, probs_d)
colnames(df) <- c("Control", "Upvote", "Downvote")


n = nrow(post_dat[post_dat$action=='Nothing',])
probs = vector()

#prob of reaching the front page >1000 votes
for ( i in seq(from=1,to=3000/50) ){
  probs[i] = nrow(post_dat[post_dat$score>=((i)*50-100) & post_dat$action=='Nothing',])/n
}
probs=probs[!is.na(probs)]

n = nrow(post_dat[post_dat$action=='Upvote',])
probs_u = vector()

#prob of reaching the front page >1000 votes
for ( i in seq(from=1,to=3000/50) ){
  probs_u[i] = nrow(post_dat[post_dat$score>=((i)*50-100) & post_dat$action=='Upvote',])/n
}
probs_u<-probs_u[!is.na(probs_u)]


n = nrow(post_dat[post_dat$action=='Downvote',])
probs_d = vector()

#prob of reaching the front page >1000 votes
for ( i in seq(from=1,to=3000/50) ){
  probs_d[i] = nrow(post_dat[post_dat$score>=((i)*50-100) & post_dat$action=='Downvote',])/n
}
probs_d<-probs_d[!is.na(probs_d)]


df2 <- data.frame(probs, probs_u, probs_d)
colnames(df2) <- c("Control", "Upvote", "Downvote")
                  
main <- ggplot(df, aes(seq(from=500,to=2000,by=20))) +
    geom_line(aes(y=Upvote, colour="Upvote"), linetype="dotdash", size=1.2) + 
  geom_line(aes(y=Downvote, colour="Downvote"), linetype="dashed", size=1.2, alpha=1) +
  geom_line(aes(y=Control, colour="Control"), linetype="solid", size=1.2, alpha=0.8) +
  xlab("Final Post Score") +
  ggtitle("Final Score Probability by Vote Effect") +  
    ylab("Probability (log scale)") +
      theme_classic() +
  scale_colour_manual("", 
                      breaks = c("Upvote", "Downvote", "Control"),
                      values = c("#0000AA", "#990000", "#008800")) +
  scale_y_log10(breaks=c(.003, .005, .01)) +
  theme(legend.justification=c(1,1), legend.position=c(1,1))

sub <- ggplot(df2, aes(seq(from=-50,to=2900,by=50))) +
    geom_line(aes(y=Upvote), colour="#008800", linetype="dotdash", size=1.1) + 
  geom_line(aes(y=Downvote), colour="#990000", linetype="dashed", size=1.1, alpha=1) +
  geom_line(aes(y=Control), colour="#0000AA", linetype="solid", size=1.1, alpha=0.7) +
  xlab("Final Post Score") +
  ggtitle("") +
    ylab("Probability (log scale)") +
      theme(panel.grid.minor = element_blank(), 
    panel.grid.major = element_blank(),
    panel.background = element_blank(), plot.background = element_blank()) +
  scale_y_log10(breaks=c(.001, .01, .1, 1)) + 
  geom_rect(aes(xmin = 500, xmax = 2000, ymin = .001, ymax = .015), color="black", alpha=0.001)

print(main)
vp <- viewport(width = .7, height = .7, x=.07, y=.07, just=c("left","bottom"))
pushViewport(vp)
print(sub,vp=vp)


```


## Distribution Test
```{r, echo=FALSE}

qqnorm(nothingScore, main="Control Score QQ-Plot"); qqline(nothingScore)
qqnorm(upvoteScore, main="Positive Treatment Score QQ-Plot"); qqline(upvoteScore)
qqnorm(downvoteScore, main="Negative Treatment Score QQ-Plot"); qqline(downvoteScore)


```

Shapiro Test Control:
```{r, echo=FALSE}
shapiro.test(sample(nothingScore,5000))
```

Shapiro Test Positive Treatment:
```{r, echo=FALSE}
shapiro.test(sample(upvoteScore,5000))
```

Shapiro Test Negative Treatment:
```{r, echo=FALSE}
shapiro.test(sample(downvoteScore,5000))
```

KS Test Control:
```{r, echo=FALSE, warning=FALSE}
ks.test(nothingScore, "pnorm")
```

KS Test Positive Treatment:
```{r, echo=FALSE, warning=FALSE}
ks.test(upvoteScore, "pnorm")
```

KS Test Negative Treatment:
```{r, echo=FALSE, warning=FALSE}
ks.test(downvoteScore, "pnorm")
```

```{r, echo=FALSE}
hist((lns), 100, col="black", main="Control Score Histogram", xlab="Score")
hist((lus), 100, col="black", main="Positive Treatment Score Histogram", xlab="Score")
hist((lds), 100, col="black", main="Negative Treatment Histogram", xlab="Score")
```


Fit to distribution:
```{r, echo=FALSE}

#Fit each treatment to Normal Distribution
print("Normal:")
print("Control")
fit1 <- fitdist((lns), "norm")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Positive")
fit1 <- fitdist((lus), "norm")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Negative")
fit1 <- fitdist((lds), "norm")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Exponential:")
print("Control")
fit1 <- fitdist((lns), "exp")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Positive")
fit1 <- fitdist((lus), "exp")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Negative")
fit1 <- fitdist((lds), "exp")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Pareto:")
print("Control")
fit1 <- fitdist(log(lns), "exp")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Positive")
fit1 <- fitdist(log(lus), "exp")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

print("Negative")
fit1 <- fitdist(log(lds), "exp")
paste("AIC: ", fit1$aic)
paste("BIC: ", fit1$bic)
paste("LL: ", fit1$loglik)

```

Fit to Pareto:

```{r, echo=FALSE}
pwrdist <- function(u,...) {
       # u is vector of event counts, e.g. how many 
       # crimes was a given perpetrator charged for by the police
       fx <- table(u)
       i <- as.numeric(names(fx))
       y <- rep(0,max(i))
       y[i] <- fx
       m0 <- glm(y~log(1:max(i)),family=quasipoisson())
       print(summary(m0))
       sub <- paste("s=",round(m0$coef[2],2),"lambda=",sum(u)/length(u))      
       plot(i,fx,log="xy",xlab="x",sub=sub,ylab="votes",...)
       lines(1:max(i),(fitted(m0)),type="l")
       return(m0)
   }

pwrdist(lns)
pwrdist(lds)
pwrdist(lus)
```

#Comparison

```{r, echo=FALSE, warning=FALSE}
ups_nothing_kstest = ks.test(upvoteScore, nothingScore)
paste("KS Test -- Positive Treatment vs Control p=", ups_nothing_kstest$p.value)

downs_nothing_kstest = ks.test(downvoteScore, nothingScore)
paste("KS Test -- Positive Treatment vs Control p=", ups_nothing_kstest$p.value)

sanity_kstest = ks.test(sample(nothingScore, 500), sample(nothingScore, 500))
paste("Sanity Check -- KS Test -- Control vs Control p=", sanity_kstest$p.value)
```

```{r, echo=FALSE, warning=FALSE}
ups_nothing_ranksum = wilcox.test(upvoteScore, nothingScore)
paste("Wilcox Test -- Positive Treatment vs Control p=", ups_nothing_ranksum$p.value)

ups_nothing_ranksum = wilcox.test(downvoteScore, nothingScore)
paste("Wilcox Test -- Negative Treatment vs Control p=", ups_nothing_ranksum$p.value)

sanity_wilcoxtest = wilcox.test(sample(nothingScore, 500), sample(nothingScore, 500))
paste("Sanity Check -- Wilcox Test -- Control vs Control p=", sanity_wilcoxtest$p.value)
```


```{r, echo=FALSE}
ups_nothing_t = t.test(log(lus), log(lns))
paste("T Test -- Log Positive Treatment vs Log Control p=", ups_nothing_t$p.value)

ups_nothing_t = t.test(log(lds), log(lns))
paste("T Test -- Log Negative Treatment vs Log Control p=", ups_nothing_t$p.value)

sanity_ttest = t.test(sample(log(lns), 500), sample(log(lns), 500))
paste("Sanity Check -- T Test -- Log Control vs Log Control p=", sanity_ttest$p.value)
```


```{r, echo=FALSE, warning=FALSE}

up_data_bytime = list()

for ( i in sort( unique( upvoteData$wait ) ) ) { #[[4]]
  upvoteHits = upvoteData$wait == i
  tmp = upvoteData[upvoteHits,]
  up_data_bytime[[toString(i)]] <-  tmp[,5]-tmp[,6] 
}

down_data_bytime = list()

for ( i in sort( unique( downvoteData$wait ) ) ) {
  downvoteHits = downvoteData$wait == i
  tmp = downvoteData[downvoteHits,]
  down_data_bytime[[toString(i)]] = tmp[,5]-tmp[,6] 
}

up_data_kstest_bytime = list()
for ( i in sort( unique( upvoteData$wait ) ) ) {
  up_data_kstest_bytime[[toString(i)]] = ks.test(up_data_bytime[[toString(i)]], nothingScore)
  print(paste("KS Test -- Positive Treatment vs Control at ", i, " sec - D = ", up_data_kstest_bytime[[toString(i)]]$statistic , "p =", up_data_kstest_bytime[[toString(i)]]$p.value))
}


down_data_kstest_bytime = list()
for ( i in sort( unique( downvoteData[[4]] ) ) ) {
  down_data_kstest_bytime[[toString(i)]] = ks.test(down_data_bytime[[toString(i)]], nothingScore)
print(paste("KS Test -- Nevative Treatment vs Control at ", i, " sec - D = ", down_data_kstest_bytime[[toString(i)]]$statistic , "p =", down_data_kstest_bytime[[toString(i)]]$p.value))
}

up_data_ranksum_bytime = list()
for ( i in sort( unique( upvoteData[[4]] ) ) ) {
  up_data_ranksum_bytime[[toString(i)]] = wilcox.test(up_data_bytime[[toString(i)]], nothingScore)
  print(paste("Wilcox Test -- Positive Treatment vs Control at ", i, " sec - p =", up_data_ranksum_bytime[[toString(i)]]$p.value))
}

down_data_ranksum_bytime = list()
for ( i in sort( unique( downvoteData[[4]] ) ) ) {
  down_data_ranksum_bytime[[toString(i)]] = wilcox.test(down_data_bytime[[toString(i)]], nothingScore);
  print(paste("Wilcox Test -- Negative Treatment vs Control at ", i, " sec - p =", down_data_ranksum_bytime[[toString(i)]]$p.value))
}


up_data_ttest_bytime = list()
for ( i in sort( unique( upvoteData[[4]] ) ) ) {
  udbt = up_data_bytime[[toString(i)]]
  udbt = udbt[udbt>0]
  up_data_ttest_bytime[[toString(i)]] = t.test(log(udbt), log(lns))
  print(paste("T Test -- Log Positive Treatment vs Log Control at ", i, " sec - p =", up_data_ttest_bytime[[toString(i)]]$p.value))
}

down_data_ttest_bytime = list()
for ( i in sort( unique( downvoteData[[4]] ) ) ) {  
  ddbt = down_data_bytime[[toString(i)]]
  ddbt = ddbt[ddbt>0]
  down_data_ttest_bytime[[toString(i)]] = t.test(log(ddbt), log(lns))
  print(paste("T Test -- Log Negative Treatment vs Log Control at ", i, " sec - p =", down_data_ttest_bytime[[toString(i)]]$p.value))
}

up_summary_data_bytime = list()

for ( i in sort( unique( upvoteData[[4]] ) ) ) {   
  up_summary_data_bytime[[toString(i)]] = summary( up_data_bytime[[toString(i)]] );
}

down_summary_data_bytime = list()

for ( i in sort( unique( downvoteData[[4]] ) ) ) {   
  down_summary_data_bytime[[toString(i)]] = summary( down_data_bytime[[toString(i)]] );
}


gp <- ggplot(post_dat, aes(x=factor(wait), y=ups-downs, color=action, group=interaction(action,wait) ))
gp +
     geom_boxplot() +scale_y_log10()


dfc <- summarySE(post_dat_elim, measurevar="score", groupvars=c("action", "wait"))


dfc$wait <- dfc$wait/60
dfc$wait <- as.factor(as.character(dfc$wait))
dfc$wait = factor(dfc$wait,levels(dfc$wait)[c(1,2,3,6,4,5,7)])

pd <- position_dodge(.3)

dfc.yinter_nothing = subset(dfc, action == 'Nothing', select=c(score))
dfc.yinter_up = subset(dfce, action == 'Upvote', select=c(score))
dfc.yinter_down = subset(dfce, action == 'Downvote', select=c(score))

plot1 <- ggplot(dfc, aes(x=wait, y=score, colour=action, group=action, fill=action, shape=action)) + 
    geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.2, position=pd) +
  geom_errorbar(aes(ymin=score-se, ymax=score+se, group=action), colour="red", width=.2, position=pd) +
    geom_point(size=3, position=pd)+
xlab("Time Elapsed before Treatment (minutes)") +
    ylab("Final Post Score") +
  
    scale_color_manual(values=c("#000000", "#000000", "#000000"))+
    ggtitle(expression(atop("Voting Effects on Final Post Score", atop(italic("Top 1% Removed"), "")))) +    
    
  geom_segment(aes(x = 1, y = dfc.yinter_nothing[[1]], xend = 7, yend = dfc.yinter_nothing[[1]]), size=.3, linetype=2, colour="#0000FF", alpha=.8) +
  geom_segment(aes(x = 1, y = dfc.yinter_up[[1]], xend = 7, yend = dfc.yinter_up[[1]]), size=.3, linetype=2, colour="#00FF00", alpha=.8) +
  geom_segment(aes(x = 1, y = dfc.yinter_down[[1]], xend = 7, yend = dfc.yinter_down[[1]]), size=.3, linetype=2, colour="#FF0000", alpha=.8) +
    theme_classic() +
  scale_shape_manual(values=c(25,21, 24)) +
  scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+
  theme(
    legend.position = "none"
  )

plot1


```

##Score deciles by wait time and action

```{r, echo=FALSE, warning=FALSE}

qdowns = list()
for ( i in sort( unique( downvoteData[[4]] ) ) ) { 
  qdowns[[toString(i)]] = quantile(down_data_bytime[[toString(i)]], seq(0, 1, by=.1))[2:10]
}
qdownsMat = do.call(rbind, qdowns) 
qups = list()
for ( i in sort( unique( upvoteData[[4]] ) ) ) { 
  qups[[toString(i)]] = quantile(up_data_bytime[[toString(i)]], seq(0, 1, by=.1))[2:10]
}
qupsMat = do.call(rbind, qups) 
qc = quantile(nothingData$ups-nothingData$downs, seq(0, 1, by=.1))[2:10]
qcontrol = list()
for ( i in sort( unique( upvoteData[[4]] ) ) ) { 
  qcontrol[[toString(i)]] = qc
}
qcontrolMat = do.call(rbind, qcontrol) 

par(mfrow=c(1,1)) # all plots on one page 
linetype <- c(2:10) 
plotchar <- c(24, 21, 25)
colors <- c("#008800", "#0000AA", "#990000")

  #scale_shape_manual(values=c(25,21, 24)) +
  #scale_fill_manual(values=c("#0000FF", "#00FF00"))+

waitLabel = c('0', '.5', '1', '5', '10', '30', '60')

for(i in ncol(qcontrolMat):1){
  
  #if(i!=5) { #we don't want the median, we've already plotted that.
    
    heading = paste(colnames(qcontrolMat)[i], " decile") 
    
    plot(range(1:7), y=range( min(qdownsMat[,i],qupsMat[,i],qcontrolMat[,i])-1, max(qupsMat[,i], qdownsMat[,i], qcontrolMat[,i])+1), type="n", xaxt='n', xlab="", ylab="", main=heading)
    
    axis(1, 1:7, labels=waitLabel)
    
    lines(y=qupsMat[,i], 1:7, type='b', col=colors[1], lty=linetype[1], lwd=1.5, pch=plotchar[1])
    lines(y=qcontrolMat[,i], 1:7, type='b', col=colors[2], lty=linetype[2], lwd=1.5, pch=plotchar[2])
    lines(y=qdownsMat[,i], 1:7, type='b', col=colors[3], lty=linetype[3], lwd=1.5, pch=plotchar[3])
 # }
}

```

##Total votes as function of action


```{r, echo=FALSE, warning=FALSE}


gp <- ggplot(post_dat, aes(x=factor(wait), y=ups+downs, color=action, group=interaction(action,wait) ))
gp +
     geom_boxplot() +scale_y_log10()

```


##Top 10 most frequently ocurring Subreddits Results


Summary Statistics:
```{r, echo=FALSE}


posts <- data.table(post_dat_elim[post_dat_elim$wait == 0,])

scores_by_subreddit = list()

top10 = posts[, (.N), by = subreddit][order(-V1)][seq(1,500)]

for( i in 1:nrow(top10)){
  data = post_dat_elim[post_dat_elim$subreddit == top10[i]$subreddit,]
  scores = data[[5]] - data[[6]]
  subreddits_scores_df = data.frame(subreddit = data$subreddit, score = scores, action = data$action, wait = data$wait)
  scores_by_subreddit[[i]] = subreddits_scores_df
}


top10summary = list()

for ( i in 1:nrow(top10) ){
  dataDf =  scores_by_subreddit[[i]]
  
  
  data = dataDf$score
  
  top10summary[[i]] <- summarySE(dataDf, measurevar="score", groupvars=c("action", "subreddit"))
  
}


y = vector()
z = vector()
s = vector()
iv = vector()
su = vector()

for ( i in 1:nrow(top10) ){
  x= top10summary[[i]]
  iv[i] = i;
  su[i] = x$N[1]+x$N[2]+x$N[3]
  y[i] = (x$score[3] - x$se[3]) - (x$score[2] + x$se[2])
  z[i] = (x$score[2] - x$se[2]) - (x$score[1] + x$se[1])
  s[i] = as.character(x$subreddit[2])
}

df = data.frame(iv, su, y, z, s) 


top = head(df,10)$s


#Create data frame for 10 most frequently occuring subreddits in dataset, removing POLITIC and friendsafari because posts in POLITIC are automatically submitted by a computer program, and because posts in friendsafari cannot be down-treated according to the subreddit rules. 
top10summaries = data.frame()
for (i in 1:10){
  data = top10summary[[i]]
  if ( (data$subreddit[1] %in% c("POLITIC", "friendsafari")) == FALSE ){
    top10summaries <- rbind(top10summaries,data)
  }
}

top10summaries

plot2 <- ggplot(top10summaries, aes(x=subreddit, y=score, colour=action, group=action, fill=action, shape=action)) + 
    geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.1, , position=position_dodge()) +
  geom_errorbar(aes(ymin=score-se, ymax=score+se), colour="red", width=.1) +
    geom_point(size=3)+
xlab("Subreddit") +
    ylab("Final Post Score") +
    scale_color_manual(values=c("#000000", "#000000", "#000000"))+
    ggtitle(expression(atop("Voting Effects on Final Post Score", atop(italic("Top 1% Removed"), "")))) +    
    theme_classic() +
  scale_shape_manual(values=c(25,21, 24)) +
  scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+
  #scale_x_discrete(breaks=c("Downvote", "Nothing", "Upvote"), labels=c("Down Vote", "Control", "Up Vote")) +
  theme(legend.position = "none", axis.text.x  = element_text(angle=45, hjust=1))

plot2