--- title: "Reddit Post Experiment" author: "Tim Weninger, Thomas J. Johnston, Maria Glenski" date: "2/6/2015" output: html_document --- ###Reddit Post Experiment dataset can be found here: ###The R-Markdown code can be found here: These are the statistics generated from the reddit data. # Basic dataset statistics: ```{r, echo=FALSE, message=FALSE} setwd("/data/tweninge/reddit_pnas") post_dat = read.csv("reddit_post_manipulation_data.csv", header = TRUE, sep="\t") post_dat$score = post_dat$ups - post_dat$downs library("ggplot2", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.0") library(grid) library("moments", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.0") library("fitdistrplus", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.0") library(data.table) ``` Number of posts, N= ```{r, echo=FALSE} n = nrow(post_dat) n ``` Control: ```{r , echo=FALSE} nothingHits = post_dat$action == 'Nothing' nothingData = post_dat[nothingHits,] count_control = nrow(nothingData) count_control ``` Positive Treatment: ```{r, echo=FALSE} upvoteHits = post_dat$action == 'Upvote' upvoteData = post_dat[upvoteHits,] count_upvote = nrow(upvoteData) count_upvote ``` Negative Treatment: ```{r, echo=FALSE} downvoteHits = post_dat$action == 'Downvote' downvoteData = post_dat[downvoteHits,] count_downvote = nrow(downvoteData) count_downvote ``` ## Post score (ups-downs) summary statistics All scores: ```{r, echo=FALSE} nothingScore = nothingData$score upvoteScore = upvoteData$score downvoteScore = downvoteData$score allScore = c(t(nothingScore), t(upvoteScore), t(downvoteScore)) summary(allScore) ``` Control Scores: ```{r, echo=FALSE} summary(nothingScore) ``` Positive Treatment Scores (without adjustment): ```{r, echo=FALSE} summary(upvoteScore) ``` Negative Treatment Scores (without adjustment): ```{r, echo=FALSE} summary(downvoteScore) ``` Boxplot Comparison: ```{r, echo=FALSE} boxplot(list(nothingScore, upvoteScore, downvoteScore), names = list("Control", "Positive", "Negative")) ``` Boxplot Comparison (log): ```{r, echo=FALSE} lns = nothingScore #eliminate nothing scores <= 0 lns = lns[lns>0] lus = upvoteScore #eliminate upvote scores <= 0 lus = lus[lus>0] lds = downvoteScore #eliminate downvote scores <= 0 lds = lds[lds>0] sk_nothing <- skewness(log(lns)) sk_upvote <- skewness(log(lus)) sk_downvote <- skewness(log(lds)) print("Skewness (log) by Treatment:") paste("Control = ", sk_nothing) paste("Positive = ", sk_upvote) paste("Negative = ", sk_downvote) kur_nothing <- kurtosis(log(lns)) kur_upvote <- kurtosis(log(lus)) kur_downvote <- kurtosis(log(lds)) print("Kurtosis (log) by Treatment:") paste("Control = ", kur_nothing) paste("Positive = ", kur_upvote) paste("Negative = ", kur_downvote) print("Boxplot Comparison (log) of Scores by Treatment") boxplot(list( log(lns), log(lus), log(lds) ), names = list("Control", "Positive", "Negative")) ``` Line graph Comparison: ```{r, echo=FALSE} plot( rev(sort(nothingScore)), type = "l", ylim = c(min(nothingScore), max(nothingScore)), ylab = "value") lines(rev(sort(upvoteScore)), col = "blue") lines(rev(sort(downvoteScore)), col = "red") ``` Outlier Elimination: ```{r outlier elimination, echo=FALSE, message=FALSE, warning=FALSE} require(gridExtra) ## Summarizes data. ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). ## data: a data frame. ## measurevar: the name of a column that contains the variable to be summariezed ## groupvars: a vector containing names of columns that contain grouping variables ## na.rm: a boolean that indicates whether to ignore NA's ## conf.interval: the percent range of the confidence interval (default is 95%) summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) { require(plyr) # New version of length which can handle NA's: if na.rm==T, don't count them length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(!is.na(x)) else length(x) } # This does the summary. For each group's data frame, return a vector with # N, mean, and sd datac <- ddply(data, groupvars, .drop=.drop, .fun = function(xx, col) { c(N = length2(xx[[col]], na.rm=na.rm), mean = mean (xx[[col]], na.rm=na.rm), sd = sd (xx[[col]], na.rm=na.rm) ) }, measurevar ) # Rename the "mean" column datac <- rename(datac, c("mean" = measurevar)) datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean # Confidence interval multiplier for standard error # Calculate t-statistic for confidence interval: # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 ciMult <- qt(conf.interval/2 + .5, datac$N-1) datac$ci <- datac$se * ciMult return(datac) } sk_post_dat <- skewness(post_dat$score) kur_post_dat <- kurtosis(post_dat$score) paste("Skewness for all scores = ", sk_post_dat) paste("Kurtosis for all scores = ", kur_post_dat) q1 = quantile(post_dat$score, probs = c(0.0, 0.99), na.rm=TRUE)[1] q3 = quantile(post_dat$score, probs = c(0.0, 0.99), na.rm=TRUE)[2] post_dat_elim = post_dat[post_dat$score>=q1,] post_dat_elim = post_dat_elim[post_dat_elim$score<=q3,] sk_post_dat_elim <- skewness(post_dat_elim$score) kur_post_dat_elim <- kurtosis(post_dat_elim$score) paste("Skewness for all scores (with outliers eliminated) = ", sk_post_dat_elim) paste("Kurtosis for all scores (with outliers eliminated) = ", kur_post_dat_elim) median_post_dat_nothing <- median(post_dat[post_dat$action=='Nothing',]$score) median_post_dat_upvote <- median(post_dat[post_dat$action=='Upvote',]$score) median_post_dat_downvote <- median(post_dat[post_dat$action=='Downvote',]$score) dfc <- summarySE(post_dat, measurevar="score", groupvars=c("action")) dfce <- summarySE(post_dat_elim, measurevar="score", groupvars=c("action")) plot1 <- ggplot(dfc, aes(x=action, y=score, colour=action, group=action, fill=action, shape=action)) + geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.1) + geom_errorbar(aes(ymin=score-se, ymax=score+se), colour="red", width=.1) + geom_point(size=3)+ xlab("Treatment") + ylab("Final Post Score") + scale_x_discrete(breaks=c("Downvote", "Nothing", "Upvote"), labels=c("Down Vote", "Control", "Up Vote")) + scale_color_manual(values=c("#000000", "#000000", "#000000"))+ ggtitle("Voting Effects on Final Post Score") + scale_y_continuous( breaks=0:20*2) + # Set tick every 4 theme_classic() + scale_shape_manual(values=c(25,21, 24)) + scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+ theme( legend.position = "none" ) plot2 <- ggplot(dfce, aes(x=action, y=score, colour=action, group=action, fill=action, shape=action)) + geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.1) + geom_errorbar(aes(ymin=score-se, ymax=score+se), colour="red", width=.1) + geom_point(size=3)+ xlab("Treatment") + ylab("Final Post Score") + scale_color_manual(values=c("#000000", "#000000", "#000000"))+ ggtitle(expression(atop("Voting Effects on Final Post Score", atop(italic("Top 1% Removed"), "")))) + theme_classic() + scale_shape_manual(values=c(25,21, 24)) + scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+ scale_x_discrete(breaks=c("Downvote", "Nothing", "Upvote"), labels=c("Down Vote", "Control", "Up Vote")) + theme( legend.position = "none" ) plot1 n = nrow(post_dat[post_dat$action=='Nothing',]) probs = vector() #prob of reaching the front page >1000 votes for ( i in seq(from=500,to=2000,by=20) ){ probs[i] <-nrow(post_dat[post_dat$score>=i & post_dat$action=='Nothing',])/n } probs<-probs[!is.na(probs)] n = nrow(post_dat[post_dat$action=='Upvote',]) probs_u = vector() #prob of reaching the front page >1000 votes for ( i in seq(from=500,to=2000,by=20) ){ probs_u[i] <-nrow(post_dat[post_dat$score>=i & post_dat$action=='Upvote',])/n } probs_u<-probs_u[!is.na(probs_u)] n = nrow(post_dat[post_dat$action=='Downvote',]) probs_d = vector() #prob of reaching the front page >1000 votes for ( i in seq(from=500,to=2000,by=20) ){ probs_d[i] <-nrow(post_dat[post_dat$score>=i & post_dat$action=='Nothing',])/n } probs_d<-probs_d[!is.na(probs_d)] df <- data.frame(probs, probs_u, probs_d) colnames(df) <- c("Control", "Upvote", "Downvote") n = nrow(post_dat[post_dat$action=='Nothing',]) probs = vector() #prob of reaching the front page >1000 votes for ( i in seq(from=1,to=3000/50) ){ probs[i] = nrow(post_dat[post_dat$score>=((i)*50-100) & post_dat$action=='Nothing',])/n } probs=probs[!is.na(probs)] n = nrow(post_dat[post_dat$action=='Upvote',]) probs_u = vector() #prob of reaching the front page >1000 votes for ( i in seq(from=1,to=3000/50) ){ probs_u[i] = nrow(post_dat[post_dat$score>=((i)*50-100) & post_dat$action=='Upvote',])/n } probs_u<-probs_u[!is.na(probs_u)] n = nrow(post_dat[post_dat$action=='Downvote',]) probs_d = vector() #prob of reaching the front page >1000 votes for ( i in seq(from=1,to=3000/50) ){ probs_d[i] = nrow(post_dat[post_dat$score>=((i)*50-100) & post_dat$action=='Downvote',])/n } probs_d<-probs_d[!is.na(probs_d)] df2 <- data.frame(probs, probs_u, probs_d) colnames(df2) <- c("Control", "Upvote", "Downvote") main <- ggplot(df, aes(seq(from=500,to=2000,by=20))) + geom_line(aes(y=Upvote, colour="Upvote"), linetype="dotdash", size=1.2) + geom_line(aes(y=Downvote, colour="Downvote"), linetype="dashed", size=1.2, alpha=1) + geom_line(aes(y=Control, colour="Control"), linetype="solid", size=1.2, alpha=0.8) + xlab("Final Post Score") + ggtitle("Final Score Probability by Vote Effect") + ylab("Probability (log scale)") + theme_classic() + scale_colour_manual("", breaks = c("Upvote", "Downvote", "Control"), values = c("#0000AA", "#990000", "#008800")) + scale_y_log10(breaks=c(.003, .005, .01)) + theme(legend.justification=c(1,1), legend.position=c(1,1)) sub <- ggplot(df2, aes(seq(from=-50,to=2900,by=50))) + geom_line(aes(y=Upvote), colour="#008800", linetype="dotdash", size=1.1) + geom_line(aes(y=Downvote), colour="#990000", linetype="dashed", size=1.1, alpha=1) + geom_line(aes(y=Control), colour="#0000AA", linetype="solid", size=1.1, alpha=0.7) + xlab("Final Post Score") + ggtitle("") + ylab("Probability (log scale)") + theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), plot.background = element_blank()) + scale_y_log10(breaks=c(.001, .01, .1, 1)) + geom_rect(aes(xmin = 500, xmax = 2000, ymin = .001, ymax = .015), color="black", alpha=0.001) print(main) vp <- viewport(width = .7, height = .7, x=.07, y=.07, just=c("left","bottom")) pushViewport(vp) print(sub,vp=vp) ``` ## Distribution Test ```{r, echo=FALSE} qqnorm(nothingScore, main="Control Score QQ-Plot"); qqline(nothingScore) qqnorm(upvoteScore, main="Positive Treatment Score QQ-Plot"); qqline(upvoteScore) qqnorm(downvoteScore, main="Negative Treatment Score QQ-Plot"); qqline(downvoteScore) ``` Shapiro Test Control: ```{r, echo=FALSE} shapiro.test(sample(nothingScore,5000)) ``` Shapiro Test Positive Treatment: ```{r, echo=FALSE} shapiro.test(sample(upvoteScore,5000)) ``` Shapiro Test Negative Treatment: ```{r, echo=FALSE} shapiro.test(sample(downvoteScore,5000)) ``` KS Test Control: ```{r, echo=FALSE, warning=FALSE} ks.test(nothingScore, "pnorm") ``` KS Test Positive Treatment: ```{r, echo=FALSE, warning=FALSE} ks.test(upvoteScore, "pnorm") ``` KS Test Negative Treatment: ```{r, echo=FALSE, warning=FALSE} ks.test(downvoteScore, "pnorm") ``` ```{r, echo=FALSE} hist((lns), 100, col="black", main="Control Score Histogram", xlab="Score") hist((lus), 100, col="black", main="Positive Treatment Score Histogram", xlab="Score") hist((lds), 100, col="black", main="Negative Treatment Histogram", xlab="Score") ``` Fit to distribution: ```{r, echo=FALSE} #Fit each treatment to Normal Distribution print("Normal:") print("Control") fit1 <- fitdist((lns), "norm") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Positive") fit1 <- fitdist((lus), "norm") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Negative") fit1 <- fitdist((lds), "norm") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Exponential:") print("Control") fit1 <- fitdist((lns), "exp") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Positive") fit1 <- fitdist((lus), "exp") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Negative") fit1 <- fitdist((lds), "exp") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Pareto:") print("Control") fit1 <- fitdist(log(lns), "exp") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Positive") fit1 <- fitdist(log(lus), "exp") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) print("Negative") fit1 <- fitdist(log(lds), "exp") paste("AIC: ", fit1$aic) paste("BIC: ", fit1$bic) paste("LL: ", fit1$loglik) ``` Fit to Pareto: ```{r, echo=FALSE} pwrdist <- function(u,...) { # u is vector of event counts, e.g. how many # crimes was a given perpetrator charged for by the police fx <- table(u) i <- as.numeric(names(fx)) y <- rep(0,max(i)) y[i] <- fx m0 <- glm(y~log(1:max(i)),family=quasipoisson()) print(summary(m0)) sub <- paste("s=",round(m0$coef[2],2),"lambda=",sum(u)/length(u)) plot(i,fx,log="xy",xlab="x",sub=sub,ylab="votes",...) lines(1:max(i),(fitted(m0)),type="l") return(m0) } pwrdist(lns) pwrdist(lds) pwrdist(lus) ``` #Comparison ```{r, echo=FALSE, warning=FALSE} ups_nothing_kstest = ks.test(upvoteScore, nothingScore) paste("KS Test -- Positive Treatment vs Control p=", ups_nothing_kstest$p.value) downs_nothing_kstest = ks.test(downvoteScore, nothingScore) paste("KS Test -- Positive Treatment vs Control p=", ups_nothing_kstest$p.value) sanity_kstest = ks.test(sample(nothingScore, 500), sample(nothingScore, 500)) paste("Sanity Check -- KS Test -- Control vs Control p=", sanity_kstest$p.value) ``` ```{r, echo=FALSE, warning=FALSE} ups_nothing_ranksum = wilcox.test(upvoteScore, nothingScore) paste("Wilcox Test -- Positive Treatment vs Control p=", ups_nothing_ranksum$p.value) ups_nothing_ranksum = wilcox.test(downvoteScore, nothingScore) paste("Wilcox Test -- Negative Treatment vs Control p=", ups_nothing_ranksum$p.value) sanity_wilcoxtest = wilcox.test(sample(nothingScore, 500), sample(nothingScore, 500)) paste("Sanity Check -- Wilcox Test -- Control vs Control p=", sanity_wilcoxtest$p.value) ``` ```{r, echo=FALSE} ups_nothing_t = t.test(log(lus), log(lns)) paste("T Test -- Log Positive Treatment vs Log Control p=", ups_nothing_t$p.value) ups_nothing_t = t.test(log(lds), log(lns)) paste("T Test -- Log Negative Treatment vs Log Control p=", ups_nothing_t$p.value) sanity_ttest = t.test(sample(log(lns), 500), sample(log(lns), 500)) paste("Sanity Check -- T Test -- Log Control vs Log Control p=", sanity_ttest$p.value) ``` ```{r, echo=FALSE, warning=FALSE} up_data_bytime = list() for ( i in sort( unique( upvoteData$wait ) ) ) { #[[4]] upvoteHits = upvoteData$wait == i tmp = upvoteData[upvoteHits,] up_data_bytime[[toString(i)]] <- tmp[,5]-tmp[,6] } down_data_bytime = list() for ( i in sort( unique( downvoteData$wait ) ) ) { downvoteHits = downvoteData$wait == i tmp = downvoteData[downvoteHits,] down_data_bytime[[toString(i)]] = tmp[,5]-tmp[,6] } up_data_kstest_bytime = list() for ( i in sort( unique( upvoteData$wait ) ) ) { up_data_kstest_bytime[[toString(i)]] = ks.test(up_data_bytime[[toString(i)]], nothingScore) print(paste("KS Test -- Positive Treatment vs Control at ", i, " sec - D = ", up_data_kstest_bytime[[toString(i)]]$statistic , "p =", up_data_kstest_bytime[[toString(i)]]$p.value)) } down_data_kstest_bytime = list() for ( i in sort( unique( downvoteData[[4]] ) ) ) { down_data_kstest_bytime[[toString(i)]] = ks.test(down_data_bytime[[toString(i)]], nothingScore) print(paste("KS Test -- Nevative Treatment vs Control at ", i, " sec - D = ", down_data_kstest_bytime[[toString(i)]]$statistic , "p =", down_data_kstest_bytime[[toString(i)]]$p.value)) } up_data_ranksum_bytime = list() for ( i in sort( unique( upvoteData[[4]] ) ) ) { up_data_ranksum_bytime[[toString(i)]] = wilcox.test(up_data_bytime[[toString(i)]], nothingScore) print(paste("Wilcox Test -- Positive Treatment vs Control at ", i, " sec - p =", up_data_ranksum_bytime[[toString(i)]]$p.value)) } down_data_ranksum_bytime = list() for ( i in sort( unique( downvoteData[[4]] ) ) ) { down_data_ranksum_bytime[[toString(i)]] = wilcox.test(down_data_bytime[[toString(i)]], nothingScore); print(paste("Wilcox Test -- Negative Treatment vs Control at ", i, " sec - p =", down_data_ranksum_bytime[[toString(i)]]$p.value)) } up_data_ttest_bytime = list() for ( i in sort( unique( upvoteData[[4]] ) ) ) { udbt = up_data_bytime[[toString(i)]] udbt = udbt[udbt>0] up_data_ttest_bytime[[toString(i)]] = t.test(log(udbt), log(lns)) print(paste("T Test -- Log Positive Treatment vs Log Control at ", i, " sec - p =", up_data_ttest_bytime[[toString(i)]]$p.value)) } down_data_ttest_bytime = list() for ( i in sort( unique( downvoteData[[4]] ) ) ) { ddbt = down_data_bytime[[toString(i)]] ddbt = ddbt[ddbt>0] down_data_ttest_bytime[[toString(i)]] = t.test(log(ddbt), log(lns)) print(paste("T Test -- Log Negative Treatment vs Log Control at ", i, " sec - p =", down_data_ttest_bytime[[toString(i)]]$p.value)) } up_summary_data_bytime = list() for ( i in sort( unique( upvoteData[[4]] ) ) ) { up_summary_data_bytime[[toString(i)]] = summary( up_data_bytime[[toString(i)]] ); } down_summary_data_bytime = list() for ( i in sort( unique( downvoteData[[4]] ) ) ) { down_summary_data_bytime[[toString(i)]] = summary( down_data_bytime[[toString(i)]] ); } gp <- ggplot(post_dat, aes(x=factor(wait), y=ups-downs, color=action, group=interaction(action,wait) )) gp + geom_boxplot() +scale_y_log10() dfc <- summarySE(post_dat_elim, measurevar="score", groupvars=c("action", "wait")) dfc$wait <- dfc$wait/60 dfc$wait <- as.factor(as.character(dfc$wait)) dfc$wait = factor(dfc$wait,levels(dfc$wait)[c(1,2,3,6,4,5,7)]) pd <- position_dodge(.3) dfc.yinter_nothing = subset(dfc, action == 'Nothing', select=c(score)) dfc.yinter_up = subset(dfce, action == 'Upvote', select=c(score)) dfc.yinter_down = subset(dfce, action == 'Downvote', select=c(score)) plot1 <- ggplot(dfc, aes(x=wait, y=score, colour=action, group=action, fill=action, shape=action)) + geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.2, position=pd) + geom_errorbar(aes(ymin=score-se, ymax=score+se, group=action), colour="red", width=.2, position=pd) + geom_point(size=3, position=pd)+ xlab("Time Elapsed before Treatment (minutes)") + ylab("Final Post Score") + scale_color_manual(values=c("#000000", "#000000", "#000000"))+ ggtitle(expression(atop("Voting Effects on Final Post Score", atop(italic("Top 1% Removed"), "")))) + geom_segment(aes(x = 1, y = dfc.yinter_nothing[[1]], xend = 7, yend = dfc.yinter_nothing[[1]]), size=.3, linetype=2, colour="#0000FF", alpha=.8) + geom_segment(aes(x = 1, y = dfc.yinter_up[[1]], xend = 7, yend = dfc.yinter_up[[1]]), size=.3, linetype=2, colour="#00FF00", alpha=.8) + geom_segment(aes(x = 1, y = dfc.yinter_down[[1]], xend = 7, yend = dfc.yinter_down[[1]]), size=.3, linetype=2, colour="#FF0000", alpha=.8) + theme_classic() + scale_shape_manual(values=c(25,21, 24)) + scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+ theme( legend.position = "none" ) plot1 ``` ##Score deciles by wait time and action ```{r, echo=FALSE, warning=FALSE} qdowns = list() for ( i in sort( unique( downvoteData[[4]] ) ) ) { qdowns[[toString(i)]] = quantile(down_data_bytime[[toString(i)]], seq(0, 1, by=.1))[2:10] } qdownsMat = do.call(rbind, qdowns) qups = list() for ( i in sort( unique( upvoteData[[4]] ) ) ) { qups[[toString(i)]] = quantile(up_data_bytime[[toString(i)]], seq(0, 1, by=.1))[2:10] } qupsMat = do.call(rbind, qups) qc = quantile(nothingData$ups-nothingData$downs, seq(0, 1, by=.1))[2:10] qcontrol = list() for ( i in sort( unique( upvoteData[[4]] ) ) ) { qcontrol[[toString(i)]] = qc } qcontrolMat = do.call(rbind, qcontrol) par(mfrow=c(1,1)) # all plots on one page linetype <- c(2:10) plotchar <- c(24, 21, 25) colors <- c("#008800", "#0000AA", "#990000") #scale_shape_manual(values=c(25,21, 24)) + #scale_fill_manual(values=c("#0000FF", "#00FF00"))+ waitLabel = c('0', '.5', '1', '5', '10', '30', '60') for(i in ncol(qcontrolMat):1){ #if(i!=5) { #we don't want the median, we've already plotted that. heading = paste(colnames(qcontrolMat)[i], " decile") plot(range(1:7), y=range( min(qdownsMat[,i],qupsMat[,i],qcontrolMat[,i])-1, max(qupsMat[,i], qdownsMat[,i], qcontrolMat[,i])+1), type="n", xaxt='n', xlab="", ylab="", main=heading) axis(1, 1:7, labels=waitLabel) lines(y=qupsMat[,i], 1:7, type='b', col=colors[1], lty=linetype[1], lwd=1.5, pch=plotchar[1]) lines(y=qcontrolMat[,i], 1:7, type='b', col=colors[2], lty=linetype[2], lwd=1.5, pch=plotchar[2]) lines(y=qdownsMat[,i], 1:7, type='b', col=colors[3], lty=linetype[3], lwd=1.5, pch=plotchar[3]) # } } ``` ##Total votes as function of action ```{r, echo=FALSE, warning=FALSE} gp <- ggplot(post_dat, aes(x=factor(wait), y=ups+downs, color=action, group=interaction(action,wait) )) gp + geom_boxplot() +scale_y_log10() ``` ##Top 10 most frequently ocurring Subreddits Results Summary Statistics: ```{r, echo=FALSE} posts <- data.table(post_dat_elim[post_dat_elim$wait == 0,]) scores_by_subreddit = list() top10 = posts[, (.N), by = subreddit][order(-V1)][seq(1,500)] for( i in 1:nrow(top10)){ data = post_dat_elim[post_dat_elim$subreddit == top10[i]$subreddit,] scores = data[[5]] - data[[6]] subreddits_scores_df = data.frame(subreddit = data$subreddit, score = scores, action = data$action, wait = data$wait) scores_by_subreddit[[i]] = subreddits_scores_df } top10summary = list() for ( i in 1:nrow(top10) ){ dataDf = scores_by_subreddit[[i]] data = dataDf$score top10summary[[i]] <- summarySE(dataDf, measurevar="score", groupvars=c("action", "subreddit")) } y = vector() z = vector() s = vector() iv = vector() su = vector() for ( i in 1:nrow(top10) ){ x= top10summary[[i]] iv[i] = i; su[i] = x$N[1]+x$N[2]+x$N[3] y[i] = (x$score[3] - x$se[3]) - (x$score[2] + x$se[2]) z[i] = (x$score[2] - x$se[2]) - (x$score[1] + x$se[1]) s[i] = as.character(x$subreddit[2]) } df = data.frame(iv, su, y, z, s) top = head(df,10)$s #Create data frame for 10 most frequently occuring subreddits in dataset, removing POLITIC and friendsafari because posts in POLITIC are automatically submitted by a computer program, and because posts in friendsafari cannot be down-treated according to the subreddit rules. top10summaries = data.frame() for (i in 1:10){ data = top10summary[[i]] if ( (data$subreddit[1] %in% c("POLITIC", "friendsafari")) == FALSE ){ top10summaries <- rbind(top10summaries,data) } } top10summaries plot2 <- ggplot(top10summaries, aes(x=subreddit, y=score, colour=action, group=action, fill=action, shape=action)) + geom_errorbar(aes(ymin=score-ci, ymax=score+ci), colour="black", width=.1, , position=position_dodge()) + geom_errorbar(aes(ymin=score-se, ymax=score+se), colour="red", width=.1) + geom_point(size=3)+ xlab("Subreddit") + ylab("Final Post Score") + scale_color_manual(values=c("#000000", "#000000", "#000000"))+ ggtitle(expression(atop("Voting Effects on Final Post Score", atop(italic("Top 1% Removed"), "")))) + theme_classic() + scale_shape_manual(values=c(25,21, 24)) + scale_fill_manual(values=c("#FF0000", "#0000FF", "#00FF00"))+ #scale_x_discrete(breaks=c("Downvote", "Nothing", "Upvote"), labels=c("Down Vote", "Control", "Up Vote")) + theme(legend.position = "none", axis.text.x = element_text(angle=45, hjust=1)) plot2