#set specific working directory uncomment to refer to the data files
#setwd("~/Desktop/R_STUFF")

# load performance improvement libraries & enable just in time compiler
library(compiler)
enableJIT(1)

#some environmental variables (decimals)
options=7

#check if required packages are there - if not the script will install them!
requiredPackages = c('rcompanion','gdata','compiler','car','lsr','sft','nlme', 'lme4', 'bibtex', 'psych', 'likert', 'ggplot2', 'tidyverse')
for(p in requiredPackages){
  if(!require(p,character.only = TRUE)) install.packages(p);
  library(p,character.only = TRUE);
}
#load all the libraries    
lapply(requiredPackages, require, character.only = TRUE);

# !!!! NOTE: run this cell twice for decluttering

# Importing the data 
data <- read.xls("data/TimeAnalysis2_1.xlsx", verbose=FALSE, na.strings=c("NA"))

# quick check on the data
head(data)

# first we'll look at data: the histogram
hist(data$Time)

# However we chose to use a log representation (lot=log of time) of our data to account for 
lot <- log(data$Time)
hist(lot)

# we perform a shapiro-wilk test to establish if the data is parametric or not:
shapiro.test(data$Time)

# We used th log of the timeing data for all subsequent analyses 
T_log <- log(data$Time)
# In this case the Wilcox test does not care about Log or not as it uses Rankings of bins
# Wilcox text on the Log Time of the data 
wilcox.test(T_log, data$Condition, paired = TRUE)
# We also looked at the Timing differences by documents
#wilcox.test(T_log, data$PDF, paired = TRUE)
# As well as differences in time in respect to the order 
wilcox.test(T_log, data$Order, paired = TRUE)

# We'll have a look at the boxplots now:
# For timing data (Logarithmic scale) by Condition (ML or Non-ML)
boxplot(T_log~data$Condition, xlab='Condition 0=Manual, 1=Semi-Automated', ylab="Time in Seconds (Log)", main='Overall timing by condition')
#boxplot(data$Time~data$Condition, xlab='Condition', ylab="Time in Seconds", main=axis(1, at=0:1, labels=(["N", "M"]))

# also by looking at the scatterplot
scatterplot(T_log ~ Condition, data=data)

# We also perfomed a t-test (paired because our within-participants design) to 
# check robustness
t.test(T_log, data$Condition, paired = TRUE, alternative = "two.sided")

# before we carry on, lets have a look if there is a difference of timing in terms of the 
# documents. First the box plot:
boxplot(T_log~data$PDF)

#Also looking at the order here First the box plot:
boxplot(data$Time~data$Order, xlab="Order of the Documents", ylab="Time in Seconds", main="Overall Time by Order")

# Now the same plot in time (Log scale) and separated by condition 
boxplot(T_log~data$Order*data$Condition, xlab='0.X = Manual conditions, 1.X = Semi-Automated conditions', ylab="Time in Seconds (Log)", main='Order by Time Separated by Condition')

# Again for robustness we also performed a t-test:
t.test(T_log, data$Order, paired = TRUE, alternative = "two.sided")

# creating two groups (ML (1) and NonML (0) )
Annotations <- subset(data, Condition=='1')
NoAnnotations <- subset(data, Condition=='0')

# checking the data
head(Annotations)

cat("Time Overall: ", sum(data$Time), '\n')
cat("Overall Mean Time: ", mean(data$Time), '\n')
cat("Overall SD Time: ", sd(data$Time), '\n')
cat('_____________________________________________', '\n')
cat("Time in Annotations: ", sum(Annotations$Time), "Time in NoAnnotations: ", sum(NoAnnotations$Time), '\n') 
cat("Mean Annotations: ", mean(Annotations$Time), "Mean NoAnnotations: ", mean(NoAnnotations$Time), '\n') 
cat("SD Annotations: ", sd(Annotations$Time), "SD NoAnnotations: ", sd(NoAnnotations$Time), '\n') 
cat('_____________________________________________', '\n')


# Quick look at the data 
plotNormalHistogram(data$Time)

# cheching the data transformation:
T_tuk = transformTukey(data$Time, plotit=FALSE)

# plotting the transformed data
plotNormalHistogram(T_tuk)

# Now using the log of the data and plotting it:
plotNormalHistogram(T_log)

# first add the log time to the data frame
data$logT <- log(data$Time)
# create the primary model: Log(Time) by Condition as fixed effects and ID as random effect:
ml.p = lmer(logT ~ Condition + (1 | ID), data=data)

summary(ml.p)

# looking at coefficients by ID
coef(ml.p)$ID 

# the mean values
coef(summary(ml.p))[,"Estimate"]

# establishing the confidence intervalls:
confint(ml.p)

# main speed up = 100 - 0.75 = 25%
exp(-.28)

# What is the effect of the PDF, Order, Condition and Person?
ml.1 = lmer(logT ~ Order + Condition + (1|ID) + (1|PDF), data=data, REML=FALSE)
ml.2 = lmer(logT ~ Condition + (1|ID) + (1|PDF), data=data, REML=FALSE)

summary(ml.1)

summary(ml.2)

# Likelihood-Ratio analysis (ANOVA) to see if the Order is important
anova(ml.1, ml.2)

summary(ml.1)

ml.3 = lmer(logT ~ Order + Condition + (1|ID), data=data, REML=FALSE)

# again likelihood-ratio analysis (ANOVA) now looking at the difference between m_1 and m_3 :
anova(ml.1, ml.3)

coef(ml.3)


summary(ml.3)

# including the condition also as random effect in model m_3
ml.3 = lmer(logT ~ Order + Condition + (Condition | ID), data=data, REML=FALSE)

summary(ml.3)

# coefficients by ID
coef(ml.3)$ID 

# histogram about those conditions:
hist(coef(ml.3)$ID[,"Condition"])

ml.4 = lmer(logT ~ Order + Condition + (Condition|ID), data=data, REML=FALSE)

# again likelihood-ratio (ANOVA) to see the differences between models:
anova(ml.4, ml.3)

# Do we need the Order if we have the condition as a random effect?
ml.5 = lmer(logT ~ Condition + (Condition|ID), data=data, REML=FALSE)

# Differences in models:
anova(ml.4, ml.5)

# examining model m_4
summary(ml.4)

# examining model m_5
summary(ml.5)

# Some descriptitves
# TOTAL first (Annotations & NonAnnotations together)
cat('____________TOTAL DATA:______________________', '\n')
cat("Total Annotations added:", sum(data$TotalAdded), '\n')
cat("Mean Total Annotations added:", mean(data$TotalAdded), '\n')
cat("SD Total Annotations added:", sd(data$TotalAdded), '\n')
cat('_____________________________________________', '\n')

cat("Total Annotations deleted:", sum(data$TotalDeleted), '\n')
cat("Mean Total Annotations deleted:", mean(data$TotalDeleted), '\n')
cat("SD Total Annotations deleted:", sd(data$TotalDeleted), '\n')
cat('\n')


cat('____________MACHINE LEARNING:________________', '\n')

# Now only for Machine-Learning Condition
cat("Total Annotations added:", sum(Annotations$TotalAdded), '\n')
cat("Mean Total Annotations added:", mean(Annotations$TotalAdded), '\n')
cat("SD Total Annotations added:", sd(Annotations$TotalAdded), '\n')
cat('_____________________________________________', '\n')

cat("Total Annotations deleted:", sum(Annotations$TotalDeleted), '\n')
cat("Mean Total Annotations deleted:", mean(Annotations$TotalDeleted), '\n')
cat("SD Total Annotations deleted:", sd(Annotations$TotalDeleted), '\n')
cat('\n')

cat('____________NON-MACHINE LEARNING: ___________', '\n')

# And for the Non-Machine-Learning Condition
cat("Total Annotations added:", sum(NoAnnotations$TotalAdded), '\n')
cat("Mean Total Annotations added:", mean(NoAnnotations$TotalAdded), '\n')
cat("SD Total Annotations added:", sd(NoAnnotations$TotalAdded), '\n')
cat('_____________________________________________', '\n')

cat("Total Annotations deleted:", sum(NoAnnotations$TotalDeleted), '\n')
cat("Mean Total Annotations deleted:", mean(NoAnnotations$TotalDeleted), '\n')
cat("SD Total Annotations deleted:", sd(NoAnnotations$TotalDeleted), '\n')


# using a subset of the above main data here:
data_s <- read.xls("data/subset_selfreported.xlsx", verbose=FALSE, na.strings=c("NA"))
# and filtering out douplicates
data_s2 <- unique(data_s)

# quick check on the data
head(data_s2)

# Calculating Descriptive Self-reported characteristics data:
NumberOfReviews0 <- sum(data_s2$NReviews=='0')
NumberOfReviews1_5 <- sum(data_s2$NReviews=='1to5')
NumberOfReviews5_0 <- sum(data_s2$NReviews=='5to10')
NumberOfReviews10plus <- sum(data_s2$NReviews=='10+')
NumberOfReviews0 
NumberOfReviews1_5 
NumberOfReviews5_0
NumberOfReviews10plus

NumberOfReviews0_percentage <- NumberOfReviews0/41
NumberOfReviews1_5_percentage <- NumberOfReviews1_5/41
NumberOfReviews5_0_percentage <- NumberOfReviews5_0/41
NumberOfReviews10plus_percentage <- NumberOfReviews10plus/41
NumberOfReviews0_percentage 
NumberOfReviews1_5_percentage
NumberOfReviews5_0_percentage
NumberOfReviews10plus_percentage

TotalSumOfPeopleWithCochraneExperience <- sum(data_s2$CExperience=='1')
TotalSumOfPeopleWithoutCochraneExperience <- sum(data_s2$CExperience=='0')
TotalSumOfPeopleWithCochraneExperience
TotalSumOfPeopleWithoutCochraneExperience

Percentage_TotalSumOfPeopleWithCochraneExperience <- TotalSumOfPeopleWithCochraneExperience/41
Percentage_TotalSumOfPeopleWithoutCochraneExperience <- TotalSumOfPeopleWithoutCochraneExperience/41
Percentage_TotalSumOfPeopleWithCochraneExperience
Percentage_TotalSumOfPeopleWithoutCochraneExperience

MedianOfTasksPerfomed <- median(data_s2$Tasks)
IRQofTasksPerformed <- quantile(data_s2$Tasks)
MedianOfTasksPerfomed
IRQofTasksPerformed


# Importing the data 
data_agreement <- read.xls("data/agreement.xlsx", verbose=FALSE, na.strings=c("NA"))

# creating two groups (ML (1) and NonML (0) )
ML <- subset(data_agreement, Condition=='1')
NoML <- subset(data_agreement, Condition=='0')

# checking how many datapoints in one column:
count(ML)

# calculating the changed data and percentiles:
RSG <- sum(ML$Changed)
cat("RSG changed:", RSG, "RSG %:", RSG/82, '\n')
AC <- sum(ML$Changed.1)
cat("AC changed:", AC, "AC %:", AC/82, '\n')
BPP <- sum(ML$Changed.2)
cat("BPP changed:", BPP, "BPP %:", BPP/82, '\n')
BOA <- sum(ML$Changed.3)
cat("BOA changed:", BOA, "BOA %:", BOA/82, '\n')
Overall <- RSG+AC+BPP+BOA
cat("Overall changed:", Overall, "RSG %:", Overall/328, '\n')

# Overall mean annotations
data <- read.xls("data/TimeAnalysis2_1.xlsx", verbose=FALSE, na.strings=c("NA"))
data2 <- subset(data, Condition=='1')
data3 <- subset(data, Condition=='0')

mean(data2$TotalSubmitted)
mean(data3$TotalSubmitted)


##############
# annotations

data <- read.xls("data/annotationschanged.xlsx", verbose=FALSE, na.strings=c("NA"))
data2 <- subset(data, Condition=='1')
data3 <- subset(data, Condition=='0')


RSG1 <- sum(data2$Changed1=='0')
RSG2 <- sum(data2$Changed1=='1')
RSG3 <- sum(data2$Changed1=='2')
RSG1/82
RSG2/82
RSG3/82

AC1 <- sum(data2$Changed2=='0')
AC2 <- sum(data2$Changed2=='1')
AC3 <- sum(data2$Changed2=='2')
AC1/82
AC2/82
AC3/82

BPP1 <- sum(data2$Changed3=='0')
BPP2 <- sum(data2$Changed3=='1')
BPP3 <- sum(data2$Changed3=='2')
BPP1/82
BPP2/82
BPP3/82

BOA1 <- sum(data2$Changed4=='0')
BOA2 <- sum(data2$Changed4=='1')
BOA3 <- sum(data2$Changed4=='2')
BOA1/82
BOA2/82
BOA3/82

TotalUnchanged <- RSG1+AC1+BPP1+BOA1
TotalUnchanged/328

TotalChangedML <- RSG2+AC2+BPP2+BOA2
TotalChangedML/328

TotalChangedNoML <- RSG3+AC3+BPP3+BOA3
TotalChangedNoML/328

# loading the data:
dataq <- read.xls("data/UXData1.xlsx", verbose=FALSE, na.strings=c("NA"))

# subsetting
data_rel <- as.data.frame(dataq)
ss2 <- c(1,3,4, 25:44)
data_q2 <- subset(data_rel, select=ss2)

# checking the data
head(data_q2)

ss3 <- c(1, 3, 8:23)
clean_q2 <- subset(data_q2, select=ss3)
head(clean_q2)

clean_q2 <- subset(clean_q2, Condition=="A")

head(clean_q2)

clean_q2_A <- subset(clean_q2, Condition=='A')
clean_q2_NOA <- subset(clean_q2, Condition=='NOA')

head(clean_q2_A)

str(clean_q2_A)

# need to change this to factors!
clean_q2_A$UseFrequently = factor(clean_q2_A$UseFrequently,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$Complex = factor(clean_q2_A$Complex,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$EasyToUse = factor(clean_q2_A$EasyToUse,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$NeedSupport = factor(clean_q2_A$NeedSupport,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$WellIntegrated = factor(clean_q2_A$WellIntegrated,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$Inconsistency = factor(clean_q2_A$Inconsistency,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$UseQuickly = factor(clean_q2_A$UseQuickly,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$Cumbersome = factor(clean_q2_A$Cumbersome,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$Confident = factor(clean_q2_A$Confident,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$NeededLearn = factor(clean_q2_A$NeededLearn,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$TextHelpful = factor(clean_q2_A$TextHelpful,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$DifficultToNav = factor(clean_q2_A$DifficultToNav,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$ImproveQuality = factor(clean_q2_A$ImproveQuality,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$Irrelevant = factor(clean_q2_A$Irrelevant,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$Confused = factor(clean_q2_A$Confused,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

clean_q2_A$ContinueUse = factor(clean_q2_A$ContinueUse,
                                  levels = c("1", "2", "3", "4", "5"), ordered = TRUE)

# checkl again
str(clean_q2_A)

summary(clean_q2_A)

#need to remove the 'out of level columns'
ss4 <- c(3:18)
final_data <- subset(clean_q2_A, select=ss4)
summary(final_data)

results <- likert(final_data)

# Legend: 5 = Strongly agree 1 = Stronly disagree
plot(results, type='bar')

# Alternative hearmap graph
plot(results, 
     type="heat",
           low.color = "white", 
           high.color = "blue",
           text.color = "black", 
           text.size = 4, 
           wrap = 50)

# checkng the distributions
plot(results,
     type="density",
           facet = TRUE, 
           bw = 0.5)

r1 <- c(1, 3:12)
sus <- subset(clean_q2_A, select=r1)
head(sus)

sus

sus_ready <- read.csv("data/sus_calculation.csv")

head(sus_ready)

g <- c(2:11)
ggplot(data=sus_ready, aes(x=Participant, y=SUS.Score, group=1)) +
  geom_point()+
  geom_point()

mean(sus_ready$SUS.Score)