#set specific working directory uncomment to refer to the data files #setwd("~/Desktop/R_STUFF") # load performance improvement libraries & enable just in time compiler library(compiler) enableJIT(1) #some environmental variables (decimals) options=7 #check if required packages are there - if not the script will install them! requiredPackages = c('rcompanion','gdata','compiler','car','lsr','sft','nlme', 'lme4', 'bibtex', 'psych', 'likert', 'ggplot2', 'tidyverse') for(p in requiredPackages){ if(!require(p,character.only = TRUE)) install.packages(p); library(p,character.only = TRUE); } #load all the libraries lapply(requiredPackages, require, character.only = TRUE); # !!!! NOTE: run this cell twice for decluttering # Importing the data data <- read.xls("data/TimeAnalysis2_1.xlsx", verbose=FALSE, na.strings=c("NA")) # quick check on the data head(data) # first we'll look at data: the histogram hist(data$Time) # However we chose to use a log representation (lot=log of time) of our data to account for lot <- log(data$Time) hist(lot) # we perform a shapiro-wilk test to establish if the data is parametric or not: shapiro.test(data$Time) # We used th log of the timeing data for all subsequent analyses T_log <- log(data$Time) # In this case the Wilcox test does not care about Log or not as it uses Rankings of bins # Wilcox text on the Log Time of the data wilcox.test(T_log, data$Condition, paired = TRUE) # We also looked at the Timing differences by documents #wilcox.test(T_log, data$PDF, paired = TRUE) # As well as differences in time in respect to the order wilcox.test(T_log, data$Order, paired = TRUE) # We'll have a look at the boxplots now: # For timing data (Logarithmic scale) by Condition (ML or Non-ML) boxplot(T_log~data$Condition, xlab='Condition 0=Manual, 1=Semi-Automated', ylab="Time in Seconds (Log)", main='Overall timing by condition') #boxplot(data$Time~data$Condition, xlab='Condition', ylab="Time in Seconds", main=axis(1, at=0:1, labels=(["N", "M"])) # also by looking at the scatterplot scatterplot(T_log ~ Condition, data=data) # We also perfomed a t-test (paired because our within-participants design) to # check robustness t.test(T_log, data$Condition, paired = TRUE, alternative = "two.sided") # before we carry on, lets have a look if there is a difference of timing in terms of the # documents. First the box plot: boxplot(T_log~data$PDF) #Also looking at the order here First the box plot: boxplot(data$Time~data$Order, xlab="Order of the Documents", ylab="Time in Seconds", main="Overall Time by Order") # Now the same plot in time (Log scale) and separated by condition boxplot(T_log~data$Order*data$Condition, xlab='0.X = Manual conditions, 1.X = Semi-Automated conditions', ylab="Time in Seconds (Log)", main='Order by Time Separated by Condition') # Again for robustness we also performed a t-test: t.test(T_log, data$Order, paired = TRUE, alternative = "two.sided") # creating two groups (ML (1) and NonML (0) ) Annotations <- subset(data, Condition=='1') NoAnnotations <- subset(data, Condition=='0') # checking the data head(Annotations) cat("Time Overall: ", sum(data$Time), '\n') cat("Overall Mean Time: ", mean(data$Time), '\n') cat("Overall SD Time: ", sd(data$Time), '\n') cat('_____________________________________________', '\n') cat("Time in Annotations: ", sum(Annotations$Time), "Time in NoAnnotations: ", sum(NoAnnotations$Time), '\n') cat("Mean Annotations: ", mean(Annotations$Time), "Mean NoAnnotations: ", mean(NoAnnotations$Time), '\n') cat("SD Annotations: ", sd(Annotations$Time), "SD NoAnnotations: ", sd(NoAnnotations$Time), '\n') cat('_____________________________________________', '\n') # Quick look at the data plotNormalHistogram(data$Time) # cheching the data transformation: T_tuk = transformTukey(data$Time, plotit=FALSE) # plotting the transformed data plotNormalHistogram(T_tuk) # Now using the log of the data and plotting it: plotNormalHistogram(T_log) # first add the log time to the data frame data$logT <- log(data$Time) # create the primary model: Log(Time) by Condition as fixed effects and ID as random effect: ml.p = lmer(logT ~ Condition + (1 | ID), data=data) summary(ml.p) # looking at coefficients by ID coef(ml.p)$ID # the mean values coef(summary(ml.p))[,"Estimate"] # establishing the confidence intervalls: confint(ml.p) # main speed up = 100 - 0.75 = 25% exp(-.28) # What is the effect of the PDF, Order, Condition and Person? ml.1 = lmer(logT ~ Order + Condition + (1|ID) + (1|PDF), data=data, REML=FALSE) ml.2 = lmer(logT ~ Condition + (1|ID) + (1|PDF), data=data, REML=FALSE) summary(ml.1) summary(ml.2) # Likelihood-Ratio analysis (ANOVA) to see if the Order is important anova(ml.1, ml.2) summary(ml.1) ml.3 = lmer(logT ~ Order + Condition + (1|ID), data=data, REML=FALSE) # again likelihood-ratio analysis (ANOVA) now looking at the difference between m_1 and m_3 : anova(ml.1, ml.3) coef(ml.3) summary(ml.3) # including the condition also as random effect in model m_3 ml.3 = lmer(logT ~ Order + Condition + (Condition | ID), data=data, REML=FALSE) summary(ml.3) # coefficients by ID coef(ml.3)$ID # histogram about those conditions: hist(coef(ml.3)$ID[,"Condition"]) ml.4 = lmer(logT ~ Order + Condition + (Condition|ID), data=data, REML=FALSE) # again likelihood-ratio (ANOVA) to see the differences between models: anova(ml.4, ml.3) # Do we need the Order if we have the condition as a random effect? ml.5 = lmer(logT ~ Condition + (Condition|ID), data=data, REML=FALSE) # Differences in models: anova(ml.4, ml.5) # examining model m_4 summary(ml.4) # examining model m_5 summary(ml.5) # Some descriptitves # TOTAL first (Annotations & NonAnnotations together) cat('____________TOTAL DATA:______________________', '\n') cat("Total Annotations added:", sum(data$TotalAdded), '\n') cat("Mean Total Annotations added:", mean(data$TotalAdded), '\n') cat("SD Total Annotations added:", sd(data$TotalAdded), '\n') cat('_____________________________________________', '\n') cat("Total Annotations deleted:", sum(data$TotalDeleted), '\n') cat("Mean Total Annotations deleted:", mean(data$TotalDeleted), '\n') cat("SD Total Annotations deleted:", sd(data$TotalDeleted), '\n') cat('\n') cat('____________MACHINE LEARNING:________________', '\n') # Now only for Machine-Learning Condition cat("Total Annotations added:", sum(Annotations$TotalAdded), '\n') cat("Mean Total Annotations added:", mean(Annotations$TotalAdded), '\n') cat("SD Total Annotations added:", sd(Annotations$TotalAdded), '\n') cat('_____________________________________________', '\n') cat("Total Annotations deleted:", sum(Annotations$TotalDeleted), '\n') cat("Mean Total Annotations deleted:", mean(Annotations$TotalDeleted), '\n') cat("SD Total Annotations deleted:", sd(Annotations$TotalDeleted), '\n') cat('\n') cat('____________NON-MACHINE LEARNING: ___________', '\n') # And for the Non-Machine-Learning Condition cat("Total Annotations added:", sum(NoAnnotations$TotalAdded), '\n') cat("Mean Total Annotations added:", mean(NoAnnotations$TotalAdded), '\n') cat("SD Total Annotations added:", sd(NoAnnotations$TotalAdded), '\n') cat('_____________________________________________', '\n') cat("Total Annotations deleted:", sum(NoAnnotations$TotalDeleted), '\n') cat("Mean Total Annotations deleted:", mean(NoAnnotations$TotalDeleted), '\n') cat("SD Total Annotations deleted:", sd(NoAnnotations$TotalDeleted), '\n') # using a subset of the above main data here: data_s <- read.xls("data/subset_selfreported.xlsx", verbose=FALSE, na.strings=c("NA")) # and filtering out douplicates data_s2 <- unique(data_s) # quick check on the data head(data_s2) # Calculating Descriptive Self-reported characteristics data: NumberOfReviews0 <- sum(data_s2$NReviews=='0') NumberOfReviews1_5 <- sum(data_s2$NReviews=='1to5') NumberOfReviews5_0 <- sum(data_s2$NReviews=='5to10') NumberOfReviews10plus <- sum(data_s2$NReviews=='10+') NumberOfReviews0 NumberOfReviews1_5 NumberOfReviews5_0 NumberOfReviews10plus NumberOfReviews0_percentage <- NumberOfReviews0/41 NumberOfReviews1_5_percentage <- NumberOfReviews1_5/41 NumberOfReviews5_0_percentage <- NumberOfReviews5_0/41 NumberOfReviews10plus_percentage <- NumberOfReviews10plus/41 NumberOfReviews0_percentage NumberOfReviews1_5_percentage NumberOfReviews5_0_percentage NumberOfReviews10plus_percentage TotalSumOfPeopleWithCochraneExperience <- sum(data_s2$CExperience=='1') TotalSumOfPeopleWithoutCochraneExperience <- sum(data_s2$CExperience=='0') TotalSumOfPeopleWithCochraneExperience TotalSumOfPeopleWithoutCochraneExperience Percentage_TotalSumOfPeopleWithCochraneExperience <- TotalSumOfPeopleWithCochraneExperience/41 Percentage_TotalSumOfPeopleWithoutCochraneExperience <- TotalSumOfPeopleWithoutCochraneExperience/41 Percentage_TotalSumOfPeopleWithCochraneExperience Percentage_TotalSumOfPeopleWithoutCochraneExperience MedianOfTasksPerfomed <- median(data_s2$Tasks) IRQofTasksPerformed <- quantile(data_s2$Tasks) MedianOfTasksPerfomed IRQofTasksPerformed # Importing the data data_agreement <- read.xls("data/agreement.xlsx", verbose=FALSE, na.strings=c("NA")) # creating two groups (ML (1) and NonML (0) ) ML <- subset(data_agreement, Condition=='1') NoML <- subset(data_agreement, Condition=='0') # checking how many datapoints in one column: count(ML) # calculating the changed data and percentiles: RSG <- sum(ML$Changed) cat("RSG changed:", RSG, "RSG %:", RSG/82, '\n') AC <- sum(ML$Changed.1) cat("AC changed:", AC, "AC %:", AC/82, '\n') BPP <- sum(ML$Changed.2) cat("BPP changed:", BPP, "BPP %:", BPP/82, '\n') BOA <- sum(ML$Changed.3) cat("BOA changed:", BOA, "BOA %:", BOA/82, '\n') Overall <- RSG+AC+BPP+BOA cat("Overall changed:", Overall, "RSG %:", Overall/328, '\n') # Overall mean annotations data <- read.xls("data/TimeAnalysis2_1.xlsx", verbose=FALSE, na.strings=c("NA")) data2 <- subset(data, Condition=='1') data3 <- subset(data, Condition=='0') mean(data2$TotalSubmitted) mean(data3$TotalSubmitted) ############## # annotations data <- read.xls("data/annotationschanged.xlsx", verbose=FALSE, na.strings=c("NA")) data2 <- subset(data, Condition=='1') data3 <- subset(data, Condition=='0') RSG1 <- sum(data2$Changed1=='0') RSG2 <- sum(data2$Changed1=='1') RSG3 <- sum(data2$Changed1=='2') RSG1/82 RSG2/82 RSG3/82 AC1 <- sum(data2$Changed2=='0') AC2 <- sum(data2$Changed2=='1') AC3 <- sum(data2$Changed2=='2') AC1/82 AC2/82 AC3/82 BPP1 <- sum(data2$Changed3=='0') BPP2 <- sum(data2$Changed3=='1') BPP3 <- sum(data2$Changed3=='2') BPP1/82 BPP2/82 BPP3/82 BOA1 <- sum(data2$Changed4=='0') BOA2 <- sum(data2$Changed4=='1') BOA3 <- sum(data2$Changed4=='2') BOA1/82 BOA2/82 BOA3/82 TotalUnchanged <- RSG1+AC1+BPP1+BOA1 TotalUnchanged/328 TotalChangedML <- RSG2+AC2+BPP2+BOA2 TotalChangedML/328 TotalChangedNoML <- RSG3+AC3+BPP3+BOA3 TotalChangedNoML/328 # loading the data: dataq <- read.xls("data/UXData1.xlsx", verbose=FALSE, na.strings=c("NA")) # subsetting data_rel <- as.data.frame(dataq) ss2 <- c(1,3,4, 25:44) data_q2 <- subset(data_rel, select=ss2) # checking the data head(data_q2) ss3 <- c(1, 3, 8:23) clean_q2 <- subset(data_q2, select=ss3) head(clean_q2) clean_q2 <- subset(clean_q2, Condition=="A") head(clean_q2) clean_q2_A <- subset(clean_q2, Condition=='A') clean_q2_NOA <- subset(clean_q2, Condition=='NOA') head(clean_q2_A) str(clean_q2_A) # need to change this to factors! clean_q2_A$UseFrequently = factor(clean_q2_A$UseFrequently, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$Complex = factor(clean_q2_A$Complex, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$EasyToUse = factor(clean_q2_A$EasyToUse, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$NeedSupport = factor(clean_q2_A$NeedSupport, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$WellIntegrated = factor(clean_q2_A$WellIntegrated, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$Inconsistency = factor(clean_q2_A$Inconsistency, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$UseQuickly = factor(clean_q2_A$UseQuickly, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$Cumbersome = factor(clean_q2_A$Cumbersome, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$Confident = factor(clean_q2_A$Confident, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$NeededLearn = factor(clean_q2_A$NeededLearn, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$TextHelpful = factor(clean_q2_A$TextHelpful, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$DifficultToNav = factor(clean_q2_A$DifficultToNav, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$ImproveQuality = factor(clean_q2_A$ImproveQuality, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$Irrelevant = factor(clean_q2_A$Irrelevant, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$Confused = factor(clean_q2_A$Confused, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) clean_q2_A$ContinueUse = factor(clean_q2_A$ContinueUse, levels = c("1", "2", "3", "4", "5"), ordered = TRUE) # checkl again str(clean_q2_A) summary(clean_q2_A) #need to remove the 'out of level columns' ss4 <- c(3:18) final_data <- subset(clean_q2_A, select=ss4) summary(final_data) results <- likert(final_data) # Legend: 5 = Strongly agree 1 = Stronly disagree plot(results, type='bar') # Alternative hearmap graph plot(results, type="heat", low.color = "white", high.color = "blue", text.color = "black", text.size = 4, wrap = 50) # checkng the distributions plot(results, type="density", facet = TRUE, bw = 0.5) r1 <- c(1, 3:12) sus <- subset(clean_q2_A, select=r1) head(sus) sus sus_ready <- read.csv("data/sus_calculation.csv") head(sus_ready) g <- c(2:11) ggplot(data=sus_ready, aes(x=Participant, y=SUS.Score, group=1)) + geom_point()+ geom_point() mean(sus_ready$SUS.Score)