This document will walk through the steps for parsing and cleaning flow cytometry spreadsheets. The result will be a single R dataframe containing all the data, with standardized column names and formatting.
Required Files:
SIG_WNV_Histology_Data_Cleaning.ipynb
): [Download here]WNV_Data_Dictionary.xlsx
): [Download here]** Note: this notebook can also be downloaded as an R script (only the code blocks seen below will be included): [Download R script here]
Required R Packages:
All code is available on GitHub: https://github.com/biodev/SIG
If you are not familiar with Jupyter Notebooks, I've created a short tutorial to get you up and running quickly. There is also plenty of documentation online:
Sometimes, a bit of manual cleaning of the spreadsheets is necessary for the parsing function to work correctly.
library(gdata)
gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED. gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED. Attaching package: ‘gdata’ The following object is masked from ‘package:stats’: nobs The following object is masked from ‘package:utils’: object.size
## Set data directories
cleaned_dir = '~/Documents/SIG/WNV/Cleaned_Data_Releases/15-Jan-2016'
hist_dir = '~/Documents/SIG/WNV/Histology/fixed_files'
## Load column names
hist_cn = read.xls(xls="./data/WNV_Data_Dictionary.xlsx", sheet="Histology Data", as.is=T)
hist_cn = hist_cn[,1]
## Load cleaned weight data
gale_weight = read.xls(file.path(cleaned_dir, 'Gale_Weight_13-Jan-2016_final.xlsx'), header=T,
as.is=T, na.strings=c(""," ", "NA", "na", "#DIV/0!"))
## Load previously cleaned histology data
hist_prev = read.xls(file.path(cleaned_dir, 'Gale_Histology_5-Jan-2016_final.xlsx'), header=T,
as.is=T, na.strings=c(""," ", "NA", "na", "#DIV/0!"))
## Get files to be processed
hist_files = list.files(hist_dir, pattern=".*\\.xlsx")
print(hist_files)
[1] "Copy of 15-H943_fixedMM.xlsx" [2] "Copy of Copy of Comparative Pathology Program Consult Request 15-H1246 imaging_fixedMM.xlsx" [3] "Copy of Copy of Comparative Pathology Program Consult Request 15-H1393_fixedMM.xlsx" [4] "Copy of Copy of Comparative Pathology Program Consult Request 15-H1508_fixedMM.xlsx" [5] "Copy of Copy of Gale CC mice CPP request for prefilled 16-H94_fixedMM.xlsx" [6] "Copy of histo 15-H792 results_fixedMM.xlsx" [7] "New scoring 08 19 14_fixedMM.xlsx" [8] "Results 15-H501+15-H502 06 10 15_fixedMM.xlsx" [9] "Results 15-H585 06 17 15_fixedMM.xlsx"
## Read all files and combine into a single dataframe
print(hist_files[1])
hist_data_v2 = read.xls(file.path(hist_dir, hist_files[1]), header=T,
as.is=T, na.strings=c(""," ", "NA", "na", "#DIV/0!"))
colnames(hist_data_v2) = hist_cn[c(9, c(13:35))]
for (file in hist_files[2:length(hist_files)]) {
print(file)
hist_data = read.xls(file.path(hist_dir, file), header=T,
as.is=T, na.strings=c(""," ", "NA", "na", "#DIV/0!"))
colnames(hist_data) = hist_cn[c(9, c(13:35))]
hist_data_v2 = rbind(hist_data_v2, hist_data)
}
[1] "Copy of 15-H943_fixedMM.xlsx" [1] "Copy of Copy of Comparative Pathology Program Consult Request 15-H1246 imaging_fixedMM.xlsx" [1] "Copy of Copy of Comparative Pathology Program Consult Request 15-H1393_fixedMM.xlsx" [1] "Copy of Copy of Comparative Pathology Program Consult Request 15-H1508_fixedMM.xlsx" [1] "Copy of Copy of Gale CC mice CPP request for prefilled 16-H94_fixedMM.xlsx" [1] "Copy of histo 15-H792 results_fixedMM.xlsx" [1] "New scoring 08 19 14_fixedMM.xlsx" [1] "Results 15-H501+15-H502 06 10 15_fixedMM.xlsx" [1] "Results 15-H585 06 17 15_fixedMM.xlsx"
## Check dimensions of dataframe
dim(hist_data_v2)
## Create missing columns
for (cn in setdiff(hist_cn, colnames(hist_data_v2))) {
print(cn)
hist_data_v2[,cn] = NA
}
## Order columns
hist_data_v2 = hist_data_v2[, hist_cn]
[1] "ID" [1] "Mating" [1] "RIX_ID" [1] "UW_Line" [1] "UWID" [1] "Virus" [1] "Tissue" [1] "Timepoint" [1] "Lab" [1] "Data_Altered" [1] "Notes" [1] "GI_Lesions"
## Remove any leading or trailing spaces from slide_label
hist_data_v2$slide_label = trim(hist_data_v2$slide_label)
## Update UW Line
hist_data_v2$UW_Line = sapply(hist_data_v2$slide_label, function(x){unlist(strsplit(x, " "))[1]})
hist_data_v2$UW_Line = as.numeric(hist_data_v2$UW_Line)
Warning message: In eval(expr, envir, enclos): NAs introduced by coercion
## Update UWID
hist_data_v2$UWID = sapply(hist_data_v2$slide_label, function(x){unlist(strsplit(x, " "))[2]})
hist_data_v2$UWID = gsub("m", "M", hist_data_v2$UWID)
## Update Lab
hist_data_v2$Lab = 'Gale'
## Update time points
hist_data_v2$Timepoint = sapply(hist_data_v2$UWID, function(x){unlist(strsplit(x, "\\."))[2]})
## Use time points to update virus
hist_data_v2$Virus = 'WNV'
hist_data_v2$Virus[grepl("M", hist_data_v2$Timepoint)] = 'Mock'
hist_data_v2$Virus[grepl("m", hist_data_v2$Timepoint)] = 'Mock'
## Update time points
hist_data_v2$Timepoint = gsub("M", "", hist_data_v2$Timepoint)
hist_data_v2$Timepoint = gsub("m", "", hist_data_v2$Timepoint)
hist_data_v2$Timepoint = as.numeric(hist_data_v2$Timepoint)
## Update Mating and RIX_ID
for (i in 1:dim(hist_data_v2)[1]) {
line = hist_data_v2$UW_Line[i]
if (!is.na(line)) {
mating = gale_weight$Mating[with(gale_weight, UW_Line==hist_data_v2$UW_Line[i])]
if (length(mating)>0) {
hist_data_v2$Mating[i] = mating[1]
rix_id = gale_weight$RIX_ID[with(gale_weight, UW_Line==hist_data_v2$UW_Line[i] & UWID==hist_data_v2$UWID[i])]
if (length(rix_id)>0) {
if (length(rix_id)>1) {
## If more than one ID matches the UWID, print the IDs
print(paste0("line=", mating[1], "; rix_id=", rix_id))
}
hist_data_v2$RIX_ID[i] = rix_id[1]
}
}
}
}
[1] "line=3260x1566; rix_id=2" "line=3260x1566; rix_id=15"
## Check Matings and IDs of the discrepancies
## Make manual corrections if necessary
hist_data_v2[hist_data_v2$Mating=='3260x1566' & !is.na(hist_data_v2$Mating), 1:12]
ID | Mating | RIX_ID | UW_Line | UWID | Virus | Tissue | Timepoint | slide_label | Lab | Data_Altered | Notes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
273 | NA | 3260x1566 | 2 | 100 | 2.12 | WNV | NA | 12 | 100 2.12 | Gale | NA | NA |
274 | NA | 3260x1566 | 8 | 100 | 1.12M | Mock | NA | 12 | 100 1.12M | Gale | NA | NA |
275 | NA | 3260x1566 | 9 | 100 | 2.12M | Mock | NA | 12 | 100 2.12M | Gale | NA | NA |
276 | NA | 3260x1566 | 16 | 100 | 3.12M | Mock | NA | 12 | 100 3.12M | Gale | NA | NA |
331 | NA | 3260x1566 | NA | 100 | 1.4 | WNV | NA | 4 | 100 1.4 | Gale | NA | NA |
332 | NA | 3260x1566 | NA | 100 | 2.4 | WNV | NA | 4 | 100 2.4 | Gale | NA | NA |
333 | NA | 3260x1566 | NA | 100 | 3.4 | WNV | NA | 4 | 100 3.4 | Gale | NA | NA |
364 | NA | 3260x1566 | NA | 100 | 1.7 | WNV | NA | 7 | 100 1.7 | Gale | NA | NA |
365 | NA | 3260x1566 | NA | 100 | 2.7 | WNV | NA | 7 | 100 2.7 | Gale | NA | NA |
366 | NA | 3260x1566 | NA | 100 | 3.7 | WNV | NA | 7 | 100 3.7 | Gale | NA | NA |
## Update ID
hist_data_v2$ID = paste(hist_data_v2$Mating, hist_data_v2$RIX_ID, sep="_")
hist_data_v2$ID[grepl("NA", hist_data_v2$ID)] = NA
## Update tissue
hist_data_v2$Tissue = 'Brain'
for (i in 1:length(hist_data_v2$slide_label)) {
label = hist_data_v2$slide_label[i]
if (grepl("Br", label)) {hist_data_v2$Tissue[i] = 'Brain'}
if (grepl("LI", label)) {hist_data_v2$Tissue[i] = 'Large Intestine'}
if (grepl("SI", label)) {hist_data_v2$Tissue[i] = 'Small Intestine'}
if (grepl("Stom", label)) {hist_data_v2$Tissue[i] = 'Stomach'}
if (grepl("[Cc]ec", label)) {hist_data_v2$Tissue[i] = 'Cecum'}
}
## Update GI_Lesions
hist_data_v2$GI_Lesions = NA
for (i in 1:length(hist_data_v2$Tissue)) {
tissue = hist_data_v2$Tissue[i]
if (tissue != 'Brain') {
hist_data_v2$GI_Lesions[i] = hist_data_v2$Cortex_PV_Inf[i]
hist_data_v2$Cortex_PV_Inf[i] = NA
}
}
hist_data_v2$Cortex_PV_Inf = as.numeric(hist_data_v2$Cortex_PV_Inf)
hist_data_v2$GI_Lesions[hist_data_v2$GI_Lesions == 'NSL'] = 0
table(hist_data_v2$GI_Lesions)
0 1 98 22
## Fix STI sample
hist_data_v2[grepl("STI", hist_data_v2$slide_label),]
ID | Mating | RIX_ID | UW_Line | UWID | Virus | Tissue | Timepoint | slide_label | Lab | Data_Altered | Notes | Cortex_PV_Inf | Cortex_P_Inf | Cortex_hm | Cortex_nn | Cortex_Subtotal | Hippocampus_PV_Inf | Hippocampus_P_Inf | Hippocampus_hm | Hippocampus_nn | Hippocampus_Subtotal | Thalamus_Midbrain_PV_Inf | Thalamus_Midbrain_P_Inf | Thalamus_Midbrain_hm | Thalamus_Midbrain_nn | Thalamus_Midbrain_Subtotal | Cerebellum_PV_Inf | Cerebellum_P_Inf | Cerebellum_hm | Cerebellum_nn | Cerebellum_Subtotal | Meningitis | Total | Comment | GI_Lesions | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
397 | NA | NA | NA | NA | Mock | WNV | Brain | NA | STI Mock K | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | KIDNEY- no signifincat lesions; mild artifacts MALE | NA |
398 | NA | NA | NA | NA | 3.6 | WNV | Brain | 6 | STI 3.6 K | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
## Fix STI sample
hist_data_v2$UWID[397] = NA
hist_data_v2$Virus[397] = 'Mock'
hist_data_v2$Tissue[397] = 'Kidney'
hist_data_v2$Tissue[398] = 'Kidney'
## Check for duplicates
sum(duplicated(hist_data_v2$ID[!is.na(hist_data_v2$ID)]))
## Duplicated IDs
hist_data_v2$ID[duplicated(hist_data_v2$ID) & !is.na(hist_data_v2$ID)]
cortex_cols = c('Cortex_PV_Inf','Cortex_P_Inf','Cortex_hm','Cortex_nn')
hippocampus_cols = c('Hippocampus_PV_Inf','Hippocampus_P_Inf','Hippocampus_hm','Hippocampus_nn')
thalamus_cols = c('Thalamus_Midbrain_PV_Inf','Thalamus_Midbrain_P_Inf','Thalamus_Midbrain_hm','Thalamus_Midbrain_nn')
cerebellum_cols = c('Cerebellum_PV_Inf','Cerebellum_P_Inf','Cerebellum_hm','Cerebellum_nn')
subtotal_cols = c('Cortex_Subtotal','Hippocampus_Subtotal','Thalamus_Midbrain_Subtotal','Cerebellum_Subtotal','Meningitis')
## Fill in zeros if not NA
for (i in 1:dim(hist_data_v2)[1]) {
if (!is.na(hist_data_v2[i, 'Cortex_Subtotal']) & hist_data_v2[i, 'Cortex_Subtotal'] == 0) {
hist_data_v2[i,cortex_cols] = c(0,0,0,0)
}
if (!is.na(hist_data_v2[i, 'Hippocampus_Subtotal']) & hist_data_v2[i, 'Hippocampus_Subtotal'] == 0) {
hist_data_v2[i,hippocampus_cols] = c(0,0,0,0)
}
if (!is.na(hist_data_v2[i, 'Thalamus_Midbrain_Subtotal']) & hist_data_v2[i, 'Thalamus_Midbrain_Subtotal'] == 0) {
hist_data_v2[i,thalamus_cols] = c(0,0,0,0)
}
if (!is.na(hist_data_v2[i, 'Cerebellum_Subtotal']) & hist_data_v2[i, 'Cerebellum_Subtotal'] == 0) {
hist_data_v2[i,cerebellum_cols] = c(0,0,0,0)
}
if (!is.na(hist_data_v2[i, 'Total']) & hist_data_v2[i, 'Total'] == 0) {
hist_data_v2[i,subtotal_cols] = c(0,0,0,0,0)
}
}
hist_data_v2$Cortex_Subtotal = apply(hist_data_v2[,cortex_cols], 1, function(x){sum(x)})
hist_data_v2$Hippocampus_Subtotal = apply(hist_data_v2[,hippocampus_cols], 1, function(x){sum(x)})
hist_data_v2$Thalamus_Midbrain_Subtotal = apply(hist_data_v2[,thalamus_cols], 1, function(x){sum(x)})
hist_data_v2$Cerebellum_Subtotal = apply(hist_data_v2[,cerebellum_cols], 1, function(x){sum(x)})
hist_data_v2$Total = apply(hist_data_v2[,subtotal_cols], 1, function(x){sum(x)})
head(hist_data_v2, 10)
ID | Mating | RIX_ID | UW_Line | UWID | Virus | Tissue | Timepoint | slide_label | Lab | Data_Altered | Notes | Cortex_PV_Inf | Cortex_P_Inf | Cortex_hm | Cortex_nn | Cortex_Subtotal | Hippocampus_PV_Inf | Hippocampus_P_Inf | Hippocampus_hm | Hippocampus_nn | Hippocampus_Subtotal | Thalamus_Midbrain_PV_Inf | Thalamus_Midbrain_P_Inf | Thalamus_Midbrain_hm | Thalamus_Midbrain_nn | Thalamus_Midbrain_Subtotal | Cerebellum_PV_Inf | Cerebellum_P_Inf | Cerebellum_hm | Cerebellum_nn | Cerebellum_Subtotal | Meningitis | Total | Comment | GI_Lesions | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 3154x16012_37 | 3154x16012 | 37 | 83 | 1.7 | WNV | Brain | 7 | 083 1.7 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
2 | 3154x16012_38 | 3154x16012 | 38 | 83 | 2.7 | WNV | Brain | 7 | 083 2.7 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
3 | 3154x16012_45 | 3154x16012 | 45 | 83 | 3.7 | WNV | Brain | 7 | 083 3.7 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
4 | 16072x5346_61 | 16072x5346 | 61 | 81 | 1.12 | WNV | Brain | 12 | 081 1.12 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
5 | 16072x5346_62 | 16072x5346 | 62 | 81 | 2.12 | WNV | Brain | 12 | 081 2.12 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
6 | 16072x5346_68 | 16072x5346 | 68 | 81 | 3.12 | WNV | Brain | 12 | 081 3.12 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
7 | 16072x5346_63 | 16072x5346 | 63 | 81 | 1.12M | Mock | Brain | 12 | 081 1.12M | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
8 | 16072x5346_70 | 16072x5346 | 70 | 81 | 2.12M | Mock | Brain | 12 | 081 2.12M | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
9 | 16072x5346_71 | 16072x5346 | 71 | 81 | 3.12M | Mock | Brain | 12 | 081 3.12M | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
10 | 5358x8046_71 | 5358x8046 | 71 | 73 | 1.12 | WNV | Brain | 12 | 073 1.12 | Gale | NA | NA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NA | NA |
## Check for mocks with scores > 0
hist_data_v2[!is.na(hist_data_v2$Total) & hist_data_v2$Total != 0 & hist_data_v2$Virus=='Mock',]
ID | Mating | RIX_ID | UW_Line | UWID | Virus | Tissue | Timepoint | slide_label | Lab | Data_Altered | Notes | Cortex_PV_Inf | Cortex_P_Inf | Cortex_hm | Cortex_nn | Cortex_Subtotal | Hippocampus_PV_Inf | Hippocampus_P_Inf | Hippocampus_hm | Hippocampus_nn | Hippocampus_Subtotal | Thalamus_Midbrain_PV_Inf | Thalamus_Midbrain_P_Inf | Thalamus_Midbrain_hm | Thalamus_Midbrain_nn | Thalamus_Midbrain_Subtotal | Cerebellum_PV_Inf | Cerebellum_P_Inf | Cerebellum_hm | Cerebellum_nn | Cerebellum_Subtotal | Meningitis | Total | Comment | GI_Lesions |
---|
## Get IDs duplicated in new data
dup_ids = intersect(hist_data_v2$ID, hist_prev$ID)[!is.na(intersect(hist_data_v2$ID, hist_prev$ID))]
dup_ids
hist_data_v2[hist_data_v2$ID %in% dup_ids,]
hist_prev[hist_prev$ID %in% dup_ids,]
dim(hist_prev)
hist_prev = hist_prev[!hist_prev$ID %in% dup_ids,]
dim(hist_prev)
for (cn in setdiff(colnames(hist_data_v2), colnames(hist_prev))) {
print(cn)
hist_prev[,cn] = NA
}
[1] "Tissue" [1] "GI_Lesions"
gale_hist = rbind(hist_prev[,colnames(hist_data_v2)], hist_data_v2[,colnames(hist_data_v2)])
dim(gale_hist)
write.table(gale_hist, file=file.path(hist_dir, 'Gale_Histology_21-Mar-2016_final.txt'),
col.names=T, row.names=F, quote=F, sep='\t', na="")