#!/usr/bin/env python
# coding: utf-8
# ### Perform partial least squares discriminant analysis
# Perform partial least squares discriminant analysis using data for named metabolites from the metabolomics workbench or uploaded data files.
#
Note: This notebook contains IPython widgets. Consequently, you won't be able to use Kernal/Restart & Restart command to automatically execute all cells in the notebook. You must use Run command individually to execute each cell and advance to the next cell.
# Import Python modules...
# In[ ]:
from __future__ import print_function
import os
import sys
import time
import re
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
import ipywidgets as widgets
from IPython.display import display, HTML
from IPython import __version__ as ipyVersion
# Import MW modules from the current directory or default Python directory...
import MWUtil
get_ipython().run_line_magic('matplotlib', 'inline')
print("Python: %s.%s.%s" % sys.version_info[:3])
print("IPython: %s" % ipyVersion)
print()
print(time.asctime())
# The URL PATH
#
# The MW REST URL consists of three main parts, separated by forward slashes, after the common prefix specifying the invariant base URL (https://www.metabolomicsworkbench.org/rest/):
#
# https://www.metabolomicsworkbench.org/rest/context/input_specification/output_specification
#
# Part 1: The context determines the type of data to be accessed from the Metabolomics Workbench, such as metadata or results related to the submitted studies, data from metabolites, genes/proteins and analytical chemistry databases as well as other services related to mass spectrometry and metabolite identification:
#
# context = study | compound | refmet | gene | protein | moverz | exactmass
#
# Part 2: The input specification consists of two required parameters describing the REST request:
#
# input_specification = input_item/input_value
#
# Part 3: The output specification consists of two parameters describing the output generated by the REST request:
#
# output_specification = output_item/(output_format)
#
# The first parameter is required in most cases. The second parameter is optional. The input and output specifications are context sensitive. The context determines the values allowed for the remaining parameters in the input and output specifications as detailed in the sections below.
#
# Setup MW REST base URL...
# In[ ]:
MWBaseURL = "https://www.metabolomicsworkbench.org/rest"
# **Retrieve or upload data for named metabolites...**
# In[ ]:
# Initialize data...
StudiesResultsData = None
RetrievedMWData = None
# In[ ]:
# Setup UIF info text...
TopInfoTextHTML = widgets.HTML(value = "Retrieve or upload data and process any missing values",
placeholder='', description='')
# Setup UIF to process any missing values...
MissingValuesMethods = ["NoAction", "DeleteRows", "DeleteColumns", "ReplaceByColumnMean", "ReplaceColumnMedian", "ReplaceByZero" , "LinearInterpolation"]
MissingValuesMethodsDropdown = widgets.Dropdown(options = MissingValuesMethods,
value = "NoAction",
description = " ")
ProcessMissingValueTopTextHTML = widgets.HTML(value = "Method for processing missing values:",
placeholder='', description='')
# Setup UIF to retrieve...
StudyIDText = widgets.Text(value = "ST000001 ST000002", description = "Study ID (s)",
placeholder = "Type study ID", disabled = False,
layout = widgets.Layout(margin='0 10px 0 0'))
RetrieveDataBtn = widgets.Button(description = 'Retrieve Data', disabled = False, button_stype = '',
tooltip = "Retrieve data for study ID")
RetrieveDataOutput = widgets.Output()
def RetrieveDataBtnEventHandler(Object):
global StudiesResultsData, RetrievedMWData
RetrievedMWData = True
StudiesResultsData = None
StudyIDs = StudyIDText.value
MissingValuesMethod = MissingValuesMethodsDropdown.value
RetrieveDataOutput.clear_output()
UploadDataOutput.clear_output()
with RetrieveDataOutput:
if len(StudyIDs):
print("\nProcessing study ID(s): %s" % StudyIDs)
StudiesResultsData = MWUtil.RetrieveStudiesAnalysisAndResultsData(StudyIDs, MWBaseURL, MissingValuesMethod)
DisplayData = False if len(StudiesResultsData.keys()) > 5 else True
MWUtil.ListStudiesAnalysisAndResultsData(StudiesResultsData, DisplayDataFrame = DisplayData,
IPythonDisplayFuncRef = display, IPythonHTMLFuncRef = HTML)
else:
print("\nNo study ID(s) specified...")
RetrieveDataBtn.on_click(RetrieveDataBtnEventHandler)
# Setup UIF to upload data file(s)...
FileUploadBtn = widgets.FileUpload(description = 'Upload File(s)', accept='.csv,.txt,.tsv', multiple = True,
disabled = False)
FileUploadTextHTML = widgets.HTML(value = "File format: Col 1: Sample names; \
Col 2: Class identifiers; Remaining cols: Named metabolites; \
Exts: .csv, .txt, or .tsv", placeholder='', description='')
UploadDataOutput = widgets.Output()
def FileUploadBtnEventHandler(Change):
global StudiesResultsData, RetrievedMWData
RetrievedMWData = False
StudiesResultsData = None
MissingValuesMethod = MissingValuesMethodsDropdown.value
UploadedDataInfo = FileUploadBtn.value
RetrieveDataOutput.clear_output()
UploadDataOutput.clear_output()
with UploadDataOutput:
StudiesResultsData = MWUtil.RetrieveUploadedData(UploadedDataInfo, MissingValuesMethod)
DisplayData = False if len(StudiesResultsData.keys()) > 5 else True
MWUtil.ListStudiesAnalysisAndResultsData(StudiesResultsData, DisplayDataFrame = DisplayData,
IPythonDisplayFuncRef = display, IPythonHTMLFuncRef = HTML)
FileUploadBtn.observe(FileUploadBtnEventHandler, names = 'value')
# Setup UIF to retrieve or upload data file...
DataWarningTextHTML = widgets.HTML(value = "Warning: Don't re-run the current cell after specifying study ID(s) or selecting file(s) and retrieving the data. Click on the next cell to advance.
", placeholder='', description='')
OrTextHTML = widgets.HTML(value = "Or", placeholder='', description='')
UIFDataBoxes = []
UIFDataBoxes.append(widgets.HBox([TopInfoTextHTML]))
UIFDataBoxes.append(widgets.HBox([ProcessMissingValueTopTextHTML, MissingValuesMethodsDropdown]))
UIFDataBoxes.append(widgets.HBox([StudyIDText, RetrieveDataBtn],
layout = widgets.Layout(margin='10px 0 0 0')))
UIFDataBoxes.append(widgets.HBox([OrTextHTML]))
UIFDataBoxes.append(widgets.HBox([FileUploadBtn]))
UIFDataBoxes.append(widgets.HBox([FileUploadTextHTML]))
UIFDataBoxes.append(widgets.HBox([DataWarningTextHTML]))
for UIFDataBox in UIFDataBoxes:
display(UIFDataBox)
display(RetrieveDataOutput)
display(UploadDataOutput)
# In[ ]:
MWUtil.CheckAndWarnEmptyStudiesData(StudiesResultsData, RetrievedMWData, StudyIDText.value)
# Setup UIF for selecting and plotting available data...
# In[ ]:
# Setup UIF data...
StudiesUIFData = MWUtil.SetupUIFDataForStudiesAnalysisAndResults(StudiesResultsData, MinClassCount = 2)
# In[ ]:
MWUtil.CheckAndWarnEmptyStudiesUIFData(StudiesUIFData, RetrievedMWData, StudyIDText.value)
# In[ ]:
# Setup a function to perform PLSDA and generate dataframe for PLSDA plot...
def GeneratePLSDAPlotData(InputDataFrame, NumComponents = 2, ClassColID = "Class", ClassNumColID = "ClassNum"):
"""Perform PLSDA and generate plot data frame. """
DataFrame = InputDataFrame.copy()
# Drop Class column...
if ClassColID is not None:
DataFrame = DataFrame.drop(ClassColID, axis = 1)
# Setup a class dataframe...
ClassesDataFrame = DataFrame.loc[:,[ClassNumColID]]
# Setup class values for PLSDA...
ClassValues = ClassesDataFrame[ClassNumColID].tolist()
# Setup a dummy identity matrix corresponding to class values for PLSDA...
ClassValuesMatrix = pd.get_dummies(ClassValues).values
# Setup a features dataframe for metaboloties...
FeaturesDataFrame = DataFrame.drop(ClassNumColID, axis = 1)
# Standardize the featues...
FeaturesDataValues = FeaturesDataFrame.values
FeaturesDataValues = StandardScaler().fit_transform(FeaturesDataValues)
# Perform analysis...
PLSModel = PLSRegression(n_components = NumComponents, scale = False)
PLSComponents = PLSModel.fit(FeaturesDataValues, ClassValuesMatrix).transform(FeaturesDataValues)
PLSComponents = PLSComponents.tolist()
PLSColumnNames = []
for Index in range(len(PLSComponents[0])):
ColumnID = "LV%s" % (Index + 1)
PLSColumnNames.append(ColumnID)
IndexValues = ClassesDataFrame.index.values.tolist()
PLSDataFrame = pd.DataFrame(data = PLSComponents, columns = PLSColumnNames, index = IndexValues)
# Setup final PLSDA dataframe including class numbers...
PLSDataFrame = pd.concat([ClassesDataFrame, PLSDataFrame], axis = 1)
return (PLSDataFrame)
# Setup a function to draw PLSDA plot...
def DrawPLSDAPlot(PLSDAPlotDataFrame, ClassNumColID ="ClassNum", LD1ColID = "LV1", LD2ColID = "LV2",
ColorPaletteName = "bright", PlotStyle = "darkgrid", FontScale = 1.3,
TitleFontWeight = "bold", LabelsFontWeight = "bold",
PlotWidth = 9, PlotHeight = 6):
sns.set(rc = {'figure.figsize':(PlotWidth, PlotHeight)})
sns.set(style = PlotStyle, font_scale = FontScale)
# Color palette names: deep, muted, pastel, bright, dark, and colorblind.
NumOfClasses = len(PLSDAPlotDataFrame[ClassNumColID].unique().tolist())
ColorsPalette = sns.color_palette(ColorPaletteName, NumOfClasses)
StyleColID = ClassNumColID if NumOfClasses <= 5 else None
if StyleColID is not None:
Axis = sns.scatterplot(x = LD1ColID, y = LD2ColID, hue = ClassNumColID, style = StyleColID,
data = PLSDAPlotDataFrame, palette = ColorsPalette, legend = "brief")
else:
Axis = sns.scatterplot(x = LD1ColID, y = LD2ColID, hue = ClassNumColID, data = PLSDAPlotDataFrame,
palette = ColorsPalette, legend = "brief")
# Set title and labels...
Axis.set_title("PLS-DA Scores Plot", fontweight = TitleFontWeight)
Axis.set_xlabel(LD1ColID, fontweight = LabelsFontWeight)
Axis.set_ylabel(LD2ColID, fontweight = LabelsFontWeight)
# Draw legend outside the plot...
plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)
plt.show()
# In[ ]:
# Setup UIF...
FirstStudyID = StudiesUIFData["StudyIDs"][0]
StudiesDropdown = widgets.Dropdown(options = StudiesUIFData["StudyIDs"], value = FirstStudyID,
description="Study:", disabled = False)
FirstAnalysisID = StudiesUIFData["AnalysisIDs"][FirstStudyID][0]
AnalysisDropdown = widgets.Dropdown(options = StudiesUIFData["AnalysisIDs"][FirstStudyID], value = FirstAnalysisID,
description = "Analysis:", disabled = False)
Components = ["2", "3", "4", "5"]
ComponentsDropdown = widgets.Dropdown(options = Components, value = "2", description = "Components:")
PlotStyles = ["Darkgrid", "Whitegrid", "Dark", "White", "Ticks"]
PlotStylesDropdown = widgets.Dropdown(options = PlotStyles, value = "Darkgrid", description = "Plot style:")
DefaultPlotWidth = 9
DefaultPlotHeight = 6
PlotSizeText = widgets.Text(value = "9x6", description = "Plot size:", placeholder = "Type WxH; Hit enter",
disabled = False, continuous_update=False)
PlotColorPalettes = ["Deep", "Muted", "Pastel", "Bright", "Dark", "Colorblind"]
PlotColorPalettesDropdown = widgets.Dropdown(options = PlotColorPalettes, value = "Bright", description = "Color palette:")
DataLayout = widgets.Layout(margin='0 0 4px 0')
StudiesDataHBox = widgets.HBox([StudiesDropdown, AnalysisDropdown], layout = DataLayout)
ComponentsDataHBox = widgets.HBox([ComponentsDropdown], layout = DataLayout)
PlotsDataHBox = widgets.HBox([PlotStylesDropdown, PlotSizeText], layout = DataLayout)
ColorPalattesDataHBox = widgets.HBox([PlotColorPalettesDropdown], layout = DataLayout)
Output = widgets.Output()
OutputPlot = widgets.Output()
UpdatePlot = True
def DisablePlotUpdate():
global UpdatePlot
UpdatePlot = False
def EnablePlotUpdate():
global UpdatePlot
UpdatePlot = True
def GetUpdatePlotStatus():
global UpdatePlot
return True if UpdatePlot else False
# Setup function to update dropdown options...
def UpdateAnalysisDropdown(StudyID):
AnalysisDropdown.options = StudiesUIFData["AnalysisIDs"][StudyID]
AnalysisDropdown.value = StudiesUIFData["AnalysisIDs"][StudyID][0]
# Setup dropdown event handlers...
def StudiesDropdownEventHandler(Change):
StudyID = Change["new"]
DisablePlotUpdate()
UpdateAnalysisDropdown(StudyID)
EnablePlotUpdate()
PlotData()
def AnalysisDropdownEventHandler(Change):
PlotData()
def ComponentsDropdownEventHandler(Change):
PlotData()
def PlotStylesDropdownEventHandler(Change):
PlotData()
def PlotSizeTextEventHandler(Change):
PlotData()
def PlotColorPalettesDropdownEventHandler(Change):
PlotData()
# Bind required event handlers...
StudiesDropdown.observe(StudiesDropdownEventHandler, names = 'value')
AnalysisDropdown.observe(AnalysisDropdownEventHandler, names = 'value')
ComponentsDropdown.observe(ComponentsDropdownEventHandler, names = 'value')
PlotStylesDropdown.observe(PlotStylesDropdownEventHandler, names = 'value')
PlotSizeText.observe(PlotSizeTextEventHandler, names = 'value')
PlotColorPalettesDropdown.observe(PlotColorPalettesDropdownEventHandler, names = 'value')
# Set up function to generate PCA plot...
def PlotData():
if not UpdatePlot:
return
Output.clear_output()
OutputPlot.clear_output()
StudyID = StudiesDropdown.value
AnalysisID = AnalysisDropdown.value
DataFrame = StudiesResultsData[StudyID][AnalysisID]["data_frame"]
NumOfComponents = int(ComponentsDropdown.value)
Style = PlotStylesDropdown.value
Style = Style.lower()
Palette = PlotColorPalettesDropdown.value
Palette = Palette.lower()
PlotSize = PlotSizeText.value.lower()
PlotSize = re.sub(" ", "", PlotSize)
PlotSizeWords = PlotSize.split("x")
if len(PlotSizeWords) == 2 and len(PlotSizeWords[0]) > 0 and len(PlotSizeWords[1]) > 0:
Width = float(PlotSizeWords[0])
Height = float(PlotSizeWords[1])
else:
Width = DefaultPlotWidth
Height = DefaultHeight
with Output:
print("Invalid plot size; Using default plot size: %sx%s\n" % (Width, Height))
# Retrieve data for a PLSDA plot...
with OutputPlot:
PLSDAPlotDataFrame = GeneratePLSDAPlotData(DataFrame, NumComponents = NumOfComponents,
ClassColID = "Class", ClassNumColID = "ClassNum")
with OutputPlot:
# Draw PLSDA plot...
DrawPLSDAPlot(PLSDAPlotDataFrame, ClassNumColID ="ClassNum", LD1ColID = "LV1", LD2ColID = "LV2",
ColorPaletteName = Palette, PlotStyle = Style,
PlotWidth = Width, PlotHeight = Height)
with Output:
MWUtil.ListClassInformation(StudiesResultsData, StudyID, AnalysisID, RetrievedMWData)
# Setup a link to download data...
if RetrievedMWData:
FileName = "%s_%s_Data.csv" % (StudyID, AnalysisID)
HTMLText = MWUtil.SetupCSVDownloadLink(DataFrame, Title = "Download data", CSVFilename = FileName)
display(HTML(HTMLText))
# Setup a link to download PLSDA data...
if RetrievedMWData:
FileName = "%s_%s_PLSDA_Data.csv" % (StudyID, AnalysisID)
else:
FileRoot, FileExt = os.path.splitext(StudyID)
FileName = "%s_PLSDA_Data.csv" % (FileRoot)
HTMLText = MWUtil.SetupCSVDownloadLink(PLSDAPlotDataFrame, Title = "Download PLSDA data", CSVFilename = FileName)
display(HTML(HTMLText))
# List dataframe...
display(HTML(PLSDAPlotDataFrame.to_html(max_rows = 10, max_cols = 10)))
display(StudiesDataHBox)
display(ComponentsDataHBox)
display(PlotsDataHBox)
display(ColorPalattesDataHBox)
display(OutputPlot)
display(Output)
PlotData()