Perform random forest analysis using data for named metabolites from the metabolomics workbench or uploaded data files.
Import Python modules...
from __future__ import print_function
import os
import sys
import time
import re
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import ipywidgets as widgets
from IPython.display import display, HTML
from IPython import __version__ as ipyVersion
# Import MW modules from the current directory or default Python directory...
import MWUtil
%matplotlib inline
print("Python: %s.%s.%s" % sys.version_info[:3])
print("IPython: %s" % ipyVersion)
print()
print(time.asctime())
The URL PATH
The MW REST URL consists of three main parts, separated by forward slashes, after the common prefix specifying the invariant base URL (https://www.metabolomicsworkbench.org/rest/):
https://www.metabolomicsworkbench.org/rest/context/input_specification/output_specification
Part 1: The context determines the type of data to be accessed from the Metabolomics Workbench, such as metadata or results related to the submitted studies, data from metabolites, genes/proteins and analytical chemistry databases as well as other services related to mass spectrometry and metabolite identification:
context = study | compound | refmet | gene | protein | moverz | exactmass
Part 2: The input specification consists of two required parameters describing the REST request:
input_specification = input_item/input_value
Part 3: The output specification consists of two parameters describing the output generated by the REST request:
output_specification = output_item/(output_format)
The first parameter is required in most cases. The second parameter is optional. The input and output specifications are context sensitive. The context determines the values allowed for the remaining parameters in the input and output specifications as detailed in the sections below.
Setup MW REST base URL...
MWBaseURL = "https://www.metabolomicsworkbench.org/rest"
Retrieve or upload data for named metabolites...
# Initialize data...
StudiesResultsData = None
RetrievedMWData = None
# Setup UIF to retrieve...
StudyIDText = widgets.Text(value = "ST000001 ST000002", description = "Study ID (s)",
placeholder = "Type study ID", disabled = False,
layout = widgets.Layout(margin='0 10px 0 0'))
RetrieveDataBtn = widgets.Button(description = 'Retrieve Data', disabled = False, button_stype = '',
tooltip = "Retrieve data for study ID")
RetrieveDataOutput = widgets.Output()
def RetrieveDataBtnEventHandler(Object):
global StudiesResultsData, RetrievedMWData
RetrievedMWData = True
StudiesResultsData = None
StudyIDs = StudyIDText.value
RetrieveDataOutput.clear_output()
UploadDataOutput.clear_output()
with RetrieveDataOutput:
if len(StudyIDs):
print("\nProcessing study ID(s): %s" % StudyIDs)
StudiesResultsData = MWUtil.RetrieveStudiesAnalysisAndResultsData(StudyIDs, MWBaseURL)
DisplayData = False if len(StudiesResultsData.keys()) > 2 else True
MWUtil.ListStudiesAnalysisAndResultsData(StudiesResultsData, DisplayDataFrame = DisplayData,
IPythonDisplayFuncRef = display, IPythonHTMLFuncRef = HTML)
else:
print("\nNo study ID(s) specified...")
RetrieveDataBtn.on_click(RetrieveDataBtnEventHandler)
# Setup UIF to upload data file(s)...
FileUploadBtn = widgets.FileUpload(description = 'Upload File(s)', accept='.csv,.txt,.tsv', multiple = True,
disabled = False)
FileUploadTextHTML = widgets.HTML(value = "<strong>File format:</strong> Col 1: Sample names; \
Col 2: Class identifiers; Remaining cols: Named metabolites; \
<strong>Exts: </strong>.csv, .txt, or .tsv", placeholder='', description='')
UploadDataOutput = widgets.Output()
def FileUploadBtnEventHandler(Change):
global StudiesResultsData, RetrievedMWData
RetrievedMWData = False
StudiesResultsData = None
UploadedDataInfo = FileUploadBtn.value
RetrieveDataOutput.clear_output()
UploadDataOutput.clear_output()
with UploadDataOutput:
StudiesResultsData = MWUtil.RetrieveUploadedData(UploadedDataInfo)
DisplayData = False if len(StudiesResultsData.keys()) > 2 else True
MWUtil.ListStudiesAnalysisAndResultsData(StudiesResultsData, DisplayDataFrame = DisplayData,
IPythonDisplayFuncRef = display, IPythonHTMLFuncRef = HTML)
FileUploadBtn.observe(FileUploadBtnEventHandler, names = 'value')
# Setup UIF to retrieve or upload data file...
DataWarningTextHTML = widgets.HTML(value = "<div class='alert alert-warning'><strong>Warning:</strong> Don't re-run the current cell after specifying study ID(s) or selecting file(s) and retrieving the data. Click on the next cell to advance.</div>", placeholder='', description='')
OrTextHTML = widgets.HTML(value = "<strong>Or</strong>", placeholder='', description='')
UIFDataBoxes = []
UIFDataBoxes.append(widgets.HBox([StudyIDText, RetrieveDataBtn],
layout = widgets.Layout(margin='10px 0 0 0')))
UIFDataBoxes.append(widgets.HBox([OrTextHTML]))
UIFDataBoxes.append(widgets.HBox([FileUploadBtn]))
UIFDataBoxes.append(widgets.HBox([FileUploadTextHTML]))
UIFDataBoxes.append(widgets.HBox([DataWarningTextHTML]))
for UIFDataBox in UIFDataBoxes:
display(UIFDataBox)
display(RetrieveDataOutput)
display(UploadDataOutput)
MWUtil.CheckAndWarnEmptyStudiesData(StudiesResultsData, RetrievedMWData, StudyIDText.value)
Setup UIF for selecting and plotting available data...
# Setup UIF data...
StudiesUIFData = MWUtil.SetupUIFDataForStudiesAnalysisAndResults(StudiesResultsData, MinClassCount = 2)
MWUtil.CheckAndWarnEmptyStudiesUIFData(StudiesUIFData, RetrievedMWData, StudyIDText.value)
# Setup a function to generate data for random forest variable importance plot (VIP)...
def GeneratePlotData(DataFrame, FirstClassNum, SecondClassNum,
ClassNumColID = "ClassNum", ClassColID = "Class",
NumOfEstimators = 250, TrainSize = 0.75, RandomSeed = None):
"""Generate variable importance plot data using random forest classifier."""
VIPDataFrame = None
# Drop Class column...
if ClassColID is not None:
DataFrame = DataFrame.drop(ClassColID, axis = 1)
# Extract data for two specified classes...
ClassDataA = DataFrame[DataFrame[ClassNumColID] == FirstClassNum]
ClassDataB = DataFrame[DataFrame[ClassNumColID] == SecondClassNum]
ClassData = pd.concat([ClassDataA, ClassDataB])
# Retrieve X and y data...
XData = ClassData.drop(ClassNumColID, axis = 1)
yData = ClassData[ClassNumColID]
# Split X and y data for training and testing...
TestSize = 1 - TrainSize
XDataTrain, XDataTest, yDataTrain, yDataTest = train_test_split(XData, yData, test_size = TestSize, train_size = TrainSize,
random_state = RandomSeed, shuffle = True)
# Setup a classifier...
RFC = RandomForestClassifier(n_estimators = NumOfEstimators, random_state = RandomSeed)
# Train the model...
RFC.fit(XDataTrain, yDataTrain)
# Calculate accuracy of the model...
yDataPredict = RFC.predict(XDataTest)
ModelAccuracy = metrics.accuracy_score(yDataTest, yDataPredict)
# Setup feature names and feature importance values...
FeatureNames = list(XDataTrain.columns.values)
FeatureImporatnces = RFC.feature_importances_
# Calculate standard deviation for feature importance values...
StandardDeviations = np.std([Tree.feature_importances_ for Tree in RFC.estimators_], axis = 0)
VIPDataFrame = pd.DataFrame({'Variable Importance' : FeatureImporatnces,
'Standard Deviation' : StandardDeviations},
index = FeatureNames)
VIPDataFrame = VIPDataFrame.sort_values(by = ['Variable Importance'], ascending = False)
return (VIPDataFrame, ModelAccuracy)
# Setup a function to draw VIP plot...
def DrawPlot(VIPDataFrame, VIPColID ="Variable Importance", StdDevColID = "Standard Deviation",
PlotStyle = "darkgrid", PlotOrientation = "Vertical", ColorsPalette = "bright",
FontScale = 1.3, TitleFontWeight = "bold", LabelsFontWeight = "bold",
PlotWidth = 9, PlotHeight = 6, DisplayStandardError = False, MaxFeaturesToPlot = 20):
sns.set(rc = {'figure.figsize':(PlotWidth, PlotHeight)})
sns.set(style = PlotStyle, font_scale = FontScale)
# Select only top 10 feature to plot..
if len(VIPDataFrame.index) > MaxFeaturesToPlot:
VIPDataFrame = VIPDataFrame.iloc[0:MaxFeaturesToPlot]
StandardError = VIPDataFrame[StdDevColID] if DisplayStandardError else None
if re.match("^Horizontal$", PlotOrientation, re.I):
Axis = sns.barplot(x = VIPDataFrame[VIPColID], y = VIPDataFrame.index, orient = "h",
palette = ColorsPalette, xerr = StandardError)
else:
Axis = sns.barplot(x = VIPDataFrame.index, y = VIPDataFrame[VIPColID], orient = "v",
palette = ColorsPalette, yerr = StandardError)
# Set title and labels...
Axis.set_title("Variable Importance Plot", fontweight = TitleFontWeight)
if re.match("^Horizontal$", PlotOrientation, re.I):
Axis.set_xlabel('Feature Importance Score')
Axis.set_ylabel('Features')
else:
Axis.set_xlabel('Features')
Axis.set_ylabel('Feature Importance Score')
Axis.set_xticklabels(Axis.get_xticklabels(), rotation = 90)
plt.show()
# Setup UIF...
FirstStudyID = StudiesUIFData["StudyIDs"][0]
StudiesDropdown = widgets.Dropdown(options = StudiesUIFData["StudyIDs"], value = FirstStudyID,
description="Study:", disabled = False)
FirstAnalysisID = StudiesUIFData["AnalysisIDs"][FirstStudyID][0]
AnalysisDropdown = widgets.Dropdown(options = StudiesUIFData["AnalysisIDs"][FirstStudyID], value = FirstAnalysisID,
description = "Analysis:", disabled = False)
FirstClassID = StudiesUIFData["ClassIDs"][FirstStudyID][FirstAnalysisID][0]
FirstClassDropdown = widgets.Dropdown(options = StudiesUIFData["ClassIDs"][FirstStudyID][FirstAnalysisID],
value = FirstClassID, description = "Class A:", disabled = False)
SecondClassID = StudiesUIFData["ClassIDs"][FirstStudyID][FirstAnalysisID][1]
SecondClassDropdown = widgets.Dropdown(options = StudiesUIFData["ClassIDs"][FirstStudyID][FirstAnalysisID],
value = SecondClassID, description = "Class B:", disabled = False)
FirstClassNum = StudiesResultsData[FirstStudyID][FirstAnalysisID]["class_names_to_nums"][FirstClassID]
SecondClassNum = StudiesResultsData[FirstStudyID][FirstAnalysisID]["class_names_to_nums"][SecondClassID]
NumOfEstimatorsIntText = widgets.IntText(value = 250, description = "Estimators:",
placeholder = "Type number > 0; Hit enter",
disabled = False, continuous_update = False)
TrainSizeSlider = widgets.FloatSlider(value = "0.75", min = "0.5", max = "0.9", step = 0.01,
description = 'Train size:', disabled = False,
continuous_update = False, orientation = 'horizontal',
readout = True, readout_format = '.2f')
RandomSeedText = widgets.Text(value = "42", description = "Random seed:",
placeholder = "Type None or a number; Hit enter",
disabled = False, continuous_update = False)
OrientationStyles = ["Horizontal", "Vertical"]
OrientationStylesDropdown = widgets.Dropdown(options = OrientationStyles, value = "Vertical",
description = "Plot orientation:")
DisplayStandardErrorCheckBox = widgets.Checkbox(value = False, description = "Plot std err", disabled = False )
MaxFeaturesToPlotIntText = widgets.IntText(value = 20, description = 'Plot max features:',
placeholder = "Type number > 0; Hit enter",
disabled = False, continuous_update = False)
PlotStyles = ["Darkgrid", "Whitegrid", "Dark", "White", "Ticks"]
PlotStylesDropdown = widgets.Dropdown(options = PlotStyles, value = "Darkgrid", description = "Plot style:")
PlotColorPalettes = ["Deep", "Muted", "Pastel", "Bright", "Dark", "Colorblind"]
PlotColorPalettesDropdown = widgets.Dropdown(options = PlotColorPalettes, value = "Bright", description = "Color palette:")
DefaultPlotWidth = 9
DefaultPlotHeight = 6
PlotSizeText = widgets.Text(value = "9x6", description = "Plot size:", placeholder = "Type WxH; Hit enter",
disabled = False, continuous_update = False)
UIFDataBoxes = []
DataLayout = widgets.Layout(margin='0 0 4px 0')
UIFDataBoxes.append(widgets.HBox([StudiesDropdown, AnalysisDropdown], layout = DataLayout))
UIFDataBoxes.append(widgets.HBox([FirstClassDropdown, SecondClassDropdown], layout = DataLayout))
UIFDataBoxes.append(widgets.HBox([NumOfEstimatorsIntText, TrainSizeSlider], layout = DataLayout))
UIFDataBoxes.append(widgets.HBox([RandomSeedText, MaxFeaturesToPlotIntText], layout = DataLayout))
UIFDataBoxes.append(widgets.HBox([OrientationStylesDropdown, DisplayStandardErrorCheckBox], layout = DataLayout))
UIFDataBoxes.append(widgets.HBox([PlotStylesDropdown, PlotColorPalettesDropdown], layout = DataLayout))
UIFDataBoxes.append(widgets.HBox([PlotSizeText], layout = DataLayout))
Output = widgets.Output()
OutputPlot = widgets.Output()
UpdatePlot = True
def DisablePlotUpdate():
global UpdatePlot
UpdatePlot = False
def EnablePlotUpdate():
global UpdatePlot
UpdatePlot = True
def GetUpdatePlotStatus():
global UpdatePlot
return True if UpdatePlot else False
# Setup function to update dropdown options...
def UpdateAnalysisDropdown(StudyID):
AnalysisDropdown.options = StudiesUIFData["AnalysisIDs"][StudyID]
AnalysisDropdown.value = StudiesUIFData["AnalysisIDs"][StudyID][0]
def UpdateFirstClassDropdown(StudyID, AnalysisID):
FirstClassDropdown.options = StudiesUIFData["ClassIDs"][StudyID][AnalysisID]
FirstClassDropdown.value = StudiesUIFData["ClassIDs"][StudyID][AnalysisID][0]
def UpdateSecondClassDropdown(StudyID, AnalysisID):
SecondClassDropdown.options = StudiesUIFData["ClassIDs"][StudyID][AnalysisID]
SecondClassDropdown.value = StudiesUIFData["ClassIDs"][StudyID][AnalysisID][1]
# Setup dropdown event handlers...
def StudiesDropdownEventHandler(Change):
StudyID = Change["new"]
DisablePlotUpdate()
UpdateAnalysisDropdown(StudyID)
UpdateFirstClassDropdown(StudyID, StudiesUIFData["AnalysisIDs"][StudyID][0])
UpdateSecondClassDropdown(StudyID, StudiesUIFData["AnalysisIDs"][StudyID][0])
EnablePlotUpdate()
PlotData()
def AnalysisDropdownEventHandler(Change):
UpdatePlotStatus = GetUpdatePlotStatus()
if UpdatePlotStatus:
DisablePlotUpdate()
UpdateFirstClassDropdown(StudiesDropdown.value, Change["new"])
UpdateSecondClassDropdown(StudiesDropdown.value, Change["new"])
if UpdatePlotStatus:
EnablePlotUpdate()
PlotData()
def FirstClassDropdownEventHandler(Change):
PlotData()
def SecondClassDropdownEventHandler(Change):
PlotData()
def NumOfEstimatorsIntTextEventHandler(Change):
PlotData()
def TrainSizeSliderEventHandler(Change):
PlotData()
def RandomSeedTextEventHandler(Change):
PlotData()
def OrientationStylesDropdownEventHandler(Change):
PlotData()
def DisplayStandardErrorCheckBoxEventHandler(Change):
PlotData()
def MaxFeaturesToPlotIntTextEventHandler(Change):
PlotData()
def PlotStylesDropdownEventHandler(Change):
PlotData()
def PlotColorPalettesDropdownEventHandler(Change):
PlotData()
def PlotSizeTextEventHandler(Change):
PlotData()
# Bind required event handlers...
StudiesDropdown.observe(StudiesDropdownEventHandler, names = 'value')
AnalysisDropdown.observe(AnalysisDropdownEventHandler, names = 'value')
FirstClassDropdown.observe(FirstClassDropdownEventHandler, names = 'value')
SecondClassDropdown.observe(SecondClassDropdownEventHandler, names = 'value')
FirstClassDropdown.observe(FirstClassDropdownEventHandler, names = 'value')
NumOfEstimatorsIntText.observe(NumOfEstimatorsIntTextEventHandler, names='value')
TrainSizeSlider.observe(TrainSizeSliderEventHandler, names='value')
RandomSeedText.observe(RandomSeedTextEventHandler, names='value')
OrientationStylesDropdown.observe(OrientationStylesDropdownEventHandler, names='value')
DisplayStandardErrorCheckBox.observe(DisplayStandardErrorCheckBoxEventHandler, names='value')
MaxFeaturesToPlotIntText.observe(MaxFeaturesToPlotIntTextEventHandler, names='value')
PlotStylesDropdown.observe(PlotStylesDropdownEventHandler, names = 'value')
PlotColorPalettesDropdown.observe(PlotColorPalettesDropdownEventHandler, names = 'value')
PlotSizeText.observe(PlotSizeTextEventHandler, names = 'value')
# Set up function to generate VIP plot...
def PlotData():
if not UpdatePlot:
return
Output.clear_output()
OutputPlot.clear_output()
StudyID = StudiesDropdown.value
AnalysisID = AnalysisDropdown.value
FirstClassID = FirstClassDropdown.value
SecondClassID = SecondClassDropdown.value
DataFrame = StudiesResultsData[StudyID][AnalysisID]["data_frame"]
# First and second class IDs must be different...
FirstClassNum = StudiesResultsData[StudyID][AnalysisID]["class_names_to_nums"][FirstClassID]
SecondClassNum = StudiesResultsData[StudyID][AnalysisID]["class_names_to_nums"][SecondClassID]
if FirstClassNum == SecondClassNum:
with Output:
print("Selected classes must be different\nClassNum: %s; Class Name: %s\n" % (FirstClassNum, FirstClassID))
ListClassInformation(StudyID, AnalysisID)
return
Estimators = NumOfEstimatorsIntText.value
TrainingSetSize = TrainSizeSlider.value
Seed = RandomSeedText.value
Seed = re.sub(" ", "", Seed)
if re.match("^None$", Seed, re.I):
Seed = None
else:
Seed = int(Seed)
OrientationStyle = OrientationStylesDropdown.value
StandardError = DisplayStandardErrorCheckBox.value
FeaturesToPlot = MaxFeaturesToPlotIntText.value
with Output:
if Estimators <= 0:
print("Invalid value specified for Estimators. Valid values: > 0")
return
if FeaturesToPlot <= 0:
print("Invalid value specified for Plot features. Valid values: > 0")
return
Style = PlotStylesDropdown.value
Style = Style.lower()
Palette = PlotColorPalettesDropdown.value
Palette = Palette.lower()
PlotSize = PlotSizeText.value.lower()
PlotSize = re.sub(" ", "", PlotSize)
PlotSizeWords = PlotSize.split("x")
if len(PlotSizeWords) == 2 and len(PlotSizeWords[0]) > 0 and len(PlotSizeWords[1]) > 0:
Width = float(PlotSizeWords[0])
Height = float(PlotSizeWords[1])
else:
Width = DefaultPlotWidth
Height = DefaultHeight
with Output:
print("Invalid plot size; Using default plot size: %sx%s\n" % (Width, Height))
with OutputPlot:
# Retrieve data for VIP plot...
VIPDataFrame, ModelAccuracy = GeneratePlotData(DataFrame, FirstClassNum, SecondClassNum,
ClassNumColID = "ClassNum", ClassColID = "Class",
NumOfEstimators = Estimators,
TrainSize = TrainingSetSize,
RandomSeed = Seed)
# Generate plot...
DrawPlot(VIPDataFrame, PlotStyle = Style, PlotOrientation = OrientationStyle,
ColorsPalette = Palette, PlotWidth = Width, PlotHeight = Height,
DisplayStandardError = StandardError, MaxFeaturesToPlot = FeaturesToPlot)
with Output:
print("Selected classes:")
print("Class A - ClassNum: %s; ClassName: %s" % (FirstClassNum, FirstClassID))
print("Class B - ClassNum: %s; ClassName: %s" % (SecondClassNum, SecondClassID))
print("\n")
MWUtil.ListClassInformation(StudiesResultsData, StudyID, AnalysisID, RetrievedMWData)
if RetrievedMWData:
FileName = "%s_%s_Data.csv" % (StudyID, AnalysisID)
HTMLText = MWUtil.SetupCSVDownloadLink(DataFrame, Title = "Download data", CSVFilename = FileName)
display(HTML(HTMLText))
if RetrievedMWData:
FileName = "%s_%s_VIP_Data.csv" % (StudyID, AnalysisID)
else:
FileRoot, FileExt = os.path.splitext(StudyID)
FileName = "%s_VIP_Data.csv" % (FileRoot)
VIPDataFrame = VIPDataFrame.applymap("{0:.4f}".format)
HTMLText = MWUtil.SetupCSVDownloadLink(VIPDataFrame, Title = "Download variable importance data", CSVFilename = FileName)
display(HTML(HTMLText))
for UIFDataBox in UIFDataBoxes:
display(UIFDataBox)
display(OutputPlot)
display(Output)
PlotData()