Retrieve named metabolites data from Metabolomics Workbench using REST API

Import Python modules...

In [ ]:
from __future__ import print_function

import os
import sys
import time
import re
from io import StringIO

import requests
import pandas as pd
import ipywidgets as widgets

from IPython.display import display, HTML
from IPython import __version__ as ipyVersion

# Import MW modules from the current directory or default Python directory...
import MWUtil

%matplotlib inline

print("Python: %s.%s.%s" % sys.version_info[:3])
print("IPython: %s" % ipyVersion)

print()
print(time.asctime())

The URL PATH

The MW REST URL consists of three main parts, separated by forward slashes, after the common prefix specifying the invariant base URL (https://www.metabolomicsworkbench.org/rest/):

https://www.metabolomicsworkbench.org/rest/context/input_specification/output_specification

Part 1: The context determines the type of data to be accessed from the Metabolomics Workbench, such as metadata or results related to the submitted studies, data from metabolites, genes/proteins and analytical chemistry databases as well as other services related to mass spectrometry and metabolite identification:

context = study | compound | refmet | gene | protein | moverz | exactmass

Part 2: The input specification consists of two required parameters describing the REST request:

input_specification = input_item/input_value

Part 3: The output specification consists of two parameters describing the output generated by the REST request:

output_specification = output_item/(output_format)

The first parameter is required in most cases. The second parameter is optional. The input and output specifications are context sensitive. The context determines the values allowed for the remaining parameters in the input and output specifications as detailed in the sections below.

Setup MW REST base URL...

In [ ]:
MWBaseURL = "https://www.metabolomicsworkbench.org/rest"

Retrieve and process results data for named metabolites...

Setup utitlity functions to retrieve and process analysis and results data...

In [ ]:
def RetrieveStudiesAnalysisAndResultsData(StudyID):
    """Retrieve analysis and results data for a study or studies."""
    
    MWDataURL = MWBaseURL + "/study/study_id/" + StudyID + "/analysis/"
    
    print("Initiating request: %s" % MWDataURL)
    Response = requests.get(MWDataURL)
    if Response.status_code != 200:
        print("Request failed: status_code: %d" % Response.status_code)
        return {}

    AnalysisData = Response.json()

    print("Processing analysis data...")
    StudiesResultsData = ProcessAnalysisData(AnalysisData)

    for StudyID in StudiesResultsData:
        for AnalysisID in StudiesResultsData[StudyID]:
            print("\nRetrieving datatable for analysis ID, %s, in study ID, %s..." % (AnalysisID, StudyID))
            
            MWDataURL = MWBaseURL + "/study/analysis_id/" + AnalysisID + "/datatable"
            
            print("Initiating request: %s" % MWDataURL)
            Response = requests.get(MWDataURL)
            if Response.status_code != 200:
                print("Request failed: status_code: %d" % Response.status_code)
                continue
            
            print("Processing datatable text...")
            ResultsDataTable = ProcessDataTableText(Response.text, AddClassNum = True)
            
            print("Setting up Pandas dataframe...")
            RESULTSDATATABLE = StringIO(ResultsDataTable)
            StudiesResultsData[StudyID][AnalysisID]["data_frame"] = pd.read_csv(RESULTSDATATABLE, sep="\t", index_col = False)
    
    return StudiesResultsData

def ProcessAnalysisData(AnalysisData):
    """Process analysis data retrieved in JSON format for a study or set of studies"""

    StudiesResultsData = {}
    
    if "study_id" in AnalysisData:
        # Turn single study with single analysis data set into dictionary
        # with multiple studies/analysis data set...
        AnalysisData = {"1" : AnalysisData}
    
    for DataSetNum in AnalysisData:
        StudyID = AnalysisData[DataSetNum]["study_id"]
        AnalysisID = AnalysisData[DataSetNum]["analysis_id"]
        
        # Intialize data...
        if StudyID not in StudiesResultsData:
            StudiesResultsData[StudyID] = {}
        
        StudiesResultsData[StudyID][AnalysisID] = {}
        
        # Track data...
        for DataType in AnalysisData[DataSetNum]:    
            DataValue = AnalysisData[DataSetNum][DataType]    
            if re.match("^(study_id|analysis_id)$", DataType, re.I):
                continue
            
            StudiesResultsData[StudyID][AnalysisID][DataType] = DataValue
    
    return StudiesResultsData


def ProcessDataTableText(DataTableText, AddClassNum = True):
    """Process datatable retrieved retrieves in text format for a specific analysis ID"""
    
    DataLines = []
    
    TextLines = DataTableText.split("\n")
    
    # Process data labels...
    LineWords = TextLines[0].split("\t")
    
    DataLabels = []
    DataLabels.append(LineWords[0])
    DataLabels.append(LineWords[1])
    if AddClassNum:
        DataLabels.append("ClassNum")
    
    for Index in range(2, len(LineWords)):
        DataLabels.append(LineWords[Index])
    
    DataLines.append("\t".join(DataLabels))
    
    # Process data...
    ClassNamesMap = {}
    ClassNum = 0
    for Index in range(1, len(TextLines)):
        LineWords = TextLines[Index].split("\t")
        
        if len(LineWords) <= 2:
            continue
        
        # Handle sample ID and class name...
        DataLine = []
        DataLine.append(LineWords[0])
        DataLine.append(LineWords[1])
        
        if AddClassNum:
            ClassName = LineWords[1]
            if ClassName not in ClassNamesMap:
                ClassNum += 1
                ClassNamesMap[ClassName] = ClassNum
            DataLine.append("%s" % ClassNamesMap[ClassName])
            
        for Index in range(2, len(LineWords)):
            DataLine.append(LineWords[Index])
        
        DataLines.append("\t".join(DataLine))
    
    return "\n".join(DataLines)

def ListStudiesAnalysisAndResultsData(StudiesResultsData, DisplayDataFrame = True):
    """List analysis and results data for studies."""
    
    print("\nListing analysis metadata for studies along with datatable for named metabolites...")
    
    for StudyID in StudiesResultsData:
        print("")
        for AnalysisID in StudiesResultsData[StudyID]:
            print("\nstudy_id:%s\nanalysis_id:%s" % (StudyID, AnalysisID))
            for DataType in StudiesResultsData[StudyID][AnalysisID]:
                DataValue = StudiesResultsData[StudyID][AnalysisID][DataType]
                if re.match("^(data_frame)$", DataType, re.I):
                    if DisplayDataFrame:
                        print("data_frame:\n")
                        display(HTML(DataValue.to_html(max_rows = 10, max_cols = 10)))
                    else:
                        print("data_frame: <Pandas DataFrame available; skipping display>")
                else:
                    print("%s: %s" % (DataType, DataValue))

Retrieve and process results data for study containing a single analysis...

In [ ]:
StudyID = "ST000001"
print("\nProcessing study ID: %s" % StudyID)

StudiesResultsData = RetrieveStudiesAnalysisAndResultsData(StudyID)
ListStudiesAnalysisAndResultsData(StudiesResultsData)

Retrieve and process results data for study containing a multiple analysis...

In [ ]:
StudyID = "ST000009"
print("\nProcessing study ID: %s" % StudyID)

StudiesResultsData = RetrieveStudiesAnalysisAndResultsData(StudyID)
ListStudiesAnalysisAndResultsData(StudiesResultsData)

Retrieve and process results data for multiple studies containing multiple analysis...

In [ ]:
# Setup a study ID subsring to match study IDs ST000010 to ST000019...
StudyID = "ST00001"
print("\nProcessing study ID substring to match studies from ST000010 to ST000019: %s" % StudyID)

StudiesResultsData = RetrieveStudiesAnalysisAndResultsData(StudyID)

# Turn off dataframe display...
ListStudiesAnalysisAndResultsData(StudiesResultsData, DisplayDataFrame = False)

Perform interactive retrieval of data for a specified study ID...

In [ ]:
StudyIDText = widgets.Text(value = "ST000001", description = "Study ID", placeholder = "Type study ID", disabled = False)

RetrieveDataBtn = widgets.Button(description = 'Retrieve Data', disabled = False, button_stype = '', tooltip = "Retrieve data for study ID")
OutputRetrieveDataBtn = widgets.Output()

def RetrieveAndListData(Object):
    StudyID = StudyIDText.value
    
    OutputRetrieveDataBtn.clear_output()
    with OutputRetrieveDataBtn:
        if len(StudyID):
            print("\nProcessing study ID: %s" % StudyID)
            StudiesResultsData = RetrieveStudiesAnalysisAndResultsData(StudyID)
            ListStudiesAnalysisAndResultsData(StudiesResultsData)
        else:
            print("\nNo study ID specified...")

RetrieveDataBtn.on_click(RetrieveAndListData)

display(StudyIDText, RetrieveDataBtn)
display(OutputRetrieveDataBtn)