import os
import numpy as np
import pandas as pd
import azureml
from azureml.core import Workspace, Run
print("Azure ML SDK Version: ", azureml.core.VERSION)
Azure ML SDK Version: 0.1.59
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
Found the config file in: /home/nbuser/library/aml_config/config.json Xiangzhe-WS westeurope Xiangzhe-ML westeurope
# create an experiment
experiment_name = 'nyc-taxi-dsvm'
from azureml.core import Experiment
exp = Experiment(workspace = ws, name = experiment_name)
Every workspace comes with a default datastore which is backed by the Azure blob storage account associated with the workspace. We can use it to transfer data from local to the cloud, and access it from the compute target (Here, our compute target is DSVM).
# get the default datastore
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)
AzureFile xiangzhews1068013949 azureml-filestore-bc063c69-64a6-48ce-90f5-33cb3c8d43b2
# upload data file(s)
ds.upload_files(['./data_after_prep.pkl'], target_path = 'nyc-taxi', overwrite = True, show_progress = True)
#ds.upload(src_dir='.', target_path='nyc-taxi', overwrite=True, show_progress=True)
$AZUREML_DATAREFERENCE_b214114d38a24588a15b66bc27d0d5df
script_folder = './scripts_dsvm'
os.makedirs(script_folder, exist_ok=True)
To submit the job to the cluster, we should create a training script.
Note: The data path settings of DSVM and Batch AI cluster are different. Be careful !!!
%%writefile $script_folder/train.py
import os
import argparse
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from azureml.core import Run
# get hold of the current run
run = Run.get_submitted_run()
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
args = parser.parse_args()
data_folder = args.data_folder
data_path = os.path.join(data_folder, 'data_after_prep.pkl')
run.log('Data path', data_path)
# load data
pd_dataframe = pd.read_pickle(data_path)
run.log('Data loading', 'finished')
# data processing
le = preprocessing.LabelEncoder()
le.fit(["N", "Y"])
pd_dataframe["store_and_fwd_flag"] = le.transform(pd_dataframe["store_and_fwd_flag"])
le.fit(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"])
pd_dataframe["pickup_weekday"] = le.transform(pd_dataframe["pickup_weekday"])
pd_dataframe["dropoff_weekday"] = le.transform(pd_dataframe["dropoff_weekday"])
run.log('Data processing', 'finished')
# load dataset into numpy arrays
y = np.array(pd_dataframe["trip_duration"]).astype(float)
y = np.log(y)
X = np.array(pd_dataframe.drop(["trip_duration"],axis = 1))
# normalize data
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)
run.log('Normalization', 'finished')
# split data into train and validation datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.25, random_state = 20)
# train LR model
lm = LinearRegression()
lm.fit(X_train, y_train)
run.log('Model training', 'finished')
y_pred = lm.predict(X_val)
run.log('Prediction', 'finished')
# evaluation
mse = mean_squared_error(y_val, y_pred)
run.log('Evaluation', 'finished')
run.log('Mean Squared Error', np.float(mse))
os.makedirs('outputs', exist_ok=True)
# note!!! file saved in the outputs folder is automatically uploaded into experiment record
joblib.dump(value=lm, filename='outputs/nyc_taxi_model.pkl')
Writing ./scripts_dsvm/train.py
from azureml.core.compute import DsvmCompute
from azureml.core.compute_target import ComputeTargetException
compute_target_name = 'mydsvm'
try:
dsvm_compute = DsvmCompute(workspace=ws, name=compute_target_name)
print('found existing:', dsvm_compute.name)
except ComputeTargetException:
print('creating new.')
dsvm_config = DsvmCompute.provisioning_configuration(vm_size="Standard_D2_v2")
dsvm_compute = DsvmCompute.create(ws, name=compute_target_name, provisioning_configuration=dsvm_config)
dsvm_compute.wait_for_completion(show_output=True)
found existing: mydsvm
Firstly, create a DataReferenceConfiguration object to inform the system what data folder to download to the copmute target.
from azureml.core.runconfig import DataReferenceConfiguration
dr = DataReferenceConfiguration(datastore_name=ds.name,
path_on_datastore='nyc-taxi',
mode='download', # download files from datastore to compute target
overwrite=True)
Secondly, ask the system to build a conda environment based on the dependency specification, and submit the script to run there.
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")
# Set compute target to the Linux DSVM
conda_run_config.target = dsvm_compute.name
# set the data reference of the run configuration
conda_run_config.data_references = {ds.name: dr}
# specify CondaDependencies obj
conda_run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['numpy','pandas','scikit-learn'])
Thirdly, run the script. Once the environment is built, and if we don't change our dependencies, it will be reused in subsequent runs.
from azureml.core import Run
from azureml.core import ScriptRunConfig
src = ScriptRunConfig(source_directory=script_folder,
script='train.py',
run_config=conda_run_config,
arguments=['--data-folder', str(ds.as_mount())]
)
run = exp.submit(config=src)
Show running details.
from azureml.train.widgets import RunDetails
RunDetails(run).show()
_UserRun()
print(run.get_metrics())
{'Data path': 'workspacefilestore/nyc-taxi/data_after_prep.pkl', 'Data loading': 'finished', 'Data processing': 'finished', 'Normalization': 'finished', 'Model training': 'finished', 'Prediction': 'finished', 'Evaluation': 'finished', 'Mean Squared Error': 0.3878969301600042}
outputs
is a special directory in that all content in this directory is automatically uploaded to your workspace. Hence, the model file will also available in the workspace.
We can see files associated with that run with the following line.
print(run.get_file_names())
['azureml-logs/60_control_log.txt', 'azureml-logs/80_driver_log.txt', 'outputs/nyc_taxi_model.pkl', 'driver_log', 'azureml-logs/azureml.log']
Register the model in the workspace so that we can later query, examine, and deploy this model.
# register model
model = run.register_model(model_name='nyc_taxi_model', model_path='outputs/nyc_taxi_model.pkl')
print(model.name, model.id, model.version, sep = '\t')
nyc_taxi_model nyc_taxi_model:2 2
dsvm_compute.delete()