Configure a workspace to enable communication between your local computer and remote resources.
However, we don't have to create a new config file each time we run this notebook.
from azureml.core import Workspace
subscription_id ='364eeb5b-f3c7-42b8-b15f-08afee51aa96'
resource_group ='Xiangzhe-ML'
workspace_name = 'Xiangzhe-WS'
try:
ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)
ws.write_config()
print('Library configuration succeeded')
except:
print('Workspace not found')
Wrote the config file config.json to: C:\Users\t-xiamen\Desktop\Automated ML Models\aml_config\config.json Library configuration succeeded
import azureml.core
import pandas as pd
from azureml.core.workspace import Workspace
from azureml.train.automl.run import AutoMLRun
import time
import logging
from matplotlib import pyplot as plt
from matplotlib.pyplot import imshow
import random
import numpy as np
import os
Create a workspace object from the existing workspace Workspace.from_config()
reads the file aml_config/config.json and loads the details into an object named ws
.
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
Found the config file in: C:\Users\t-xiamen\Desktop\Automated ML Models\aml_config\config.json Xiangzhe-WS westeurope Xiangzhe-ML westeurope
experiment_name = 'automl-regression-nyc-taxi'
from azureml.core import Experiment
exp = Experiment(workspace = ws, name = experiment_name)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# load data
pd_dataframe = pd.read_pickle('data/data_after_prep.pkl')
print('Data loading finished.')
y = np.array(pd_dataframe["trip_duration"]).astype(float)
y = np.log(y)
X = pd_dataframe.drop(["trip_duration"],axis = 1)
Data loading finished.
Function AutoMLConfig
description: https://docs.microsoft.com/en-gb/python/api/azureml-train-automl/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py
Here is a brief summary of the properties which are used in our model:
Property | Description |
---|---|
task | 'classification' or 'regression' depending on what kind of ML problem to solve. |
primary_metric | Metric that you want to optimize. |
max_time_sec | Time limit in seconds for each iteration |
iterations | Number of iterations. In each iteration, the model trains with the data with a specific pipeline |
n_cross_validations | Number of cross validation splits |
preprocess | True/False Enables experiment to perform preprocessing on the input. Preprocessing handles missing data, and performs some common feature extraction |
verbosity | Verbosity level for AutoML log file. |
X | The training features to use when fitting pipelines during AutoML experiment. |
y | Training labels to use when fitting pipelines during AutoML experiment. |
path | Full path to the AzureML project folder. |
project_folder = './projects/automl-regression-nyc-taxi'
if not os.path.exists(project_folder):
os.makedirs(project_folder)
from azureml.train.automl import AutoMLConfig
# randomly choose 100 samples to train
n = 100
sample_indices = np.random.permutation(X.shape[0])[0:n]
# local compute
automl_config_local = AutoMLConfig(task = 'regression',
primary_metric = 'spearman_correlation',
max_time_sec = 600,
iterations = 5,
n_cross_validations = 5,
preprocess = True,
verbosity = logging.INFO,
X = X.iloc[sample_indices],
y = y[sample_indices],
path = project_folder)
local_run = exp.submit(automl_config_local, show_output=True)
Parent Run ID: AutoML_d9633602-e93e-4825-8356-d172c0ecdf7e *********************************************************************************************** ITERATION: The iteration being evaluated. PIPELINE: A summary description of the pipeline being evaluated. DURATION: Time taken for the current iteration. METRIC: The result of computing score on the fitted pipeline. BEST: The best observed score thus far. *********************************************************************************************** ITERATION PIPELINE DURATION METRIC BEST 0 SparseNormalizer LightGBMRegressor 0:00:36.908574 0.700 0.700 1 SparseNormalizer SGDRegressor 0:00:17.820830 0.043 0.700 2 SparseNormalizer KNeighborsRegressor 0:00:17.480213 0.524 0.700 3 SparseNormalizer DecisionTreeRegressor0:00:15.354690 0.654 0.700 4 SparseNormalizer LightGBMRegressor 0:00:15.824144 0.711 0.711
from azureml.core.compute import DsvmCompute
dsvm_name = 'mydsvm'
try:
dsvm_compute = DsvmCompute(ws, dsvm_name)
print('Found an existing DSVM.')
except:
print('Creating a new DSVM.')
dsvm_config = DsvmCompute.provisioning_configuration(vm_size = "Standard_D2_v2")
dsvm_compute = DsvmCompute.create(ws, name = dsvm_name, provisioning_configuration = dsvm_config)
dsvm_compute.wait_for_completion(show_output = True)
Found an existing DSVM.
from azureml.core import Workspace, Datastore
#blob_datastore = Datastore(ws, blob_datastore_name)
ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)
AzureFile xiangzhews3534612219 azureml-filestore-6d04f583-d8b9-4df3-bec0-be867960a327
#ds.upload_files("./data/data_after_prep.pkl")
ds.upload(src_dir='./data', target_path='data', overwrite=True, show_progress=True)
$AZUREML_DATAREFERENCE_6ff5e855872d40c4bce28aa4a5c9863d
from azureml.core.runconfig import DataReferenceConfiguration
dr = DataReferenceConfiguration(datastore_name=ds.name,
path_on_datastore='data',
mode='download', # download files from datastore to compute target
overwrite=True)
from azureml.core.runconfig import RunConfiguration
# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")
# Set compute target to the Linux DSVM
conda_run_config.target = dsvm_compute.name
# set the data reference of the run coonfiguration
conda_run_config.data_references = {ds.name: dr}
project_folder = './projects/automl-regression-nyc-taxi'
if not os.path.exists(project_folder):
os.makedirs(project_folder)
%%writefile $project_folder/get_data.py
import numpy as np
import pandas as pd
import os
from os.path import expanduser, join, dirname
def get_data():
# download data from cloud
pd_dataframe = pd.read_pickle(join(dirname(os.path.realpath(__file__)),
os.environ["AZUREML_DATAREFERENCE_workspacefilestore"],
"data_after_prep.pkl"))
y = np.array(pd_dataframe["trip_duration"]).astype(float)
y = np.log(y)
X = pd_dataframe.drop(["trip_duration"],axis = 1)
# randomly choose samples to train
n = 300
sample_indices = np.random.permutation(X.shape[0])[0:n]
return { "X" : X.iloc[sample_indices], "y" : y[sample_indices] }
Overwriting ./projects/automl-regression-nyc-taxi/get_data.py
Function AutoMLConfig
description: https://docs.microsoft.com/en-gb/python/api/azureml-train-automl/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py
Note: For Remote DSVM and Batch AI you cannot pass Numpy arrays directly to AutoMLConfig.
from azureml.train.automl import AutoMLConfig
automl_settings = {
"max_time_sec": 600,
"iterations": 5,
"n_cross_validations": 5,
"primary_metric": 'spearman_correlation',
"preprocess": True,
"max_cores_per_iteration": 2,
"verbosity": logging.INFO
}
automl_config_remote = AutoMLConfig(task = 'regression',
debug_log = 'automl_errors.log',
path = project_folder,
run_configuration = conda_run_config,
data_script = project_folder + "/get_data.py",
**automl_settings
)
remote_run = exp.submit(automl_config_remote, show_output = False)
remote_run.cancel()
remote_run
Experiment | Id | Type | Status | Details Page | Docs Page |
---|---|---|---|---|---|
automl-regression-nyc-taxi | AutoML_be5eb023-f90c-4071-aefa-a0e23b98f9f9 | automl | Preparing | Link to Azure Portal | Link to Documentation |
We can use the cell below to fetch all the child runs and see individual metrics.
children = list(remote_run.get_children())
metricslist = {}
for run in children:
properties = run.get_properties()
metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
metricslist[int(properties['iteration'])] = metrics
import pandas as pd
rundata = pd.DataFrame(metricslist).sort_index(1)
rundata
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
explained_variance | 0.431827 | -0.000010 | 0.138325 | 0.409190 | 0.000171 |
mean_absolute_error | 0.419844 | 6.533152 | 0.534185 | 0.406930 | 0.558047 |
median_absolute_error | 0.341994 | 6.591523 | 0.438236 | 0.322102 | 0.438419 |
normalized_mean_absolute_error | 0.127710 | 1.987283 | 0.162491 | 0.123782 | 0.169749 |
normalized_median_absolute_error | 0.104029 | 2.005039 | 0.133305 | 0.097978 | 0.133360 |
normalized_root_mean_squared_error | 0.166126 | 1.998619 | 0.204503 | 0.166620 | 0.217820 |
normalized_root_mean_squared_log_error | 0.023284 | 0.610079 | 0.028147 | 0.023221 | 0.030001 |
r2_score | 0.372486 | -10.000000 | 0.053538 | 0.352351 | -0.061866 |
root_mean_squared_error | 0.546135 | 6.570417 | 0.672299 | 0.547758 | 0.716080 |
root_mean_squared_log_error | 0.076545 | 2.005621 | 0.092534 | 0.076337 | 0.098629 |
spearman_correlation | 0.700303 | 0.042560 | 0.524273 | 0.653972 | 0.711035 |
spearman_correlation_max | 0.700303 | 0.700303 | 0.700303 | 0.700303 | 0.711035 |
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)
Run(Experiment: automl-regression-nyc-taxi, Id: AutoML_d9633602-e93e-4825-8356-d172c0ecdf7e_4, Type: None, Status: Completed) Pipeline(memory=None, steps=[('datatransformer', DataTransformer(logger=None, task=None)), ('sparsenormalizer', <automl.client.core.common.model_wrappers.SparseNormalizer object at 0x00000174158C6400>), ('lightgbmregressor', <automl.client.core.common.model_wrappers.LightGBMRegressor object at 0x00000174158BDBA8>)])
For example, the cell below shows the run and the model that has the smallest root_mean_squared_error
value.
lookup_metric = "root_mean_squared_error"
best_run, fitted_model = remote_run.get_output(metric = lookup_metric)
print(best_run)
print(fitted_model)
For example, the cell below shows the run and the model from the 3rd iteration.
iteration = 3
third_run, third_model = remote_run.get_output(iteration = iteration)
print(third_run)
print(third_model)
pd_dataframe = pd.read_pickle("./data/sub_data_after_prep.pkl")
y_test = np.array(pd_dataframe["trip_duration"]).astype(float)
y_test = np.log(y_test)
X_test = pd_dataframe.drop(["trip_duration"],axis = 1)
random_index = np.random.randint(0, len(X_test)-1)
y_pred = fitted_model.predict(X_test.iloc[[random_index]])
y_residual_test = abs(y_test[random_index] - y_pred[0])
print("actual value:", y_test[random_index])
print("prediction:", y_pred[0])
print("residual:", y_residual_test)
actual value: 6.156978985585555 prediction: 6.546361761136513 residual: 0.389382775550958
We can use the cell below to register model in the workspace.
description = 'Automated Machine Learning Model'
tags = None
remote_run.register_model(description=description, tags=tags)
remote_run.model_id # Use this id to deploy the model as a web service in Azure