%load_ext sparkmagic.magics
from dsx_core_utils import proxy_util,dsxhi_util
proxy_util.configure_proxy_livy()
success configuring sparkmagic livy.
dsxhi_util.list_livy_endpoints()
['https://becks1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://becks1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://cdh513edge11.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://cdh514edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://cdh515edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://cdh515edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://centos74edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://centos74edge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://rated3.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://yccdh5.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://yccdh5.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://ycedge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://ycedge1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1', 'https://zinc1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy/v1', 'https://zinc1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1']
!cat /user-home/_global_/.remote-images/dsx-hi/dsx-scripted-ml-python2.json
{ "imageId": "968c2101554e0d1e0d4fdd3720aaa565a2910cb46f4d7ed61188b6ceeec22930", "scriptCommand": "anaconda2/bin/python2.7", "libPaths": ["usr/local/spark-2.0.2-bin-hadoop2.7/python","user-home/.scripts/common-helpers/batch/pmml","user-home/.scripts/common-helpers/saas","user-home/_global_/python-2.7"] }
Using values from dsx-scripted-ml-python2.json
, we'll need to:
scriptCommand
Example DSX_HI Properties for using dsx-scripted-ml-python2.tar.gz Virtual Environment:
{"proxyUser": "user1", "archives": ["/user/dsxhi/environments/26611bf7fe595f786139d6d2132de070fc813f6a0ef7a4e25857b79c8cd4b565/dsx-scripted-ml-python2.tar.gz"],"conf":{"spark.yarn.appMasterEnv.PYSPARK_PYTHON":"dsx-scripted-ml-python2.tar.gz/anaconda2/bin/python"}}
/user/dsxhi/environments/26611bf7fe595f786139d6d2132de070fc813f6a0ef7a4e25857b79c8cd4b565/dsx-scripted-ml-python2.tar.gz
/user/dsxhi/environments/pythonAddons/pythonAddons.tar.gz
%manage_spark
MagicsControllerWidget(children=(Tab(children=(ManageSessionWidget(children=(HTML(value=u'<br/>'), HTML(value=…
Added endpoint https://zinc1.fyre.ibm.com:8443/gateway/mjoudsx336-master-1/livy2/v1 Starting Spark application
SparkSession available as 'spark'.
%%spark
import pandas as pd
import numpy as np
# Reading the data from hdfs
data = spark.read.option("delimiter",",").option("header","false").csv("hdfs:///user/user1/SMSSpamCollection.csv")
dataset = data.toPandas()
dataset = dataset.iloc[:,:2]
message = dataset['_c1']
%%spark
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(message).toarray()
y = dataset.iloc[:, 0].values
%%spark
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
#Fitting Logistic Regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)
#Predicting the Test Results
y_pred = classifier.predict(X_test)
#Printing the model accuracy
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
#Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
#Printing the confusion matrix in a table
x=pd.Series(np.array([1,0,1,0]))
y=pd.Series(np.array([1,1,0,0]))
z=pd.Series(np.array([cm[0][0],cm[0][1],cm[1][0],cm[1][1]]))
cm_df = pd.DataFrame({'y_test':x,'y_pred':y,'count':z})
cm_df = cm_df[['y_test','y_pred','count']]
print cm_df.to_string(index=False)
Accuracy: 93.00% y_test y_pred count 1 1 94 0 1 4 1 0 10 0 0 92 /hadoop/yarn/local/usercache/user1/appcache/application_1533478912530_0773/container_e32_1533478912530_0773_01_000001/dsx-scripted-ml-python2.tar.gz/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)