#!/usr/bin/env python # coding: utf-8 # # 07 - Model Deployment # # by [Alejandro Correa Bahnsen](http://www.albahnsen.com/) & [Iván Torroledo](http://www.ivantorroledo.com/) # # version 1.2, Feb 2018 # # ## Part of the class [Machine Learning for Risk Management](https://github.com/albahnsen/ML_RiskManagement) # # This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://creativecommons.org/licenses/by-sa/3.0/deed.en_US). # ## Agenda: # # 1. Creating and saving a model # 2. Running the model in batch # 3. Exposing the model as an API # ## Part 1: Phishing Detection # # Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers. # In[1]: import pandas as pd import zipfile with zipfile.ZipFile('../datasets/model_deployment/phishing.csv.zip', 'r') as z: f = z.open('phishing.csv') data = pd.read_csv(f, index_col=False) # In[2]: data.head() # In[3]: data.tail() # In[4]: data.phishing.value_counts() # ### Creating features # In[5]: data.url[data.phishing==1].sample(50, random_state=1).tolist() # Contain any of the following: # * https # * login # * .php # * .html # * @ # * sign # * ? # In[6]: keywords = ['https', 'login', '.php', '.html', '@', 'sign'] # In[7]: for keyword in keywords: data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int) # * Lenght of the url # * Lenght of domain # * is IP? # * Number of .com # In[8]: data['lenght'] = data.url.str.len() - 2 # In[9]: domain = data.url.str.split('/', expand=True).iloc[:, 2] # In[10]: data['lenght_domain'] = domain.str.len() # In[11]: domain.head(12) # In[12]: data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int) # In[13]: data['count_com'] = data.url.str.count('com') # In[14]: data.sample(15, random_state=4) # ### Create Model # In[15]: X = data.drop(['url', 'phishing'], axis=1) # In[16]: y = data.phishing # In[17]: from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score # In[18]: clf = RandomForestClassifier(n_jobs=-1, n_estimators=100) # In[19]: cross_val_score(clf, X, y, cv=10) # In[20]: clf.fit(X, y) # ### Save model # In[21]: from sklearn.externals import joblib # In[22]: joblib.dump(clf, '../datasets/model_deployment/07_phishing_clf.pkl', compress=3) # ## Part 2: Model in batch # # See m07_model_deployment.py # In[23]: from m07_model_deployment import predict_proba # In[24]: predict_proba('http://www.vipturismolondres.com/com.br/?atendimento=Cliente&/LgSgkszm64/B8aNzHa8Aj.php') # ## Part 3: API # # Flask is considered more Pythonic than Django because Flask web application code is in most cases more explicit. Flask is easy to get started with as a beginner because there is little boilerplate code for getting a simple app up and running. # First we need to install some libraries # # ``` # pip install flask-restplus # ``` # Load Flask # In[25]: from flask import Flask from flask_restplus import Api, Resource, fields from sklearn.externals import joblib import pandas as pd # Create api # In[26]: app = Flask(__name__) api = Api( app, version='1.0', title='Phishing Prediction API', description='Phishing Prediction API') ns = api.namespace('predict', description='Phishing Classifier') parser = api.parser() parser.add_argument( 'URL', type=str, required=True, help='URL to be analyzed', location='args') resource_fields = api.model('Resource', { 'result': fields.String, }) # Load model and create function that predicts an URL # In[27]: clf = joblib.load('../datasets/model_deployment/07_phishing_clf.pkl') @ns.route('/') class PhishingApi(Resource): @api.doc(parser=parser) @api.marshal_with(resource_fields) def get(self): args = parser.parse_args() result = self.predict_proba(args) return result, 200 def predict_proba(self, args): url = args['URL'] url_ = pd.DataFrame([url], columns=['url']) # Create features keywords = ['https', 'login', '.php', '.html', '@', 'sign'] for keyword in keywords: url_['keyword_' + keyword] = url_.url.str.contains(keyword).astype(int) url_['lenght'] = url_.url.str.len() - 2 domain = url_.url.str.split('/', expand=True).iloc[:, 2] url_['lenght_domain'] = domain.str.len() url_['isIP'] = (url_.url.str.replace('.', '') * 1).str.isnumeric().astype(int) url_['count_com'] = url_.url.str.count('com') # Make prediction p1 = clf.predict_proba(url_.drop('url', axis=1))[0,1] print('url=', url,'| p1=', p1) return { "result": p1 } # Run API # In[28]: app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000) # Check using # # * http://localhost:5000/predict/?URL=http://consultoriojuridico.co/pp/www.paypal.com/ #