#!/usr/bin/env python
# coding: utf-8

# # 07 - Model Deployment
# 
# by [Alejandro Correa Bahnsen](http://www.albahnsen.com/) & [Iván Torroledo](http://www.ivantorroledo.com/)
# 
# version 1.2, Feb 2018
# 
# ## Part of the class [Machine Learning for Risk Management](https://github.com/albahnsen/ML_RiskManagement)
# 
# This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://creativecommons.org/licenses/by-sa/3.0/deed.en_US).

# ## Agenda:
# 
# 1. Creating and saving a model
# 2. Running the model in batch
# 3. Exposing the model as an API

# ## Part 1: Phishing Detection
# 
# Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers.

# In[1]:


import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/model_deployment/phishing.csv.zip', 'r') as z:
    f = z.open('phishing.csv')
    data = pd.read_csv(f, index_col=False)


# In[2]:


data.head()


# In[3]:


data.tail()


# In[4]:


data.phishing.value_counts()


# ### Creating features

# In[5]:


data.url[data.phishing==1].sample(50, random_state=1).tolist()


# Contain any of the following:
# * https
# * login
# * .php
# * .html
# * @
# * sign
# * ?

# In[6]:


keywords = ['https', 'login', '.php', '.html', '@', 'sign']


# In[7]:


for keyword in keywords:
    data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int)


# * Lenght of the url
# * Lenght of domain
# * is IP?
# * Number of .com

# In[8]:


data['lenght'] = data.url.str.len() - 2


# In[9]:


domain = data.url.str.split('/', expand=True).iloc[:, 2]


# In[10]:


data['lenght_domain'] = domain.str.len()


# In[11]:


domain.head(12)


# In[12]:


data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int)


# In[13]:


data['count_com'] = data.url.str.count('com')


# In[14]:


data.sample(15, random_state=4)


# ### Create Model

# In[15]:


X = data.drop(['url', 'phishing'], axis=1)


# In[16]:


y = data.phishing


# In[17]:


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


# In[18]:


clf = RandomForestClassifier(n_jobs=-1, n_estimators=100)


# In[19]:


cross_val_score(clf, X, y, cv=10)


# In[20]:


clf.fit(X, y)


# ### Save model

# In[21]:


from sklearn.externals import joblib


# In[22]:


joblib.dump(clf, '../datasets/model_deployment/07_phishing_clf.pkl', compress=3)


# ## Part 2: Model in batch
# 
# See m07_model_deployment.py

# In[23]:


from m07_model_deployment import predict_proba


# In[24]:


predict_proba('http://www.vipturismolondres.com/com.br/?atendimento=Cliente&/LgSgkszm64/B8aNzHa8Aj.php')


# ## Part 3: API
# 
# Flask is considered more Pythonic than Django because Flask web application code is in most cases more explicit. Flask is easy to get started with as a beginner because there is little boilerplate code for getting a simple app up and running.

# First we need to install some libraries 
# 
# ```
# pip install flask-restplus
# ```

# Load Flask

# In[25]:


from flask import Flask
from flask_restplus import Api, Resource, fields
from sklearn.externals import joblib
import pandas as pd


# Create api

# In[26]:


app = Flask(__name__)

api = Api(
    app, 
    version='1.0', 
    title='Phishing Prediction API',
    description='Phishing Prediction API')

ns = api.namespace('predict', 
     description='Phishing Classifier')
   
parser = api.parser()

parser.add_argument(
    'URL', 
    type=str, 
    required=True, 
    help='URL to be analyzed', 
    location='args')

resource_fields = api.model('Resource', {
    'result': fields.String,
})


# Load model and create function that predicts an URL

# In[27]:


clf = joblib.load('../datasets/model_deployment/07_phishing_clf.pkl') 

@ns.route('/')
class PhishingApi(Resource):

    @api.doc(parser=parser)
    @api.marshal_with(resource_fields)
    def get(self):
        args = parser.parse_args()
        result = self.predict_proba(args)

        return result, 200

    def predict_proba(self, args):
        url = args['URL']
        
        url_ = pd.DataFrame([url], columns=['url'])
        
        # Create features
        keywords = ['https', 'login', '.php', '.html', '@', 'sign']
        for keyword in keywords:
            url_['keyword_' + keyword] = url_.url.str.contains(keyword).astype(int)
        
        url_['lenght'] = url_.url.str.len() - 2
        domain = url_.url.str.split('/', expand=True).iloc[:, 2]
        url_['lenght_domain'] = domain.str.len()
        url_['isIP'] = (url_.url.str.replace('.', '') * 1).str.isnumeric().astype(int)
        url_['count_com'] = url_.url.str.count('com')

        # Make prediction
        p1 = clf.predict_proba(url_.drop('url', axis=1))[0,1]

        print('url=', url,'| p1=', p1)

        return {
         "result": p1
        }


# Run API

# In[28]:


app.run(debug=True, use_reloader=False, host='0.0.0.0', port=5000)


# Check using 
# 
# * http://localhost:5000/predict/?URL=http://consultoriojuridico.co/pp/www.paypal.com/
#