#!/usr/bin/env python
# coding: utf-8

# # Exercise 11
# 
# 
# ## Phishing Detection
# 
# Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers.

# In[2]:


import pandas as pd
import zipfile
with zipfile.ZipFile('../datasets/phishing.csv.zip', 'r') as z:
    f = z.open('phishing.csv')
    data = pd.read_csv(f, index_col=False)
data.head()


# In[3]:


data.phishing.value_counts()


# In[4]:


data.url[data.phishing==1].sample(50, random_state=1).tolist()


# In[5]:


keywords = ['https', 'login', '.php', '.html', '@', 'sign']
for keyword in keywords:
    data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int)


# In[6]:


data['lenght'] = data.url.str.len() - 2
domain = data.url.str.split('/', expand=True).iloc[:, 2]
data['lenght_domain'] = domain.str.len()
domain.head(12)


# In[7]:


data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int)
data['count_com'] = data.url.str.count('com')
data.sample(15, random_state=4)


# In[8]:


X = data.drop(['url', 'phishing'], axis=1)


# In[9]:


y = data.phishing


# # Exercice 11.1
# 
# Create 5 more features

# In[ ]:


# # Exercice 11.2
# 
# * Standarized the features 
# * Create a Linear SVM
# 

# In[ ]:


# # Exercice 11.3
# 
# Test the two SVM's using the different kernels (‘poly’, ‘rbf’, ‘sigmoid’)
# 

# In[ ]:


# # Exercice 11.4
# 
# Using the best SVM find the parameters that gives the best performance
# 
# 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001]

# In[ ]:


# # Exercice 11.5
# 
# Compare the results with other methods

# In[ ]: