#!/usr/bin/env python # coding: utf-8 # # Exercise 11 # # # ## Phishing Detection # # Phishing, by definition, is the act of defrauding an online user in order to obtain personal information by posing as a trustworthy institution or entity. Users usually have a hard time differentiating between legitimate and malicious sites because they are made to look exactly the same. Therefore, there is a need to create better tools to combat attackers. # In[2]: import pandas as pd import zipfile with zipfile.ZipFile('../datasets/phishing.csv.zip', 'r') as z: f = z.open('phishing.csv') data = pd.read_csv(f, index_col=False) data.head() # In[3]: data.phishing.value_counts() # In[4]: data.url[data.phishing==1].sample(50, random_state=1).tolist() # In[5]: keywords = ['https', 'login', '.php', '.html', '@', 'sign'] for keyword in keywords: data['keyword_' + keyword] = data.url.str.contains(keyword).astype(int) # In[6]: data['lenght'] = data.url.str.len() - 2 domain = data.url.str.split('/', expand=True).iloc[:, 2] data['lenght_domain'] = domain.str.len() domain.head(12) # In[7]: data['isIP'] = (domain.str.replace('.', '') * 1).str.isnumeric().astype(int) data['count_com'] = data.url.str.count('com') data.sample(15, random_state=4) # In[8]: X = data.drop(['url', 'phishing'], axis=1) # In[9]: y = data.phishing # # Exercice 11.1 # # Create 5 more features # In[ ]: # # Exercice 11.2 # # * Standarized the features # * Create a Linear SVM # # In[ ]: # # Exercice 11.3 # # Test the two SVM's using the different kernels (‘poly’, ‘rbf’, ‘sigmoid’) # # In[ ]: # # Exercice 11.4 # # Using the best SVM find the parameters that gives the best performance # # 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001] # In[ ]: # # Exercice 11.5 # # Compare the results with other methods # In[ ]: