import chardet # or use !file in POSIX
s = "hello world ǥood day. Let`Çhange the world!"
print chardet.detect(s)
s ## s as bytes
{'confidence': 0.7996636550693685, 'encoding': 'ISO-8859-2'}
'hello world \xc7\xa5ood day. Let`\xc3\x87hange the world!'
## decode the bytes using the suggested encoding
## encode as ascii with error = 'ignore'
clean_s = s.decode("ISO-8859-2").encode("ascii", "ignore")
clean_s
'hello world ood day. Let`hange the world!'
import re
s = '<a href="http://www.google.com">google</a>, <a href="http://yahoo.com">Yahoo!</a>'
print re.findall(r'<a href="(.*)"', s)
print
print re.findall(r'<a href="(.*?)"', s)
['http://www.google.com">google</a>, <a href="http://yahoo.com'] ['http://www.google.com', 'http://yahoo.com']
%matplotlib inline
import pylab as pl
import pandas as pd
import numpy as np
import tldextract
from sklearn.feature_extraction import text
from IPython.display import HTML, display
import warnings
warnings.filterwarnings(action = 'ignore', category = DeprecationWarning )
pl.rcParams["axes.grid"] = True
## alexa 100k domain list as the source of legit
alexa_df = pd.read_csv('data/alexa_100k.csv', names=['rank', 'uri'], header=None)
def extract_domain(uri):
import tldextract
import numpy as np
ext = tldextract.extract(uri.strip())
if not ext.suffix:
return np.nan
else:
return ext.domain
%time alexa_df['domain'] = map(extract_domain, alexa_df.uri)
alexa_df = alexa_df.dropna().drop_duplicates()
alexa_df['type'] = 'legit'
print alexa_df.shape
CPU times: user 1.18 s, sys: 19.4 ms, total: 1.2 s Wall time: 1.18 s (99836, 4)
alexa_df.head()
rank | uri | domain | type | |
---|---|---|---|---|
0 | 1 | facebook.com | legit | |
1 | 2 | google.com | legit | |
2 | 3 | youtube.com | youtube | legit |
3 | 4 | yahoo.com | yahoo | legit |
4 | 5 | baidu.com | baidu | legit |
5 rows × 4 columns
dga_df = pd.read_csv('data/dga_domains.txt', names = ['raw_domain'], header = None, encoding='utf-8')
dga_df['domain'] = map(lambda uri: uri.lower().split(".")[0].strip(), dga_df.raw_domain)
dga_df['type'] = 'dga'
dga_df = dga_df.dropna().drop_duplicates()
print dga_df.shape
(2669, 3)
dga_df.head()
raw_domain | domain | type | |
---|---|---|---|
0 | 04055051be412eea5a61b7da8438be3d.info | 04055051be412eea5a61b7da8438be3d | dga |
1 | 1cb8a5f36f.info | 1cb8a5f36f | dga |
2 | 30acd347397c34fc273e996b22951002.org | 30acd347397c34fc273e996b22951002 | dga |
3 | 336c986a284e2b3bc0f69f949cb437cb.info | 336c986a284e2b3bc0f69f949cb437cb | dga |
4 | 336c986a284e2b3bc0f69f949cb437cb.org | 336c986a284e2b3bc0f69f949cb437cb | dga |
5 rows × 3 columns
## all domains
all_domains = pd.concat([alexa_df.loc[:, ["domain", "type"]],
dga_df.loc[:, ["domain", "type"]]], axis = 0, ignore_index = True)
all_domains = all_domains.drop_duplicates().dropna()
all_domains = all_domains[all_domains.domain != ""]
pd.concat([all_domains.head(n = 10), all_domains.tail(n = 10)], axis=0, )
domain | type | |
---|---|---|
0 | legit | |
1 | legit | |
2 | youtube | legit |
3 | yahoo | legit |
4 | baidu | legit |
5 | wikipedia | legit |
6 | amazon | legit |
7 | live | legit |
8 | legit | |
9 | taobao | legit |
102495 | xcfwwghb | dga |
102496 | xcgqdfyrkgihlrmfmfib | dga |
102497 | xclqwzcfcx | dga |
102498 | xcpfxzuf | dga |
102499 | xcvxhxze | dga |
102500 | xdbrbsbm | dga |
102501 | xdfjryydcfwvkvui | dga |
102502 | xdjlvcgw | dga |
102503 | xdrmjeu | dga |
102504 | xflrjyyjswoatsoq | dga |
20 rows × 2 columns
## spot useful features
## 1. length of words
## 2. entropy of words
## 3. bigrams? unigrams?
from collections import Counter
import math
def entropy(word):
p, lns = Counter(word), float(len(word))
return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
all_domains['word_len'] = map(len, all_domains.domain)
all_domains['word_ent'] = map(entropy, all_domains.domain)
## NOT much distinguishing power
all_domains.boxplot(column="word_len", by = "type")
all_domains.boxplot(column="word_ent", by = "type")
<matplotlib.axes.AxesSubplot at 0x7ff502e70250>
## but how about joint features
pl.figure(figsize=(10, 4))
dga = all_domains[all_domains.type=='dga']
alexa = all_domains[all_domains.type=='legit']
pl.scatter(alexa.word_len, alexa.word_ent, s = 10, c="#aaaaff", label = 'Alexa', alpha = .2)
pl.scatter(dga.word_len, dga.word_ent, s = 40, c = 'r', label = 'DGA', alpha = .3)
pl.legend(loc = 'best')
pl.xlabel("domain length")
pl.ylabel("domain entropy")
<matplotlib.text.Text at 0x7ff5006fa050>
## try these features with a model
from sklearn import ensemble
from sklearn import cross_validation
from sklearn import metrics
X = all_domains.as_matrix(["word_len", "word_ent"])
y = np.asarray(all_domains.type)
print X.shape, y.shape
train_X, test_X, train_y, test_y = cross_validation.train_test_split(X, y, test_size = 0.3)
print train_X.shape, train_y.shape
print test_X.shape, test_y.shape
model = ensemble.RandomForestClassifier(n_estimators=20)
cv = cross_validation.KFold(train_X.shape[0], n_folds=10)
[np.mean(model.fit(train_X[train_index, :], train_y[train_index]).predict(train_X[test_index, :]) == train_y[test_index])
for train_index, test_index in cv]
(94365, 2) (94365,) (66055, 2) (66055,) (28310, 2) (28310,)
[0.97532546170148349, 0.97683923705722076, 0.97547683923705719, 0.97653648198607323, 0.97608234937935212, 0.97350492051476156, 0.9742619227857684, 0.97683573050719152, 0.97441332323996976, 0.97638152914458742]
## but ... because the imbalanced distribution in the target
## the baseline model will give 97% accuracy, IT IS NOT EXCITING AT a
print np.mean(test_y == model.fit(train_X, train_y).predict(test_X))
print np.mean(y == 'legit')
0.976616036736 0.971769194087