import sklearn.feature_extraction sklearn.__version__ import pandas as pd pd.__version__ # Set default pylab stuff pylab.rcParams['figure.figsize'] = (14.0, 5.0) pylab.rcParams['axes.grid'] = True # Version 0.12.0 of Pandas has a DeprecationWarning about Height blah that I'm ignoring import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) # This is the Alexa 100k domain list, we're not using the 1 Million just for speed reasons. Results # for the Alexa 1M are given at the bottom of the notebook. alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank','uri'], header=None, encoding='utf-8') alexa_dataframe.head() # Okay for this exercise we need the 2LD and nothing else import tldextract def domain_extract(uri): ext = tldextract.extract(uri) if (not ext.suffix): return np.nan else: return ext.domain alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']] del alexa_dataframe['rank'] del alexa_dataframe['uri'] alexa_dataframe.head() alexa_dataframe.tail() # It's possible we have NaNs from blanklines or whatever alexa_dataframe = alexa_dataframe.dropna() alexa_dataframe = alexa_dataframe.drop_duplicates() # Set the class alexa_dataframe['class'] = 'legit' # Shuffle the data (important for training/testing) alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index)) alexa_total = alexa_dataframe.shape[0] print 'Total Alexa domains %d' % alexa_total # Hold out 10% hold_out_alexa = alexa_dataframe[alexa_total*.9:] alexa_dataframe = alexa_dataframe[:alexa_total*.9] print 'Number of Alexa domains: %d' % alexa_dataframe.shape[0] alexa_dataframe.head() # Read in the DGA domains dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8') # We noticed that the blacklist values just differ by captilization or .com/.org/.info dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower()) del dga_dataframe['raw_domain'] # It's possible we have NaNs from blanklines or whatever dga_dataframe = dga_dataframe.dropna() dga_dataframe = dga_dataframe.drop_duplicates() dga_total = dga_dataframe.shape[0] print 'Total DGA domains %d' % dga_total # Set the class dga_dataframe['class'] = 'dga' # Hold out 10% hold_out_dga = dga_dataframe[dga_total*.9:] dga_dataframe = dga_dataframe[:dga_total*.9] print 'Number of DGA domains: %d' % dga_dataframe.shape[0] dga_dataframe.head() # Concatenate the domains in a big pile! all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True) # Add a length field for the domain all_domains['length'] = [len(x) for x in all_domains['domain']] # Okay since we're trying to detect dynamically generated domains and short # domains (length <=6) are crazy random even for 'legit' domains we're going # to punt on short domains (perhaps just white/black list for short domains?) all_domains = all_domains[all_domains['length'] > 6] # Grabbed this from Rosetta Code (rosettacode.org) import math from collections import Counter def entropy(s): p, lns = Counter(s), float(len(s)) return -sum( count/lns * math.log(count/lns, 2) for count in p.values()) # Add a entropy field for the domain all_domains['entropy'] = [entropy(x) for x in all_domains['domain']] all_domains.head() all_domains.tail() # Boxplots show you the distribution of the data (spread). # http://en.wikipedia.org/wiki/Box_plot # Plot the length and entropy of domains all_domains.boxplot('length','class') pylab.ylabel('Domain Length') all_domains.boxplot('entropy','class') pylab.ylabel('Domain Entropy') # Split the classes up so we can set colors, size, labels cond = all_domains['class'] == 'dga' dga = all_domains[cond] alexa = all_domains[~cond] plt.scatter(alexa['length'], alexa['entropy'], s=140, c='#aaaaff', label='Alexa', alpha=.2) plt.scatter(dga['length'], dga['entropy'], s=40, c='r', label='DGA', alpha=.3) plt.legend() pylab.xlabel('Domain Length') pylab.ylabel('Domain Entropy') # Below you can see that our DGA domains do tend to have higher entropy than Alexa on average. # Lets look at the types of domains that have entropy higher than 4 high_entropy_domains = all_domains[all_domains['entropy'] > 4] print 'Num Domains above 4 entropy: %.2f%% %d (out of %d)' % \ (100.0*high_entropy_domains.shape[0]/all_domains.shape[0],high_entropy_domains.shape[0],all_domains.shape[0]) print "Num high entropy legit: %d" % high_entropy_domains[high_entropy_domains['class']=='legit'].shape[0] print "Num high entropy DGA: %d" % high_entropy_domains[high_entropy_domains['class']=='dga'].shape[0] high_entropy_domains[high_entropy_domains['class']=='legit'].head() # Looking at the results below, we do see that there are more domains # in the DGA group that are high entropy but only a small percentage # of the domains are in that high entropy range... high_entropy_domains[high_entropy_domains['class']=='dga'].head() # In preparation for using scikit learn we're just going to use # some handles that help take us from pandas land to scikit land # List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors) X = all_domains.as_matrix(['length', 'entropy']) # Labels (scikit learn uses 'y' for classification labels) y = np.array(all_domains['class'].tolist()) # Yes, this is weird but it needs # to be an np.array of strings # Random Forest is a popular ensemble machine learning classifier. # http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html # import sklearn.ensemble clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20, compute_importances=True) # Trees in the forest # Now we can use scikit learn's cross validation to assess predictive performance. scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=5, n_jobs=4) print scores # Wow 96% accurate! At this point we could claim success and we'd be gigantic morons... # Recall that we have ~100k 'legit' domains and only 3.5k DGA domains # So a classifier that marked everything as legit would be about # 96% accurate.... # So we dive in a bit and look at the predictive performance more deeply. # Train on a 80/20 split from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Now plot the results of the 80/20 split in a confusion matrix from sklearn.metrics import confusion_matrix labels = ['legit', 'dga'] cm = confusion_matrix(y_test, y_pred, labels) def plot_cm(cm, labels): # Compute percentanges percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) # Show confusion matrix # Thanks kermit666 from stackoverflow :) fig = plt.figure() ax = fig.add_subplot(111) ax.grid(b=False) cax = ax.matshow(percent, cmap='coolwarm') pylab.title('Confusion matrix of the classifier') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) pylab.xlabel('Predicted') pylab.ylabel('True') pylab.show() plot_cm(cm, labels) # We can see below that our suspicions were correct and the classifier is # marking almost everything as Alexa. We FAIL.. science is hard... lets go drinking.... # Well our Mom told us we were still cool.. so with that encouragement we're # going to compute NGrams for every Alexa domain and see if we can use the # NGrams to help us better differentiate and mark DGA domains... # Scikit learn has a nice NGram generator that can generate either char NGrams or word NGrams (we're using char). # Parameters: # - ngram_range=(3,5) # Give me all ngrams of length 3, 4, and 5 # - min_df=1e-4 # Minimumum document frequency. At 1e-4 we're saying give us NGrams that # # happen in at least .1% of the domains (so for 100k... at least 100 domains) alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0) # I'm SURE there's a better way to store all the counts but not sure... # At least the min_df parameters has already done some thresholding counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain']) alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1()) ngrams_list = alexa_vc.get_feature_names() # For fun sort it and show it import operator _sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=operator.itemgetter(1), reverse=True) print 'Alexa NGrams: %d' % len(_sorted_ngrams) for ngram, count in _sorted_ngrams[:10]: print ngram, count # We're also going to throw in a bunch of dictionary words word_dataframe = pd.read_csv('data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8') # Cleanup words from dictionary word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())] word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower()) word_dataframe = word_dataframe.dropna() word_dataframe = word_dataframe.drop_duplicates() word_dataframe.head(10) # Now compute NGrams on the dictionary words # Same logic as above... dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0) counts_matrix = dict_vc.fit_transform(word_dataframe['word']) dict_counts = np.log10(counts_matrix.sum(axis=0).getA1()) ngrams_list = dict_vc.get_feature_names() # For fun sort it and show it import operator _sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True) print 'Word NGrams: %d' % len(_sorted_ngrams) for ngram, count in _sorted_ngrams[:10]: print ngram, count # We use the transform method of the CountVectorizer to form a vector # of ngrams contained in the domain, that vector is than multiplied # by the counts vector (which is a column sum of the count matrix). def ngram_count(domain): alexa_match = alexa_counts * alexa_vc.transform([domain]).T # Woot vector multiply and transpose Woo Hoo! dict_match = dict_counts * dict_vc.transform([domain]).T print '%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match) # Examples: ngram_count('google') ngram_count('facebook') ngram_count('1cb8a5f36f') ngram_count('pterodactylfarts') ngram_count('ptes9dro-dwacty2lfa5rrts') ngram_count('beyonce') ngram_count('bey666on4ce') # Compute NGram matches for all the domains and add to our dataframe all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T all_domains.head() all_domains.tail() # Use the vectorized operations of the dataframe to investigate differences # between the alexa and word grams all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams'] all_domains.sort(['diff'], ascending=True).head(10) # The table below shows those domain names that are more 'dictionary' and less 'web' all_domains.sort(['diff'], ascending=False).head(50) # The table below shows those domain names that are more 'web' and less 'dictionary' # Good O' web.... # Lets plot some stuff! # Here we want to see whether our new 'alexa_grams' feature can help us differentiate between Legit/DGA cond = all_domains['class'] == 'dga' dga = all_domains[cond] legit = all_domains[~cond] plt.scatter(legit['length'], legit['alexa_grams'], s=120, c='#aaaaff', label='Alexa', alpha=.1) plt.scatter(dga['length'], dga['alexa_grams'], s=40, c='r', label='DGA', alpha=.3) plt.legend() pylab.xlabel('Domain Length') pylab.ylabel('Alexa NGram Matches') # Lets plot some stuff! # Here we want to see whether our new 'alexa_grams' feature can help us differentiate between Legit/DGA cond = all_domains['class'] == 'dga' dga = all_domains[cond] legit = all_domains[~cond] plt.scatter(legit['entropy'], legit['alexa_grams'], s=120, c='#aaaaff', label='Alexa', alpha=.2) plt.scatter(dga['entropy'], dga['alexa_grams'], s=40, c='r', label='DGA', alpha=.3) plt.legend() pylab.xlabel('Domain Entropy') pylab.ylabel('Alexa Gram Matches') # Lets plot some stuff! # Here we want to see whether our new 'word_grams' feature can help us differentiate between Legit/DGA # Note: It doesn't look quite as good as the Alexa_grams but it might generalize better (less overfit). cond = all_domains['class'] == 'dga' dga = all_domains[cond] legit = all_domains[~cond] plt.scatter(legit['length'], legit['word_grams'], s=120, c='#aaaaff', label='Alexa', alpha=.2) plt.scatter(dga['length'], dga['word_grams'], s=40, c='r', label='DGA', alpha=.3) plt.legend() pylab.xlabel('Domain Length') pylab.ylabel('Dictionary NGram Matches') # Lets look at which Legit domains are scoring low on the word gram count all_domains[(all_domains['word_grams']==0)].head() # Okay these look kinda weird, lets use some nice Pandas functionality # to look at some statistics around our new features. all_domains[all_domains['class']=='legit'].describe() # Lets look at how many domains that are both low in word_grams and alexa_grams (just plotting the max of either) legit = all_domains[(all_domains['class']=='legit')] max_grams = np.maximum(legit['alexa_grams'],legit['word_grams']) ax = max_grams.hist(bins=80) ax.figure.suptitle('Histogram of the Max NGram Score for Domains') pylab.xlabel('Number of Domains') pylab.ylabel('Maximum NGram Score') # Lets look at which Legit domains are scoring low on both alexa and word gram count weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2) weird = all_domains[weird_cond] print weird.shape[0] weird.head(30) # Epiphany... Alexa really may not be the best 'exemplar' set... # (probably a no-shit moment for everyone else :) # # Discussion: If you're using these as exemplars of NOT DGA, then your probably # making things very hard on your machine learning algorithm. # Perhaps we should have two categories of Alexa domains, 'legit' # and a 'weird'. based on some definition of weird. # Looking at the entries above... we have approx 80 domains # that we're going to mark as 'weird'. # all_domains.loc[weird_cond, 'class'] = 'weird' print all_domains['class'].value_counts() all_domains[all_domains['class'] == 'weird'].head() # Now we try our machine learning algorithm again with the new features # Alexa and Dictionary NGrams and the exclusion of the bad exemplars. X = all_domains.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams']) # Labels (scikit learn uses 'y' for classification labels) y = np.array(all_domains['class'].tolist()) # Train on a 80/20 split from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Now plot the results of the 80/20 split in a confusion matrix labels = ['legit', 'weird', 'dga'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Hun, well that seem to work 'ok', but you don't really want a classifier # that outputs 3 classes, you'd like a classifier that flags domains as DGA or not. # This was a path that seemed like a good idea until it wasn't.... # Perhaps we will just exclude the weird class from our ML training not_weird = all_domains[all_domains['class'] != 'weird'] X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams']) # Labels (scikit learn uses 'y' for classification labels) y = np.array(not_weird['class'].tolist()) # Train on a 80/20 split from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) # Now plot the results of the 80/20 split in a confusion matrix labels = ['legit', 'dga'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Well it's definitely better.. but haven't we just cheated by removing # the weird domains? Well perhaps, but on some level we're removing # outliers that are bad exemplars. So to validate that the model is still # doing the right thing lets try our new model prediction on our hold out sets. # First train on the whole thing before looking at prediction performance clf.fit(X, y) # Pull together our hold out set hold_out_domains = pd.concat([hold_out_alexa, hold_out_dga], ignore_index=True) # Add a length field for the domain hold_out_domains['length'] = [len(x) for x in hold_out_domains['domain']] hold_out_domains = hold_out_domains[hold_out_domains['length'] > 6] # Add a entropy field for the domain hold_out_domains['entropy'] = [entropy(x) for x in hold_out_domains['domain']] # Compute NGram matches for all the domains and add to our dataframe hold_out_domains['alexa_grams']= alexa_counts * alexa_vc.transform(hold_out_domains['domain']).T hold_out_domains['word_grams']= dict_counts * dict_vc.transform(hold_out_domains['domain']).T hold_out_domains.head() # List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors) hold_X = hold_out_domains.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams']) # Labels (scikit learn uses 'y' for classification labels) hold_y = np.array(hold_out_domains['class'].tolist()) # Now run through the predictive model hold_y_pred = clf.predict(hold_X) # Add the prediction array to the dataframe hold_out_domains['pred'] = hold_y_pred # Now plot the results labels = ['legit', 'dga'] cm = confusion_matrix(hold_y, hold_y_pred, labels) plot_cm(cm, labels) # Okay so on our 10% hold out set of 10k domains about ~100 domains were mis-classified # at this point we're made some good progress so we're going to claim success :) # - Out of 10k domains 100 were mismarked # - false positives (Alexa marked as DGA) = ~0.6% # - about 80% of the DGA are getting marked # Note: Alexa 1M results on the 10% hold out (100k domains) were in the same ballpark # - Out of 100k domains 432 were mismarked # - false positives (Alexa marked as DGA) = 0.4% # - about 70% of the DGA are getting marked # Now were going to just do some post analysis on how the ML algorithm performed. # Lets look at a couple of plots to see which domains were misclassified. # Looking at Length vs. Alexa NGrams fp_cond = ((hold_out_domains['class'] == 'legit') & (hold_out_domains['pred']=='dga')) fp = hold_out_domains[fp_cond] fn_cond = ((hold_out_domains['class'] == 'dba') & (hold_out_domains['pred']=='legit')) fn = hold_out_domains[fn_cond] okay = hold_out_domains[hold_out_domains['class'] == hold_out_domains['pred']] plt.scatter(okay['length'], okay['alexa_grams'], s=100, c='#aaaaff', label='Okay', alpha=.1) plt.scatter(fp['length'], fp['alexa_grams'], s=40, c='r', label='False Positive', alpha=.5) plt.legend() pylab.xlabel('Domain Length') pylab.ylabel('Alexa NGram Matches') # Looking at Length vs. Dictionary NGrams cond = (hold_out_domains['class'] != hold_out_domains['pred']) misclassified = hold_out_domains[cond] okay = hold_out_domains[~cond] plt.scatter(okay['length'], okay['word_grams'], s=100, c='#aaaaff', label='Okay', alpha=.2) plt.scatter(misclassified['length'], misclassified['word_grams'], s=40, c='r', label='Misclassified', alpha=.3) plt.legend() pylab.xlabel('Domain Length') pylab.ylabel('Dictionary NGram Matches') misclassified.head() misclassified[misclassified['class'] == 'dga'].head() # We can also look at what features the learning algorithm thought were the most important importances = zip(['length', 'entropy', 'alexa_grams', 'word_grams'], clf.feature_importances_) importances # From the list below we see our feature importance scores. There's a lot of feature selection, # sensitivity study, etc stuff that you could do if you wanted at this point. # Discussion for how to use the resulting models. # Typically Machine Learning comes in two phases # - Training of the Model # - Evaluation of new observations against the Model # This notebook is about exploration of the data and training the model. # After you have a model that you are satisfied with, just 'pickle' it # at the end of the your training script and then in a separate # evaluation script 'unpickle' it and evaluate/score new observations # coming in (through a file, or ZeroMQ, or whatever...) # # In this case we'd have to pickle the RandomForest classifier # and the two vectorizing transforms (alexa_grams and word_grams). # See 'test_it' below for how to use them in evaluation mode. # test_it shows how to do evaluation, also fun for manual testing below :) def test_it(domain): _alexa_match = alexa_counts * alexa_vc.transform([domain]).T # Woot matrix multiply and transpose Woo Hoo! _dict_match = dict_counts * dict_vc.transform([domain]).T _X = [len(domain), entropy(domain), _alexa_match, _dict_match] print '%s : %s' % (domain, clf.predict(_X)[0]) # Examples (feel free to change these and see the results!) test_it('google') test_it('google88') test_it('facebook') test_it('1cb8a5f36f') test_it('pterodactylfarts') test_it('ptes9dro-dwacty2lfa5rrts') test_it('beyonce') test_it('bey666on4ce') test_it('supersexy') test_it('yourmomissohotinthesummertime') test_it('35-sdf-09jq43r') test_it('clicksecurity')