Approach:
import sklearn.feature_extraction
sklearn.__version__
'0.14.1'
import pandas as pd
pd.__version__
'0.13.1'
# Plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 12.0, 5.0
# A plotting helper method
def plot_it(df,label_x,label_y):
fig, ax = plt.subplots(subplot_kw={'axisbg':'#EEEEE5'})
ax.grid(color='grey', linestyle='solid')
df.T.plot(kind='bar', logx=True, rot=0, ax=ax, colormap='PiYG')
ax.legend(loc=0, prop={'size':14})
plt.subplots_adjust(left=0, right=1, bottom=0, top=1, wspace=0, hspace=0)
plt.xlabel(label_x)
plt.ylabel(label_y)
return ax
# Read in a set of SQL statements from various sources
import os
basedir = 'data'
filelist = os.listdir(basedir)
df_list = []
for file in filelist:
df = pd.read_csv(os.path.join(basedir,file), sep='|||', names=['raw_sql'], header=None)
df['type'] = 'legit' if file.split('.')[0] == 'legit' else 'malicious'
df_list.append(df)
dataframe = pd.concat(df_list, ignore_index=True)
dataframe.dropna(inplace=True)
print dataframe['type'].value_counts()
dataframe.head()
malicious 12892 legit 1003 dtype: int64
raw_sql | type | |
---|---|---|
0 | '; exec master..xp_cmdshell 'ping 10.10.1.2'-- | malicious |
1 | create user name identified by 'pass123' | malicious |
2 | create user name identified by pass123 tempora... | malicious |
3 | exec sp_addlogin 'name' , 'password' | malicious |
4 | exec sp_addsrvrolemember 'name' , 'sysadmin' | malicious |
5 rows × 2 columns
!which python
/usr/bin/python
# Use the SQLParse module: sqlparse is a non-validating SQL parser module for Python
# https://github.com/andialbrecht/sqlparse
import sqlparse
def parse_it(raw_sql):
parsed = sqlparse.parse(unicode(raw_sql,'utf-8'))
return [token._get_repr_name() for parse in parsed for token in parse.tokens if token._get_repr_name() != 'Whitespace']
dataframe['parsed_sql'] = dataframe['raw_sql'].map(lambda x: parse_it(x))
dataframe.head()
raw_sql | type | parsed_sql | |
---|---|---|---|
0 | '; exec master..xp_cmdshell 'ping 10.10.1.2'-- | malicious | [Single, Identifier, Float, Float, Float, Erro... |
1 | create user name identified by 'pass123' | malicious | [DDL, Keyword, Identifier, Keyword, Single] |
2 | create user name identified by pass123 tempora... | malicious | [DDL, Keyword, Identifier, Keyword, Identifier... |
3 | exec sp_addlogin 'name' , 'password' | malicious | [Keyword, Identifier, IdentifierList] |
4 | exec sp_addsrvrolemember 'name' , 'sysadmin' | malicious | [Keyword, Identifier, IdentifierList] |
5 rows × 3 columns
# Looking at the SQL tokens is 'kinda' interesting but sequences of tokens and transitions
# between tokens seems more meaningful so we're also going to compute sequences by
# computing NGrams for every SQL statement...
def ngrams(lst, N):
ngrams = []
for n in xrange(0,N):
ngrams += zip(*(lst[i:] for i in xrange(n+1)))
return [str(tuple) for tuple in ngrams]
dataframe['sequences'] = dataframe['parsed_sql'].map(lambda x: ngrams(x, 3))
# Helper method
def token_expansion(series, types):
_tokens, _types = zip(*[(token,token_type) for t_list,token_type in zip(series,types) for token in t_list])
return pd.Series(_tokens), pd.Series(_types)
dataframe['sequences']
0 [('Single',), ('Identifier',), ('Float',), ('F... 1 [('DDL',), ('Keyword',), ('Identifier',), ('Ke... 2 [('DDL',), ('Keyword',), ('Identifier',), ('Ke... 3 [('Keyword',), ('Identifier',), ('IdentifierLi... 4 [('Keyword',), ('Identifier',), ('IdentifierLi... 5 [('DML',), ('Keyword',), ('Identifier',), ('Ke... 6 [('Keyword',), ('Keyword',), ('Keyword',), ('I... 7 [('DML',), ('Keyword',), ('Function',), ('Keyw... 8 [('Integer',), ('Error',), ('Integer',), ('Int... 9 [('Integer',), ('Keyword',), ('Comparison',), ... 10 [('Integer',), ('Single',), ('Integer',), ('Si... 11 [('Integer',), ('Keyword',), ('Function',), ('... 12 [('Error',), ('Error',), ('Punctuation',), ('O... 13 [('Integer',), ('Error',), ('Error',), ('Integ... 14 [('Integer',), ('Single',), ('Integer',), ('In... ... 13881 [('DML',), ('Identifier',), ('Keyword',), ('Co... 13882 [('DML',), ('Identifier',), ('Keyword',), ('Co... 13883 [('DML',), ('Identifier',), ('Keyword',), ('Co... 13884 [('DML',), ('Identifier',), ('Keyword',), ('Co... 13885 [('DML',), ('Identifier',), ('Keyword',), ('Id... 13886 [('DML',), ('Identifier',), ('Keyword',), ('Co... 13887 [('Identifier',), ('Builtin',), ('Identifier',... 13888 [('Identifier',), ('Function',), ('Identifier'... 13889 [('Identifier',), ('Function',), ('Identifier'... 13890 [('Identifier',), ('Function',), ('Identifier'... 13891 [('Identifier',), ('Function',), ('Identifier'... 13892 [('Identifier',), ('Function',), ('Identifier'... 13893 [('Identifier',), ('Function',), ('Identifier'... 13894 [('Keyword',), ('Identifier',), ('Function',),... 13895 [('Keyword',), ('IdentifierList',), ('DML',), ... Name: sequences, Length: 13895, dtype: object
# The data hacking repository has a simple stats module we're going to use
import data_hacking.simple_stats as ss
# Spin up our g_test class
g_test = ss.GTest()
# Here we'd like to see how various sql tokens and transitions are related.
# Is there an association with particular token sets and malicious SQL statements.
tokens, types = token_expansion(dataframe['sequences'], dataframe['type'])
df_ct, df_cd, df_stats = g_test.highest_gtest_scores(tokens, types, matches=0, N=0)
df_stats.sort('malicious_g', ascending=0).head(10)
# The table below shows raw counts, conditional distributions, expected counts, and g-test score.
legit | malicious | legit_cd | malicious_cd | total_cd | legit_exp | legit_g | malicious_exp | malicious_g | |
---|---|---|---|---|---|---|---|---|---|
('Single',) | 7 | 10984 | 0.000637 | 0.999363 | 10991 | 1121.927951 | -71.076512 | 9869.072049 | 2351.319327 |
('Single', 'Identifier') | 0 | 8309 | 0.000000 | 1.000000 | 8309 | 848.157524 | 0.000000 | 7460.842476 | 1789.275422 |
('Punctuation',) | 152 | 7707 | 0.019341 | 0.980659 | 7859 | 802.222889 | -505.705813 | 7056.777111 | 1358.598582 |
('Identifier',) | 1284 | 17011 | 0.070183 | 0.929817 | 18295 | 1867.498123 | -962.022691 | 16427.501877 | 1187.480739 |
('Identifier', 'Single') | 2 | 4222 | 0.000473 | 0.999527 | 4224 | 431.173111 | -21.493450 | 3792.826889 | 905.174233 |
('Single', 'Identifier', 'Single') | 0 | 4170 | 0.000000 | 1.000000 | 4170 | 425.660955 | 0.000000 | 3744.339045 | 897.975510 |
('Identifier', 'Single', 'Identifier') | 0 | 4162 | 0.000000 | 1.000000 | 4162 | 424.844339 | 0.000000 | 3737.155661 | 896.252775 |
('Identifier', 'Identifier') | 4 | 3957 | 0.001010 | 0.998990 | 3961 | 404.326869 | -36.927434 | 3556.673131 | 844.111737 |
('Keyword', 'Keyword', 'DML') | 18 | 3248 | 0.005511 | 0.994489 | 3266 | 333.383376 | -105.081169 | 2932.616624 | 663.529712 |
('Keyword', 'DML', 'IdentifierList') | 28 | 3157 | 0.008791 | 0.991209 | 3185 | 325.115142 | -137.310594 | 2859.884858 | 624.081095 |
10 rows × 9 columns
# Now plot the the head() and the tail() of the dataframe to see who's been naughty or nice
sorted_df = df_stats.sort('malicious_g', ascending=0)
naughty = sorted_df.head(7)
nice = sorted_df.tail(7).sort('malicious_g', ascending=0)
naughty_and_nice = pd.concat([naughty, nice])
ax = plot_it(naughty_and_nice[['malicious_g']],'SQL Command Types','G-Test Scores')
ax.set_xlim(.2, 1.4)
(0.2, 1.4)
# Documentation in sqlparse for the mapping can be found here:
# https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/keywords.py
# or here
# https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/lexer.py
# Here we look at example of the SQL sequence that G-Test has indicated are good
# indicators of SQL injections.
dataframe[dataframe['sequences'].map(lambda x: "('Single', 'Identifier')" in x)].head()
raw_sql | type | parsed_sql | sequences | |
---|---|---|---|---|
0 | '; exec master..xp_cmdshell 'ping 10.10.1.2'-- | malicious | [Single, Identifier, Float, Float, Float, Erro... | [('Single',), ('Identifier',), ('Float',), ('F... |
44 | anything' or 'x'='x | malicious | [Identifier, Single, Identifier, Single, Ident... | [('Identifier',), ('Single',), ('Identifier',)... |
49 | '; exec master..xp_cmdshell 'ping aaa.bbb.ccc.... | malicious | [Single, Identifier, Error, Single] | [('Single',), ('Identifier',), ('Error',), ('S... |
54 | '; if not(select system_user) <> 'sa' waitfor ... | malicious | [Single, Identifier, Single, Integer, Placehol... | [('Single',), ('Identifier',), ('Single',), ('... |
55 | '; if is_srvrolemember('sysadmin') > 0 waitfor... | malicious | [Single, Identifier, Single, Integer, Placehol... | [('Single',), ('Identifier',), ('Single',), ('... |
5 rows × 4 columns
dataframe[dataframe['sequences'].map(lambda x: "('Punctuation',)" in x)].head()
raw_sql | type | parsed_sql | sequences | |
---|---|---|---|---|
2 | create user name identified by pass123 tempora... | malicious | [DDL, Keyword, Identifier, Keyword, Identifier... | [('DDL',), ('Keyword',), ('Identifier',), ('Ke... |
6 | grant connect to name; grant resource to name; | malicious | [Keyword, Keyword, Keyword, Identifier, Punctu... | [('Keyword',), ('Keyword',), ('Keyword',), ('I... |
7 | insert into users(login, password, level) valu... | malicious | [DML, Keyword, Function, Keyword, Punctuation,... | [('DML',), ('Keyword',), ('Function',), ('Keyw... |
12 | \'; desc users; -- | malicious | [Error, Error, Punctuation, Order, Identifier,... | [('Error',), ('Error',), ('Punctuation',), ('O... |
21 | 1' and 1=(select count(*) from tablenames); -- | malicious | [Integer, Error, Keyword, Comparison, Punctuat... | [('Integer',), ('Error',), ('Keyword',), ('Com... |
5 rows × 4 columns
# Generating additional feature dimensions for the machine learning to expand its mind into...
# We're basically building up features to include into our 'feature vector' for ML
import math
from collections import Counter
def entropy(s):
p, lns = Counter(s), float(len(s))
return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
dataframe['length'] = dataframe['parsed_sql'].map(lambda x: len(x))
dataframe['entropy'] = dataframe['raw_sql'].map(lambda x: entropy(x))
dataframe.head()
raw_sql | type | parsed_sql | sequences | length | entropy | |
---|---|---|---|---|---|---|
0 | '; exec master..xp_cmdshell 'ping 10.10.1.2'-- | malicious | [Single, Identifier, Float, Float, Float, Erro... | [('Single',), ('Identifier',), ('Float',), ('F... | 7 | 4.368792 |
1 | create user name identified by 'pass123' | malicious | [DDL, Keyword, Identifier, Keyword, Single] | [('DDL',), ('Keyword',), ('Identifier',), ('Ke... | 5 | 4.037326 |
2 | create user name identified by pass123 tempora... | malicious | [DDL, Keyword, Identifier, Keyword, Identifier... | [('DDL',), ('Keyword',), ('Identifier',), ('Ke... | 11 | 4.028603 |
3 | exec sp_addlogin 'name' , 'password' | malicious | [Keyword, Identifier, IdentifierList] | [('Keyword',), ('Identifier',), ('IdentifierLi... | 3 | 4.030493 |
4 | exec sp_addsrvrolemember 'name' , 'sysadmin' | malicious | [Keyword, Identifier, IdentifierList] | [('Keyword',), ('Identifier',), ('IdentifierLi... | 3 | 4.010013 |
5 rows × 6 columns
# For each SQL statement aggregate the malicious and legit g-test scores as features
import numpy as np
def g_aggregate(sequence, name):
try:
g_scores = [df_stats.ix[item][name] for item in sequence]
except KeyError:
return 0
return sum(g_scores)/len(g_scores) if g_scores else 0 # Average
dataframe['malicious_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'malicious_g'))
dataframe['legit_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'legit_g'))
dataframe.head()
raw_sql | type | parsed_sql | sequences | length | entropy | malicious_g | legit_g | |
---|---|---|---|---|---|---|---|---|
0 | '; exec master..xp_cmdshell 'ping 10.10.1.2'-- | malicious | [Single, Identifier, Float, Float, Float, Erro... | [('Single',), ('Identifier',), ('Float',), ('F... | 7 | 4.368792 | 449.733570 | -63.831145 |
1 | create user name identified by 'pass123' | malicious | [DDL, Keyword, Identifier, Keyword, Single] | [('DDL',), ('Keyword',), ('Identifier',), ('Ke... | 5 | 4.037326 | -242.191260 | 1210.713063 |
2 | create user name identified by pass123 tempora... | malicious | [DDL, Keyword, Identifier, Keyword, Identifier... | [('DDL',), ('Keyword',), ('Identifier',), ('Ke... | 11 | 4.028603 | -392.742728 | 1489.732587 |
3 | exec sp_addlogin 'name' , 'password' | malicious | [Keyword, Identifier, IdentifierList] | [('Keyword',), ('Identifier',), ('IdentifierLi... | 3 | 4.030493 | -331.875793 | 1069.265013 |
4 | exec sp_addsrvrolemember 'name' , 'sysadmin' | malicious | [Keyword, Identifier, IdentifierList] | [('Keyword',), ('Identifier',), ('IdentifierLi... | 3 | 4.010013 | -331.875793 | 1069.265013 |
5 rows × 8 columns
# Boxplots show you the distribution of the data (spread).
# http://en.wikipedia.org/wiki/Box_plot
# Plot the length and entropy of SQL statements
# Fixme Brian: make these pretty
dataframe.boxplot('length','type')
plt.ylabel('SQL Statement Length')
dataframe.boxplot('entropy','type')
plt.ylabel('SQL Statement Entropy')
<matplotlib.text.Text at 0x6711210>
# Split the classes up so we can set colors, size, labels
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5'))
ax.grid(color='grey', linestyle='solid')
cond = dataframe['type'] == 'malicious'
evil = dataframe[cond]
legit = dataframe[~cond]
plt.scatter(legit['length'], legit['entropy'], s=140, c='#aaaaff', label='Legit', alpha=.7)
plt.scatter(evil['length'], evil['entropy'], s=40, c='r', label='Injections', alpha=.3)
plt.legend()
plt.xlabel('SQL Statement Length')
plt.ylabel('SQL Statement Entropy')
<matplotlib.text.Text at 0x74843d0>
# Split the classes up so we can set colors, size, labels
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5'))
ax.grid(color='grey', linestyle='solid')
plt.scatter(legit['malicious_g'], legit['legit_g'], s=140, c='#aaaaff', label='Legit', alpha=.7)
plt.scatter(evil['malicious_g'], evil['legit_g'], s=40, c='r', label='Injections', alpha=.3)
plt.legend()
plt.ylabel('Legit SQL G-Test Score')
plt.xlabel('Malicious SQL G-Test Score')
<matplotlib.text.Text at 0x7490450>
# In preparation for using scikit learn we're just going to use
# some handles that help take us from pandas land to scikit land
# List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors)
X = dataframe.as_matrix(['length', 'entropy','legit_g','malicious_g'])
# Labels (scikit learn uses 'y' for classification labels)
y = np.array(dataframe['type'].tolist()) # Yes, this is weird but it needs
# to be an np.array of strings
# Random Forest is a popular ensemble machine learning classifier.
# http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#
import sklearn.ensemble
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) # Trees in the forest
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10, n_jobs=4)
print scores
[ 0.99784173 0.99784173 1. 0.99784173 0.99856115 0.99784017 0.99640029 0.99856012 0.99784017 0.99784017]
# Wow 99% accurate! There is an issue though...
# Recall that we have ~13k 'malicious SQL statements and
# we only have about 1k 'legit' SQL statements, so we dive
# in a bit and look at the predictive performance more deeply.
# Train on a 80/20 split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X, y, dataframe.index, test_size=0.2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Now plot the results of the 80/20 split in a confusion matrix
from sklearn.metrics import confusion_matrix
labels = ['legit', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks kermit666 from stackoverflow :)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm')
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
plot_cm(cm, labels)
Confusion Matrix Stats legit/legit: 96.59% (170/176) legit/malicious: 3.41% (6/176) malicious/legit: 0.08% (2/2603) malicious/malicious: 99.92% (2601/2603)
# Compute the precition probabilities and use them to mimimize our false positives
# Note: This is simply a trade off, it means we'll miss a few of the malicious
# ones but typically false alarms are a death blow to any new 'fancy stuff' so
# we definitely want to mimimize the false alarms.
y_probs = clf.predict_proba(X_test)[:,1]
thres = .9 # This can be set to whatever you'd like
y_pred[y_probs<thres] = 'legit'
y_pred[y_probs>=thres] = 'malicious'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
Confusion Matrix Stats legit/legit: 98.30% (173/176) legit/malicious: 1.70% (3/176) malicious/legit: 0.38% (10/2603) malicious/malicious: 99.62% (2593/2603)
# We can also look at what features the learning algorithm thought were the most important
importances = zip(['length', 'entropy', 'legit_g', 'malicious_g'], clf.feature_importances_)
importances
# From the list below we see our feature importance scores. There's a lot of feature selection,
# sensitivity study, etc stuff that you could do if you wanted at this point.
[('length', 0.034947270077838898), ('entropy', 0.081305411355911961), ('legit_g', 0.63638493679534702), ('malicious_g', 0.247362381770902)]
# Now were going to just do some post analysis on how the ML algorithm performed.
# Lets look at the legit samples that were misclassified as malicious
test_set = dataframe.ix[index_test]
test_set['pred'] = y_pred
misclassified = test_set[(test_set['type']=='legit') & (test_set['pred']=='malicious')]
misclassified.head()
raw_sql | type | parsed_sql | sequences | length | entropy | malicious_g | legit_g | pred | |
---|---|---|---|---|---|---|---|---|---|
13606 | create table Purchase (pid int primary key, pr... | legit | [DDL, Keyword, Function, Punctuation] | [('DDL',), ('Keyword',), ('Function',), ('Punc... | 4 | 4.400948 | -174.930469 | 513.570637 | malicious |
13605 | create table Product (pid int primary key, pna... | legit | [DDL, Keyword, Function, Punctuation] | [('DDL',), ('Keyword',), ('Function',), ('Punc... | 4 | 4.137866 | -174.930469 | 513.570637 | malicious |
13495 | SELECT dept, number, SUBSTR(title, 1, 12) AS s... | legit | [DML, Identifier, Punctuation, Builtin, Punctu... | [('DML',), ('Identifier',), ('Punctuation',), ... | 8 | 4.699688 | -20.133315 | 353.217738 | malicious |
3 rows × 9 columns
# Discussion for how to use the resulting models.
# Typically Machine Learning comes in two phases
# - Training of the Model
# - Evaluation of new observations against the Model
# This notebook is about exploration of the data and training the model.
# After you have a model that you are satisfied with, just 'pickle' it
# at the end of the your training script and then in a separate
# evaluation script 'unpickle' it and evaluate/score new observations
# coming in (through a file, or ZeroMQ, or whatever...)
#
# In this case we'd have to pickle the RandomForest classifier.
# See 'test_it' below for how to use them in evaluation mode.
# test_it shows how to do evaluation, also fun for manual testing below :)
def test_it(sql):
parsed_sql = parse_it(sql)
ngram_list = ngrams(parsed_sql, 3)
malicious_g = g_aggregate(ngram_list, 'malicious_g')
legit_g = g_aggregate(ngram_list, 'legit_g')
_X = [len(parsed_sql), entropy(sql), legit_g, malicious_g]
print '%-40s: %s' % (sql, clf.predict(_X)[0])
test_it('select * from employees')
test_it("'; exec master..xp_cmdshell")
test_it("'any 'x'='x'")
test_it('from dorseys mom xp_cmdshell biache')
test_it('select * from your_mom')
select * from employees : legit '; exec master..xp_cmdshell : malicious 'any 'x'='x' : malicious from dorseys mom xp_cmdshell biache : malicious select * from your_mom : legit
The combination of IPython, Pandas and Scikit Learn let us pull in some junky SQL data, clean it up, plot it, understand it and slap it with some machine learning!
Clearly a lot more formality could be used, plotting learning curves, adjusting for overfitting, feature selection, on and on... there are some really great machine learning resources that cover this deeper material. In particular we highly recommend the work and presentations of Olivier Grisel at INRIA Saclay. http://ogrisel.com/