import pandas as pd print 'pandas version is', pd.__version__ import numpy as np print 'numpy version is', np.__version__ import sklearn print 'scikit-learn version is', sklearn.__version__ import matplotlib print 'matplotlib version is', matplotlib.__version__ import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 18.0 plt.rcParams['figure.figsize'] = 16.0, 5.0 def plot_cm(cm, labels): # Compute percentanges percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) print 'Confusion Matrix Stats' for i, label_i in enumerate(labels): for j, label_j in enumerate(labels): print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum()) # Show confusion matrix # Thanks to kermit666 from stackoverflow fig = plt.figure() ax = fig.add_subplot(111) ax.grid(b=False) cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100) plt.title('') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show() def extract_character_info(string): lowercase_runs = [] uppercase_runs = [] digit_runs = [] lower = map(str.islower, str(string)) upper = map(str.isupper, str(string)) digits = map(str.isdigit, str(string)) current_length = 0 current = False for l in lower: if l: current_length += 1 current = True else: if current: lowercase_runs.append(current_length) current_length = 0 current = False if current: lowercase_runs.append(current_length) current_length = 0 current = False for u in upper: if u: current_length += 1 current = True else: if current: uppercase_runs.append(current_length) current_length = 0 current = False if current: uppercase_runs.append(current_length) current_length = 0 current = False for d in digits: if d: current_length += 1 current = True else: if current: digit_runs.append(current_length) current_length = 0 current = False if current: digit_runs.append(current_length) return lowercase_runs, uppercase_runs, digit_runs def extract_features(data): features = {} try: features['sha256'] = data['metadata']['sha256'] features['size'] = data['metadata']['file_size'] features['entropy'] = data['metadata']['entropy'] if 'sourcefile' in data['characteristics']['java']: features['source file'] = data['characteristics']['java']['sourcefile'] else: features['source file'] = 'No Source File' if 'access_permissions' in data['characteristics']['java']: features['ap_count'] = len(data['characteristics']['java']['access_permissions']) for ap in data['characteristics']['java']['access_permissions']: features[str.lower(str(ap).replace(" ", "_"))] = 1 features['class name'] = data['characteristics']['java']['class_name'] features['class_name_slash_count'] = features['class name'].count('/') features['class_name_length'] = len(features['class name']) cn_lowercase_runs, cn_uppercase_runs, cn_digit_runs = extract_character_info(features['class name']) cn_lowercase_run_longest = 0 cn_lowercase_run_average = 0 cn_uppercase_run_longest = 0 cn_uppercase_run_average = 0 cn_digit_run_longest = 0 cn_digit_run_average = 0 if cn_lowercase_runs: cn_lowercase_run_longest = np.max(cn_lowercase_runs) cn_lowercase_run_average = np.mean(cn_lowercase_runs) features['class_name_lowercase_run_longest'] = cn_lowercase_run_longest features['class_name_lowercase_run_avg'] = cn_lowercase_run_average if cn_uppercase_runs: cn_uppercase_run_longest = np.max(cn_uppercase_runs) cn_uppercase_run_average = np.mean(cn_uppercase_runs) features['class_name_uppercase_run_longest'] = cn_uppercase_run_longest features['class_name_uppercase_run_avg'] = cn_uppercase_run_average if cn_digit_runs: cn_digit_run_longest = np.max(cn_digit_runs) cn_digit_run_average = np.mean(cn_digit_runs) features['class_name_digit_run_longest'] = cn_digit_run_longest features['class_name_digit_run_avg'] = cn_digit_run_average features['major version'] = data['characteristics']['java']['major_version'] features['minor version'] = data['characteristics']['java']['minor_version'] if 'method_names' in data['characteristics']['java']: features['method names'] = data['characteristics']['java']['method_names'] else: features['method names'] = [] features['methods_count'] = len(features['method names']) lowercase_run_longest = 0 lowercase_run_average = 0 lowercase_runs = [] uppercase_run_longest = 0 uppercase_run_average = 0 uppercase_runs = [] digit_run_longest = 0 digit_run_average = 0 digit_runs = [] for method in features['method names']: lc, uc, d = extract_character_info(method) lowercase_runs.extend(lc) uppercase_runs.extend(uc) digit_runs.extend(d) if lowercase_runs: lowercase_run_longest = np.max(lowercase_runs) lowercase_run_average = np.mean(lowercase_runs) features['method_name_lowercase_run_longest'] = lowercase_run_longest features['method_name_lowercase_run_avg'] = lowercase_run_average if uppercase_runs: uppercase_run_longest = np.max(uppercase_runs) uppercase_run_average = np.mean(uppercase_runs) features['method_name_uppercase_run_longest'] = uppercase_run_longest features['method_name_uppercase_run_avg'] = uppercase_run_average if digit_runs: digit_run_longest = np.max(digit_runs) digit_run_average = np.mean(digit_runs) features['method_name_digit_run_longest'] = digit_run_longest features['method_name_digit_run_avg'] = digit_run_average if 'interfaces' in data['characteristics']['java']: features['interfaces'] = data['characteristics']['java']['interfaces'] else: features['interfaces'] = [] features['interface_count'] = len(features['interfaces']) features['constant_pool_count'] = data['characteristics']['java']['const_pool_count'] except KeyError as ke: print 'ERROR:', ke, data['metadata']['sha256'] return features def load_files(file_list): import json features_list = [] for filename in file_list: with open(filename,'rb') as f: features = extract_features(json.loads(f.read())) features_list.append(features) return features_list # Good files import glob good_list = glob.glob('data/clean/*.results') good_features = load_files(good_list) print "Files:", len(good_list) # Bad files bad_list = glob.glob('data/malicious/*.results') bad_features = load_files(bad_list) print "Files:", len(bad_list) df_good = pd.DataFrame.from_records(good_features) df_good.fillna(0, inplace=True) df_good['label'] = 'benign' df_good.head() df_bad = pd.DataFrame.from_records(bad_features) df_bad.fillna(0, inplace=True) df_bad['label'] = 'malicious' df_bad.head() df = pd.concat([df_bad, df_good], ignore_index=True) df.fillna(0, inplace=True) df.boxplot(column='size', by='label') plt.ylabel('File Size') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot(column='size', by='label') plt.ylabel('File Size') plt.xlabel('') plt.title('') plt.suptitle('') plt.ylim(0, 15000) df.boxplot('entropy', 'label') plt.ylabel('Entropy') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot(column='constant_pool_count', by='label') plt.ylabel('Constant Pool Count') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot(column='constant_pool_count', by='label') plt.xlabel('') plt.ylabel('Constant Pool Count') plt.title('') plt.suptitle('') plt.ylim(0, 1000) df.boxplot(column='methods_count', by='label') plt.ylabel('Number of Methods') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot(column='interface_count', by='label') plt.ylabel('Number of Interfaces') plt.xlabel('') plt.title('') plt.suptitle('') my_seed = 1022 my_tsize = .2 import sklearn.ensemble clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50) simple_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count', 'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version'] X = df.as_matrix(simple_features) y = np.array(df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_simple.fit(X_train, y_train) y_pred = clf_simple.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) importances = zip(simple_features, clf_simple.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) for idx, im in enumerate(importances): print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5) bad = [] good = [] for strings, label in zip(df['method names'], df['label']): for name in strings: d = {'method name': name} if label == 'malicious' and d not in bad: bad.append(d) elif label == 'benign' and d not in good: good.append(d) df_method_names_bad = pd.DataFrame.from_records(bad) df_method_names_good = pd.DataFrame.from_records(good) df_method_names_bad.head(50) df_method_names_good.head(50) df.boxplot('method_name_lowercase_run_longest', 'label') plt.ylabel('Max length of lower case letters') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('method_name_lowercase_run_avg', 'label') plt.ylabel('Avg length of lower case letters') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('method_name_uppercase_run_longest', 'label') plt.ylabel('Max length of upper case letters') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('method_name_uppercase_run_avg', 'label') plt.ylabel('Avg length of upper case letters') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('method_name_digit_run_longest', 'label') plt.ylabel('Max length of digits') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('method_name_digit_run_avg', 'label') plt.ylabel('Avg length of digits') plt.xlabel('') plt.title('') plt.suptitle('') import sklearn.ensemble clf_methods = sklearn.ensemble.RandomForestClassifier(n_estimators=50) method_name_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count', 'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version', 'method_name_digit_run_avg', 'method_name_digit_run_longest', 'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest', 'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest'] X = df.as_matrix(method_name_features) y = np.array(df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_methods, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_methods.fit(X_train, y_train) y_pred = clf_methods.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) for idx, gcn in enumerate(df_good['class name']): print gcn if idx == 19: break for idx, gcn in enumerate(df_bad['class name']): print gcn if idx == 19: break df.boxplot('class_name_length', 'label') plt.ylabel('Class Name Length') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('class_name_slash_count', 'label') plt.ylabel('Class Name Slash Count') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('class_name_lowercase_run_longest', 'label') plt.ylabel('Max Run of Lower Case Letters') plt.xlabel('') plt.title('') plt.suptitle('') df.boxplot('class_name_lowercase_run_avg', 'label') plt.ylabel('Avg Run of Lower Case Letters') plt.xlabel('') plt.title('') plt.suptitle('') import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split clf_all = sklearn.ensemble.RandomForestClassifier(n_estimators=75) all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count', 'entropy', 'interface_count', 'major version', 'methods_count', 'size', 'minor version', 'method_name_digit_run_avg', 'method_name_digit_run_longest', 'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest', 'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest', 'class_name_digit_run_avg', 'class_name_digit_run_longest', 'class_name_length', 'class_name_lowercase_run_avg', 'class_name_lowercase_run_longest', 'class_name_slash_count', 'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest'] X = df.as_matrix(all_features) y = np.array(df['label'].tolist()) labels = ['good', 'bad'] scores = sklearn.cross_validation.cross_val_score(clf_all, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_all.fit(X_train, y_train) y_pred = clf_all.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) y_probs = clf_all.predict_proba(X_test)[:,1] thres = .80 # This can be set to whatever you'd like y_pred[y_probs>thres] = 'malicious' y_pred[y_probs<=thres] = 'benign' cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) #### We do the same, but set the threshold lower, to only 20% y_probs = clf_all.predict_proba(X_test)[:,1] thres = .20 # This can be set to whatever you'd like y_pred[y_probs>thres] = 'malicious' y_pred[y_probs<=thres] = 'benign' cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) scores = clf_all.predict_proba(X_test)[:,1] plt.hist(scores, bins=20) plt.grid(True) plt.show() importances = zip(all_features, clf_all.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) sum = 0 for idx, im in enumerate(importances): sum += round(im[1], 5) print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum clf_er = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50) X_er = df.as_matrix(all_features) y_er = np.array(df['label'].tolist()) labels = ['benign', 'malicious'] scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X_er, y_er, test_size=my_tsize, random_state=my_seed) clf_er.fit(X_train, y_train) y_pred = clf_er.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) import sklearn.svm import sklearn.preprocessing clf_svc = sklearn.svm.SVC() X_svc = df.as_matrix(all_features) X_svc = sklearn.preprocessing.scale(X_svc) y_svc = np.array(df['label'].tolist()) labels = ['benign', 'malicious'] scores = sklearn.cross_validation.cross_val_score(clf_svc, X_svc, y_svc, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X_svc, y_svc, test_size=my_tsize, random_state=my_seed) clf_svc.fit(X_train, y_train) y_pred = clf_svc.predict(X_test) cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Now we can use scikit learn's cross validation to assess predictive performance. scores = sklearn.cross_validation.cross_val_score(clf_all, X_all, y_all, cv=20) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) # Now we can use scikit learn's cross validation to assess predictive performance. scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=20) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50) all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count', 'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version', 'method_name_digit_run_avg', 'method_name_digit_run_longest', 'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest', 'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest', 'class_name_digit_run_avg', 'class_name_digit_run_longest', 'class_name_length', 'class_name_lowercase_run_avg', 'class_name_lowercase_run_longest', 'class_name_slash_count', 'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest'] X_all = df.as_matrix(all_features) y_all = np.array(df['label'].tolist()) clf_everything.fit(X_all, y_all) java_big_pile_df = pd.read_hdf('data/java_clean_df.hd5', 'table') clean = 0 gray = 0 bad = 0 for x in java_big_pile_df.as_matrix(all_features): try: score = clf_everything.predict_proba(x)[:,1][0] if score < 0.5: clean += 1 elif score < 0.8: gray += 1 else: bad += 1 except: print "Sad" print x break print java_big_pile_df.shape print clean print gray print bad java_more_bad_df = pd.read_hdf('data/java_malicious_df.hd5', 'table') java_big_pile_df.head() java_big_pile_df['class_name_length'].describe() java_random_df = java_big_pile_df.reindex(np.random.permutation(java_big_pile_df.index)) java_random_2k_df = java_random_df[0:2000] java_random_the_rest_df = java_random_df[2000:] java_random_2k_df['label'] = 'benign' java_more_bad_df['label'] = 'malicious' java_4k_df = pd.concat([java_more_bad_df, java_random_2k_df], ignore_index=True) java_4k_df.fillna(0, inplace=True) clf_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=75) all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'class_name_digit_run_avg', 'class_name_digit_run_longest', 'class_name_length', 'class_name_lowercase_run_avg', 'class_name_lowercase_run_longest', 'class_name_slash_count', 'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest', 'constant_pool_count', 'entropy', 'interface_count', 'major version', 'method_name_digit_run_avg', 'method_name_digit_run_longest', 'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest', 'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest', 'methods_count', 'minor version', 'size'] X = java_4k_df.as_matrix(all_features) y = np.array(java_4k_df['label'].tolist()) scores = sklearn.cross_validation.cross_val_score(clf_4k, X, y, cv=10) print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2)) import sklearn.ensemble from sklearn.metrics import confusion_matrix from sklearn.cross_validation import train_test_split # 80/20 Split for predictive test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf_4k.fit(X_train, y_train) y_pred = clf_4k.predict(X_test) labels = ['benign', 'malicious'] cm = confusion_matrix(y_test, y_pred, labels) plot_cm(cm, labels) # Feature Selection # Which features best deferentiated the two classes? # Here we're going to grab the feature_importances from the classifier itself, importances = zip(all_features, clf_4k.feature_importances_) importances.sort(key=lambda k:k[1], reverse=True) sum = 0 for idx, im in enumerate(importances): sum += round(im[1], 5) print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum clf_everything_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=50) X_all = java_4k_df.as_matrix(all_features) y_all = np.array(java_4k_df['label'].tolist()) clf_everything_4k.fit(X_all, y_all) clean = 0 gray = 0 bad = 0 X_rest = java_random_the_rest_df.as_matrix(all_features) for x in X_rest: score = clf_everything_4k.predict_proba(x)[:,1][0] if score < 0.5: clean += 1 elif score < 0.8: gray += 1 else: bad += 1 print java_random_the_rest_df.shape[0] print clean print gray print bad