# Lets start to interact with workbench, please note there is NO specific client to workbench, # Just use the ZeroRPC Python, Node.js, or CLI interfaces. import zerorpc c = zerorpc.Client() c.connect("tcp://127.0.0.1:4242") # I forgot what stuff I can do with workbench print c.help() print c.help_basic() # STEP 1: # Okay get the list of commands from workbench print c.help_commands() # STEP 2: # Lets gets the infomation on a specific command 'store_sample' print c.help_command('store_sample') # STEP 3: # Now lets get infomation about the dynamically loaded workers (your site may have many more!) # Next to each worker name is the list of dependences that worker has declared print c.help_workers() # STEP 4: # Lets gets the infomation about the meta worker print c.help_worker('meta') # STEP 5: # Okay when we load up a file, we get the md5 back filename = '../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3' with open(filename,'rb') as f: my_md5 = c.store_sample(f.read(), filename, 'exe') print my_md5 # STEP 6: # Run a worker on my sample output = c.work_request('meta', my_md5) output # Lets see what view_pe does print c.help_worker('view_pe') # Okay lets give it a try c.work_request('view_pe', my_md5) # Okay, that worker needed the output of pe_features and pe_indicators # so what happened? The worker has a dependency list and workbench # recursively satisfies that dependency list.. this is powerful because # when we're interested in one particular analysis we just want to get # the darn thing without having to worry about a bunch of details # Well lets do this for a bunch of files! import os file_list = [os.path.join('../data/pe/bad', child) for child in os.listdir('../data/pe/bad')] working_set = [] for filename in file_list: with open(filename,'rb') as f: md5 = c.store_sample(f.read(), filename, 'exe') results = c.work_request('pe_classifier', md5) working_set.append(md5) print 'Results: %s' % (results) # We just ran the classifer on 50 files and you'll note that we ONLY got back the # information we ask for. On a large amount of files (100k or greater) if you don't # have a granular system, something this easy WILL NOT BE POSSIBLE! (dramatic enough?) # So lets look at the features going into the classifier (btw the classifier is currently a TOY EXAMPLE) c.work_request('pe_features', md5) c.work_request('pe_indicators', md5) # Another example.. I want to look at strings for different types of files (not just pe_files) # So we can load up a few pdfs (the pe's are already in the datastore) file_list = [os.path.join('../data/pdf/bad', child) for child in os.listdir('../data/pdf/bad')] for filename in file_list: with open(filename,'rb') as f: md5 = c.store_sample(f.read(), filename, 'pdf') working_set.append(md5) # Now we rip the strings worker on them all for md5 in working_set: result = c.work_request('strings', md5) print 'results: %s' % (result['strings']['string_list'][:5]) # strings output is large so just showing the first 5 # This just grabs all the file_paths recursively def tag_type(path): types = ['bro','json','log','pcap','pdf','exe','swf','zip'] for try_type in types: if try_type in os.path.dirname(path): return try_type file_list = [] for p,d,f_list in os.walk('../data'): file_list += [os.path.join(p, f) for f in f_list] # We're going to load in all the files which include PE files, PCAPS, PDFs, and ZIPs and run 'view' on them. # Note: This takes a while :) import pprint results = [] for filename in file_list: with open(filename,'rb') as f: md5 = c.store_sample(f.read(), os.path.basename(filename), tag_type(filename)) results.append(c.work_request('view', md5)) pprint.pprint(results[:5]) # Okay so views can either aggregate results from multiple workers or they # can subset to just want you want (webpage presentation for instance) results = c.batch_work_request('view_customer') print results # At this granularity it opens up a new world import pandas as pd df = pd.DataFrame(results) df.head(10) # Lets look at the file submission types broken down by customer df['count'] = 1 df.groupby(['customer','type_tag']).sum() # Plotting defaults import matplotlib.pyplot as plt %matplotlib inline plt.rcParams['font.size'] = 12.0 plt.rcParams['figure.figsize'] = 18.0, 8.0 # Plot box plots based on customer (PDFs) df[df['type_tag']=='pdf'].boxplot('length','customer') plt.xlabel('Customer') plt.ylabel('File Size') plt.title('File Length (PDF) by Customer') plt.suptitle('') # Plot box plots based on customer (PEs) df[df['type_tag']=='exe'].boxplot('length','customer') plt.xlabel('Customer') plt.ylabel('File Size') plt.title('File Length (PE) by Customer') plt.suptitle('') # Okay now lets do some plots on the file meta-data results = c.batch_work_request('meta_deep') df_meta = pd.DataFrame(results) df_meta.head() # Plot entropy box plots based on file type df_meta.boxplot('entropy','type_tag') plt.xlabel('Mime Type') plt.ylabel('Entropy') # Plot customer submissions based on file type group_df = df[['customer','type_tag']] group_df['submissions'] = 1 group_df = group_df.groupby(['customer','type_tag']).sum().unstack() group_df.head() # Plot entropy box plots based on mime-type my_colors = [(x/9.0, .8, 1.0-x/9.0) for x in range(10)] # Why the heck dosen't matplotlib have better categorical cmaps? group_df['submissions'].plot(kind='bar', stacked=True, color=my_colors) plt.xlabel('Customer') plt.ylabel('Submissions')