# Lets start to interact with workbench, please note there is NO specific client to workbench,
# Just use the ZeroRPC Python, Node.js, or CLI interfaces.
import zerorpc
c = zerorpc.Client()
c.connect("tcp://127.0.0.1:4242")

# I forgot what stuff I can do with workbench
print c.help()

print c.help_basic()

# STEP 1:
# Okay get the list of commands from workbench
print c.help_commands()

# STEP 2:
# Lets gets the infomation on a specific command 'store_sample'
print c.help_command('store_sample')

# STEP 3:
# Now lets get infomation about the dynamically loaded workers (your site may have many more!)
# Next to each worker name is the list of dependences that worker has declared
print c.help_workers()

# STEP 4:
# Lets gets the infomation about the meta worker
print c.help_worker('meta')

# STEP 5:
# Okay when we load up a file, we get the md5 back
filename = '../data/pe/bad/0cb9aa6fb9c4aa3afad7a303e21ac0f3'
with open(filename,'rb') as f:
    my_md5 = c.store_sample(f.read(), filename, 'exe')
print my_md5

# STEP 6:
# Run a worker on my sample
output = c.work_request('meta', my_md5)
output

# Lets see what view_pe does
print c.help_worker('view_pe')

# Okay lets give it a try
c.work_request('view_pe', my_md5)

# Okay, that worker needed the output of pe_features and pe_indicators
# so what happened? The worker has a dependency list and workbench
# recursively satisfies that dependency list.. this is powerful because
# when we're interested in one particular analysis we just want to get
# the darn thing without having to worry about a bunch of details

# Well lets do this for a bunch of files!
import os
file_list = [os.path.join('../data/pe/bad', child) for child in os.listdir('../data/pe/bad')]
working_set = []
for filename in file_list:
    with open(filename,'rb') as f:
        md5 = c.store_sample(f.read(), filename, 'exe')
        results = c.work_request('pe_classifier', md5)
        working_set.append(md5)
        print 'Results: %s' % (results)

# We just ran the classifer on 50 files and you'll note that we ONLY got back the
# information we ask for. On a large amount of files (100k or greater) if you don't
# have a granular system, something this easy WILL NOT BE POSSIBLE! (dramatic enough?)

# So lets look at the features going into the classifier (btw the classifier is currently a TOY EXAMPLE)
c.work_request('pe_features', md5)

c.work_request('pe_indicators', md5)

# Another example.. I want to look at strings for different types of files (not just pe_files)
# So we can load up a few pdfs (the pe's are already in the datastore)
file_list = [os.path.join('../data/pdf/bad', child) for child in os.listdir('../data/pdf/bad')]
for filename in file_list:
    with open(filename,'rb') as f:
        md5 = c.store_sample(f.read(), filename, 'pdf')
        working_set.append(md5)

# Now we rip the strings worker on them all
for md5 in working_set:
    result = c.work_request('strings', md5)
    print 'results: %s' % (result['strings']['string_list'][:5]) # strings output is large so just showing the first 5

# This just grabs all the file_paths recursively
def tag_type(path):
    types = ['bro','json','log','pcap','pdf','exe','swf','zip']
    for try_type in types:
        if try_type in os.path.dirname(path):
            return try_type

file_list = []
for p,d,f_list in os.walk('../data'):
    file_list += [os.path.join(p, f) for f in f_list]

# We're going to load in all the files which include PE files, PCAPS, PDFs, and ZIPs and run 'view' on them.
# Note: This takes a while :)
import pprint
results = []
for filename in file_list:
    with open(filename,'rb') as f:
        md5 = c.store_sample(f.read(), os.path.basename(filename), tag_type(filename))
        results.append(c.work_request('view', md5))
pprint.pprint(results[:5])

# Okay so views can either aggregate results from multiple workers or they
# can subset to just want you want (webpage presentation for instance)
results = c.batch_work_request('view_customer')
print results

# At this granularity it opens up a new world
import pandas as pd
df = pd.DataFrame(results)
df.head(10)

# Lets look at the file submission types broken down by customer
df['count'] = 1
df.groupby(['customer','type_tag']).sum()

# Plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 12.0
plt.rcParams['figure.figsize'] = 18.0, 8.0

# Plot box plots based on customer (PDFs)
df[df['type_tag']=='pdf'].boxplot('length','customer')
plt.xlabel('Customer')
plt.ylabel('File Size')
plt.title('File Length (PDF) by Customer')
plt.suptitle('')

# Plot box plots based on customer (PEs)
df[df['type_tag']=='exe'].boxplot('length','customer')
plt.xlabel('Customer')
plt.ylabel('File Size')
plt.title('File Length (PE) by Customer')
plt.suptitle('')

# Okay now lets do some plots on the file meta-data
results = c.batch_work_request('meta_deep')

df_meta = pd.DataFrame(results)
df_meta.head()

# Plot entropy box plots based on file type
df_meta.boxplot('entropy','type_tag')
plt.xlabel('Mime Type')
plt.ylabel('Entropy')

# Plot customer submissions based on file type
group_df = df[['customer','type_tag']]
group_df['submissions'] = 1
group_df = group_df.groupby(['customer','type_tag']).sum().unstack()
group_df.head()

# Plot entropy box plots based on mime-type
my_colors = [(x/9.0, .8, 1.0-x/9.0) for x in range(10)] # Why the heck dosen't matplotlib have better categorical cmaps?
group_df['submissions'].plot(kind='bar', stacked=True, color=my_colors)
plt.xlabel('Customer')
plt.ylabel('Submissions')