Visualizing Binary Features

This notebook demonstrates how I visualize binary features & clustering

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

Functions

In [3]:
def is_std_perm(p): return p.startswith('android.permission.') or p.startswith('com.android.')
In [4]:
def elide(s, l=20):
    s = s[:-4]  # remove extension
    s = s[:s.find('_')]  # remove _<vercode>
    if len(s) <= l:
        return s
    h = l / 2
    return s[:h] + '..' + s[-h:]

Preparing the Data

In [5]:
dataset = pd.read_csv('crawled-apks.csv')
dataset.fillna(0, inplace=True)
In [6]:
# remove some samples
dataset.drop(np.random.choice(dataset.index, int(len(dataset) * 0.5), replace=False), inplace=True)
dataset.reset_index(drop=True, inplace=True)
In [7]:
# remove columns which are completely all zero
dataset = dataset.ix[:,(dataset != 0).any(axis=0)]
In [8]:
# remove columns with non-standard permissions
dataset = dataset[[c for c in dataset.columns if not c.startswith('p_') or is_std_perm(c[2:])]]
In [9]:
# length of filenames
dataset['_file'].apply(len).describe()
Out[9]:
count    1401.000000
mean       33.226981
std         8.747727
min        14.000000
25%        27.000000
50%        32.000000
75%        38.000000
max        72.000000
dtype: float64
In [10]:
# visualize sample data
d = dataset.ix[np.random.choice(dataset.index, 50)]
d = d.reset_index(drop=True)

fig, ax = plt.subplots()
fig.set_size_inches(fig.get_size_inches() * (1.5, 1.6))
plt.yticks(d.index, d['_file'].apply(elide), fontsize='small')
ax.imshow(d[:50][[c for c in d.columns if c != '_file']], 
          aspect='auto', cmap=plt.cm.gray_r, interpolation='none')
Out[10]:
<matplotlib.image.AxesImage at 0x15d58bd0>
In [11]:
# visualize data
fig, ax = plt.subplots()
fig.set_size_inches(fig.get_size_inches() * (2.5, 15))
plt.yticks(dataset.index, dataset['_file'].apply(elide), fontsize='small')
ax.imshow(dataset[[c for c in dataset.columns if c != '_file']], aspect='auto', cmap=plt.cm.gray_r, interpolation='none')
Out[11]:
<matplotlib.image.AxesImage at 0x92837d0>
In [12]:
# plot frequency of permission & features used by apps
plt.plot(dataset.mean())
Out[12]:
[<matplotlib.lines.Line2D at 0x15d7ef10>]
In [13]:
dataset.mean().order(ascending=False)
Out[13]:
p_android.permission.INTERNET                  0.925054
p_android.permission.ACCESS_NETWORK_STATE      0.850821
p_android.permission.WRITE_EXTERNAL_STORAGE    0.640971
p_android.permission.READ_PHONE_STATE          0.442541
p_android.permission.WAKE_LOCK                 0.344754
p_android.permission.ACCESS_WIFI_STATE         0.326196
p_android.permission.ACCESS_FINE_LOCATION      0.304069
p_android.permission.ACCESS_COARSE_LOCATION    0.292648
p_android.permission.VIBRATE                   0.278373
p_android.permission.GET_ACCOUNTS              0.229836
p_android.permission.RECEIVE_BOOT_COMPLETED    0.171306
p_android.permission.CAMERA                    0.121342
p_com.android.vending.BILLING                  0.118487
p_android.permission.GET_TASKS                 0.114918
p_android.permission.READ_CONTACTS             0.108494
...
p_android.permission.SMARTCARD                    0.000714
p_android.permission.ACCESS_CORSE_LOCATION        0.000714
p_android.permission.NETWORK_STATE                0.000714
p_android.permission.NETWORK                      0.000714
p_android.permission.FLAG_ACTIVITY_NEW_TASK       0.000714
p_android.permission.ACCESS_CHECKIN_PROPERTIES    0.000714
p_android.permission.FLAG_SHOW_WHEN_LOCKED        0.000714
p_android.permission.ACCESS_ALL_DOWNLOADS         0.000714
f_lge.hardware.real3d.barrier.landscape           0.000714
p_android.permission.WRITE_SYNC_STATS             0.000714
f_android.permission.ACCESS_FINE_LOCATION         0.000714
f_android.permission.ACCESS_COARSE_LOCATION       0.000714
p_android.permission.SILENT                       0.000714
f_android.hardware.usb.host                       0.000714
p_android.permission.WRITE_SD_CARD                0.000714
Length: 214, dtype: float64

Clustering

In [14]:
X = dataset[[c for c in dataset.columns if not c.startswith('_')]]
clustering = AgglomerativeClustering(n_clusters=10)
clustering.fit(X)

datasetC = dataset.copy()
datasetC['_label'] = pd.DataFrame(clustering.labels_, index=dataset.index)
In [15]:
# reset the indices
datasetC.sort('_label', inplace=True)
datasetC.reset_index(drop=True, inplace=True)
In [16]:
fig, ax = plt.subplots()
fig.set_size_inches(fig.get_size_inches() * (2.5, 10))
plt.yticks(datasetC.index, datasetC['_file'].apply(elide), fontsize='small')

# visualize clusters
for label, rows in datasetC.groupby('_label').groups.iteritems():
    r = sorted(rows)
    start, end = r[0], r[-1]
    
    # separator line & text label
    ax.axhline(end + 0.5, lw=2, color='blue', alpha=0.4)
    ax.text(.4 * len(datasetC.columns), start + .5 * (end - start), '%d' % label,
        fontsize=30, fontweight='bold', va='center', color='blue', alpha=0.3)

ax.imshow(datasetC[[c for c in datasetC.columns if not c.startswith('_')]], 
          aspect='auto', cmap=plt.cm.gray_r, interpolation='none')
Out[16]:
<matplotlib.image.AxesImage at 0x39a117d0>