In this notebook we will pull Bro data into Spark then do some analysis and clustering. The first step is to convert your Bro log data into a Parquet file, for instructions on how to do this (just a few lines of Python code using the BAT package) please see this notebook:
Apache Parquet is a columnar storage format focused on performance. Parquet data is often used within the Hadoop ecosystem and we will specifically be using it for loading data into Spark.
# Third Party Imports
import pyspark
from pyspark.sql import SparkSession
import pyarrow
# Local imports
import bat
from bat.log_to_parquet import log_to_parquet
# Good to print out versions of stuff
print('BAT: {:s}'.format(bat.__version__))
print('PySpark: {:s}'.format(pyspark.__version__))
print('PyArrow: {:s}'.format(pyarrow.__version__))
BAT: 0.2.9 PySpark: 2.2.0 PyArrow: 0.6.0
Here we're spinning up a local spark server with 4 parallel executors, although this might seem a bit silly since we're probably running this on a laptop, there are a couple of important observations:
# Spin up a local Spark Session (with 4 executors)
spark = SparkSession.builder.master("local[4]").appName('my_awesome').getOrCreate()
# Have Spark read in the Parquet File
spark_df = spark.read.parquet("dns.parquet")
# Get information about the Spark DataFrame
num_rows = spark_df.count()
print("Number of Rows: {:d}".format(num_rows))
columns = spark_df.columns
print("Columns: {:s}".format(','.join(columns)))
Number of Rows: 427935 Columns: AA,RA,RD,TC,TTLs,Z,answers,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,qclass,qclass_name,qtype,qtype_name,query,rcode,rcode_name,rejected,trans_id,uid,ts
spark_df.groupby('qtype_name','proto').count().sort('count', ascending=False).show()
+----------+-----+------+ |qtype_name|proto| count| +----------+-----+------+ | A| udp|212473| | NB| udp| 77199| | AAAA| udp| 54519| | PTR| udp| 52991| | TXT| udp| 12644| | SRV| udp| 12268| | -| udp| 3472| | *| udp| 882| | AXFR| tcp| 440| | SOA| udp| 346| | TXT| tcp| 226| | -| tcp| 176| | MX| udp| 169| | NS| udp| 43| | HINFO| udp| 30| | NAPTR| udp| 27| | PTR| tcp| 26| | A| tcp| 4| +----------+-----+------+
Spark has a powerful SQL engine as well as a Machine Learning library. So now that we've loaded our Bro data we're going to utilize the Spark SQL commands to do some investigation of our data including clustering from the MLLib.
# Add a column with the string length of the DNS query
from pyspark.sql.functions import col, length
# Create new dataframe that includes two new column
spark_df = spark_df.withColumn('query_length', length(col('query')))
spark_df = spark_df.withColumn('answer_length', length(col('answers')))
# Plotting defaults
%matplotlib inline
import matplotlib.pyplot as plt
from bat.utils import plot_utils
plot_utils.plot_defaults()
# Show histogram of the Spark DF request body lengths
bins, counts = spark_df.select('query_length').rdd.flatMap(lambda x: x).histogram(50)
# This is a bit awkward but I believe this is the correct way to do it
plt.hist(bins[:-1], bins=bins, weights=counts, log=True)
plt.grid(True)
plt.xlabel('DNS Query Lengths')
plt.ylabel('Counts')
<matplotlib.text.Text at 0x10b05e630>
# Show histogram of the Spark DF request body lengths
bins, counts = spark_df.select('answer_length').rdd.flatMap(lambda x: x).histogram(50)
# This is a bit awkward but I believe this is the correct way to do it
plt.hist(bins[:-1], bins=bins, weights=counts, log=True)
plt.grid(True)
plt.xlabel('DNS Answer Lengths')
plt.ylabel('Counts')
<matplotlib.text.Text at 0x104a08860>
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
categoricalColumns = ['qtype_name', 'proto']
stages = []
for categoricalCol in categoricalColumns:
stringIndexer = StringIndexer(inputCol=categoricalCol,
outputCol=categoricalCol+"Index")
encoder = OneHotEncoder(inputCol=categoricalCol+"Index",
outputCol=categoricalCol+"classVec")
stages += [stringIndexer, encoder]
numericCols = ['query_length', 'answer_length', 'Z', 'rejected']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(spark_df)
spark_df = pipelineModel.transform(spark_df)
spark_df.select('features').show()
+--------------------+ | features| +--------------------+ |(18,[5,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[5,13,14,15],...| |(18,[5,13,14,15],...| |(18,[5,13,14,15],...| |(18,[3,13,14,15],...| |(18,[3,13,14,15],...| |(18,[5,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[1,13,14,15,1...| |(18,[2,13,14,15],...| +--------------------+ only showing top 20 rows
from pyspark.ml.clustering import KMeans
# Trains a k-means model.
kmeans = KMeans().setK(70)
model = kmeans.fit(spark_df)
# Evaluate clustering by computing Within Set Sum of Squared Errors.
wssse = model.computeCost(spark_df)
print("Within Set Sum of Squared Errors = " + str(wssse))
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
print(center)
Within Set Sum of Squared Errors = 120733.85472213484 Cluster Centers: [ 9.50906344e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.74751834e-02 0.00000000e+00 0.00000000e+00 9.71083297e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.47388865e-04 9.97410445e-01 1.14872680e+01 1.00000000e+00 0.00000000e+00 9.17134225e-03] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.72002232e+01 1.00360999e+00 0.00000000e+00 1.60808638e-03] [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 10. 687. 0. 0.] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.63855422e-02 0.00000000e+00 9.03614458e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.13253012e+00 3.47000000e+02 0.00000000e+00 0.00000000e+00] [ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 21.65517241 137.27586207 0. 0. ] [ 7.35499488e-03 0.00000000e+00 7.54119728e-03 0.00000000e+00 1.86202402e-04 9.70021413e-01 0.00000000e+00 1.48961922e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 5.69467461e+01 1.00000000e+00 3.52108742e-01 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 9.21781975e-01 7.82180250e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.05908438e+01 1.00000000e+00 8.21186615e-04 4.10593307e-04] [ 9.97151713e-01 0.00000000e+00 0.00000000e+00 2.26187484e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.02638854e-04 0.00000000e+00 8.37731423e-05 9.99664907e-01 1.50000000e+01 1.00000000e+00 6.70185139e-04 3.09960627e-03] [ 8.24082785e-01 0.00000000e+00 1.48871119e-01 2.42238946e-02 2.82220132e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.30000000e+01 1.00000000e+00 0.00000000e+00 2.35183443e-03] [ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.97252747 1. 47.64285714 0. 0. ] [ 8.47867380e-02 0.00000000e+00 8.63408738e-03 8.28699706e-01 7.68433777e-02 1.03609049e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.97349335e+01 1.00000000e+00 0.00000000e+00 9.82559143e-02] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 4.20721412 1. 0.99945181 0. ] [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 10. 412. 0. 0.] [ 8.98322492e-01 0.00000000e+00 8.40465594e-02 5.13522766e-03 3.59465936e-03 0.00000000e+00 0.00000000e+00 8.90106128e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.31299213e+01 1.00000000e+00 0.00000000e+00 0.00000000e+00] [ 9.70394737e-01 0.00000000e+00 0.00000000e+00 6.57894737e-03 2.30263158e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.80263158e-01 1.40986842e+01 1.19210526e+01 9.86842105e-03 0.00000000e+00] [ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.95454545 1. 17.59848485 0. 0. ] [ 9.84349541e-01 0.00000000e+00 1.56075808e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.28779693e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.50398551e+01 1.00000000e+00 0.00000000e+00 8.57559386e-05] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 8. 1. 0.99988369 0. ] [ 3.88098318e-03 0.00000000e+00 0.00000000e+00 8.58990944e-01 1.81112549e-02 1.29366106e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.06080207e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 7.21966365e+01 1.07761966e+00 0.00000000e+00 3.88098318e-02] [ 9.95342475e-01 0.00000000e+00 0.00000000e+00 3.25701072e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.62495522e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.60000000e+01 1.00000000e+00 3.25701072e-05 1.46565482e-03] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 14.65577023 1. 0.97712794 0. ] [ 0. 0. 0. 0.36548223 0.63451777 0. 0. 0. 0. 0. 0. 0. 0. 0.53807107 16.69035533 32.99492386 0. 0. ] [ 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 5.93881886 1. 0. 0. ] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 7.59493671e-02 0.00000000e+00 0.00000000e+00 6.32911392e-02 0.00000000e+00 8.60759494e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.39240506e-01 1.03924051e+01 1.13240506e+02 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.98357964e-01 0.00000000e+00 0.00000000e+00 1.06249396e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.79542162e-04 1.00000000e+00 1.39970057e+01 1.00000000e+00 2.99430117e-03 3.86361441e-03] [ 9.20485175e-01 0.00000000e+00 3.09973046e-02 3.03234501e-03 0.00000000e+00 4.54851752e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.98662399e+01 1.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 10. 519.33333333 0. 0. ] [ 9.93240447e-01 0.00000000e+00 0.00000000e+00 2.06925093e-03 0.00000000e+00 8.27700372e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.86260174e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.00000000e+01 1.00000000e+00 0.00000000e+00 3.44875155e-03] [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 10. 277. 0. 0.] [ 7.62580244e-01 0.00000000e+00 1.17829778e-01 1.15862497e-01 1.03541106e-04 0.00000000e+00 0.00000000e+00 6.21246635e-04 0.00000000e+00 3.00269207e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.40000000e+01 1.00000000e+00 0.00000000e+00 1.65665769e-03] [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 10. 440. 0. 0.] [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.60000000e+01 1.00000000e+00 0.00000000e+00 6.07902736e-03] [ 8.65995032e-01 0.00000000e+00 1.08059619e-01 0.00000000e+00 2.89815070e-03 1.38007176e-04 0.00000000e+00 1.91829975e-02 3.31217223e-03 0.00000000e+00 0.00000000e+00 4.14021529e-04 0.00000000e+00 9.96687828e-01 9.67913331e+00 1.00000000e+00 0.00000000e+00 5.79630141e-03] [ 1.22048223e-01 0.00000000e+00 4.77255779e-02 6.94258016e-01 1.32239622e-01 0.00000000e+00 0.00000000e+00 3.72856078e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 4.25021129e+01 1.00000000e+00 0.00000000e+00 7.05940840e-02] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 7. 1. 1. 0.] [ 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.40000000e+01 1.00000000e+00 0.00000000e+00 5.74349549e-05] [ 2.35294118e-01 0.00000000e+00 1.61764706e-02 2.10294118e-01 8.08823529e-03 5.30147059e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.71735294e+01 1.03088235e+00 0.00000000e+00 0.00000000e+00] [ 9.99643589e-01 0.00000000e+00 0.00000000e+00 2.67308206e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.90000000e+01 1.00000000e+00 0.00000000e+00 2.49487659e-03] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 6. 1. 0.99921034 0. ] [ 8.58277625e-01 1.41486433e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.35941801e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.30000000e+01 1.00000000e+00 1.40464019e-01 1.73023987e-03] [ 0.00000000e+00 0.00000000e+00 9.61997828e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.33876221e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 4.61454940e-03 1.00000000e+00 1.06636808e+01 1.00000000e+00 0.00000000e+00 7.32899023e-03] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 9. 1. 1. 0.] [ 9.92063492e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 7.93650794e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.92063492e-01 9.04761905e+00 9.13492063e+00 0.00000000e+00 0.00000000e+00] [ 9.97958721e-03 0.00000000e+00 6.35064641e-03 8.84781130e-01 9.88886369e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 4.10000000e+01 1.00000000e+00 0.00000000e+00 9.70741665e-02] [ 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 30.99463087 1. 0. 0. ] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 24. 1. 1. 0.] [ 0.00000000e+00 0.00000000e+00 2.58215962e-02 0.00000000e+00 0.00000000e+00 9.63615023e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.05633803e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.79495305e+01 1.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.88638181e+01 1.00000000e+00 0.00000000e+00 5.89349611e-03] [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.40000000e+01 1.00000000e+00 2.51098556e-04 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.50000000e+01 1.00000000e+00 1.78126113e-03 2.49376559e-03] [ 9.20588235e-01 0.00000000e+00 0.00000000e+00 5.83823529e-02 9.11764706e-03 3.38235294e-03 0.00000000e+00 4.70588235e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.96323529e-01 2.13551471e+01 1.05882353e+00 5.42647059e-02 8.82352941e-04] [ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 23. 122. 0. 0.] [ 0.00000000e+00 0.00000000e+00 6.25000000e-02 8.43750000e-01 0.00000000e+00 0.00000000e+00 9.37500000e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.56875000e+01 6.38437500e+01 0.00000000e+00 0.00000000e+00] [ 9.99437254e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 5.62746201e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.10153348e+01 1.00000000e+00 7.03432752e-04 0.00000000e+00] [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 10. 658. 0. 0.] [ 9.98983482e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 7.62388818e-04 0.00000000e+00 2.54129606e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.70000000e+01 1.00000000e+00 0.00000000e+00 0.00000000e+00] [ 9.20746742e-01 0.00000000e+00 0.00000000e+00 2.00774921e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.76118352e-03 0.00000000e+00 0.00000000e+00 5.74145826e-02 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.80000000e+01 1.00000000e+00 0.00000000e+00 0.00000000e+00] [ 9.18107833e-01 0.00000000e+00 2.64496439e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.45879959e-02 0.00000000e+00 5.08646999e-04 0.00000000e+00 2.03458800e-02 0.00000000e+00 1.00000000e+00 7.84740590e+00 1.00000000e+00 0.00000000e+00 2.03458800e-03] [ 9.98236677e-01 0.00000000e+00 1.76332288e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 3.76057994e+00 1.00000000e+00 0.00000000e+00 1.95924765e-04] [ 7.80748663e-01 0.00000000e+00 2.18360071e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.91265597e-04 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70713012e+01 1.00000000e+00 5.94177065e-04 0.00000000e+00] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 10. 1. 1. 0.] [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.27907801e+01 1.00000000e+00 0.00000000e+00 5.80270793e-03] [ 0.00000000e+00 9.84625240e-01 0.00000000e+00 0.00000000e+00 1.53747598e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.11755285e+01 1.00000000e+00 9.92099082e-01 1.70830664e-03] [ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.72340426 1. 11.55319149 0. 0. ] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.57142857e-02 0.00000000e+00 0.00000000e+00 9.14285714e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.80000000e+00 1.57914286e+02 0.00000000e+00 0.00000000e+00] [ 0.18624735 0.29167346 0.00195535 0. 0. 0. 0.52012384 0. 0. 0. 0. 0. 0. 0.97865407 1.47286948 1. 0.29151051 0.02281245] [ 0.00000000e+00 0.00000000e+00 6.01967058e-02 9.25702097e-01 0.00000000e+00 1.41011968e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 2.57784098e+01 1.00000000e+00 0.00000000e+00 2.72544140e-03] [ 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.68168168 12.03903904 5.03903904 0.11711712 0.03903904] [ 0.00000000e+00 4.17310665e-02 4.88408037e-01 0.00000000e+00 0.00000000e+00 4.63678516e-03 0.00000000e+00 1.66924266e-01 0.00000000e+00 2.98299845e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 5.37094281e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.00000000e+00 1.70000000e+01 1.00000000e+00 2.40673887e-03 0.00000000e+00]
features = ['qtype_name', 'proto', 'query_length', 'answer_length', 'Z', 'rejected']
transformed = model.transform(spark_df).select(features + ['prediction'])
transformed.collect()
transformed.show()
+----------+-----+------------+-------------+---+--------+----------+ |qtype_name|proto|query_length|answer_length| Z|rejected|prediction| +----------+-----+------------+-------------+---+--------+----------+ | SRV| udp| 57| 1| 1| false| 5| | NB| udp| 8| 1| 1| false| 17| | NB| udp| 8| 1| 1| false| 17| | NB| udp| 8| 1| 1| false| 17| | NB| udp| 4| 1| 1| false| 11| | NB| udp| 4| 1| 1| false| 11| | NB| udp| 4| 1| 1| false| 11| | NB| udp| 6| 1| 1| false| 38| | NB| udp| 6| 1| 1| false| 38| | NB| udp| 6| 1| 1| false| 38| | SRV| udp| 57| 1| 0| false| 5| | SRV| udp| 57| 1| 0| false| 5| | SRV| udp| 57| 1| 0| false| 5| | PTR| udp| 28| 1| 0| false| 1| | PTR| udp| 28| 1| 0| false| 1| | SRV| udp| 57| 1| 1| false| 5| | NB| udp| 15| 1| 1| false| 20| | NB| udp| 12| 1| 1| false| 62| | NB| udp| 15| 1| 1| false| 20| | AAAA| udp| 13| 1| 0| false| 61| +----------+-----+------------+-------------+---+--------+----------+ only showing top 20 rows
transformed.groupby(features + ['prediction']).count().sort('prediction').show(50)
+----------+-----+------------+-------------+---+--------+----------+-----+ |qtype_name|proto|query_length|answer_length| Z|rejected|prediction|count| +----------+-----+------------+-------------+---+--------+----------+-----+ | A| udp| 12| 1| 0| true| 0| 26| | A| udp| 11| 1| 0| false| 0| 4713| | TXT| tcp| 12| 1| 0| false| 0| 19| | TXT| udp| 12| 1| 0| true| 0| 15| | HINFO| udp| 12| 1| 0| false| 0| 6| | TXT| udp| 12| 1| 0| false| 0| 401| | *| udp| 12| 1| 0| false| 0| 9| | TXT| tcp| 12| 1| 0| true| 0| 5| | A| udp| 11| 1| 0| true| 0| 39| | A| udp| 12| 1| 0| false| 0| 4035| | PTR| udp| 27| 11| 0| false| 1| 11| | PTR| udp| 28| 1| 0| true| 1| 1| | PTR| udp| 27| 1| 0| true| 1| 48| | PTR| udp| 27| 1| 0| false| 1|24311| | PTR| udp| 28| 1| 0| false| 1| 6100| | AXFR| tcp| 10| 687| 0| false| 2| 74| | AXFR| tcp| 10| 347| 0| false| 3| 75| | -| tcp| 1| 347| 0| false| 3| 8| | PTR| udp| 20| 136| 0| false| 4| 3| | PTR| udp| 23| 137| 0| false| 4| 5| | PTR| udp| 23| 138| 0| false| 4| 11| | PTR| udp| 20| 137| 0| false| 4| 10| | *| udp| 54| 1| 0| false| 5| 144| | AAAA| udp| 55| 1| 0| false| 5| 12| | SRV| udp| 50| 1| 0| false| 5| 18| | A| udp| 51| 1| 0| false| 5| 10| | *| udp| 51| 1| 0| false| 5| 16| | A| udp| 55| 1| 0| false| 5| 11| | SRV| udp| 57| 1| 1| false| 5| 3782| | AAAA| udp| 51| 1| 0| false| 5| 10| | TXT| udp| 64| 1| 0| false| 5| 2| | SRV| udp| 57| 1| 0| false| 5| 6619| | A| udp| 59| 1| 0| false| 5| 58| | AAAA| udp| 59| 1| 0| false| 5| 59| | AAAA| udp| 21| 1| 0| false| 6| 1825| | PTR| udp| 21| 1| 0| false| 6| 381| | AAAA| udp| 22| 1| 0| false| 6| 333| | AAAA| udp| 21| 1| 0| true| 6| 2| | AAAA| udp| 20| 1| 0| false| 6| 2326| | AAAA| udp| 21| 1| 1| false| 6| 4| | MX| udp| 15| 1| 0| false| 7| 6| | A| udp| 15| 1| 0| true| 7| 37| | A| udp| 15| 1| 1| false| 7| 8| | PTR| udp| 15| 1| 0| false| 7| 27| | A| udp| 15| 1| 0| false| 7|11854| | HINFO| udp| 15| 1| 0| false| 7| 1| | A| tcp| 15| 1| 0| false| 7| 4| | TXT| udp| 23| 1| 0| true| 8| 2| | A| udp| 23| 1| 0| false| 8| 3496| | TXT| udp| 23| 1| 0| false| 8| 10| +----------+-----+------------+-------------+---+--------+----------+-----+ only showing top 50 rows