#!/usr/bin/env python # coding: utf-8 # # Proof of concept. Data mining of imported logfile content with Anaconda analytics. # # - Environment: Anaconda 3 x86_64 for windows 10 x86_64 (Python 3.5) # - Python packages added: # - conda/pip install pandas_highcharts # - conda install seaborn # - Installed Windows programs required for exporting this Jupyter notebook to PDF: # - Pandoc # - TeX Live 2015 (LaTeX) # - Motivation: # - This proof of concept was developed by the author during a recruitment process in November 2015: a Python programming test to be completed in 3 hours. Only a first design with a basic prototype was delivered, but a couple of updates were handled over in the following days with the final outcome. # - The programming test was about importing a custom large log file where specific entries needed to be filtered out and imported for data mining purposes. # - The author of this notebook decided to deliver added value and develop a proof of concept based on Anaconda analytics instead of a traditional solution based on Python + vim + awk + sed. # - The 3 pages long detailed requirements are not included in order to preserve anonymity of the Company and privacy of their recruitment test. # - Ideas for improvement: # - mmap module as the most efficient way to read large log files # - syslog module to deal with Syslog file rotation # In[5]: # Standard import tempfile import os import sys import pyparsing from pyparsing import Word, alphas, Suppress, Combine, nums, string, Optional, Regex, ParseException from datetime import datetime import pandas as pd import numpy as np from numpy.random import randn # Stats from scipy import stats from pylab import * # Plotting import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns get_ipython().run_line_magic('pylab', 'inline') get_ipython().run_line_magic('matplotlib', 'inline') # In[6]: logfilecontent = ''' Feb 25 12:40:12 victim /opt/zl/bin/zdaemon[29412]: [zl::esink::Internals::timeout][/m0/zdaemon3]: Interval(5000): \ numCycles(173) numTimeouts(89) numMessages(241) numTMessages(2) numDescriptors(48) numPipes(0) task-events(0) num-timers(439) Feb 25 12:40:17 victim /opt/zl/bin/zdaemon[29412]: [zl::esink::Internals::timeout][/m0/zdaemon3]: Interval(5000): \ numCycles(17) numTimeouts(82) numMessages(247) numTMessages(3) numDescriptors(54) numPipes(0) task-events(0) num-timers(454) other logs 1 Feb 25 12:40:22 victim /opt/zl/bin/zdaemon[29412]: [zl::esink::Internals::timeout][/m0/zdaemon3]: Interval(5000): \ numCycles(12) numTimeouts(2) numMessages(242) numTMessages(5) numDescriptors(67) numPipes(0) task-events(0) num-timers(499) Feb 25 12:40:27 victim /opt/zl/bin/zdaemon[29412]: [zl::esink::Internals::timeout][/m0/zdaemon3]: Interval(5000): \ numCycles(7) numTimeouts(32) numMessages(189) numTMessages(8) numDescriptors(60) numPipes(0) task-events(0) num-timers(413) Feb 25 12:40:32 victim /opt/zl/bin/zdaemon[29412]: [zl::esink::Internals::timeout][/m0/zdaemon3]: Interval(5000): \ numCycles(18) numTimeouts(21) numMessages(310) numTMessages(3) numDescriptors(78) numPipes(0) task-events(0) num-timers(505) other logs 2 Feb 25 12:40:37 victim /opt/zl/bin/zdaemon[29412]: [zl::esink::Internals::timeout][/m0/zdaemon3]: Interval(5000): \ numCycles(22) numTimeouts(17) numMessages(233) numTMessages(6) numDescriptors(65) numPipes(0) task-events(0) num-timers(569) ''' # In[7]: month = Word(string.ascii_letters, string.ascii_letters, exact=3) integer = Word(nums) serverDateTime = Combine(month + " " + integer + " " + integer + ":" + integer + ":" + integer) hostname = Word(alphas + nums + "_" + "-") daemon = Word(alphas + "/" + "-" + "_" ) + Optional(Suppress("[") + integer + Suppress("]")) + Suppress(":") zl = Suppress("[") + Word( alphas + "::" + alphas + "::" + alphas + "::" + alphas ) + Suppress("]") m0 = Suppress("[") + Word( alphas + nums + "_" + "-" + "/" ) + Suppress("]") + Suppress(":") interval = Suppress("Interval") + Suppress("(") + Word(nums) + Suppress("):") numCycles = Suppress("numCycles") + Suppress("(") + Word(nums) + Suppress(")") numTimeouts = Suppress("numTimeouts") + Suppress("(") + Word(nums) + Suppress(")") numMessages = Suppress("numMessages") + Suppress("(") + Word(nums) + Suppress(")") numTMessages = Suppress("numTMessages") + Suppress("(") + Word(nums) + Suppress(")") numDescriptors = Suppress("numDescriptors") + Suppress("(") + Word(nums) + Suppress(")") numPipes = Suppress("numPipes") + Suppress("(") + Word(nums) + Suppress(")") taskevents = Suppress("task-events") + Suppress("(") + Word(nums) + Suppress(")") numtimers = Suppress("num-timers") + Suppress("(") + Word(nums) + Suppress(")") bnf = (serverDateTime + hostname + daemon + zl + m0 + interval + numCycles + numTimeouts + numMessages + numTMessages + numDescriptors + numPipes + taskevents + numtimers) # In[8]: def make_tempfile(): fd, temp_file_name = tempfile.mkstemp() os.close(fd) f = open(temp_file_name,'wt') try: f.write(logfilecontent) finally: f.close() return temp_file_name def cleanup(filename): os.unlink(filename) filename = make_tempfile() L=list() with open(filename) as infile: for line in infile: try: results = bnf.parseString(line) except ParseException as pe: continue else: L.append(results) print(L) cleanup(filename) # In[9]: # Columns naming dframe = pd.DataFrame(L,columns=['serverDateTime','hostname','daemon','daemon-p','zl','m0','interval','numcycles', 'numtimeouts','nummessages','numtmessages','numdescriptors','numpipes','task-events', 'num-timers']) # In[10]: # dtype conversion to integer dframe[['interval','numcycles','numtimeouts','nummessages','numtmessages','numdescriptors','numpipes','task-events', 'num-timers']] = dframe[['interval','numcycles','numtimeouts','nummessages','numtmessages','numdescriptors', 'numpipes','task-events','num-timers']].astype(int) # In[11]: dframe # In[12]: dframe.columns # In[13]: dframe.index # In[14]: dframe.columns # In[15]: dframe['nummessages'] # In[16]: sample_string2 = dframe['num-timers'] # In[17]: sample_string2 # In[18]: del dframe['hostname'] del dframe['daemon'] del dframe['daemon-p'] del dframe['zl'] del dframe['m0'] del dframe['numpipes'] del dframe['task-events'] # In[19]: dframe # In[20]: dframe[['nummessages','numtmessages']] # In[21]: dframe2 = pd.DataFrame(dframe,columns=['numcycles','numtimeouts','nummessages','numtmessages']) # In[22]: dframe2 # In[23]: dframe2.describe() # In[24]: dframe2.index # ## Plotting copied dataframe with specific columns # In[25]: matplotlib.style.use('ggplot') rcParams['figure.figsize'] = 10, 5 # width, height in inches dframe2.plot() legend() # In[26]: dframe # In[27]: dframe.index # ## Problem: Monitor should obtain the following information # - The number of events per second for numCycles, numTimeouts, numMessages and numTMessages # - The current value of numDescriptors and num-timers # - Solution: DataFrame manipulation # - Issue: serverDateTime is not in datetime format, pd.to_datetime will solve it although year is not specified in origin and Python's default year is 1900. # In[28]: dframe['interval'] = dframe['interval']/1000 dframe['numcycles']=dframe['numcycles']/5 dframe['numtimeouts']=dframe['numtimeouts']/5 dframe['nummessages']=dframe['nummessages']/5 dframe['numtmessages']=dframe['numtmessages']/5 dframe['serverDateTime'] = pd.to_datetime(dframe['serverDateTime'],format='%b %d %H:%M:%S') dframe['serverDateTime'].dt.year dframe['serverDateTime'] # ## Python's default year is updated # - Year is not specified in logs' timestamp and default year in Python is 1900. # - Adding years to our current datetime is not valid as each year has a different amount of days. # - daysToAdd = 365 * 115 # - dframe['serverDateTime'] = dframe['serverDateTime'] + np.timedelta64(115, 'Y') # - dframe['serverDateTime'] = dframe['serverDateTime'] + np.timedelta64(daysToAdd, 'D') # - Solution: Timestamps have a replace method (just like datetimes): # - df.index.map(lambda t: t.replace(year=2015, month=2, day=1)) # In[29]: dframe['serverDateTime'] = dframe['serverDateTime'].map(lambda t: t.replace(year=2015)) dframe['serverDateTime'].dt.year dframe['serverDateTime'] # ## Problem: Every 10 seconds, the program should emit all the counters to a file or a round-robin database for further processing and logging. # - DatetimeIndex required for generating meaninful graphs and by dframe.resample # - We setup serverDateTime column as index in order to use dframe.resample below (DatetimeIndex) # In[30]: #dframe.set_index('serverDateTime', drop=False, verify_integrity=True) dframe.set_index('serverDateTime', drop=False, inplace=True) dframe # In[31]: dframe.index # ### Pandas has simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. See the Time Series section. # In[32]: # DataFrame.resample will show NaN rows. We are copying dframe to a new one in order to avoid issues afterwards # (pd.notnull not working properly) dframe10s = dframe.copy(deep=True) #dframe.resample('10min', how='sum') #dframe10s.resample('10S', how='sum') # 10S = 10 seconds dframe10s.resample('10S', how='mean') # 10S = 10 seconds, mean = average #dframe10s[pd.notnull(dframe10s.resample('10S', how='mean'))] #dframe10s[np.isfinite(dframe10s.resample('10S', how='mean'))] # In[33]: dframe # In[34]: dframe.index # ### Groupby per hour: # In[35]: def func(group): return pd.Series({'numcycles': group.numcycles.sum(), 'numtimeouts': group.numtimeouts.sum(), 'nummessages': group.nummessages.sum(), 'numtmessages': group.numtmessages.sum(), 'serverDateTime': group.index[1], 'Period': '{0} - {1}'.format(group.index[0].date(), group.index[-1].date())}) dframe.groupby(lambda idx: idx.hour).apply(func) # In[36]: dframe # ### resample 10 min # In[37]: dframe10min = dframe.copy(deep=True) dframe10min.resample('10min', how='sum') # In[38]: dframe # In[39]: dframe.index # In[40]: dframe['numcycles'].value_counts() # ## Obtain some values related with the machine load. # - There's no load average for the last 1, 5 and 15 minutes in my windows environment. In any case this is trivial task. # - import psutil # - import os # - os.getloadavg() # - psutil.cpu_percent(percpu=True) # ## Configurable threshold on the aggregated counters # In[41]: dframe[(dframe.numtimeouts < 17) & (dframe.numtimeouts > 5)] # In[47]: dframe[(dframe.numtimeouts < 17) & (dframe.numtimeouts > 5)] [['numtimeouts']] # In[43]: dframe[dframe['numtimeouts']>6] # ### Threshold checking via a conditional statement # In[44]: if ~dframe[dframe['numtimeouts']>6].empty: print('DataFrame contains rows with numtimeouts bigger than threshold!') #dframe3 = dframe[dframe['numtimeouts']>6] #dframe3 is not None and isinstance(dframe3, pd.DataFrame) and not dframe3.empty # ### Display an index/row # In[45]: dframe.ix['2015-02-25 12:40:17'] # ## Graphs # ### Plotting with matplotlib # In[48]: rcParams['figure.figsize'] = 10, 5 # width, height in inches #df_s = dframe['numcycles'].resample('1S', how='sum') #df_s.plot() # In[49]: dframe.plot() legend() # In[50]: #dframe.resample("1D", how="sum") dframe['numcycles'].plot() legend() # In[51]: plt.figure();dframe.plot(); # In[52]: dframe.plot(x='serverDateTime',y='numdescriptors') # ## Plotting with Seaborn # ### Scatterplot # In[53]: sns.lmplot('numtimeouts','nummessages', data=dframe, fit_reg=False) # ### Density Plot. Kernel density estimation # In[54]: sns.kdeplot(dframe.nummessages) # In[55]: sns.kdeplot(dframe.nummessages, dframe.numtimeouts) # In[56]: sns.distplot(dframe.nummessages) # ### Histogram # In[57]: plt.hist(dframe.nummessages, alpha = .3) sns.rugplot(dframe.nummessages); # ### Violin Plot # In[58]: sns.violinplot([dframe.numtimeouts,dframe.nummessages]) # ### Heatmap # In[59]: sns.heatmap([dframe.numtimeouts,dframe.nummessages], annot=True, fmt="f") # ## Plotting with Pandas Highcharts # - conda/pip install pandas_highcharts # In[60]: from pandas.compat import StringIO from pandas.io.common import urlopen from IPython.display import display, display_pretty, Javascript, HTML from pandas_highcharts.core import serialize from pandas_highcharts.display import display_charts # In[66]: display_charts(dframe, title="logs") # In[67]: display_charts(dframe, kind="bar", title="rate") # In[68]: display_charts(dframe, kind="barh", title="rate") # In[69]: display_charts(dframe, title="rate", legend=None, kind="bar", figsize = (1000, 700)) # In[70]: display_charts(dframe, title="rate", kind="bar", render_to="chart5", zoom="xy") # #