#!/usr/bin/env python # coding: utf-8 # # TEAM V - Final Project for Data Visualization (IS590DV Fall 2019) # by Dhwani Parekh, Wenyi Shang, Akshat Sharma, Anirudh Sharma, Tre Tomaszewski # # ## [WISDM Smartphone and Smartwatch Activity and Biometrics Dataset Data Set](https://archive.ics.uci.edu/ml/datasets/WISDM+Smartphone+and+Smartwatch+Activity+and+Biometrics+Dataset+) # # From # > Smartphone and Smartwatch-Based Biometrics Using Activities of Daily Living. IEEE Access, 7:133190-133202, Sept. 2019. # # and # # > [Jennifer R. Kwapisz, Gary M. Weiss and Samuel A. Moore (2010). Activity Recognition using Cell Phone Accelerometers, Proceedings of the Fourth International Workshop on Knowledge Discovery from Sensor Data (at KDD-10), Washington DC.](http://www.cis.fordham.edu/wisdm/includes/files/sensorKDD-2010.pdf) # # # # | Data File Group | Total Size | Total Files | Instances | # |-:|-:|-:|-:| # |`Phone/Accel`| 250MB | 51 | 4,804,404 | # |`Phone/Gyro` | 205MB | 51 | 3,608,635 | # |`Watch/Accel`| 196MB | 51 | 3,777,048 | # |`Watch/Gyro` | 190MB | 51 | 3,440,344 | # |All | 1.1GB | 204 | 15,630,426| # In[1]: get_ipython().run_line_magic('matplotlib', 'widget') from pathlib import Path from IPython.display import display import ipywidgets import ipywidgets as widgets from ipywidgets import interact, interactive import traitlets import numpy as np import pandas as pd import matplotlib import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import bqplot # Thanks to `ImportanceOfBeingErnest` from https://stackoverflow.com/questions/47404653/pandas-0-21-0-timestamp-compatibility-issue-with-matplotlib pd.plotting.register_matplotlib_converters() # In[2]: subject_ids = [p.stem.split('_')[0] for p in Path('../data/processed/wisdm/merged_sensors/phone/').glob('*.csv')] activity_name_key = {k: v for k, v in [line.strip().split(' = ') for line in Path('../references/wisdm_activity_key.txt').read_text().strip().split('\n')]} activity_letter_key = {v: k for k, v in [line.strip().split(' = ') for line in Path('../references/wisdm_activity_key.txt').read_text().strip().split('\n')]} # In[3]: def get_by_subject_id(subject_id='1600'): subject_id = str(subject_id) # force into a string device_dict = { 'phone': pd.read_csv('../data/processed/wisdm/merged_sensors/phone/{}_phone.csv'.format(subject_id)).drop(columns=['timestamp']),#, parse_dates=['timestamp']) The time isn't really needed and adds a lot of loading time 'watch': pd.read_csv('../data/processed/wisdm/merged_sensors/watch/{}_watch.csv'.format(subject_id)).drop(columns=['timestamp'])#, parse_dates=['timestamp']) } for k,v in device_dict.items(): v.insert(2, 'activity_name', v['activity_code'].replace(activity_letter_key)) return device_dict # In[4]: all_phone_df = pd.read_csv('../data/processed/wisdm/merged_sensors/phone_subject_activity_counts.csv') all_watch_df = pd.read_csv('../data/processed/wisdm/merged_sensors/watch_subject_activity_counts.csv') # ## Part I: General Data Analytics # # Below are charts briefly analyzing the raw data through several basic facets. # In[5]: plt.ion() fig, ax = plt.subplots() fig.canvas.layout.width='800px' fig.canvas.layout.height='600px' ax.bar(['phone', 'watch'], [all_phone_df.num_rows.sum(), all_watch_df.num_rows.sum()], align='center', alpha=0.5, color='blue') ax.set_ylabel('Number of records') fig.suptitle('Total Records per Device') fig.show() # Comparing total records logged by the phone and the watch, the watch is logging approximately 14% more records than the phone. # In[6]: plt.ioff() rps_phone_count = all_phone_df.groupby('subject_id')['num_rows'].sum() rps_watch_count = all_watch_df.groupby('subject_id')['num_rows'].sum() fig, axs = plt.subplots(2,1) axs[0].set_title('Records for Phone by Subject') axs[1].set_title('Records for Watch by Subject') axs[0].grid(axis='x', alpha=0.1, zorder=-1) axs[1].grid(axis='x', alpha=0.1, zorder=-1) xlabels = [str(i) for i in rps_phone_count.index] axs[0].bar(xlabels, rps_phone_count.values, alpha=0.8, color='blue', zorder=5) axs[0].xaxis.set_tick_params(labelrotation=40) axs[0].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor') axs[0].set_xlabel('Subject ID', fontsize=12) axs[0].set_ylabel('Number of Records', fontsize=12) axs[1].bar(xlabels, rps_watch_count.values, alpha=0.8, color='orange', zorder=5) axs[1].xaxis.set_tick_params(labelrotation=40) axs[1].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor') axs[1].set_xlabel('Subject ID', fontsize=12) axs[1].set_ylabel('Number of Records', fontsize=12) fig.canvas.layout.height='1200px' fig.subplots_adjust(hspace=0.4) widgets.VBox([fig.canvas]) # We can observe that the variance between the number of records per subject is less in the watch than the variance of the same recorded in the phone. Subject 1629 has the highest number of activities recorded in both phone and watch, but it most probably looks like an outlier, but we aren’t quite sure why it is one. # In[7]: plt.ioff() # rpa_phone_count = all_phone_df.copy().replace({'activity_code': activity_letter_key})['activity_code'].value_counts() # rpa_watch_count = all_watch_df.copy().replace({'activity_code': activity_letter_key})['activity_code'].value_counts() rpa_phone_count = all_phone_df.replace({'activity_code': activity_letter_key}).groupby('activity_code')['num_rows'].sum().sort_values(ascending=False) rpa_watch_count = all_watch_df.replace({'activity_code': activity_letter_key}).groupby('activity_code')['num_rows'].sum().sort_values(ascending=False) fig, axs = plt.subplots(2,1) axs[0].set_title('Records for Phone by Activity') axs[1].set_title('Records for Watch by Activity') axs[0].grid(axis='y', alpha=0.2, zorder=-1) axs[1].grid(axis='y', alpha=0.2, zorder=-1) xlabels = [str(i) for i in rpa_phone_count.index] axs[0].bar(xlabels, rpa_phone_count.values, alpha=0.8, color='blue', zorder=5) axs[0].xaxis.set_tick_params(labelrotation=40) axs[0].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor') axs[0].set_xlabel('Activity', fontsize=12) axs[0].set_ylabel('Number of Records', fontsize=12) axs[0].set_ylim(0, 2*10**5) axs[1].bar(xlabels, rpa_watch_count.values, alpha=0.8, color='orange', zorder=5) axs[1].xaxis.set_tick_params(labelrotation=40) axs[1].set_xticklabels(xlabels, va='top', ha='right', rotation_mode='anchor') axs[1].set_xlabel('Activity', fontsize=12) axs[1].set_ylabel('Number of Records', fontsize=12) axs[1].set_ylim(0, 2*10**5) fig.canvas.layout.height='1200px' fig.subplots_adjust(hspace=0.4) widgets.VBox([fig.canvas]) # Surprisingly all the activities have the same sequence in watch or phone with respect to the number of records logged. The range of records logged in the phone range from 150000 to 180000, while the range of records logged in the watch range from 180000 to 190000. We can confirm by this that the watch is recording more data as we saw from the first part. # ## Part II: Aggregate Analytics # Below is an interactive bqplot project to display the aggregated/derived data of the dataset. It allows users to select subject ID (1600-1650), phone vs watch, coordinates (x_accel, y_accel, z_accel, x_gyro, y_gyro, z_gyro), and aggregation type (in total, there are 5 aggregation types: sum, mean, max, min, count). Users can select anything they want a from the four dropdowns, and the results will display with a barplot of the data of every activity correspondingly. By this interactive plot, users can learn about the aggregated results of the dataset. # In[8]: plt.ion() @interact(Subject_ID = subject_ids) def get_subject(Subject_ID): ID=Subject_ID @interact(Device = ['phone', 'watch']) def get_device(Device): device=Device @interact(Coordinate = ['x_accel', 'y_accel','z_accel','x_gyro','y_gyro','z_gyro']) def get_coordinate(Coordinate): coordinate=Coordinate @interact(Aggregate = ['sum', 'mean','max','min','count']) def change_aggregation(Aggregate): if Aggregate=='sum': y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].sum() if Aggregate=='mean': y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].mean() if Aggregate=='max': y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].max() if Aggregate=='min': y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].min() if Aggregate=='count': y = get_by_subject_id(ID).get(device).groupby('activity_code')[coordinate].count() bin_x_sc = bqplot.OrdinalScale() bin_x_ax = bqplot.Axis(scale = bin_x_sc,label='activity code') bin_y_sc = bqplot.LinearScale() bin_y_ax = bqplot.Axis(scale = bin_y_sc,orientation='vertical',label='value') bars = bqplot.Bars(x = y.index,y = y,scales = {'x': bin_x_sc, 'y': bin_y_sc}) fig = bqplot.Figure(marks = [bars], axes = [bin_x_ax, bin_y_ax]) display(fig) # By looking at each activity's mean values for the various acceleration axes, some observations regarding the overall motion of the activity can be made. For an example, using subject 1600: # # 1. **Device**: Phone; **Axis**: _x\_accel_; **Aggregation method**: _mean_
#

Activity teeth has the highest mean for x_accel which is near 5, and the activity dribbling has the lowest mean which is nearly -4. All the activities which have a negative mean, the means are around -2 or more than that, except for the activity dribbling.  Also activity typing has a near to 0 mean.

# # 2. **Device**: Watch; **Axis**: _x\_accel_; **Aggregation method**: _mean_
#

Except for activities sitting and folding polarity of mean of all the activities have changed for the device watch. Walking has the highest x_accel mean, while teeth has the lowest x_accel. Interestingly activity teeth had the highest x_accel for the phone.  Activities writing and typing have a near to 0 mean.

# # 3. **Device**: Phone; **Axis**: _y\_accel_; **Aggregation method**: _mean_
#

Mean of y_accel for all the activities is positive. Catching and folding have the highest y_accel mean, while teeth has the lowest y_accel mean.

# # 4. **Device**: Watch; **Axis**: _y\_accel_; **Aggregation method**: _mean_
#

Mean of y_accel for all the activities is negative. Writing has the lowest(highest in magnitude) y_accel mean, interestingly writing had almost null x_accel mean in the watch. Drinking has the highest mean (lowest in magnitude)

# In[ ]: plt.ion() fig = plt.figure() fig.suptitle('Scatter Plot of the WISDM Data Statespace Sum') fig.canvas.layout.width = "1000px" fig.canvas.layout.height = "1000px" t = None cb = None cmap = None def update_scatter(subject_id=None, device=None, activity=None, stepsize=None): global t, cb, cmap Subject_ID = subject_id or 1600 Device = device or 'phone' Activity = activity or 'A' StepSize = stepsize or 10 x = get_by_subject_id(Subject_ID).get(Device).groupby('activity_code').get_group(Activity)[['x_accel','y_accel','z_accel','x_gyro','y_gyro','z_gyro']] # ax = Axes3D(fig) ax = fig.add_subplot(111, projection='3d') smooth_x = x[::StepSize] t = ax.scatter(smooth_x['x_accel'],smooth_x['y_accel'],smooth_x['z_accel'], c = plt.cm.jet(np.linspace(0,1,len(smooth_x)))) ax.set_title('Subject #{} `{}` ({}) using {}'.format(Subject_ID, activity_letter_key.get(Activity).capitalize(), Activity, Device.capitalize())) ax.set_xlabel('X-Axis') ax.set_ylabel('Y-Axis') ax.set_zlabel('Z-Axis') fig.canvas.draw() fig.canvas.flush_events() def get_subject(change): update_scatter(subject_id=change.new) def get_device(change): update_scatter(device=change.new) def get_activity(change): update_scatter(activity=change.new) def get_stepsize(change): update_scatter(stepsize=change.new) subject_dd = widgets.Dropdown(options = subject_ids, description='Subject ID:', value='1600') device_dd = widgets.Dropdown(options = ['phone', 'watch'], description='Device Type:', value='phone') activity_dd = widgets.Dropdown(options=activity_name_key, description='Activity:', value='A') stepsize_sl = widgets.IntSlider(min=1, max=20, value=10) subject_dd.observe(get_subject, names='value') device_dd.observe(get_device, names='value') activity_dd.observe(get_activity, names='value') stepsize_sl.observe(get_stepsize, names='value') update_scatter() widgets.VBox([widgets.HBox([subject_dd, device_dd, activity_dd]), widgets.HBox([widgets.Label(value='Steps between Points:'), stepsize_sl])]) # The color gradient explains the start and the end of a person’s activity for all the 18 activities. # The dark blue color represents the start of the persons activity at the start and as it reaches the end it’s represented by a dark red color. # # After seeing the 2d visualizations we moved forward to visualizing the 3d scatter plots just to see how the movement for different activities in the 3-dimmensional space. # # First, we plotted the normal 3d plots but it was just random scatter and we were unable to differentiate between the start and end of any activity. We determined a different color scale would help perceive this difference. # # Inferences for subject 1600: # * Walking: An outlier on the top left corner of the plot. # * Jogging: An outlier on the top of the plot. # * Standing: More outliers on the right of the plot # * Stairs: Some outliers in the entire plot # # With the color scale applied to these plots, we observed the cumulative sum of the accelerations might produce a more informational pattern for the activities. # As for the outliers, we can only guess. We do not know where they are walking, the surface, or conditions. While walking, the outlier could mean the subject jerked or jumped suddenly. This is an interesting question to be raised! # In[ ]: plt.ion() fig = plt.figure() fig.suptitle('Scatter Plot of the WISDM Data Cumulative Sum over Time') fig.canvas.layout.width = "1000px" fig.canvas.layout.height = "1000px" t = None cb = None cmap = None def update_scatter(subject_id=None, device=None, activity=None, stepsize=None): global t, cb, cmap Subject_ID = subject_id or 1600 Device = device or 'phone' Activity = activity or 'A' StepSize = stepsize or 10 x = get_by_subject_id(Subject_ID).get(Device).groupby('activity_code').get_group(Activity)[['x_accel','y_accel','z_accel','x_gyro','y_gyro','z_gyro']].cumsum(axis = 0) # ax = Axes3D(fig) ax = fig.add_subplot(111, projection='3d') smooth_x = x[::StepSize] t = ax.scatter(smooth_x['x_accel'],smooth_x['y_accel'],smooth_x['z_accel'], c = plt.cm.jet(np.linspace(0,1,len(smooth_x)))) ax.set_title('Subject #{} `{}` ({}) using {}'.format(Subject_ID, activity_letter_key.get(Activity).capitalize(), Activity, Device.capitalize())) ax.set_xlabel('X-Axis') ax.set_ylabel('Y-Axis') ax.set_zlabel('Z-Axis') fig.canvas.draw() fig.canvas.flush_events() def get_subject(change): update_scatter(subject_id=change.new) def get_device(change): update_scatter(device=change.new) def get_activity(change): update_scatter(activity=change.new) def get_stepsize(change): update_scatter(stepsize=change.new) subject_dd = widgets.Dropdown(options = subject_ids, description='Subject ID:', value='1600') device_dd = widgets.Dropdown(options = ['phone', 'watch'], description='Device Type:', value='phone') activity_dd = widgets.Dropdown(options=activity_name_key, description='Activity:', value='A') stepsize_sl = widgets.IntSlider(min=1, max=20, value=10) subject_dd.observe(get_subject, names='value') device_dd.observe(get_device, names='value') activity_dd.observe(get_activity, names='value') stepsize_sl.observe(get_stepsize, names='value') update_scatter() widgets.VBox([widgets.HBox([subject_dd, device_dd, activity_dd]), widgets.HBox([widgets.Label(value='Steps between Points:'), stepsize_sl])]) # We can find more patterns in the figures of cumulative sums. Compared to simple scatter plots where scatters are messly distributed, scatters in cumulative sum plots show a clear path. In the figure, blue is beginning point, and red is ending point. From this, we can find that in every activity, y values go up, while the x and z values vary (some go up, others go down), depending on different activities. # ## Part III: Adjusted Data # In[61]: activities = get_by_subject_id(1600)['phone'].groupby('activity_name') dfs = {activity: activities.get_group(activity)[['x_accel', 'y_accel', 'z_accel']]/20 for activity in ['walking', 'jogging', 'stairs', 'standing']} fig, axs = plt.subplots(4, figsize=(10, 10)) for i, (name, df) in enumerate(dfs.items()): ax = axs[i] adj_df = df.sub(df.mean(axis=0)).cumsum(axis=0) ax.plot(adj_df.z_accel, label='Z Axis (Forward)') ax.plot(adj_df.x_accel, label='X Axis (Side-to-Side)') ax.plot(adj_df.y_accel, label='Y Axis (Up and Down)') if ax.is_last_row(): ax.set_xlabel('Time (50ms per step)') ax.set_ylabel('Meters/0.05 seconds squared') ax.set_title(name.capitalize()) fig.canvas.layout.width = '1200px' fig.canvas.layout.height = '1500px' fig.suptitle('Acceleration with Adjusted Axes over Time for 4 Activities') fig.subplots_adjust(top=0.91, left=0.05, right=0.95, hspace=0.5) fig.legend(['Z Axis (Forward)', 'X Axis (Side-to-Side)', 'Y Axis (Up and Down)'], loc='upper center', ncol=3, bbox_to_anchor=(0.5, 0.97)) # The problems discovered in the previous visualizatins prompted further analysis. The second paper, [Activity Recognition using Cell Phone Accelerometers],(http://www.cis.fordham.edu/wisdm/includes/files/sensorKDD-2010.pdf) (Kwaspiz et al.) helped identify where adjustments to the data may assist in creating a more sensible plot. # # The first is the reason for the consistently larger magnitude of the Y-Acceleration, causing the significant growth of the cumulative sum. This phenomenon is explained on page 3: # >Note that for most activities the y values have the largest accelerations. This is a consequence of Earth’s gravitational pull, which causes the accelerometer to measure a value of 9.8 m/s2 in the direction of the Earth’s center. For all activities except sitting this direction corresponds to the y axis # # Additionally, the paper depicts a graphic of the directionality of the movement with respect to the person. Since the phone is in the subject's pocket, all upright activities (walking, jogging, standing, and stairs) have the Z-axis as the forward vector, and the X axis as the side to side. # # Where subtracting __gravitation__ (at $9.807 \frac{m}{s^2}$, then dividing by 20 to match the 20Hz recording rate) would seem to make the most sense, it doesn't work as well as hoped. Instead, the overall _mean_ is is subtracted from each Y-Acceleration value prior to summation, using the device's frame-of-reference. Due to Y being not __necessarily__ downward, this subtraction isn't a perfectly sound assumption. Applying instantaneous rotation using gyroscopic data was attempted, but did not help to correct for any deviations, since the original orientation of the device is not known (especially for the watch). In addition, in a real-time setting, this would be untenable. # # The resulting visualization shows the plotted path of the summed acceleration of each subject's device for each activity over time. In addition to the options to choose the subject, device, and activity, the slider will follow the path over a predefined time range. Sensible patterns for the activity in question emerge from this view-resolution, not easily apparent in previous visualizations. This indicates that manual course-correction may be possible with help from the visualization. # In[ ]: plt.ioff() fig = plt.figure() fig.canvas.layout.width = '1600px' fig.canvas.layout.height = '1200px' class AdjustedPlot(): def __init__(self, fig_): self.fig = fig_ self.fig.suptitle('Adjusted Cumulative Instantateous Accelerations over Time') # Data Initialization self.view_span = 500 self.subject_id = '1600' self.device = 'phone' self.activity_code = 'A' self.output = widgets.Output() self.update_data() # Initial Plot self.ax = self.fig.add_subplot(111, projection='3d') self.scatter = self.ax.scatter3D(self.df.x_accel, self.df.z_accel, self.df.y_accel, c=plt.cm.jet(np.linspace(0,1,len(self.df)))) self.lines = self.ax.plot(self.df.x_accel, self.df.z_accel, self.df.y_accel, color='black', alpha=0.5) self.ax.set_xlabel('X-Axis') self.ax.set_ylabel('Adjusted Y-Axis (originally Z)') self.ax.set_zlabel('Adjusted Z-Axis (originally Y)') self.update_view() # Widgets self.subject_dropdown = widgets.Dropdown(options=subject_ids, value=self.subject_id, description='Subject ID:') self.device_dropdown = widgets.Dropdown(options=['phone', 'watch'], value=self.device, description='Device:') self.activity_dropdown = widgets.Dropdown(options=activity_name_key, value=self.activity_code, description='Activity:') self.view_slider = widgets.IntSlider(min=0, max=self.df.shape[0]-self.view_span, value=1.0, description='View Frame:') # Observers self.subject_dropdown.observe(self.on_change_subject(), names='value') self.device_dropdown.observe(self.on_change_device(), names='value') self.activity_dropdown.observe(self.on_change_activity(), names='value') self.view_slider.observe(self.on_change_view(), names='value') def update_data(self, subject_id=None, device=None, activity_code=None): if subject_id or not self.subject_id: self.subject_id = subject_id or '1600' if device or not self.device: self.device = device or 'phone' if activity_code or not self.activity_code: self.activity_code = activity_code or 'A' self.subject_dfs = get_by_subject_id(self.subject_id).copy() self.df = self.subject_dfs[self.device] self.df = self.df[self.df['activity_code'] == self.activity_code].reset_index(drop=True)[['x_accel', 'y_accel', 'z_accel']] / 20 self.df.y_accel = self.df.y_accel.sub(self.df.y_accel.mean()) self.df = self.df.cumsum(axis=0) def update_canvas(self): self.ax.relim() self.fig.canvas.draw() self.fig.canvas.flush_events() def update_plot(self): # https://stackoverflow.com/questions/41602588/matplotlib-3d-scatter-animations self.lines[0].set_data(self.df.x_accel, self.df.z_accel) self.lines[0].set_3d_properties(self.df.y_accel) self.scatter.remove() self.scatter = self.ax.scatter3D(self.df.x_accel, self.df.z_accel, self.df.y_accel, c=plt.cm.jet(np.linspace(0,1,len(self.df)))) self.view_slider.max = self.df.shape[0] - self.view_span self.view_slider.value = 1 self.ax.relim() self.ax.autoscale_view() def update_view(self, start=0): part = self.df.iloc[start: start + self.view_span] self.ax.set_xlim((part.x_accel.min() - 3, part.x_accel.max() + 3)) self.ax.set_ylim((part.z_accel.min() - 3, part.z_accel.max() + 3)) self.ax.set_zlim((part.y_accel.min() - 3, part.y_accel.max() + 3)) def on_change_subject(self): def callback(change): self.update_view() self.update_data(subject_id=change.new) self.update_plot() self.update_canvas() self.update_view() return callback def on_change_device(self): def callback(change): self.update_view() self.update_data(device=change.new) self.update_plot() self.update_canvas() self.update_view() return callback def on_change_activity(self): def callback(change): self.update_view() self.update_data(activity_code = change.new) self.update_plot() self.update_canvas() self.update_view() return callback def on_change_view(self): def callback(change): self.update_view(change.new) self.update_canvas() return callback def _p(self, *args): with self.output: print(*args) ajp = AdjustedPlot(fig) plt.ion() widgets.HBox([widgets.VBox([widgets.HBox([ajp.subject_dropdown, ajp.device_dropdown, ajp.activity_dropdown]), ajp.view_slider, fig.canvas])]) # Thanks for Viewing!