%matplotlib inline
import pandas as pd
import matplotlib as mpl
mpl.rc('font', family='Ubuntu', size=14)
mpl.rc('figure', figsize=(16, 10))
data = pd.read_csv('ubuntultusers.csv', sep='\t', names=['user', 'date'], parse_dates=[1])
ubuntultusers.csv
failas sugeneruotas naudojant tokią užklausą:
SELECT users.username, FROM_UNIXTIME(posts.post_time)
FROM phpbb_posts AS posts, phpbb_users AS users
WHERE posts.poster_id = users.user_id
INTO OUTFILE '/tmp/ubuntultusers.csv';
data.shape
(67573, 2)
Viso yra 67,573 postai parašyti per visą ubuntu.lt forumo istoriją.
posts = data.user.value_counts().to_frame().rename(columns={'user': 'posts'})
frame = posts.head(50).sort_values('posts', ascending=True)
ax = frame.plot.barh(grid=True, figsize=(15, 20), colormap='Set3')
ax.set_xlabel('posts')
for i, (x, y) in enumerate(frame.posts.items()):
ax.annotate(str(y), (y + 10, i), va='center')
n_posts = 600
frame = data.merge(posts, left_on='user', right_index=True).sort_values('posts', ascending=False)
frame = frame[frame.posts > n_posts]
frame = frame.groupby([frame.date.dt.to_period('Q'), 'user']).posts.count().unstack()
frame = frame[posts[posts.posts > n_posts].index.tolist()]
ax = frame.plot.bar(stacked=True, colormap='Set3')
ax.set_ylabel('posts')
<matplotlib.text.Text at 0x7f257a3145f8>
n_posts = 900
frame = data.merge(posts, left_on='user', right_index=True).sort_values('posts', ascending=False)
frame = frame[frame.posts > n_posts]
frame = frame.groupby([frame.date.dt.to_period('Q'), 'user']).posts.count().unstack()
frame = frame[posts[posts.posts > n_posts].index.tolist()]
ax = frame.plot.bar(stacked=True, colormap='Set3')
ax.set_ylabel('posts')
<matplotlib.text.Text at 0x7f257a0ca780>
n_posts = 100
frame = data.merge(posts, left_on='user', right_index=True).sort_values('posts', ascending=False)
frame = frame[frame.posts > n_posts]
frame = frame.groupby([frame.date.dt.to_period('Q'), 'user']).posts.count().unstack()
frame = frame[posts[posts.posts > n_posts].index.tolist()]
ax = frame.plot.bar(stacked=True, legend=False, colormap='Set3')
ax.set_ylabel('posts')
<matplotlib.text.Text at 0x7fe3afe0a898>
frame = data.groupby(data.date.dt.to_period('Q')).date.count()
ax = frame.plot.bar(colormap='Set3')
ax.set_ylabel('posts')
<matplotlib.text.Text at 0x7f2560827278>
frame = data.groupby([data.date.dt.to_period('Q'), 'user']).user.count()
frame = frame.to_frame()[[]].reset_index().groupby('date')
frame.user.count().plot.bar(colormap='Set3')
<matplotlib.axes._subplots.AxesSubplot at 0x7fe3b0f07f28>
frame = data.groupby('user').date.first().reset_index().sort_values('date')[['date']]
frame['n_users'] = 1
frame.groupby(frame.date.dt.to_period('Q')).n_users.sum().plot.bar(colormap='Set3', grid=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe3bf663e48>
frame = data.groupby('user').date.first().reset_index().sort_values('date')[['date']]
frame['n_users'] = 1
frame.groupby(frame.date.dt.to_period('Q')).n_users.sum().cumsum().plot.bar(colormap='Set3', grid=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fe3c413df98>
n_posts = 250
frame = data.merge(posts, left_on='user', right_index=True).sort_values('posts', ascending=False)
frame = frame[frame.posts > n_posts]
frame = frame.groupby([frame.date.dt.to_period('Q'), 'user']).posts.count().unstack()
frame = frame[posts[posts.posts > n_posts].index.tolist()]
for ax in frame.plot.bar(subplots=True, colormap='Set3', figsize=(15, 150)):
ax.set_ylabel('posts')