https://plot.ly/python/google_big_query/

Table of Contents

In [1]:

import pandas as pd

# to communicate with Google BigQuery
from pandas.io import gbq

In [2]:

import plotly.plotly as py
import plotly.graph_objs as go
from plotly.tools import FigureFactory as FF

In [14]:

top10_active_users_query = """
SELECT
  author AS User,
  count(author) as Stories
FROM
  [fh-bigquery:hackernews.stories]
GROUP BY
  User
ORDER BY
  Stories DESC
LIMIT
  10
"""

In [12]:

project_id = 'tak2016-144221'

To get below running, needed this library:

https://github.com/google/google-api-python-client

$ pip install google-api-python-client --user

In [15]:

top10_active_users_df = gbq.read_gbq(top10_active_users_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 18.1 Mb

Retrieving results...
Got 10 rows.

Total time taken 1.97 s.
Finished at 2016-09-22 17:24:16.

In [16]:

top10_active_users_df

Out[16]:

	User	Stories
0	cwan	7077
1	shawndumas	6602
2	evo_9	5659
3	nickb	4322
4	iProject	4266
5	bootload	4212
6	edw519	3844
7	ColinWright	3766
8	nreece	3724
9	tokenadult	3659

In [18]:

top_10_users_table = FF.create_table(top10_active_users_df)
py.iplot(top_10_users_table, filename='plotly/top-10-active-users')

Out[18]:

Top 10 Hacker News Submissions (by score)¶

In [19]:

top10_story_query = """
SELECT
  title,
  score,
  time_ts AS timestamp
FROM
  [fh-bigquery:hackernews.stories]
ORDER BY
  score DESC
LIMIT
  10
"""

In [20]:

top10_story_df = gbq.read_gbq(top10_story_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 113.1 Mb

Retrieving results...
Got 10 rows.

Total time taken 1.5 s.
Finished at 2016-09-22 17:26:09.

In [21]:

# Create a table figure from the DataFrame
top10_story_figure = FF.create_table(top10_story_df)

# Scatter trace for the bubble chart timeseries
story_timeseries_trace = go.Scatter(
    x=top10_story_df['timestamp'],
    y=top10_story_df['score'],
    xaxis='x2',
    yaxis='y2',
    mode='markers',
    text=top10_story_df['title'],
    marker=dict(
        color=[80 + i*5 for i in range(10)],
        size=top10_story_df['score']/50,
        showscale=False
    )
)

# Add the trace data to the figure
top10_story_figure['data'].extend(go.Data([story_timeseries_trace]))

# Subplot layout
top10_story_figure.layout.yaxis.update({'domain': [0, .45]})
top10_story_figure.layout.yaxis2.update({'domain': [.6, 1]})

# Y-axis of the graph should be anchored with X-axis
top10_story_figure.layout.yaxis2.update({'anchor': 'x2'})
top10_story_figure.layout.xaxis2.update({'anchor': 'y2'})

# Add the height and title attribute
top10_story_figure.layout.update({'height':900})
top10_story_figure.layout.update({'title': 'Highest Scoring Submissions on Hacker News'})

# Update the background color for plot and paper
top10_story_figure.layout.update({'paper_bgcolor': 'rgb(243, 243, 243)'})
top10_story_figure.layout.update({'plot_bgcolor': 'rgb(243, 243, 243)'})

# Add the margin to make subplot titles visible
top10_story_figure.layout.margin.update({'t':75, 'l':50})
top10_story_figure.layout.yaxis2.update({'title': 'Upvote Score'})
top10_story_figure.layout.xaxis2.update({'title': 'Post Time'})

In [22]:

py.image.save_as(top10_story_figure, filename='top10-posts.png')
py.iplot(top10_story_figure, filename='plotly/highest-scoring-submissions')

Out[22]:

From which Top-level domain (TLD) most of the stories come?¶

Here we have used the url-function TLD from BigQuery's query syntax.
We collect the domain for all URLs with their respective count, and group them by it.

In [23]:

tld_share_query = """
SELECT
  TLD(url) AS domain,
  count(score) AS stories
FROM
  [fh-bigquery:hackernews.stories]
GROUP BY
  domain
ORDER BY
  stories DESC
LIMIT 10
"""

In [24]:

tld_share_df = gbq.read_gbq(tld_share_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 134.6 Mb

Retrieving results...
Got 10 rows.

Total time taken 4.13 s.
Finished at 2016-09-22 17:27:28.

In [25]:

tld_share_df

Out[25]:

	domain	stories
0	.com	1324329
1	.org	120716
2	None	111744
3	.net	58754
4	.co.uk	43955
5	.io	23507
6	.edu	14727
7	.co	10692
8	.me	10565
9	.info	8121

In [26]:

labels = tld_share_df['domain']
values = tld_share_df['stories']

tld_share_trace = go.Pie(labels=labels, values=values)
data = [tld_share_trace]

layout = go.Layout(
    title='Submissions shared by Top-level domains'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='plotly/toplevel-domains')

Out[26]:

Public response to the "Who Is Hiring?" posts¶

There is an account on Hacker News by the name whoishiring.
This account automatically submits a 'Who is Hiring?' post at 11 AM Eastern time on the first weekday of every month.

In [27]:

wih_query = """
SELECT
  id,
  title,
  score,
  time_ts
FROM
  [fh-bigquery:hackernews.stories]
WHERE
  author == 'whoishiring' AND
  LOWER(title) contains 'who is hiring?'
ORDER BY
  time
"""

In [28]:

wih_df = gbq.read_gbq(wih_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 146.1 Mb

Retrieving results...
Got 52 rows.

Total time taken 1.83 s.
Finished at 2016-09-22 17:28:45.

In [29]:

trace = go.Scatter(
    x=wih_df['time_ts'],
    y=wih_df['score'],
    mode='markers+lines',
    text=wih_df['title'],
    marker=dict(
        size=wih_df['score']/50
    )
)

layout = go.Layout(
    title='Public response to the "Who Is Hiring?" posts',
    xaxis=dict(
        title="Post Time"
    ),
    yaxis=dict(
        title="Upvote Score"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='plotly/whoishiring-public-response')

Out[29]:

Submission Traffic Volume in a Week¶

In [30]:

week_traffic_query = """
SELECT
  DAYOFWEEK(time_ts) as Weekday,
  count(DAYOFWEEK(time_ts)) as story_counts
FROM
  [fh-bigquery:hackernews.stories]
GROUP BY
  Weekday
ORDER BY
  Weekday
"""

In [31]:

week_traffic_df = gbq.read_gbq(week_traffic_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 14.8 Mb

Retrieving results...
Got 8 rows.

Total time taken 2.45 s.
Finished at 2016-09-22 17:29:14.

In [33]:

week_traffic_df['Day'] = ['NULL', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
week_traffic_df = week_traffic_df.drop(week_traffic_df.index[0])

In [34]:

trace = go.Scatter(
    x=week_traffic_df['Day'],
    y=week_traffic_df['story_counts'],
    mode='lines',
    text=week_traffic_df['Day']
)

layout = go.Layout(
    title='Submission Traffic Volume (Week Days)',
    xaxis=dict(
        title="Day of the Week"
    ),
    yaxis=dict(
        title="Total Submissions"
    )
)

data = [trace]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='plotly/submission-traffic-volume')

Out[34]:

Programming Language Trend on HackerNews¶

We will compare the trends for the Python and PHP programming languages, using the Hacker News post titles.

In [35]:

python_query = """
SELECT
  YEAR(time_ts) as years,
  COUNT(YEAR(time_ts )) as trends
FROM
  [fh-bigquery:hackernews.stories]
WHERE
  LOWER(title) contains 'python'
GROUP BY
  years
ORDER BY
  years
"""

php_query = """
SELECT
  YEAR(time_ts) as years,
  COUNT(YEAR(time_ts )) as trends
FROM
  [fh-bigquery:hackernews.stories]
WHERE
  LOWER(title) contains 'php'
GROUP BY
  years
ORDER BY
  years
"""

In [36]:

python_df = gbq.read_gbq(python_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 99.0 Mb

Retrieving results...
Got 9 rows.

Total time taken 1.79 s.
Finished at 2016-09-22 17:29:58.

In [37]:

php_df = gbq.read_gbq(php_query, project_id=project_id)

Requesting query... ok.
Query running...
Query done.
Processed: 99.0 Mb

Retrieving results...
Got 9 rows.

Total time taken 2.9 s.
Finished at 2016-09-22 17:30:04.

In [38]:

trace1 = go.Scatter(
    x=python_df['years'],
    y=python_df['trends'],
    mode='lines',
    line=dict(color='rgba(115,115,115,1)', width=4),
    connectgaps=True,
)

trace2 = go.Scatter(
    x=[python_df['years'][0], python_df['years'][8]],
    y=[python_df['trends'][0], python_df['trends'][8]],
    mode='markers',
    marker=dict(color='rgba(115,115,115,1)', size=8)
)

trace3 = go.Scatter(
    x=php_df['years'],
    y=php_df['trends'],
    mode='lines',
    line=dict(color='rgba(189,189,189,1)', width=4),
    connectgaps=True,
)

trace4 = go.Scatter(
    x=[php_df['years'][0], php_df['years'][8]],
    y=[php_df['trends'][0], php_df['trends'][8]],
    mode='markers',
    marker=dict(color='rgba(189,189,189,1)', size=8)
)

traces = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    xaxis=dict(
        showline=True,
        showgrid=False,
        showticklabels=True,
        linecolor='rgb(204, 204, 204)',
        linewidth=2,
        autotick=False,
        ticks='outside',
        tickcolor='rgb(204, 204, 204)',
        tickwidth=2,
        ticklen=5,
        tickfont=dict(
            family='Arial',
            size=12,
            color='rgb(82, 82, 82)',
        ),
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=False,
        showticklabels=False,
    ),
    autosize=False,
    margin=dict(
        autoexpand=False,
        l=100,
        r=20,
        t=110,
    ),
    showlegend=False,
)

annotations = []

annotations.append(
    dict(xref='paper', x=0.95, y=python_df['trends'][8],
    xanchor='left', yanchor='middle',
    text='Python',
    font=dict(
        family='Arial',
        size=14,
        color='rgba(49,130,189, 1)'
    ),
    showarrow=False)
)

annotations.append(
    dict(xref='paper', x=0.95, y=php_df['trends'][8],
    xanchor='left', yanchor='middle',
    text='PHP',
    font=dict(
        family='Arial',
        size=14,
        color='rgba(49,130,189, 1)'
    ),
    showarrow=False)
)

annotations.append(
    dict(xref='paper', yref='paper', x=0.5, y=-0.1,
    xanchor='center', yanchor='top',
    text='Source: Hacker News submissions with the title containing Python/PHP',
    font=dict(
        family='Arial',
        size=12,
        color='rgb(150,150,150)'
    ),
    showarrow=False)
)

layout['annotations'] = annotations

fig = go.Figure(data=traces, layout=layout)
py.iplot(fig, filename='plotly/programming-language-trends')

Out[38]: