In this analysis we are going to analyse a pre-reduced set of data from Hacker News, a popular technology blog, where users submitt stories which can be voted and commented upon. We are going to have a deeper look at the posts which include Ask HN
and Show HN
. We would like to understand these two points in that regard:
Ask HN
or Show HN
receive more comments on average?import csv
f = open("hacker_news.csv")
hn = list(csv.reader(f))
f.close()
hn[:5]
[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]
header = hn[0]
hn = hn[1:]
print(header)
print(hn[:5])
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] [['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]
We will further divide the dataset into the following clusters:
ask_posts = []
show_posts = []
other_posts = []
for i in hn:
title = i[1].lower()
if title.startswith("ask hn"):
ask_posts.append(i)
elif title.startswith("show hn"):
show_posts.append(i)
else:
other_posts.append(i)
print(f"There are {len(ask_posts)} posts starting with 'Ask HN'")
print(f"There are {len(show_posts)} posts starting with 'Show HN'")
print(f"There are {len(other_posts)} posts starting with something else.")
There are 1744 posts starting with 'Ask HN' There are 1162 posts starting with 'Show HN' There are 17194 posts starting with something else.
def counting_comments(data):
total_comments = 0
post_counter = 0
for i in data:
num_comments = int(i[4])
total_comments += num_comments
post_counter += 1
return [total_comments, post_counter]
avg_show = counting_comments(show_posts)
avg_show_comments = avg_show[0] / avg_show[1]
print(round(avg_show_comments,2))
avg_ask = counting_comments(ask_posts)
avg_ask_comments = avg_ask[0] / avg_ask[1]
print(round(avg_ask_comments,2))
10.32 14.04
Comparing the two post-types, we can derive that Ask HN
(14) does receive 3 more comments than Show HN
(11) on average.
import datetime as dt
result_list = []
for i in ask_posts:
created_at = i[6]
comment_count = int(i[4])
element = [created_at, comment_count]
result_list.append(element)
len(result_list)
1744
counts_by_hour = {}
comments_by_hour = {}
for i in result_list:
dt_string = i[0]
datetime = dt.datetime.strptime(dt_string, "%m/%d/%Y %H:%M")
hour = datetime.strftime("%H")
comments_amount = i[1]
if hour not in counts_by_hour:
counts_by_hour[hour] = 1
comments_by_hour[hour] = comments_amount
else:
counts_by_hour[hour] += 1
comments_by_hour[hour] += comments_amount
print(counts_by_hour)
print(f"length: {len(counts_by_hour)}")
print("")
print(comments_by_hour)
print(f"length: {len(comments_by_hour)}")
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58} length: 24 {'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641} length: 24
avg_by_hour = []
for i in counts_by_hour:
avg_by_hour.append([i,comments_by_hour[i]/counts_by_hour[i]])
print(sorted(avg_by_hour))
[['00', 8.127272727272727], ['01', 11.383333333333333], ['02', 23.810344827586206], ['03', 7.796296296296297], ['04', 7.170212765957447], ['05', 10.08695652173913], ['06', 9.022727272727273], ['07', 7.852941176470588], ['08', 10.25], ['09', 5.5777777777777775], ['10', 13.440677966101696], ['11', 11.051724137931034], ['12', 9.41095890410959], ['13', 14.741176470588234], ['14', 13.233644859813085], ['15', 38.5948275862069], ['16', 16.796296296296298], ['17', 11.46], ['18', 13.20183486238532], ['19', 10.8], ['20', 21.525], ['21', 16.009174311926607], ['22', 6.746478873239437], ['23', 7.985294117647059]]
Lets make the result more readable and conclude our analysis.
swap_avg_by_hour = []
for i in avg_by_hour:
swap_avg_by_hour.append([round(i[1],2),i[0]])
sorted_swap = sorted(swap_avg_by_hour, reverse=True)
print("Top 5 Hours for Ask Posts Comments")
for i in sorted_swap[:5]:
hour = dt.datetime.strptime(i[1],"%H")
hour = hour.strftime("%H:%M")
comments = i[0]
string = "{}: {:.2f} average comments per post".format(hour, comments)
print(string)
Top 5 Hours for Ask Posts Comments 15:00: 38.59 average comments per post 02:00: 23.81 average comments per post 20:00: 21.52 average comments per post 16:00: 16.80 average comments per post 21:00: 16.01 average comments per post
The best times to write a Ask post
is the time between 15 to 16 and 20 to 21 o clock. The outlier lies with 2 o clock in the morning.