In this project, data from the website 'Hacker News' will be analysed. Hacker News is a site started by the startup incubator Y Combinator, where user-submitted stories (known as "posts") are voted and commented upon, similar to reddit. Hacker News is extremely popular in technology and startup circles, and posts that make it to the top of Hacker News' listings can get hundreds of thousands of visitors as a result.
### Import reader from the csv module ###
from csv import reader
### Open the Hacker News data set. Split into header and main data ###
opened_file = open('hacker_news.csv', encoding="utf8")
read_file = reader(opened_file)
hn = list(read_file)
headers = hn[0]
hn = hn[1:]
print(headers)
print(hn[:5])
print(len(hn))
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] [['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']] 20100
Below are descriptions of the columns of the dataset:
First, split the dataset into three:
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
title_lc = title.lower()
if title_lc.startswith('ask hn'):
ask_posts.append(row)
elif title_lc.startswith('show hn'):
show_posts.append(row)
else:
other_posts.append(row)
print(len(ask_posts))
print(len(show_posts))
print(len(other_posts))
1744 1162 17194
#print the first 5 rows od each dataset
print(ask_posts[:5])
print(show_posts[:5])
[['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', '8/16/2016 9:55'], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', '11/22/2015 13:43'], ['11610310', 'Ask HN: Aby recent changes to CSS that broke mobile?', '', '1', '1', 'polskibus', '5/2/2016 10:14'], ['12210105', 'Ask HN: Looking for Employee #3 How do I do it?', '', '1', '3', 'sph130', '8/2/2016 14:20'], ['10394168', 'Ask HN: Someone offered to buy my browser extension from me. What now?', '', '28', '17', 'roykolak', '10/15/2015 16:38']] [['10627194', 'Show HN: Wio Link ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', '11/25/2015 14:03'], ['10646440', 'Show HN: Something pointless I made', 'http://dn.ht/picklecat/', '747', '102', 'dhotson', '11/29/2015 22:46'], ['11590768', 'Show HN: Shanhu.io, a programming playground powered by e8vm', 'https://shanhu.io', '1', '1', 'h8liu', '4/28/2016 18:05'], ['12178806', 'Show HN: Webscope Easy way for web developers to communicate with Clients', 'http://webscopeapp.com', '3', '3', 'fastbrick', '7/28/2016 7:11'], ['10872799', 'Show HN: GeoScreenshot Easily test Geo-IP based web pages', 'https://www.geoscreenshot.com/', '1', '9', 'kpsychwave', '1/9/2016 20:45']]
#calculate the average number of comments on ask posts and on show posts
total_ask_comments = 0
for row in ask_posts:
no_comments = int(row[4])
total_ask_comments = total_ask_comments + no_comments
#print(total_ask_comments)
#print(len(ask_posts))
avg_ask_comments = float(total_ask_comments/len(ask_posts))
print("The average number of comments on ask posts is", int(avg_ask_comments))
total_show_comments = 0
for row in show_posts:
no_comments = int(row[4])
total_show_comments = total_show_comments + no_comments
#print(total_show_comments)
#print(len(show_posts))
avg_show_comments = float(total_show_comments/len(show_posts))
print("The average number of comments on show posts is", int(avg_show_comments))
The average number of comments on ask posts is 14 The average number of comments on show posts is 10
import datetime as dt
#create two dictionaries: one for the number of posts per hour, one for the total number of comments per hour
result_list = []
for row in ask_posts:
created = row[6]
no_comments = int(row[4])
result_list.append([created, no_comments])
#print(result_list)
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
date = row[0]
no_comments = int(row[1])
date = dt.datetime.strptime(date, "%m/%d/%Y %H:%M")
hour = date.strftime("%H")
#print(hour)
if hour in counts_by_hour:
counts_by_hour[hour] += 1
else:
counts_by_hour[hour] = 1
if hour in comments_by_hour:
comments_by_hour[hour] += no_comments
else:
comments_by_hour[hour] = no_comments
print(counts_by_hour)
print(comments_by_hour)
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58} {'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}
#calculate average number of comments per hour
avg_by_hour = []
for key in counts_by_hour:
hour = key
avg = comments_by_hour[key]/counts_by_hour[key]
avg_by_hour.append([hour,avg])
print(avg_by_hour)
[['09', 5.5777777777777775], ['13', 14.741176470588234], ['10', 13.440677966101696], ['14', 13.233644859813085], ['16', 16.796296296296298], ['23', 7.985294117647059], ['12', 9.41095890410959], ['17', 11.46], ['15', 38.5948275862069], ['21', 16.009174311926607], ['20', 21.525], ['02', 23.810344827586206], ['18', 13.20183486238532], ['03', 7.796296296296297], ['05', 10.08695652173913], ['19', 10.8], ['01', 11.383333333333333], ['22', 6.746478873239437], ['08', 10.25], ['04', 7.170212765957447], ['00', 8.127272727272727], ['06', 9.022727272727273], ['07', 7.852941176470588], ['11', 11.051724137931034]]
#swap the order of the list to be able to use the sorted() function
swap_avg_by_hour = []
for row in avg_by_hour:
hour = row[0]
avg = row[1]
swap_avg_by_hour.append([avg, hour])
#print(swap_avg_by_hour)
sorted_swap = sorted(swap_avg_by_hour, reverse = True)
print("Top 5 Hours for Ask Posts Comments")
for row in sorted_swap[:5]:
hour = dt.datetime.strptime(str(row[1]), "%H")
format_hour = hour.strftime("%H:00")
num_c = row[0]
print("{format_hour}: {num_c:.2f} average comments per post.".format(format_hour = format_hour, num_c = num_c))
Top 5 Hours for Ask Posts Comments 15:00: 38.59 average comments per post. 02:00: 23.81 average comments per post. 20:00: 21.52 average comments per post. 16:00: 16.80 average comments per post. 21:00: 16.01 average comments per post.
From this analysis, it has been shown that posts posted between 15:00 - 16:00 receive, on average, the most comments.