First we can explore the 'hacker_news.csv'
dataset by opening and printing few rows.
from csv import reader #improrting reader
opened_file = open('hacker_news.csv')
read_file = reader(opened_file)
hn = list(read_file) # List of list creation
print(hn[:5]) # displaying the first 5 rows
[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]
We could see above that the column headers are present in the dataset in row 1. Hence we seperate them from the data to help us analyze the dataset with ease.
headers = hn[0]
hn = hn[1:] #Removed header from our list
print(headers, '\n')
print(hn[:5])
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] [['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']]
Now we seperate Ask posts and Show posts into two seperate lists
ask_posts = []
show_posts = []
other_posts = []
#Sorting all the different posts
for row in hn:
title = row[1].lower() #converting Post titles to lower case
if title.startswith('ask hn') == True:
ask_posts.append(row)
elif title.startswith('show hn') == True:
show_posts.append(row)
else:
other_posts.append(row)
# Displaying all the different number of posts
ap = len(ask_posts)
sp = len(show_posts)
op = len(other_posts)
print('Ask posts :' + str(ap) +'\nShow posts:' + str(sp) + '\nOther posts:' + str(op))
Ask posts :1744 Show posts:1162 Other posts:17194
We find out that the Ask HN Posts are more in number when compared to the Show HN Posts
After seperating the different posts we can now try finding the type of posts with the most number of comments
total_ask_comments = 0
total_ask_posts = 0
#Finding the total ask comments
for row in ask_posts:
num_comments = int(row[4])
total_ask_comments += num_comments
total_ask_posts += 1
# Computing average
avg_ask_comments = total_ask_comments / total_ask_posts
print('Average ask comments:' + str(avg_ask_comments))
total_show_comments = 0
total_show_posts = 0
#Finding the total show comments and average
for post in show_posts:
num_comments = int(post[4])
total_show_comments += num_comments
total_show_posts += 1
avg_show_comments = total_show_comments / total_show_posts
print('Average show comments:' + str(avg_show_comments))
Average ask comments:14.038417431192661 Average show comments:10.31669535283993
It is evident that on an average Ask HN Posts reveive more comments per post - (approx. 14 comments per post) while comparing with Show HN Posts - (approx. 10 comments per post).
Since ask posts are more likely to receive comments, we'll focus our remaining analysis just on these posts.
Next, we'll determine if ask posts created at a certain time are more likely to attract comments. To do this we calculate the amount of ask posts created in each hour of the day, along with the number of comments received.
import datetime as dt
result_list = []
for row in ask_posts:
# Creating a list with 'created time' and 'number of comments'
#for each ask post
temp = [row[6],int(row[4])]
result_list.append(temp)
counts_by_hour = {}
comments_by_hour = {}
# Creating two Freq tables with 'hour' as key and
# 'number of posts' and 'number of comments' as values
for row in result_list:
dt_object = dt.datetime.strptime(row[0], "%m/%d/%Y %H:%M")
hour = dt_object.strftime('%H')
if hour not in counts_by_hour:
counts_by_hour[hour] = 1
comments_by_hour[hour] = row[1]
else :
counts_by_hour[hour] += 1
comments_by_hour[hour] += row[1]
print('Frequency table of posts per hour')
print(counts_by_hour)
print('\n')
print('Number of comments \'ask posts\' created at each hour received:')
print(comments_by_hour)
Frequency table of posts per hour {'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58} Number of comments 'ask posts' created at each hour received: {'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}
Next, we'll use these two dictionaries to calculate the average number of comments for posts created during each hour of the day.
avg_by_hour = []
#Computing Avg comments per hour for ask posts
for hour in counts_by_hour:
avg = comments_by_hour[hour] / counts_by_hour[hour]
avg_by_hour.append([hour, round(avg,4)]) #rounding off comments
for row in avg_by_hour:
print(row)
['09', 5.5778] ['13', 14.7412] ['10', 13.4407] ['14', 13.2336] ['16', 16.7963] ['23', 7.9853] ['12', 9.411] ['17', 11.46] ['15', 38.5948] ['21', 16.0092] ['20', 21.525] ['02', 23.8103] ['18', 13.2018] ['03', 7.7963] ['05', 10.087] ['19', 10.8] ['01', 11.3833] ['22', 6.7465] ['08', 10.25] ['04', 7.1702] ['00', 8.1273] ['06', 9.0227] ['07', 7.8529] ['11', 11.0517]
Although we now have the results we need, this format makes it hard to identify the hours with the highest values. Let's finish by sorting the list of lists and printing the five highest values in a format that's easier to read.
swap_avg_by_hour = []
#Swapping the elements in List of Lists to help with sorting
for row in avg_by_hour:
swap_avg_by_hour.append([row[1],row[0]])
#Sorting the new list in descending order
sorted_swap = sorted(swap_avg_by_hour, reverse = True)
print('\n')
print("Top 5 Hours for Ask Posts Comments")
#Printing the top 5 highest values
for row in sorted_swap[:5]:
hour = dt.datetime.strptime(row[1], '%H')
hour_str = hour.strftime('%H:%M')
average = row[0]
output = '{0} {1:.2f} average comments per post'.format(hour_str, average)
print(output)
Top 5 Hours for Ask Posts Comments 15:00 38.59 average comments per post 02:00 23.81 average comments per post 20:00 21.52 average comments per post 16:00 16.80 average comments per post 21:00 16.01 average comments per post
Based on the results it is clear that making Ask HN posts
at 3PM Eastern, KY, USA (GMT-5) which translates to 1:30 AM Indian Standard Time is the best option for getting maximum comments.