import csv
opened_file = open('hacker_news.csv')
read_file = csv.reader(opened_file)
hn = list(read_file)
header = hn[0]
hn = hn[1:]
print(hn[0:3]) #just checkpoint
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20']]
print(header) #printing the header of our data set
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
#Creating 3 empty lists of lists
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
result_lower = title.lower()
result_1 = result_lower.startswith('ask hn') #returns Boolean result
result_2 = result_lower.startswith('show hn') #returns Boolean result
if result_1: #The append operation in this if statement is executed only if result_1 gives True
ask_posts.append(row)
elif result_2:
show_posts.append(row)
else:
other_posts.append(row)
print("There are overall {} ASK HN posts".format(len(ask_posts)))
print('\n')
print("There are overall {} SHOW HN posts".format(len(show_posts)))
print('\n')
print("There are overall {} OTHER posts".format(len(other_posts)))
There are overall 1744 ASK HN posts There are overall 1162 SHOW HN posts There are overall 17194 OTHER posts
print('The length of HN dataset is: ', len(hn))
print('\n')
print("The length of HN dataset and the length of all other 3 created lists are equal: ", len(hn) == (len(ask_posts)+len(show_posts)+len(other_posts)))
The length of HN dataset is: 20100 The length of HN dataset and the length of all other 3 created lists are equal: True
total_ask_comments = 0
counter_length = 0
for row in ask_posts:
num_comments = row[4]
total_ask_comments += int(num_comments)
counter_length += 1
avg_ask_comments = total_ask_comments/counter_length
print("The average calculated number of 'ASK' comments is: ", avg_ask_comments)
The average calculated number of 'ASK' comments is: 14.038417431192661
total_show_comments = 0
counter = 0
for row in show_posts:
num_comments = row[4]
total_show_comments += int(num_comments)
counter += 1
avg_show_comments = total_show_comments/counter
print("The average calculated number of 'SHOW' comments is: ", avg_show_comments)
The average calculated number of 'SHOW' comments is: 10.31669535283993
Well According to the calculations, ASK posts got more comments on average than SHOW posts did. In my opinion, the underlying reason for that people tend to show more reaction to ASK posts as people need some help with this kind of posts. People tend to answer more to ASK posts in the comments section. Since ASK posts are more likely to receive comments, we'll focus our remaining analysis just on these posts.
import datetime as dt
result_list = []
for row in ask_posts:
first_element = row[6]
second_element = int(row[4])
result_list.append([first_element,second_element])
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
date = row[0]
comment = row[1]
datetime_object = dt.datetime.strptime(date, "%m/%d/%Y %H:%M")
time = datetime_object.strftime("%H")
if time not in counts_by_hour:
counts_by_hour[time] = 1
comments_by_hour[time] = comment
else:
counts_by_hour[time] += 1
comments_by_hour[time] += comment
print(counts_by_hour)
print('\n')
print(comments_by_hour)
{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58} {'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}
avg_by_hour = []
for hr in comments_by_hour:
avg_by_hour.append([hr, comments_by_hour[hr] / counts_by_hour[hr]])
avg_by_hour
[['09', 5.5777777777777775], ['13', 14.741176470588234], ['10', 13.440677966101696], ['14', 13.233644859813085], ['16', 16.796296296296298], ['23', 7.985294117647059], ['12', 9.41095890410959], ['17', 11.46], ['15', 38.5948275862069], ['21', 16.009174311926607], ['20', 21.525], ['02', 23.810344827586206], ['18', 13.20183486238532], ['03', 7.796296296296297], ['05', 10.08695652173913], ['19', 10.8], ['01', 11.383333333333333], ['22', 6.746478873239437], ['08', 10.25], ['04', 7.170212765957447], ['00', 8.127272727272727], ['06', 9.022727272727273], ['07', 7.852941176470588], ['11', 11.051724137931034]]
swap_avg_by_hour = []
for row in avg_by_hour:
swap_avg_by_hour.append([row[1],row[0]])
print(swap_avg_by_hour)
print('\n')
sorted_swap = sorted(swap_avg_by_hour, reverse=True)
print(sorted_swap)
[[5.5777777777777775, '09'], [14.741176470588234, '13'], [13.440677966101696, '10'], [13.233644859813085, '14'], [16.796296296296298, '16'], [7.985294117647059, '23'], [9.41095890410959, '12'], [11.46, '17'], [38.5948275862069, '15'], [16.009174311926607, '21'], [21.525, '20'], [23.810344827586206, '02'], [13.20183486238532, '18'], [7.796296296296297, '03'], [10.08695652173913, '05'], [10.8, '19'], [11.383333333333333, '01'], [6.746478873239437, '22'], [10.25, '08'], [7.170212765957447, '04'], [8.127272727272727, '00'], [9.022727272727273, '06'], [7.852941176470588, '07'], [11.051724137931034, '11']] [[38.5948275862069, '15'], [23.810344827586206, '02'], [21.525, '20'], [16.796296296296298, '16'], [16.009174311926607, '21'], [14.741176470588234, '13'], [13.440677966101696, '10'], [13.233644859813085, '14'], [13.20183486238532, '18'], [11.46, '17'], [11.383333333333333, '01'], [11.051724137931034, '11'], [10.8, '19'], [10.25, '08'], [10.08695652173913, '05'], [9.41095890410959, '12'], [9.022727272727273, '06'], [8.127272727272727, '00'], [7.985294117647059, '23'], [7.852941176470588, '07'], [7.796296296296297, '03'], [7.170212765957447, '04'], [6.746478873239437, '22'], [5.5777777777777775, '09']]
print("Top 5 Hours for 'Ask HN' Comments")
for avg, hr in sorted_swap[:5]:
print("{}: {:.2f} average comments per post".format(dt.datetime.strptime(hr, "%H").strftime("%H:%M"),avg))
Top 5 Hours for 'Ask HN' Comments 15:00: 38.59 average comments per post 02:00: 23.81 average comments per post 20:00: 21.52 average comments per post 16:00: 16.80 average comments per post 21:00: 16.01 average comments per post