About: We are using a trimed down data set from Hacker News to perform some analytics
Goal: Determine if Ask Hacker News or Show Hacker News posts recieve more commments.
###########################################
# Put the data into appropriate collections
###########################################
import csv
hn=[]
ask_posts=[]
show_posts=[]
other_posts=[]
headers = ''
i=0 # first row counter
#this way we only make one pass through the dataset
with open('hacker_news.csv') as f:
reader = csv.reader(f)
for row in reader:
if i ==0:
headers=row
i+=1
else:
hn.append(row)
if row[1].lower().startswith('ask hn'):
ask_posts.append(row)
elif row[1].lower().startswith('show hn'):
show_posts.append(row)
else:
other_posts.append(row)
print(headers)
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
##############################################
# Calculate the averages for each type of post
##############################################
total_ask_comments = sum([int(row[4]) for row in ask_posts])
total_show_comments = sum([int(row[4]) for row in show_posts])
avg_ask_comments = total_ask_comments / len(ask_posts)
avg_show_comments = total_show_comments / len(show_posts)
print(f"The average Ask HN post has {avg_ask_comments:.2f} comments")
print(f"The average Show HN post has {avg_show_comments:.2f} comments")
The average Ask HN post has 14.04 comments The average Show HN post has 10.32 comments
####################################################
# Calculate the number of posts and comments by hour
####################################################
import datetime as dt
from collections import defaultdict as dd
#put the post hour and comments into a tuple
t = [(dt.datetime.strptime(row[6], "%m/%d/%Y %H:%M").hour, int(row[4])) for row in ask_posts]
counts_by_hour = dd(int)
comments_by_hour =dd(int)
#loop through the tuple and aggregate by hour
for k,v in t:
counts_by_hour[k] +=1 #count the post only once per hour
comments_by_hour[k] +=v #sum the comments by hour
#calculate the average by hour
avg_by_hour=[]
for i in range(24):
avg_by_hour.append([i, comments_by_hour[i]/ counts_by_hour[i]])
#sort the collections and output the top 5 in local time zone
sorted_swap = sorted([[row[1], row[0]] for row in avg_by_hour],reverse=True)[0:5]
output = [f"{row[1]-2}:00 : {row[0]:.2f}" for row in sorted_swap]
print(output)
['13:00 : 38.59', '0:00 : 23.81', '18:00 : 21.52', '14:00 : 16.80', '19:00 : 16.01']