#!/usr/bin/env python # coding: utf-8 # # Comparing Post Response Volumes on Hacker News # ## Introduction # # This project will look at a 20,000 randomly selected posts on Hacker News. Of those, posts that are in either the "Ask HN" or the "Show HN" categories will be analyzed to see which subset generates the most comments. We will also look for any correlation between number of comments and the time the post is published. # The only selection criteria used was that all posts must have received comments. # In[1]: # accessing the data import csv from csv import reader opened_file = open('hacker_news.csv') read_file = reader(opened_file) data_file = list(read_file) hn_header = data_file[0] hn = data_file[1:] print(hn_header) print(hn[:5]) # In[2]: # separating the posts into three sub-groups ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] title = title.lower() if title.startswith('ask hn'): ask_posts.append(row) if title.startswith('show hn'): show_posts.append(row) else: other_posts.append(row) print(len(ask_posts)) print(len(show_posts)) print(len(other_posts)) # In[3]: total_ask_comments = 0 # determining the average number of comments for the "Ask HN" category for row in ask_posts: num_comments = int(row[4]) total_ask_comments += num_comments avg_ask_comments = total_ask_comments/len(ask_posts) print(avg_ask_comments) total_show_comments = 0 # determining the average number of comments for the "Show HN" category for row in show_posts: num_comments = int(row[4]) total_show_comments += num_comments average_show_comments = total_show_comments/len(show_posts) print(average_show_comments) # ## Comparison of Ask HN and Show HN # * There is an average of 14.0 comments per post in the Ask HN group. In the Show HN group the average is about 10.3 comments per post. # * This is a 35% increase in the volume of comments for Ask HN. # * This may not be too surprising, since asking is a process that inherently illicits a response. # In[4]: import datetime as dt result_list = [] for row in ask_posts: created_at = row[6] # really just interested in the hour here num_comments = int(row[4]) result_list.append([created_at, num_comments]) counts_by_hour = {} # this is the number of posts_per_hour comments_by_hour = {} for row in result_list: post_time_str = row[0] comment_num = row[1] post_time_dt = dt.datetime.strptime(post_time_str, "%m/%d/%Y %H:%M") post_time_hour = post_time_dt.hour if post_time_hour not in counts_by_hour: counts_by_hour[post_time_hour] = 1 comments_by_hour[post_time_hour] = comment_num else: counts_by_hour[post_time_hour] += 1 comments_by_hour[post_time_hour] += comment_num # In[5]: avg_by_hour = [] for hour in counts_by_hour: avg_by_hour.append([hour, (comments_by_hour[hour]/counts_by_hour[hour])]) # In[6]: swap_avg_by_hour = [] for row in avg_by_hour: hour = row[0] avg = row[1] temp_list = [] temp_list.append(avg) temp_list.append(hour) swap_avg_by_hour.append(temp_list) sorted_swap = sorted(swap_avg_by_hour, key=lambda x:x[0], reverse=True) print("Top 5 Hours for Ask Post Comments") print("Adjusted to Pacific Time") for row in sorted_swap[:5]: hour = str(row[1]) comments = row[0] hour_dt = dt.datetime.strptime(hour, "%H") hour_dt = hour_dt - dt.timedelta(hours = 3) hour_dt = hour_dt.strftime("%H:%M") print("{}: {:.2f} average comments per post.".format(hour_dt, comments)) # ## Conclusion # * Noon and eleven o'clock at night are the times with the highest average number of comments per post in the Ask HN category. # * It may be worth considering the five p.m. and six p.m. slots as well. They both have high average numbers and the wider time span may meet some specifications.