#!/usr/bin/env python
# coding: utf-8

# # Comparing Post Response Volumes on Hacker News

# ## Introduction
# 
# This project will look at a 20,000 randomly selected posts on Hacker News. Of those, posts that are in either the "Ask HN" or the "Show HN" categories will be analyzed to see which subset generates the most comments. We will also look for any correlation between number of comments and the time the post is published.
# The only selection criteria used was that all posts must have received comments.

# In[1]:


# accessing the data
import csv
from csv import reader
opened_file = open('hacker_news.csv')
read_file = reader(opened_file)
data_file = list(read_file)
hn_header = data_file[0]
hn = data_file[1:]
print(hn_header)
print(hn[:5])


# In[2]:


# separating the posts into three sub-groups
ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    title = title.lower()
    if title.startswith('ask hn'):
        ask_posts.append(row)
    if title.startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)
        
print(len(ask_posts))
print(len(show_posts))
print(len(other_posts))


# In[3]:


total_ask_comments = 0 # determining the average number of comments for the "Ask HN" category
for row in ask_posts:
    num_comments = int(row[4])
    total_ask_comments += num_comments    
avg_ask_comments = total_ask_comments/len(ask_posts)
print(avg_ask_comments)

total_show_comments = 0 # determining the average number of comments for the "Show HN" category
for row in show_posts:
    num_comments = int(row[4])
    total_show_comments += num_comments
average_show_comments = total_show_comments/len(show_posts)
print(average_show_comments)


# ## Comparison of Ask HN and Show HN
# * There is an average of 14.0 comments per post in the Ask HN group. In the Show HN group the average is about 10.3 comments per post.
# * This is a 35% increase in the volume of comments for Ask HN.
# * This may not be too surprising, since asking is a process that inherently illicits a response.

# In[4]:


import datetime as dt

result_list = []
for row in ask_posts:
    created_at = row[6] # really just interested in the hour here
    num_comments = int(row[4])
    result_list.append([created_at, num_comments])
    
counts_by_hour = {} # this is the number of posts_per_hour
comments_by_hour = {}

for row in result_list:
    post_time_str = row[0]
    comment_num = row[1]
    post_time_dt = dt.datetime.strptime(post_time_str, "%m/%d/%Y %H:%M")
    post_time_hour = post_time_dt.hour
    if post_time_hour not in counts_by_hour:
        counts_by_hour[post_time_hour] = 1
        comments_by_hour[post_time_hour] = comment_num
    else:
        counts_by_hour[post_time_hour] += 1
        comments_by_hour[post_time_hour] += comment_num


# In[5]:


avg_by_hour = []

for hour in counts_by_hour:
    avg_by_hour.append([hour, (comments_by_hour[hour]/counts_by_hour[hour])])


# In[6]:


swap_avg_by_hour = []

for row in avg_by_hour:
    hour = row[0]
    avg = row[1]
    temp_list = []
    temp_list.append(avg)
    temp_list.append(hour)
    swap_avg_by_hour.append(temp_list)

sorted_swap = sorted(swap_avg_by_hour, key=lambda x:x[0], reverse=True)
print("Top 5 Hours for Ask Post Comments")
print("Adjusted to Pacific Time")
for row in sorted_swap[:5]:
    hour = str(row[1])
    comments = row[0]
    hour_dt = dt.datetime.strptime(hour, "%H")
    hour_dt = hour_dt - dt.timedelta(hours = 3)
    hour_dt = hour_dt.strftime("%H:%M")
    print("{}: {:.2f} average comments per post.".format(hour_dt, comments))
    

# ## Conclusion
# * Noon and eleven o'clock at night are the times with the highest average number of comments per post in the Ask HN category.
# * It may be worth considering the five p.m. and six p.m. slots as well. They both have high average numbers and the wider time span may meet some specifications.