#!/usr/bin/env python
# coding: utf-8
# # Guided Project - Exploring Hacker News Posts
# ![download.png](attachment:download.png)
#
# ### Company Background:
# Hacker News is a site where users receive votes and comments on submitted stories (or posts). Top posts on Hacker News may receive hundreds of thousands of visitors.
#
# ### **Project Goal:**
#
# My goal as the Data Analyst is to answer two questions:
# * Does Ask HN or Show HN receive more comments?
# * Do posts created at certain times receive more comments on average?
#
# > **This project will use the following six steps of the data analysis process to answer the above two questions of the project goal:**
# > * 1.) Ask Question
# > * 2.) Get Data
# > * 3.) Explore Data
# > * 4.) Clean Data
# > * 5.) Analyze Data
# > * 6.) Conclusion
#
# ## Ask Question:
#
# My goal as the Data Analyst is to answer two questions:
# * Does Ask HN or Show HN receive more comments on average?
# * Do posts created at certain times receive more comments on average?
# ## Get Data
#
# The data set that will be used for analysis can be found [here](https://www.kaggle.com/hacker-news/hacker-news-posts).
# The data set includes submissions to the Hacker News site.
# In[12]:
# Reduced dataset from 300,000 rows to 20,000 rows by removing all submissions without a comment
opened_file = open("hacker_news.csv")
from csv import reader
read_file = reader(opened_file)
hn = list(read_file)
headers = hn[0]
print(headers)
hn = hn[1:] ## remove header
print(hn[0])
# In[13]:
#import modules
import datetime as dt
# ## Explore Data
# #Data Dictionary
# * id: the unique identifier from Hacker News for the post
# * title: the title of the post
# * url: the URL that the posts links to, if the post has a URL
# * num_points: the number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes
# * num_comments: the number of comments on the post
# * author: the username of the person who submitted the post
# * created_at: the date and time of the post's submission
# ## Clean Data
# Find posts that begin with Ask HN or Show HN
#
# In[14]:
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
title = title.lower()
if title.startswith("ask hn"):
ask_posts.append(row)
elif title.startswith("show hn"):
show_posts.append(row)
else:
other_posts.append(row)
print("Number of Posts begin with 'Ask HN': ", len(ask_posts), "\n","Number of posts begin with 'Show HN': ", len(show_posts),"\n", "Number of Other posts:", len(other_posts))
# Calculate the average number of comments per posts for:
# * Ask HN
# * Show HN
#
# In[15]:
# Calc the average comments per ask_posts
total_ask_comments = 0
for row in ask_posts:
num = int(row[4])
total_ask_comments += num
avg_ask_comments = total_ask_comments / len(ask_posts)
print("Average number of comments for Ask HN:")
avg_ask_comments
# In[16]:
# Find the average comments per show_posts
total_show_comments = 0
for post in show_posts:
num_1 = int(post[4])
total_show_comments += num_1
avg_show_comments = total_show_comments / len(show_posts)
print("Average number of comments for Show HN:")
avg_show_comments
# > The Ask HN posts receive more comments. The Ask HN posts receive around 14 comments per post while the Show HN posts receive around 10 comments per post.
#
# Since Ask HN posts receive more comments, the analysis will focus on Ask HN posts only.
# ## Analyze Ask Posts Data
# Determine if Ask Posts created at certain times have more comments:
#
# * calculate the number of Ask Posts created each hour along with # comments received
#
# * calculate average number of comments Ask Posts receives by hour created
#
# * identify the top five hours with the highest comments per post
#
# * identify the best hours to create a post to have a higher chance of receiving comments
#
# > Calculate the number of Ask Posts created each hour along with the number of comments received
# In[17]:
# calculate the number of ask posts created each hour along with # comments received
result_list = []
for post in ask_posts:
created_at_col = post[6]
num_comments_col = post[4]
result_list.append([created_at_col, num_comments_col]) # create at time and dat, number of comments
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
date = row[0]
comment_num = int(row[1])
date_formatted_dt = dt.datetime.strptime(date,"%m/%d/%Y %H:%M")
hour = date_formatted_dt.hour
if hour not in counts_by_hour:
counts_by_hour[hour] = 1
comments_by_hour[hour] = comment_num
elif hour in counts_by_hour:
counts_by_hour[hour] += 1
comments_by_hour[hour] += comment_num
print("Posts per hour:","\n",counts_by_hour,"\n", "\n","Comments per hour:","\n",comments_by_hour)
# > Calculate average number of comments Ask Posts receives by the hour created
# In[18]:
#calculate average number of comments ask posts receives by hour created
avg_by_hour = []
for hour in comments_by_hour:
avg_by_hour.append([hour,comments_by_hour[hour]/counts_by_hour[hour]])
print("Average number of comments per post by hour:")
avg_by_hour
# > Identify the top five hours with the highest comments per post:
#
# > * create a list of list with average as first element and hour as 2nd element
#
# > * sort the list and print the top 5 hours with the most average comments per hour
# In[19]:
#create a list with avg_by_hour swapped columns
swap_avg_by_hour = []
for hour in avg_by_hour:
swap_avg_by_hour.append([hour[1],hour[0]])
swap_avg_by_hour
# In[20]:
# sort new list of list swap_avg_by_hour
sorted_swap = sorted(swap_avg_by_hour, reverse = True)
print("Top 5 Hours for Ask Posts Comments")
for comment_hour in sorted_swap[:5]:
hour_avg_comment = str(comment_hour[1]) # 15 convert to str
hour_dt = dt.datetime.strptime(hour_avg_comment, "%H")# 1900-01-01 15:00:00
hour_formatted = hour_dt.strftime("%H:%M") # 15:00
template = "{hour} {avg_comment:.2f} average comments per post".format(hour=hour_formatted, avg_comment = comment_hour[0])
print(template)
# > Identify the best hours to create a post to have a higher chance of receiving comments
# # Conclusion
#
# This project answers two questions:
#
# **Does Ask HN or Show HN receive more comments on average?**
# Based on the above analysis, Ask HN receives around 14 comments per post while Show HN receives 10
#
# **Do posts created at certain times receive more comments on average?**
#
# The best hour to create a post is at 15:00 or 3:00 pm Eastern time. Based on TABLE 1 below, the 15:00 hour receives the highest average comments per post at 38.59.
#
# TABLE 1:
#
# **Top 5 Hours for Ask Posts Comments**
#
# 15:00 38.59 average comments per post
#
# 02:00 23.81 average comments per post
#
# 20:00 21.52 average comments per post
#
# 16:00 16.80 average comments per post
#
# 21:00 16.01 average comments per post