#!/usr/bin/env python # coding: utf-8 # # Guided Project - Exploring Hacker News Posts # ![download.png](attachment:download.png) # # ### Company Background: # Hacker News is a site where users receive votes and comments on submitted stories (or posts). Top posts on Hacker News may receive hundreds of thousands of visitors. # # ### **Project Goal:** # # My goal as the Data Analyst is to answer two questions: # * Does Ask HN or Show HN receive more comments? # * Do posts created at certain times receive more comments on average? # # > **This project will use the following six steps of the data analysis process to answer the above two questions of the project goal:** # > * 1.) Ask Question # > * 2.) Get Data # > * 3.) Explore Data # > * 4.) Clean Data # > * 5.) Analyze Data # > * 6.) Conclusion # # ## Ask Question: # # My goal as the Data Analyst is to answer two questions: # * Does Ask HN or Show HN receive more comments on average? # * Do posts created at certain times receive more comments on average? # ## Get Data # # The data set that will be used for analysis can be found [here](https://www.kaggle.com/hacker-news/hacker-news-posts). # The data set includes submissions to the Hacker News site. # In[12]: # Reduced dataset from 300,000 rows to 20,000 rows by removing all submissions without a comment opened_file = open("hacker_news.csv") from csv import reader read_file = reader(opened_file) hn = list(read_file) headers = hn[0] print(headers) hn = hn[1:] ## remove header print(hn[0]) # In[13]: #import modules import datetime as dt # ## Explore Data # #Data Dictionary # * id: the unique identifier from Hacker News for the post # * title: the title of the post # * url: the URL that the posts links to, if the post has a URL # * num_points: the number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes # * num_comments: the number of comments on the post # * author: the username of the person who submitted the post # * created_at: the date and time of the post's submission # ## Clean Data # Find posts that begin with Ask HN or Show HN # # In[14]: ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] title = title.lower() if title.startswith("ask hn"): ask_posts.append(row) elif title.startswith("show hn"): show_posts.append(row) else: other_posts.append(row) print("Number of Posts begin with 'Ask HN': ", len(ask_posts), "\n","Number of posts begin with 'Show HN': ", len(show_posts),"\n", "Number of Other posts:", len(other_posts)) # Calculate the average number of comments per posts for: # * Ask HN # * Show HN # # In[15]: # Calc the average comments per ask_posts total_ask_comments = 0 for row in ask_posts: num = int(row[4]) total_ask_comments += num avg_ask_comments = total_ask_comments / len(ask_posts) print("Average number of comments for Ask HN:") avg_ask_comments # In[16]: # Find the average comments per show_posts total_show_comments = 0 for post in show_posts: num_1 = int(post[4]) total_show_comments += num_1 avg_show_comments = total_show_comments / len(show_posts) print("Average number of comments for Show HN:") avg_show_comments # > The Ask HN posts receive more comments. The Ask HN posts receive around 14 comments per post while the Show HN posts receive around 10 comments per post. # # Since Ask HN posts receive more comments, the analysis will focus on Ask HN posts only. # ## Analyze Ask Posts Data # Determine if Ask Posts created at certain times have more comments: # # * calculate the number of Ask Posts created each hour along with # comments received # # * calculate average number of comments Ask Posts receives by hour created # # * identify the top five hours with the highest comments per post # # * identify the best hours to create a post to have a higher chance of receiving comments # # > Calculate the number of Ask Posts created each hour along with the number of comments received # In[17]: # calculate the number of ask posts created each hour along with # comments received result_list = [] for post in ask_posts: created_at_col = post[6] num_comments_col = post[4] result_list.append([created_at_col, num_comments_col]) # create at time and dat, number of comments counts_by_hour = {} comments_by_hour = {} for row in result_list: date = row[0] comment_num = int(row[1]) date_formatted_dt = dt.datetime.strptime(date,"%m/%d/%Y %H:%M") hour = date_formatted_dt.hour if hour not in counts_by_hour: counts_by_hour[hour] = 1 comments_by_hour[hour] = comment_num elif hour in counts_by_hour: counts_by_hour[hour] += 1 comments_by_hour[hour] += comment_num print("Posts per hour:","\n",counts_by_hour,"\n", "\n","Comments per hour:","\n",comments_by_hour) # > Calculate average number of comments Ask Posts receives by the hour created # In[18]: #calculate average number of comments ask posts receives by hour created avg_by_hour = [] for hour in comments_by_hour: avg_by_hour.append([hour,comments_by_hour[hour]/counts_by_hour[hour]]) print("Average number of comments per post by hour:") avg_by_hour # > Identify the top five hours with the highest comments per post: # # > * create a list of list with average as first element and hour as 2nd element # # > * sort the list and print the top 5 hours with the most average comments per hour # In[19]: #create a list with avg_by_hour swapped columns swap_avg_by_hour = [] for hour in avg_by_hour: swap_avg_by_hour.append([hour[1],hour[0]]) swap_avg_by_hour # In[20]: # sort new list of list swap_avg_by_hour sorted_swap = sorted(swap_avg_by_hour, reverse = True) print("Top 5 Hours for Ask Posts Comments") for comment_hour in sorted_swap[:5]: hour_avg_comment = str(comment_hour[1]) # 15 convert to str hour_dt = dt.datetime.strptime(hour_avg_comment, "%H")# 1900-01-01 15:00:00 hour_formatted = hour_dt.strftime("%H:%M") # 15:00 template = "{hour} {avg_comment:.2f} average comments per post".format(hour=hour_formatted, avg_comment = comment_hour[0]) print(template) # > Identify the best hours to create a post to have a higher chance of receiving comments # # Conclusion # # This project answers two questions: # # **Does Ask HN or Show HN receive more comments on average?** # Based on the above analysis, Ask HN receives around 14 comments per post while Show HN receives 10 # # **Do posts created at certain times receive more comments on average?** # # The best hour to create a post is at 15:00 or 3:00 pm Eastern time. Based on TABLE 1 below, the 15:00 hour receives the highest average comments per post at 38.59. # # TABLE 1: # # **Top 5 Hours for Ask Posts Comments** # # 15:00 38.59 average comments per post # # 02:00 23.81 average comments per post # # 20:00 21.52 average comments per post # # 16:00 16.80 average comments per post # # 21:00 16.01 average comments per post