#!/usr/bin/env python # coding: utf-8 # ### Project: analytic insights for ask and show posts on hacker news platform # # This project will compare posts on Hacker News (HN) platform specifically Ask HN posts (users ask the HN community a question) and Show HN posts (users show the HN community a project, product, or generally something interesting). # # The objective is to discover: # * Type of HN posts (ask or show) that recieve more comments # * The time of day at which posts are created and how this relates to the number of comments recieved # # The dataset to be used for this data analysis can be found [here](https://www.kaggle.com/hacker-news/hacker-news-posts) # # The dataset has 300,000 rows, by removing posts which did not have any comments and then randomly sampling, this has been reduced to 20,000. # In[81]: import csv file = open('hacker_news.csv') hn_data = list(csv.reader(file)) hn_header = hn_data[0] hn_data = hn_data[1:] # Preview data print(hn_header) print('\n') print(hn_data[:2]) # This dataset includes the following columns: # # - `title`: title of the post (self explanatory) # - `url`: the url of the item being linked to # - `num_points`: the number of upvotes the post received # - `num_comments`: the number of comments the post received # - `author`: the name of the account that made the post # - `created_at`: the date and time the post was made # ### HN Posts: Breakdown by total number of user posts # # Post have been categorised into three: 'ask hn', 'show hn' and 'others' based on the post 'title'. Posts which have a title that begins with 'ask hn' will be categorised as such, for instance, 'Ask HN: How to improve my personal website?'. The same technique will be used for categorising 'show hn' posts. The 'others' category will be used for the rest of user posts. # In[82]: post_ask = [] post_show = [] post_other = [] for post in hn_data: hn_title = post[1] if hn_title.lower().startswith('ask hn'): post_ask.append(post) elif hn_title.lower().startswith('show hn'): post_show.append(post) else: post_other.append(post) # Post count print(len(post_ask), 'posts for Ask HN') print(len(post_show), 'posts for Show HN') print(len(post_other), 'posts for Others') # Preview posts print('\n') print('Posts for Ask HN') print(post_ask[:2]) print('\n') print('Posts for Show HN') print(post_show[:2]) print('\n') print('Posts for others') print(post_other[:2]) # The breakdown shows that the larger portion of the data set belongs to posts in the 'others' category. The ask posts have a total count of 1744 while for show posts, there are 1162. # ### HN Posts: Breakdown by comments # # The breakdown for hacker news posts based on the total count for comments has been displayed below. There are over twice as much user comments for ask posts than there are comments on show posts from the hacker news community. The data shows that there are more comments on average for ask posts. # # | Post Type | Comments (Total) | Comments (Avg) # | ----------- | ----------- | ----------- # | Ask HN | 24,483 | 14.04 # | Show HN | 11,988 | 10.32 # # In[83]: # Total and comment avg for ask posts total_ask_comments = 0 for post in post_ask: comments = int(post[4]) total_ask_comments += comments avg_ask_comments = total_ask_comments / len(post_ask) print ('{:,}'.format(total_ask_comments)) print (round(avg_ask_comments,2)) # In[84]: # Total and comment avg for show posts total_show_comments = 0 for post in post_show: comments = int(post[4]) total_show_comments += comments avg_show_comments = total_show_comments / len(post_show) print ('{:,}'.format(total_show_comments)) print (round(avg_show_comments,2)) # Progressing further with the analysis, we would like to discover if posts created at a certain time are more likely to recieve more comments from the hacker news community. This part of the analysis will only utilise ask posts reason being there is more data to work with. # ### Ask Posts: Breakdown of total comments recieved by time period # # Below we determine the amount of ask posts created in each hour of the day, along with the number of comments received by time period. The data shows that most ask posts recieve more comments around 15:00pm. # In[85]: import datetime as dt result_list = [] for post in post_ask: result_list.append([post[6], int(post[4])]) comments_by_hour = {} counts_by_hour = {} date_format = "%m/%d/%Y %H:%M" for each_row in result_list: dt_created = each_row[0] num_comment = each_row[1] time = dt.datetime.strptime(dt_created, date_format).strftime("%H") if time in counts_by_hour: comments_by_hour[time] += num_comment counts_by_hour[time] += 1 else: comments_by_hour[time] = num_comment counts_by_hour[time] = 1 comments_by_hour # ### Ask Posts: Top 5 hours for most comments by time period # # Based on the dataset documentation, the timezone used is the eastern time in the US, 15:00 will be equivalent to 3:00 pm est. The **top 5** hours for most comments on Ask Posts are 15:00, 02:00, 20.00, 16:00 and 21.00. The hour that receives the most comments on average is 15:00 with thirty nine comments per post. # In[86]: avg_by_hour = [] for hr in comments_by_hour: avg_by_hour.append([hr, comments_by_hour[hr] / counts_by_hour[hr]]) #avg_by_hour swap_avg_by_hour = [] for row in avg_by_hour: swap_avg_by_hour.append([row[1], row[0]]) sorted_swap = sorted(swap_avg_by_hour, reverse=True) print("The Top 5 Hours for 'Ask HN' Comments") for avg, hr in sorted_swap[:5]: print( "{}: {:.2f}".format( dt.datetime.strptime(hr, "%H").strftime("%H:%M"),avg ) ) # ### HN Posts: Breakdown by points # # The purpose of this part of the analysis will be to discover the breakdown for hacker news posts based on points. Points represent the number of upvotes the post has received over time from the hacker news community. The data shows that there are more points on average for show posts. This suggests that while ask posts will likely recieve more comments from the HN community, show posts tend have more upvotes. # # | Post Type | Points (Total) | Points (Avg) # | ----------- | ----------- | ----------- # | Ask HN | 26,268 | 15.06 # | Show HN | 32,019 | 27.56 # # In[87]: # Total and point avg for ask posts total_ask_points = 0 for post in post_ask: points = int(post[3]) total_ask_points += points avg_ask_points = total_ask_points / len(post_ask) print ('Points total for ask posts:', '{:,}'.format(total_ask_points)) print ('Points avg for ask posts:', round(avg_ask_points,2)) print ('\n') # Total and point avg for show posts total_show_points = 0 for post in post_show: points = int(post[3]) total_show_points += points avg_show_points = total_show_points / len(post_show) print ('Points total for show posts:', '{:,}'.format(total_show_points)) print ('Points avg for ask posts:', round(avg_show_points,2)) # ### Show Posts: Breakdown of average points recieved by time period # # Progressing further with the point analysis, we would like to discover if posts created at a certain time are more likely to recieve more upvotes. The same technique used for the analysis of post comments will be used here, lets determine the average number of points for show posts by hour. # # In[88]: import datetime as dt result_list = [] for post in post_show: result_list.append([post[6], int(post[3])]) points_by_hour = {} counts_by_hour = {} date_format = "%m/%d/%Y %H:%M" for each_row in result_list: dt_created = each_row[0] num_point = each_row[1] time = dt.datetime.strptime(dt_created, date_format).strftime("%H") if time in counts_by_hour: points_by_hour[time] += num_point counts_by_hour[time] += 1 else: points_by_hour[time] = num_point counts_by_hour[time] = 1 points_by_hour avg_by_hour = [] for hr in points_by_hour: avg_by_hour.append([hr, points_by_hour[hr] / counts_by_hour[hr]]) avg_by_hour # The hour that receives the most points for show posts on average is 23:00. The **top 5** hours for most upvotes from the hacker news community for show posts are 23:00, 12 noon, 22.00, 00:00 and 18.00. # In[89]: points_avg_by_hour = [] for row in avg_by_hour: points_avg_by_hour.append([row[1], row[0]]) sorted_list = sorted(points_avg_by_hour, reverse=True) print("Top 5 hours for points on 'Show HN' posts") for avg, hr in sorted_list[:5]: print( "{}: {:.2f}".format( dt.datetime.strptime(hr, "%H").strftime("%H:%M"),avg ) ) # ### Result summary # # This project analysed ask posts and show posts on the hacker news platform to determine which type of post and time period received the most comments and points on average. # # Based on the analysis, to optimise the possibility of receiving more comments, we'd recommend users to post on the hacker news platform using the 'ask hn' title and possibly create the post sometime between the time period of 15:00pm and 16:00pm EST. This time period in my timezone is between 20:00pm and 21:00pm WAT. # # # | Post Type | Comments (Total) | Comments (Avg) # | ----------- | ----------- | ----------- # | Ask HN | 24,483 | 14.04 # | Show HN | 11,988 | 10.32 # # Furthermore, on comparing both types of posts based on the points / upvotes recieved, the data shows that there are more points on average for show posts than there are for ask posts. This suggests that while 'ask hn' posts are more likely to recieve more comments from the hacker news community, 'show hn' posts tend to recieve more points. # # # | Post Type | Points (Total) | Points (Avg) # | ----------- | ----------- | ----------- # | Ask HN | 26,268 | 15.06 # | Show HN | 32,019 | 27.56 # #