#!/usr/bin/env python # coding: utf-8 # # Hacker News Site - When and What to post to get the maximum comments # ## Objective # # In this project, I will analyse a representative set of data from the Hacker News site to determine what kind of posts (between **Ask HN** and **Show HN**) drives more comments and if the time at which the posts was created has a influence on the number of comments received. # The **data set** can be found on this [kaggle site](https://www.kaggle.com/hacker-news/hacker-news-posts) # # It contains approximately 20,000 rows - from the 300,000 rows originally extracted in 2016 - posts without comments were removed and the remaining submissions randomly sampled. # # **Below are the column description:** # # + `id`: The unique identifier from Hacker News for the post # + `title`: The title of the post # + `url`: The URL that the posts links to, if the post has a URL # + `num_points`: The number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes # + `num_comments`: The number of comments that were made on the post # + `author`: The username of the person who submitted the post # + `created_at`: The date and time at which the post was submitted (using Easter Time in the US) # # # ## Importing and preparing the data # In[1]: #Importing the data opened_file = open('hacker_news.csv') from csv import reader read_file = reader(opened_file) hn = list(read_file) print(hn[:5]) # In[2]: #separating the header from the data for ease of use header = hn[0] hn = hn[1:] print(header) for row in hn[:6]: print(row) # ## Preparing the data # We are only concerned with post titles beginning with `Ask HN` or `Show HN`, we'll create new lists of lists containing just the data for those titles. # In[3]: #empty lists to store data of interest ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] title = title.lower() if title.startswith('ask hn'): ask_posts.append(row) elif title.startswith('show hn'): show_posts.append(row) else: other_posts.append(row) print('There are {} in the ask hn list'.format(len(ask_posts))) print('There are {} in the show hn list'.format(len(show_posts))) print('There are {} in the other list'.format(len(other_posts))) # ## Comparing both types of posts # # We want to determine which kind of posts - `Ask HN` or `Show HN` gather greater interest. # # To do so, we will look at the average comments each type of posts receive. # In[4]: #Ask hn - average number of comments per posts total_ask_comments = 0 for post in ask_posts: n_comments = int(post[4]) total_ask_comments +=n_comments avg_ask_comments = total_ask_comments / len(ask_posts) print('There are {:.2f} comments on average for Ask NH posts'.format(avg_ask_comments)) #Show hn - average number of comments per posts total_show_comments = 0 for post in show_posts: n_comments = int(post[4]) total_show_comments +=n_comments avg_show_comments = total_show_comments / len(show_posts) print('There are {:.2f} comments on average for Show HN posts'.format(avg_show_comments)) # In[5]: #looking at maximum number of comments #ask hn max_n_comments_ask = 0 for post in ask_posts: n_comments = int(post[4]) if n_comments > max_n_comments_ask: max_n_comments_ask = n_comments print('There a maximum of {} comments for ASK HN posts'.format(max_n_comments_ask)) #show hn max_n_comments_show = 0 for post in show_posts: n_comments = int(post[4]) if n_comments > max_n_comments_show: max_n_comments_show = n_comments print('There a maximum of {} comments for Show HN posts'.format(max_n_comments_show)) # ### Comparing both type of posts - conclusion # # From the average number of comments on posts, we can see that `Ask HN` posts are receiving on average almost 1.5 times more comments than `Show HN` posts - with each `Aks HN` post included in our sample receiving an average of 14.04 comments compared to the 10.32 average comments of the `Show HN`posts. In addition, just to confirm our conclusion, we can look at the maximum number of comments received by each type of post: the result is aligned with the maximum of comments an `Ask HN` post received is 3 times as high as the maximum number of comments received by a `Show HN` post. # # Based on the above, we will focus on the `Ask HN` posts for the remaining part of the analysis. # ## Finding if there is a better time to make a Ask HN post # # In order to do so, we will: # 1. Calculate the amount of ask posts created in each hour of the day, along with the number of comments received. # 2. Calculate the average number of comments ask posts receive by hour created. # In[6]: # Importing the Datetime module given we will work with time import datetime as dt # ### Calculating the amount of ask posts created in each hour of the day, along with the number of comments received # In[7]: result_list = [] for post in ask_posts: created_at = post[6] n_comments = int(post[4]) new_element = [created_at,n_comments] result_list.append(new_element) counts_by_hour = {} comments_by_hour = {} for row in result_list: time = row[0] comments = int(row[1]) time_dt = dt.datetime.strptime(time,"%m/%d/%Y %H:%M") hour = time_dt.hour if hour not in counts_by_hour: counts_by_hour[hour] = 1 comments_by_hour[hour] = comments else: counts_by_hour[hour] += 1 comments_by_hour[hour] += comments #Making sure the results is realistic and that we likely have accurate results out print(counts_by_hour) print(comments_by_hour) #making sure we have all the rows print("There are {} lines in counts by hours".format(len(counts_by_hour))) print("There are {} lines in comments by hours".format(len(comments_by_hour))) # ### Calculating the hourly average of comments # # This will help us identify the best time to create a post to draw a maximum of comments # # In[8]: avg_by_hour = [] for hour in counts_by_hour: avg_by_hour.append([hour, (comments_by_hour[hour]/counts_by_hour[hour])]) print("There are {} lines in average comments by hours".format(len(avg_by_hour))) for row in avg_by_hour: print(row) # ### Sorting the data # # The above data appears accurate, however, it is hard to manually quickly identify the 5 best hours for creating a posts. # # To facilitate the task, we will swap the order of the data (average number of comments first instead of hour) and sort the average in descending order. # In[9]: #Swapping the order swap_avg_by_hour = [] for hour in avg_by_hour: swap_avg_by_hour.append([hour[1],hour[0]]) sorted_swap = sorted(swap_avg_by_hour, reverse=True) for row in sorted_swap: print(row) # ### Identifying the best 5 hours to post a Ask HN post # In[10]: print("Top 5 Hours for Ask HN Posts Comments") for row in sorted_swap[:5]: time_dt = dt.datetime.strptime(str(row[1]), '%H') time = time_dt.strftime("%H:%M") print("{time}: {comments:.2f} average comments per post".format(time=time,comments=row[0])) # ## What and When - Conclusions # # After our preliminary analysis we found that: # 1. **Ask HN** posts attract more comments in average than Show HN posts # 2. Posting in mid-afternoon **15:00 US Easter Time** leads to more comments # # We could however give a secondary window - while the 15:00 - 16:00 window produces significantly more comments (up to 2.5 times the average for Ask HN post in the case of the 15:00 hour), a second window opens in early evening around 20:00 - 21:00 with about 1.5 times more comments than the average Ask HN post at 20:00. It must be noted that those located outside of Eastern US, will need to adjust their posting time to take into account the time difference.