#!/usr/bin/env python # coding: utf-8 # # Exploring Hacker News posts # # - In this project, we analyse the data set of submissions to popular technology site - Hacker News. # - Hacker News is a site started by the startup incubator Y Combinator, where user-submitted stories (known as "posts") are voted and commented upon, similar to reddit. Hacker News is extremely popular in technology and startup circles, and posts that make it to the top of Hacker News' listings can get hundreds of thousands of visitors as a result. # - You can find the data set [here](https://www.kaggle.com/hacker-news/hacker-news-posts), but note that it has been reduced from almost 300,000 rows to approximately 20,000 rows by removing all submissions that did not receive any comments, and then randomly sampling from the remaining submissions. # In[8]: opened_file = open("hacker_news.csv") #opening the dataset. from csv import reader read_file = reader(opened_file) hn = list(read_file) #converting the data set into a list of lists. print(hn[:5]) # Here's what the columns represent: # # |**Column Name**|**What it represents**| # |:---:|---:| # |id|The unique identifier from Hacker News for the post| # |title|The title of the post| # |url|The URL that the posts links to, if the post has a URL| # |num_points|The number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes| # |num_comments|The number of comments that were made on the post| # |author|The username of the person who submitted the post| # |created_at| the date and time the post was made (the time zone is Eastern Time in the US)| # # In[9]: headers = hn[0] hn = hn[1:] #we need to remove the header to analyse our data. print(headers) print('\n') print(hn[0],'\n',hn[1],'\n',hn[2],'\n',hn[3],'\n',hn[4]) #displaying the first five rows of the data set #after we removed the header # We're specifically interested in posts whose titles begin with either 'Ask HN' or 'Show HN'. Users submit Ask HN posts to ask the Hacker News community a specific question. Below are a couple examples: # # - Ask HN: How to improve my personal website? # - Ask HN: Am I the only one outraged by Twitter shutting down share counts? # # Likewise, users submit Show HN posts to show the Hacker News community a project, product, or just generally something interesting. Below are a couple of examples: # # - Show HN: Something pointless I made # - Show HN: Shanhu.io, a programming playground powered by e8vm # # ## Extracting Ask HN and show HN posts # In[10]: ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] if title.lower().startswith('ask hn'): ask_posts.append(row) # creating the list of ask hn posts elif title.lower().startswith('show hn'): show_posts.append(row) # creating the list of show hn posts else: other_posts.append(row) print('Number of ask hn posts:',len(ask_posts)) print('\n') print('Below are two instances of Ask HN posts:') print('\n') print(ask_posts[:2]) print('\n') print('Number of show hn posts:',len(show_posts)) print('\n') print('Below are two instances of Show HN posts:') print('\n') print(show_posts[:2]) print('\n') print('Number of other posts:',len(other_posts)) # ## Calculating the average number of comments for Ask HN and Show HN posts # # In[11]: # finding total comments in ask posts total_ask_comments = 0 for row in ask_posts: total_ask_comments += int(row[4]) # calculating average number of comments on ask posts avg_ask_comments = total_ask_comments/len(ask_posts) print('Average number of ask post comments:',avg_ask_comments) # finding total comments in show posts total_show_comments = 0 for row in show_posts: total_show_comments += int(row[4]) # calculating average number of comments on show posts avg_show_comments = total_show_comments/len(show_posts) print('Average number of show post comments:',avg_show_comments) # **We can see in the output of the code cell above that where the average number of comments on Ask HN posts are about 14, the average number of comments on show posts are around 10. This could be expected as the questions will attract more comments than general posts.** # # Since ask posts are more likely to receive comments, we'll focus our remaining analysis just on these posts. # # Next, we'll determine if ask posts created at a certain time are more likely to attract comments. # ## Finding the amount of Ask HN posts and comments by hour created # In[12]: import datetime as dt result_list = [] for row in ask_posts: date = row[6] comments = int(row[4]) result_list.append([date,comments]) counts_by_hour = {} comments_by_hour = {} for row in result_list: hour_dt = dt.datetime.strptime(row[0],'%m/%d/%Y %H:%M') hour = hour_dt.strftime('%H') comments = row[1] if hour in counts_by_hour: counts_by_hour[hour] += 1 comments_by_hour[hour] += comments else: counts_by_hour[hour] = 1 comments_by_hour[hour] = comments print(counts_by_hour) print('\n') print(comments_by_hour) # ## Calculating the Average number of comments for Ask HN posts by hour # In[13]: avg_by_hour = [] for hour in comments_by_hour: avg_comments = comments_by_hour[hour]/counts_by_hour[hour] avg_by_hour.append([hour,avg_comments]) print(avg_by_hour) # ## Top five hours for Ask HN posts comments # In[14]: swap_avg_by_hour = [] for row in avg_by_hour: swap_avg_by_hour.append([row[1],row[0]]) print(swap_avg_by_hour) sorted_swap = sorted(swap_avg_by_hour,reverse = True) print('\n') print('Top 5 Hours for Ask Posts Comments:') print('\n') for avg,hours in sorted_swap[:5]: hour = dt.datetime.strptime(hours,'%H') hour = hour.strftime('%H:%M') string = '{hours} : {comments:.2f} average comments per post'.format(hours = hour,comments = avg) print(string) # ## Conclusion: The best time to post questions is 15th hour (EST) of the day # # - **Ask HN posts recieve more comments on average as compared to Show HN posts. This is to be expected as the questions are more likely to recieve comments than general posts.** # # - **Average comments of ask posts are about 14. If we see hour-wise data, the comments peak during 3 pm to 4 pm with an average of about 38 comments. This is a 1.6x increase to the 2nd highest average comments recieved during the night at 2 am to 3am which is around 23. The increase could be due to combination of young and adult population being active on the web after the lunch time. And the reason for next highest comments being recorded after midnight could be the young population which is mostly active on the internet during this time.** # # - **The trend is exactly opposite if we convert the time zone from EST to IST where I live. The comments peak around 1:30 am. The next highest comments are recorded around 12:30 pm which is past the lunch time here. This reversal could be due to greater population and a majority of it being occupied by the youth, which here too is mostly active at night.**