#!/usr/bin/env python # coding: utf-8 # #### Examining Hacker News posts and replies: # # In this project we will look at the differnce between posts that ask for reviews, advice and input while also looking at posts that are simply up for sharing. These are referred to as ASK HN and SHOW HN, respectively. We're hoping to illuminate whether certain posts garner more comments and attention and whether or not certain times affect said responses. # In[3]: #open, prepare and read the file from csv import reader opened_file = open("hacker_news.csv") read_file = reader(opened_file) hn = list(read_file) print(hn[:5]) # In[4]: #create a header file and a main hn file. This is to ensure that #the title column is not inlcluded in count data headers = hn[0] hn = hn[1:] print(hn[:5]) # In[5]: #after creating the hn list of lists w/o a header we will #now create three empty lists which will be populated with #posts beginning with Ask HN and Show HN and other ask_posts = [] show_posts = [] other_posts = [] #looping through hn to find titled articles for row in hn: title = row[1] if title.lower().startswith("ask hn"): ask_posts.append(row) elif title.lower().startswith("show hn"): show_posts.append(row) else: other_posts.append(row) print("The ask hn posts have",len(ask_posts),"entries") print("The show hn posts have",len(show_posts),"entries") print("The other posts have",len(other_posts),"entries") # In[6]: #below we will look at the first few rows of both ask_posts and #show_posts to see if we've gathered the correct data print(ask_posts[:3]) print('\n') print(show_posts[:3]) # Now that we've assigned the ask posts versus show posts we want to see the number of comments received. This will set the stage of beginning to understand, possibly, which category gathers more comments as suggested in the introduction. # In[7]: #finding the total number of comments for ask_posts total_ask_comments = 0 #iterating over ask_posts for comments in ask_posts: total_ask_comments += int(comments[4]) #find the average number of ask post comments avg_ask_comments = ((total_ask_comments))/(len(ask_posts)) print("The average number of ask comments is","{:.2f}".format(avg_ask_comments)) # In[8]: #We'll now repeat the same process that we ran on ask_posts #but with show_posts instead. total_show_comments = 0 #iterating over show_posts for comments in show_posts: total_show_comments += int(comments[4]) #find the average number of show post comments avg_show_comments = total_show_comments/len(show_posts) print("The average number of show comments is","{:.2f}".format(avg_show_comments)) # Based on this short series of data queries and averages it appears that, on average, ask post comments receive 14.04 comments per post as opposed to 10.32 for show posts. This makes sense in some ways as ask posts are just that, asks. They are looking for responses and feedback and show posts are more about exhibition. The remainder of the project analysis will focus on ask_posts. # # Now that we know we're working on ask_posts we're going to dig a little deeper and find out the number of ask_posts created in each hour of the day as well as comments during the same time periods. We will then calculate the averages of these numbers and, ostensibly, gain a better idea about which parts of the day are more active than others. # In[9]: import datetime as dt result_list = [] #iterated over askposts to find times for results in ask_posts: result_list.append([results[6],int(results[4])]) #verify if results_list was succssful print(result_list[:3]) # In[10]: #We'll now create two empty dictionaries that will house counts and #comments by the hour counts_by_hour = {} comments_by_hour = {} for row in result_list: date = row[0] comments_number = row[1] date_made = dt.datetime.strptime(date,"%m/%d/%Y %H:%M") hour_made = date_made.hour if hour_made in counts_by_hour: counts_by_hour[hour_made] += 1 comments_by_hour[hour_made] += comments_number else: counts_by_hour[hour_made] = 1 comments_by_hour[hour_made] = comments_number #check to see if counts and comments by hour were successfully created print("The counts by hour were:",counts_by_hour) print("\n") print("The comments by hour were:",comments_by_hour) # Our goal now is to create an average number of comments per hour, per post. The best way to do this will be to create a list of lists from both dictionaries created (count_by_hour & comments_by_hour). This will be fairly straight forward, requiring isolating hours, comments and averaging them by dividing the number of comments, per hour, by the number of posts per the same hour. # In[11]: #create an empty list which will count posts by hours and average of #comments for each hour. avg_by_hour = [] for posts in counts_by_hour: post_avg = comments_by_hour[posts]/counts_by_hour[posts] avg_by_hour.append([posts, post_avg]) #check for accuracy in averages print(avg_by_hour) # Now that we have the information we need, it would be helpful to show it in a way where it's easier to read and shows the highest averages by hour along with formatted decimal places. The finshed result will highlight the top five averge comments by hour and give some insight into what time of day both posts and commenters are most active within this part of Hacker News # In[12]: #create a list of lists for average by hour swap_avg_by_hour = [] #iterate over avg_by_hour for row in avg_by_hour: swap_avg_by_hour.append([row[1],row[0]]) print(swap_avg_by_hour) # In[26]: #We'll use the sorted function here to put these values in descending order sorted_swap_avg = sorted(swap_avg_by_hour, reverse = True) print("The Top 5 Hours for Ask Post Comments:") #loop through the first five sorted values and format for row in sorted_swap_avg[:5]: hour_form = dt.datetime.strptime(str(row[1]), "%H") hour_form = hour_form.strftime("%H:%M") print(hour_form,'{:.2f}'.format(row[0]),'average comments per post') # Now that we have the top five times and averages per time period, we're going to convert the listed times from above (Eastern Standard Time) to the this author's time zone (Pacific Standard Time). We will check the documentation from the Kaggle page (https://www.kaggle.com/hacker-news/hacker-news-posts) that tells us what time zone it was originated in. This exercise will illuminate a requirement for time conversion as well as advantageous times to post in the author's time zone. # In[25]: #convert times to Pacific Standard Time-United States(PST) print('Top 5 Hours for Ask Posts Comments (Time zone: PST, -3 from EST)') for row in sorted_swap_avg[:5]: # format the hours adj_time = (dt.datetime.strptime(str(row[1]), "%H") + dt.timedelta(hours = -3)).strftime("%H:%M") print(adj_time,'{:.2f}'.format(row[0]),'average comments per post') # After changing the time from Eastern Standard to Pacific Standard it's easy to see the top five times to post and, hopefully, receive a large share of comments. While this is not guaranteed, it provides insight into trends that may help understand characteristics of users on Hacker News at certain times.