#!/usr/bin/env python # coding: utf-8 # # Hacker News Posts: which ones have most interactions and which are the best hours to post. # # The objective of this project is to have insights regarding the hacker posts activity, this info could be used to decided wich kind of post I should publish and at which hour to increase the chance to obtain the most points or comments. # # We are going to use string methods to obtain the titles taht we need, also we are going to use datetime methods to obtain info of the hours. # # The most important insight here is that the results depends of the type of post that your are analyzing. News posts have more points early, ask posts have more interactions at the afternoon, show posts at late night. So we need to be careful of what we want to post to choose the best hour to publish. # # ## Opening the dataset # # First, we need to open the dataset. This dataset was obtained in this [link](https://www.kaggle.com/hacker-news/hacker-news-posts). There are no comments or special observations in the dataset on the discussion page. # # The descriptions of the columns are as follow: # # * id: The unique identifier from HN for the post # * title: The title of the post # * url : The URL that the posts link to, if the post has the URL # * num_points : The number of points the post acquired, calculated as the total number of upvotes minus the total number of downvotes # * num_comments: The number of comments that were made on the post # * author : The username of the person who submitted the post # * created_at: The data and time at which the post was submitted # In[1]: import csv open_file = open('hacker_news.csv') hn = list(csv.reader(open_file)) hn[:5] # ## Remove the headers # # After open the dataset we proceed to remove the headers. # In[2]: headers = hn[0] hn = hn[1:] print(headers) print(hn[:5]) # ## Create list of "ask posts" and "show posts" # # In this part we are going to use the string method `startswith` and the `lower()`. # # + `startswith` is a method to compare the first letters of a string with a given string. # + `lower` is a method to convert the string to lower case. Because `startswith` is a case sensitive method, we need to put the letters in the same case to avoid missing data. # # In Hacker News there is two kinds of post generated by the users: Ask Hacker News (Ask HN) and Show Hacker News (Show Hn). There is other kinds, but in this exercise we want to focus on these two. # In[3]: # We need to create empty lists to sort the data later ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] # We are using here startswith, and previous we transform the case to lower if title.lower().startswith("ask hn"): ask_posts.append(row) elif title.lower().startswith("show hn"): show_posts.append(row) else: other_posts.append(row) # Print the len to verify the quantities print(len(ask_posts)) print(len(show_posts)) print(len(other_posts)) # In[4]: # Print he first five rows of each list to see the results. print(ask_posts[:2]) print(show_posts[:2]) print(other_posts[:2]) # ## Determine wich kind of post have the most comments # # Once we have three lists with differente kind of posts we need to identify wich kind of post have the most comments in average. Because we are not comparing the other posts we are not going to use this list here. # In[5]: total_ask_comments = 0 for row in ask_posts: num_comments = int(row[4]) total_ask_comments += num_comments avg_ask_comments = total_ask_comments / len(ask_posts) total_show_comments = 0 for row in show_posts: num_comments = int(row[4]) total_show_comments += num_comments avg_show_comments = total_show_comments / len(show_posts) print(avg_ask_comments) print(avg_show_comments) # In average in Ask posts we have 14 comments for post, instead in the show comments post we have 10 comments for post. So we are going to analyze the Ask Posts first. # ## At wich time the asks posts have more comments # # Now that we know which kind of posts have in average the most comments, we wan to know: # # 1. At which hours the asks post are created # 2. What is the average number of comments ask posts have for hour. # # For this tasks we need to us `datetime`. This Class allow us to use certain methods to transform our raw date data to a date time standarized data, that python could use and manipulate for further analysis. # In[6]: import datetime as dt # With this we import the methods to use time data #We create an empty list that stores the date and the number of comments of the raw data result_list = [] for row in ask_posts: created_at = row[6] n_comments = int(row[4]) result_list.append([created_at, n_comments]) print(result_list[:5]) # We need to create first two empty dictionaries where we are going to store number of comments and the hour of post counts_by_hour = {} comments_by_hour = {} for row in result_list: date = row[0] comments = row[1] date_format = "%m/%d/%Y %H:%M" #This function parse the date, it is convert the strings in date formats for python time = dt.datetime.strptime(date, date_format) #This function extract from the parsed date the info that I need hour = time.strftime("%H") if hour not in counts_by_hour: counts_by_hour[hour] = 1 comments_by_hour[hour] = comments else: counts_by_hour[hour] += 1 comments_by_hour[hour] += comments comments_by_hour # ## Calculating the average number of comments per post during each hour of the day # # In this part we are going calculate the average number of comments per hour, and assign this value to a new list. # In[7]: avg_by_hour = [] for hour in comments_by_hour: avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]]) avg_by_hour # ## Sorting the results # # Now that we have the average number of comments, we need to sort the results, so we can see more clearly wich are the hours with most comments. To sort, we need to swap the list results, so we can use the `sorted` function. # In[8]: swap_avg_by_hour = [] for row in avg_by_hour: swap_avg_by_hour.append([row[1], row[0]]) print(swap_avg_by_hour) # In[9]: #In the sorted function we use the reverse method to sorte the results from largest to smallest average comments sorted_swap = sorted(swap_avg_by_hour, reverse=True) print(sorted_swap) # ## The top 5 hours to Ask Posts Comments # # Now, with the sorted data, we have the info that we needed to know which hours are the best to post, so we are going to print the first five results with the average number of comments. # In[10]: print("Top 5 Hours for Ask Posts Comments") for row in sorted_swap[:5]: hour_format = "%H" #We only need the hour info hour = dt.datetime.strptime(row[1], hour_format) hr = hour.strftime("%H:%M") # The format method allow us to assign a value to a string and print it with a loop string = "{h} ---> {avg:.2f} average comments per post".format(h=hr, avg=row[0]) print(string) # ## Converting to my timezone # # This is a failed experiment, but an instructive one. In this part I tried first to assign a timezone to the data, and convert this date time to my timezone. For this part I used the `pytz` Class. # # The timezones that python use could be seen using the function, or you can see [here](https://stackoverflow.com/questions/13866926/is-there-a-list-of-pytz-timezones) a list of the principal timezones. The datetime of the dataset is 'US/Eastern', accoding the documentation, so I need to convert this to 'America/Lima'. # In[11]: from pytz import timezone # With this method I assign a time zone to my data import pytz # An empty list to fill with the changed hours sorted_swap_east = [] for row in sorted_swap[:5]: hour_format = "%H" hour = dt.datetime.strptime(row[1], hour_format) # In the variable Eastern I assign the timezone, eastern = timezone('US/Eastern') # In this part I us .localize method to assign the timezone hour_tz = eastern.localize(hour) #Now I can use the hour with the assigned timezone hr = hour_tz.strftime("%H:%M") sorted_swap_east.append([row[0], hour_tz]) string = "{h} ---> {avg:.2f} average comments per post".format(h=hr, avg=row[0]) print(string) print ('\n') # In[12]: for row in sorted_swap_east: hour = row[1] # Here I'm going to assign my timezone to the dataset tz = timezone('America/Lima') # In this case I used the .astimezone to assign my time zone to the hour hour_ame = hour.astimezone(tz) hr = hour_ame.strftime("%H:%M") string = "{h} ---> {avg:.2f} average comments per post".format(h=hr, avg=row[0]) print(string) # As we can see the results is not good, I expected to obtain `14:00` but instead I obtained `14:48`, so clearly something is missing, but I don't want to dive more in the problem, maybe is related with the minutes. # ## Which kind of posts receive more points # # Finished the comments part we are going to explore the points parts. It is exactly the same procedure, but in this case we use the index for points, so we only need to extract the data an see what we obtain. # In[13]: total_ask_points = 0 for row in ask_posts: num_points = int(row[3]) # The points index is 3 total_ask_points += num_points avg_ask_points = total_ask_points / len(ask_posts) total_show_points = 0 for row in show_posts: num_points = int(row[3]) total_show_points += num_points avg_show_points = total_show_points / len(show_posts) print(avg_ask_points) print(avg_show_points) # So in this case we could see that the Show Posts have more points that the Asks Posts, which is totally expected because the asks post are totally designed to generate comments, the show posts not. # ## Looking the average points # # The best exercise in this case is follow the same steps of the previous analysis with comments, and apply the code to obtain the average points per hour in every kind of posts. # In[14]: result_list_point = [] for row in ask_posts: created_at = row[6] n_points = int(row[3]) result_list_point.append([created_at, n_points]) countspoint_by_hour = {} points_by_hour = {} for row in result_list_point: date = row[0] points = row[1] date_format = "%m/%d/%Y %H:%M" #This function parse the date, it is convert the strings in date formats for python time = dt.datetime.strptime(date, date_format) #This function extract from the parsed date the info that I need hour = time.strftime("%H") if hour not in countspoint_by_hour: countspoint_by_hour[hour] = 1 points_by_hour[hour] = points else: countspoint_by_hour[hour] += 1 points_by_hour[hour] += points avgpoint_by_hour = [] for hour in points_by_hour: avgpoint_by_hour.append([hour, points_by_hour[hour]/countspoint_by_hour[hour]]) swap_avgpoint_by_hour = [] for row in avgpoint_by_hour: swap_avgpoint_by_hour.append([row[1], row[0]]) sortedpoint_swap = sorted(swap_avgpoint_by_hour, reverse=True) sortedpoint_swap[:5] # In[17]: result_list_point = [] for row in show_posts: created_at = row[6] n_points = int(row[3]) result_list_point.append([created_at, n_points]) countspoint_by_hour = {} points_by_hour = {} for row in result_list_point: date = row[0] points = row[1] date_format = "%m/%d/%Y %H:%M" #This function parse the date, it is convert the strings in date formats for python time = dt.datetime.strptime(date, date_format) #This function extract from the parsed date the info that I need hour = time.strftime("%H") if hour not in countspoint_by_hour: countspoint_by_hour[hour] = 1 points_by_hour[hour] = points else: countspoint_by_hour[hour] += 1 points_by_hour[hour] += points avgpoint_by_hour = [] for hour in points_by_hour: avgpoint_by_hour.append([hour, points_by_hour[hour]/countspoint_by_hour[hour]]) swap_avgpoint_by_hour = [] for row in avgpoint_by_hour: swap_avgpoint_by_hour.append([row[1], row[0]]) sortedpoint_swap = sorted(swap_avgpoint_by_hour, reverse=True) sortedpoint_swap[:5] # In[18]: result_list_other = [] for row in other_posts: created_at = row[6] n_other = int(row[3]) result_list_other.append([created_at, n_points]) countsother_by_hour = {} points_by_hour = {} for row in result_list_other: date = row[0] otherss = row[1] date_format = "%m/%d/%Y %H:%M" #This function parse the date, it is convert the strings in date formats for python time = dt.datetime.strptime(date, date_format) #This function extract from the parsed date the info that I need hour = time.strftime("%H") if hour not in countsother_by_hour: countsother_by_hour[hour] = 1 points_by_hour[hour] = points else: countsother_by_hour[hour] += 1 points_by_hour[hour] += points avgpoint_by_hour = [] for hour in points_by_hour: avgpoint_by_hour.append([hour, points_by_hour[hour]/countspoint_by_hour[hour]]) swap_avgpoint_by_hour = [] for row in avgpoint_by_hour: swap_avgpoint_by_hour.append([row[1], row[0]]) sortedpoint_swap = sorted(swap_avgpoint_by_hour, reverse=True) sortedpoint_swap [:5] # So, we can resume the findings: # # + The Asks Posts obtained more points in the afternoon, coinciding with some of the hours in the comments. # # + The Show Posts obtained more points late in the night, because the people that likes to explore new content use their free hours to explore. # # + The other posts obtained more points early in the morning, coincides with the habit of read the news before going to work. # ## Conclusions # # In this project we wanted to find which kind of posts have more comments and at which hours. # # We found that it depends of the type of post, and clearly each post have different audience hours. # # We need to know what we want to publish, if it is an Asks posts afternoon is a good time to publish. Show posts late night, and other posts like news are better in the morning.