#!/usr/bin/env python # coding: utf-8 # ## Guided Project: Exploring Hacker News Posts # # #### In this project, we'll work with a dataset of submissions to a popular technology site called Hacker News. Hacker News is extremely popular in the technology sector. Hacker News is a community based forum website similar to reddit. In this project we're specifically interested in posts with titles that begin with either Ask HN or Show HN. Users submit Ask HN posts to ask the Hacker News community a specific question. Users also submit Show HN posts to show the Hacker News community a project, product, or just something interesting. # # #### We'll compare these two types of posts to determine the following: # # * **Do Ask HN or Show HN receive more comments on average?** # * **Do posts created at a certain time receive more comments on average?** # # ##### Let's start by importing the libraries I will need and reading the dataset into a list of lists. # In[81]: from csv import reader opened_file = open('hacker_news.csv') read_file = reader(opened_file) hn = list(read_file) hn_header = hn[0] print(hn[0:5]) print() print (hn_header) # ### Removing Headers from a list of lists: # In[82]: hn = hn[1:] print (hn_header) print () print (hn [0:5]) # ### Extracting Ask HN and Show HN posts: # In[136]: ask_posts = [] show_posts = [] other_posts = [] for row in hn: title = row[1] if title.lower().startswith('ask hn'): ask_posts.append(row) elif title.lower().startswith('show hn'): show_posts.append(row) else: other_posts.append(row) # printed a sample of each group print (ask_posts[0:3]) print () print () print (show_posts[0:3]) print () print () print (other_posts[0:3]) # In[84]: #printing length for each type of post print ('Ask Posts Total:',len(ask_posts)) print () print ('Show Posts Total:',len(show_posts)) print () print('Other Posts Total:',len(other_posts)) # ### Calculating the Average Number of Comments for Ask HN and Show HN Posts # In[85]: total_ask_comments = [0] for row in ask_posts: num_comments = row[4] num_comments = int(num_comments) total_ask_comments.append(num_comments) print(total_ask_comments[0:20]) # In[86]: avg_ask_comments = sum(total_ask_comments)/ len(total_ask_comments) print(avg_ask_comments) print(round(avg_ask_comments)) # In[87]: total_show_comments = [0] for row in show_posts: num_comments = row[4] num_comments = int(num_comments) total_show_comments.append(num_comments) print(total_show_comments[0:20]) # In[88]: avg_show_comments = sum(total_show_comments)/ len(total_show_comments) print(avg_show_comments) print(round(avg_show_comments)) # In[89]: print('Average Ask Comments:',round(avg_ask_comments)) print('Average Show Comments:',round(avg_show_comments)) # > **The average Ask post comments is 14. The average of Show post comments is 10. It appears more people respond to ask posts then show posts.** # ### Finding the Number of Ask Posts and Comments by Hour Created # # **Since ask posts are more likely to receive comments, I'll focus the remaining analysis just on these posts. I'll determine if ask posts created at a certain time are more likely to attract comments.** # In[90]: print(hn_header) print(ask_posts[0:5]) # In[91]: import datetime as dt result_list = [] for row in ask_posts: created_at = row[6] num_comments = row[4] num_comments = int(num_comments) result_list.append([created_at, num_comments]) print(result_list[0:20]) # > **Above I have seperated the number of comments per date and printed a sample to see the result of how the data looks cleaned** # # **Below I seperated the amount of comments per hour in a 24 hour time period.** # In[92]: import datetime as dt counts_by_hour = {} comments_by_hour = {} for row in result_list: hour_date = row[0] comment_date = row[1] date_object = "%m/%d/%Y %H:%M" hour_post = dt.datetime.strptime(hour_date, date_object).strftime("%H") if hour_post not in counts_by_hour: counts_by_hour[hour_post] = 1 comments_by_hour[hour_post] = comment_date else: counts_by_hour[hour_post] += 1 comments_by_hour[hour_post] += comment_date display(counts_by_hour) print() display(comments_by_hour) # ### Calculating the Average Number of Comments for Ask HN Posts by Hour # In[93]: avg_by_hour = [] for hour in counts_by_hour: avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]]) display(avg_by_hour) # ### Sorting and Printing Values from a List of Lists # In[138]: swap_avg_by_hour = [] for column in avg_by_hour: columns_a = column[1] columns_b = column[0] swap_avg_by_hour.append([columns_a, columns_b]) sorted_swap = sorted(swap_avg_by_hour, reverse = True) display (sorted_swap) # #### Printing the "Top 5 hours for Ask Posts Comments: # In[95]: print ('Top 5 Hours for Ask Posts Comments:', sorted_swap[0:5]) # In[139]: #converted to a 12 hour format. for time in sorted_swap[0:5]: display_format = "{hour}: {five_comments:.2f} average comments per post" five_comments = time[0] sorted_five = dt.datetime.strptime(time[1],"%H").strftime("%I:%M %p") print (display_format.format(hour= sorted_five, five_comments = five_comments)) # ### The most comments for Ask posts seem to occur during the afternoon around 3-4 pm. The data set is already in eastern time zone which is my local time zone. In conclusion the best time to receive a comment to your post is around 3-4pm and 8-9pm as a second place alternative. # ## Continuation of additional tasks: # # #### 1 )Determine if show or ask posts receive more points on average. # #### 2 )Determine if posts created at a certain time are more likely to receive more points. # #### 3) Compare your results to the average number of comments and points other posts receive. # # ### We will start with the first additional question of determining if show or ask posts receive more points on average. # In[141]: print (hn_header) print () print (ask_posts[0:4]) # In[98]: total_ask_points = [0] for row in ask_posts: num_points = row[3] num_points = int(num_points) total_ask_points.append(num_points) avg_ask_points = sum(total_ask_points)/ len(total_ask_points) display (round(avg_ask_points)) # In[143]: print (hn_header) print () print (show_posts[0:4]) # In[100]: total_show_points = [0] for row in show_posts: num_comments_points = row[3] num_comments_points = int(num_comments_points) total_show_points.append(num_comments_points) avg_show_points = sum(total_show_points)/ len(total_show_points) display (round(avg_show_points)) # In[101]: print ("Average Ask Points:", (round(avg_ask_points))) print ("Average Show Points:", (round(avg_show_points))) # #### According to the above information. `Show Posts` obtain more points then `Ask Posts`. # >#### Average Ask Points: 15 # >#### Average Show Points: 28 # ### In this part we will determine if a certain time receives more points. I will break it down for each show and ask points. # # #### First we will start with Ask posts: # In[102]: time_ask_posts = [] for row in ask_posts: time_ask = row[6] comments_ask = row[3] comments_ask = int(comments_ask) time_ask_posts.append([time_ask,comments_ask]) print (time_ask_posts[0:15]) # > **Above I have seperated the number of comments per date and printed a sample to see the result of how the data looks cleaned** # # **Below I seperated the amount of comments per hour in a 24 hour time period.** # In[103]: import datetime as dt ask_date_time = {} ask_point_count = {} for row in time_ask_posts: adt_a = row[0] apc_b = row[1] points_date_object = "%m/%d/%Y %H:%M" hour_points = dt.datetime.strptime(adt_a, points_date_object).strftime("%H") if hour_points not in ask_date_time: ask_date_time[hour_points] = 1 ask_point_count[hour_points] = apc_b else: ask_date_time[hour_points] += 1 ask_point_count[hour_points] += apc_b display (ask_date_time) display (ask_point_count) # #### Avg of Ask points per hour: # In[104]: avg_ask_points = [] for hour in ask_date_time: avg_ask_points.append([hour, ask_point_count[hour]/ask_date_time[hour]]) display(avg_ask_points) # #### Swapping the time and average column. Also rounding the average amount of points. # In[146]: avg_ask_points_swap = [] for column in avg_ask_points: asp_a = column[0] asp_b = round(column[1]) avg_ask_points_swap.append([asp_b, asp_a]) avg_ask_points_swap = sorted(avg_ask_points_swap, reverse = True) display (avg_ask_points_swap) # #### Top 5 hours for the most amount of points. # In[107]: for time in avg_ask_points_swap[0:5]: display_format = "{hour}: {five_points:.2f} average points per post" five_points = time[0] points_five = dt.datetime.strptime(time[1],"%H").strftime("%I:%M %p") print (display_format.format(hour= points_five, five_points= five_points)) # ___ # #### Here we can see that the highest amount of points received is `30 points at 3pm` timeframe for the `Ask posts`. The most points received for `Ask posts` seem to be around `1-5pm eastern standard time`. All hours in this range are in the top 5 except 2pm. # ___ # # ### I have already determined that `Show Posts` receive more points then `Ask Posts`. I will continue now to clean the 'Show posts' to see what are the top 5 hours in the day the `Show Posts` receive the most points. # # #### Since the code will basically be an exact copy of the code ran for `Ask posts` above. I will copy and paste and run it all in fewer cells. # ___ # In[108]: time_show_posts = [] for row in show_posts: time_show = row[6] comments_show = row[3] comments_show = int(comments_show) time_show_posts.append([time_show,comments_show]) print (time_show_posts[0:15]) # In[109]: import datetime as dt show_date_time = {} show_point_count = {} for row in time_show_posts: sdt_a = row[0] spc_b = row[1] points_show_object = "%m/%d/%Y %H:%M" hour_show = dt.datetime.strptime(sdt_a, points_show_object).strftime("%H") if hour_show not in show_date_time: show_date_time[hour_show] = 1 show_point_count[hour_show] = spc_b else: show_date_time[hour_show] += 1 show_point_count[hour_show] += spc_b avg_show_points = [] for hour in show_date_time: avg_show_points.append([hour, show_point_count[hour]/show_date_time[hour]]) avg_show_points = sorted(avg_show_points, reverse = False) avg_show_points_swap = [] for column in avg_show_points: ssp_a = column[0] ssp_b = round(column[1]) avg_show_points_swap.append([ssp_b, ssp_a]) avg_show_points_swap = sorted(avg_show_points_swap, reverse = True) display (avg_show_points_swap) for time in avg_show_points_swap[0:5]: show_display_format = "{hour}: {show_five_points:.2f} average points per post" show_five_points = time[0] show_points_five = dt.datetime.strptime(time[1],"%H").strftime("%I:%M %p") print (show_display_format.format(hour=show_points_five, show_five_points= show_five_points)) # ___ # # #### As we can see with the top 5 `Show posts` the most points are obtained between the hours of 10pm to midnight eastern standard time. With 12 noon being number 2. It seems alot of people also check Hacker news on their lunch. # # #### What I have also determined is that `Show Posts` are most commented at night and `Ask posts` are most commented during the afternoon. # ___ # ### I will continue to now clean the data for `Other posts`. I will determine the average amount of comments and points received for other posts. # # ### I will also see if `Other posts` has any noticeable pattern that can be observed. # In[150]: other_posts_comments = [] for row in other_posts: time_other = row[6] comments_other = row[4] comments_other = int(comments_other) other_posts_comments.append([time_other,comments_other]) # In[148]: import datetime as dt other_posts_date = {} other_posts_com = {} for row in other_posts_comments: opc_a = row[0] opc_b = row[1] comments_other_object = "%m/%d/%Y %H:%M" hour_comments_other = dt.datetime.strptime(opc_a, comments_other_object).strftime("%H") if hour_comments_other not in other_posts_date: other_posts_date[hour_comments_other] = 1 other_posts_com[hour_comments_other] = opc_b else: other_posts_date[hour_comments_other] += 1 other_posts_com[hour_comments_other] += opc_b avg_other_posts = [] for hour in other_posts_date: avg_other_posts.append([hour, other_posts_com[hour]/other_posts_date[hour]]) # In[112]: swap_other_posts = [] for column in avg_other_posts: sop_a = column[0] sop_b = column[1] swap_other_posts.append([sop_b, sop_a]) sorted_other_posts = sorted(swap_other_posts, reverse = True) display (sorted_other_posts) for time in sorted_other_posts[0:5]: other_display_format = "{hour}: {swap_other_five: .2f} average comments per hour for other posts" swap_other_five = time[0] swap_other_time = dt.datetime.strptime(time[1],"%H").strftime("%I:%M %p") display (other_display_format.format(hour=swap_other_time, swap_other_five = swap_other_five)) # In[113]: total_other_comments = [0] for row in other_posts: num_comments_other = row[4] num_comments_other = int(num_comments_other) total_other_comments.append(num_comments_other) avg_other_comments = sum(total_other_comments)/ len(total_other_comments) print('Average Other Comments:' ,round(avg_other_comments)) print('Average Ask Comments:',round(avg_ask_comments)) print('Average Show Comments:',round(avg_show_comments)) # #### We started by cleaning the comments by hour for other posts. The end result shows that the 2pm eastern standard time has the most activity with comments. The most popular time range seems to be between 11am to 3pm. # # #### In conclusion with average comments by hour. Other posts seem to have the most activity with comments. In 2nd place will be ask posts and last will be show posts. # # `Average Other Comments: 27 # Average Ask Comments: 14 # Average Show Comments: 10` # # ___ # # ### Now I will look at the amount of points received per hour for Other posts and its average amount of points compared to Ask and Show posts. # In[114]: other_posts_points = [] for row in other_posts: time_other = row[6] points_other = row[3] points_other = int(points_other) other_posts_points.append([time_other, points_other]) print (other_posts_points[0:15]) # In[115]: other_post_date = {} other_points = {} for row in other_posts_points: opd_a = row[0] opd_b = row[1] points_other_object = "%m/%d/%Y %H:%M" hour_points_other = dt.datetime.strptime(opd_a, points_other_object).strftime("%H") if hour_points_other not in other_post_date: other_post_date[hour_points_other] = 1 other_points[hour_points_other] = opd_b else: other_post_date[hour_points_other] += 1 other_points[hour_points_other] += opd_b avg_other_points = [] for hour in other_post_date: avg_other_points.append([hour, other_points[hour]/other_post_date[hour]]) swap_other_points = [] for column in avg_other_points: avg_a = column[0] avg_b = column[1] swap_other_points.append([avg_b, avg_a]) sorted_other_points = sorted(swap_other_points, reverse = True) display (sorted_other_points) for time in sorted_other_points[0:5]: other_display_format = "{hour}: {swap_other_points_five: .2f} average points per hour for other posts" swap_other_points_five = time[0] swap_other_date_times = dt.datetime.strptime(time[1], "%H").strftime("%I:%M %p") display (other_display_format.format(hour = swap_other_date_times, swap_other_points_five = swap_other_points_five)) # ___ # #### The highest average amount of points is 62.5 points per hour during 1pm. It seems that the other posts receive the most points during the range of 1-3 pm est. This falls in line with the average range of comments being between 11-3pm est for other posts. # ___ # In[133]: total_other_posts_points = [0] for row in other_posts: num_other_points = row[3] num_other_points = int(num_other_points) total_other_posts_points.append(num_other_points) avg_other_points = sum(total_other_posts_points)/ len(total_other_posts_points) print ("Average Other Points:", (round(avg_other_points))) # ___ # #### The average amount of points an hour is 55 points for other posts. Other posts recieve more comments and points then Ask or Show posts. Also another observation is the peak amount of points given for other posts and ask posts seem overlap in a similar time frame. Below I will break it down for comparsion. # ___ # ### In conclusion Other Posts are the most popular overall: # # **Points:** # # * **Average Other Posts Points: 55** # * **Average Show Posts Points: 28** # * **Average Ask Posts Points: 15** # # **Comments:** # # * **Average Other Posts Comments: 27** # * **Average Ask Posts Comments: 14** # * **Average Show Posts Comments: 10** # # **Peak times for points:** # # * **Average Other Posts Points: 1-3 pm est** # * **Average Ask Posts Points: 1-5pm Est** # * **Average Show Posts Points: 10pm to 12am Est** # # **Peak time for comments:** # # * **Average Other Posts Comments: 11am to 3pm.** # * **Average Ask Posts Comments: 3-4 pm** # * **Average Show Posts Comments: unknown**