The project is about analyzing the total no of posts for the title beginning with Ask HN and Show HN. And analysze which time of day Hacker News posts receives more number of comments.
import csv
from csv import reader
news_file = open("hacker_news.csv") # opening the csv file
news_file = reader(news_file)
hn = list(news_file)
print(hn[:5])
[['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'], ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']]
headers = hn[0] # removing header row and saving in headers
hn = hn[1:] #this removes the header row
ask_posts = []
show_posts = []
other_posts = []
for row in hn:
title = row[1]
if title.lower().startswith('ask hn'):
ask_posts.append(row)
elif title.lower().startswith('show hn'):
show_posts.append(row)
else:
other_posts.append(row)
print(title)
print(len(ask_posts))
print(len(show_posts))
print(len(other_posts))
RoboBrowser: Your friendly neighborhood web scraper 1744 1162 17194
ask_posts[:5]
[['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', '8/16/2016 9:55'], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', '11/22/2015 13:43'], ['11610310', 'Ask HN: Aby recent changes to CSS that broke mobile?', '', '1', '1', 'polskibus', '5/2/2016 10:14'], ['12210105', 'Ask HN: Looking for Employee #3 How do I do it?', '', '1', '3', 'sph130', '8/2/2016 14:20'], ['10394168', 'Ask HN: Someone offered to buy my browser extension from me. What now?', '', '28', '17', 'roykolak', '10/15/2015 16:38']]
def avg_comments(data):
total_comments = 0
for row in data:
comments = int(row[4])
total_comments += comments
avg_comments = ((total_comments) / (len(data)))
return avg_comments
avg_ask_comments= avg_comments(ask_posts)
avg_show_comments = avg_comments(show_posts)
print(avg_ask_comments)
print(avg_show_comments)
14.038417431192661 10.31669535283993
Based on findings , we can conclude that on an average "ask_posts" receive more number of comments when comparing to "show_posts".
import datetime as dt
result_list = []
for row in ask_posts:
date_time = row[6]
comments = int(row[4])
result_list.append([date_time,comments])
counts_by_hour = {}
comments_by_hour = {}
for row in result_list:
date = row[0]
hour = dt.datetime.strptime(date,"%m/%d/%Y %H:%M").strftime("%H")
if hour not in counts_by_hour:
counts_by_hour[hour] = 1
comments_by_hour[hour] = row[1]
else:
counts_by_hour[hour] += 1
comments_by_hour[hour] += row[1]
print(counts_by_hour)
#len(counts_by_hour)
print(comments_by_hour)
#print(ask_posts)
{'12': 73, '17': 100, '11': 58, '20': 80, '03': 54, '21': 109, '10': 59, '15': 116, '08': 48, '01': 60, '02': 58, '14': 107, '18': 109, '09': 45, '23': 68, '04': 47, '19': 110, '00': 55, '07': 34, '16': 108, '05': 46, '06': 44, '22': 71, '13': 85} {'12': 687, '17': 1146, '11': 641, '20': 1722, '03': 421, '21': 1745, '10': 793, '15': 4477, '08': 492, '01': 683, '02': 1381, '14': 1416, '18': 1439, '09': 251, '23': 543, '04': 337, '19': 1188, '00': 447, '07': 267, '16': 1814, '05': 464, '06': 397, '22': 479, '13': 1253}
avg_by_hour = []
for hour in comments_by_hour and counts_by_hour:
avg_by_hour.append([hour, comments_by_hour[hour]/counts_by_hour[hour]])
print(avg_by_hour)
[['12', 9.41095890410959], ['17', 11.46], ['11', 11.051724137931034], ['20', 21.525], ['03', 7.796296296296297], ['21', 16.009174311926607], ['10', 13.440677966101696], ['15', 38.5948275862069], ['08', 10.25], ['01', 11.383333333333333], ['02', 23.810344827586206], ['14', 13.233644859813085], ['18', 13.20183486238532], ['09', 5.5777777777777775], ['23', 7.985294117647059], ['04', 7.170212765957447], ['19', 10.8], ['00', 8.127272727272727], ['07', 7.852941176470588], ['16', 16.796296296296298], ['05', 10.08695652173913], ['06', 9.022727272727273], ['22', 6.746478873239437], ['13', 14.741176470588234]]
swap_avg_by_hour = []
for row in avg_by_hour:
p = row[0]
q = row[1]
p,q = q,p
swap_avg_by_hour.append([p,q])
sorted_swap = sorted(swap_avg_by_hour,reverse = True)
sorted_swap[:5]
[[38.5948275862069, '15'], [23.810344827586206, '02'], [21.525, '20'], [16.796296296296298, '16'], [16.009174311926607, '21']]
These are the top 5 ask_posts with avg_number of comments at respective time
for row in sorted_swap[:5]:
hours = row[1]
avg = "{:.2f}".format(row[0])
hour1 = dt.datetime.strptime(hours,"%H").strftime("%H:%M:")
print("{hour} {avg1} average comments per post".format(hour=hour1,avg1 = avg))
15:00: 38.59 average comments per post 02:00: 23.81 average comments per post 20:00: 21.52 average comments per post 16:00: 16.80 average comments per post 21:00: 16.01 average comments per post
After analyzing the data we can understand that there are more number of posts starting with ask_hn when comparing to show_hn.