Notebook

Analysing the data set of submissions to a popular tech site Hacker News¶

In [1]:

import csv

opened_file = open('hacker_news.csv')
read_file = csv.reader(opened_file)
hn = list(read_file)
header = hn[0]
hn = hn[1:]

In [2]:

print(hn[0:3]) #just checkpoint

[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20']]

In [3]:

print(header) #printing the header of our data set

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']

In [4]:

#Creating 3 empty lists of lists

ask_posts = []
show_posts = []
other_posts = []

In [5]:

for row in hn:
    title = row[1]
    
    result_lower = title.lower()
    result_1 = result_lower.startswith('ask hn') #returns Boolean result
    result_2 = result_lower.startswith('show hn') #returns Boolean result
    
    if result_1: #The append operation in this if statement is executed only if result_1 gives True
        ask_posts.append(row)
    
    elif result_2:
        show_posts.append(row)
    else:
        other_posts.append(row)
        
        

In [6]:

print("There are overall {} ASK HN posts".format(len(ask_posts)))
print('\n')
print("There are overall {} SHOW HN posts".format(len(show_posts)))
print('\n')
print("There are overall {} OTHER posts".format(len(other_posts)))

There are overall 1744 ASK HN posts


There are overall 1162 SHOW HN posts


There are overall 17194 OTHER posts

In [8]:

print('The length of HN dataset is: ', len(hn))
print('\n')
print("The length of HN dataset and the length of all other 3 created lists are equal: ", len(hn) == (len(ask_posts)+len(show_posts)+len(other_posts)))

The length of HN dataset is:  20100


The length of HN dataset and the length of all other 3 created lists are equal:  True

In [11]:

total_ask_comments = 0
counter_length = 0

for row in ask_posts:
    num_comments = row[4]
    total_ask_comments += int(num_comments)
    counter_length += 1
    
avg_ask_comments = total_ask_comments/counter_length

print("The average calculated number of 'ASK' comments is: ", avg_ask_comments)

The average calculated number of 'ASK' comments is:  14.038417431192661

In [12]:

total_show_comments = 0
counter = 0

for row in show_posts:
    num_comments = row[4]
    total_show_comments += int(num_comments)
    counter += 1
    
avg_show_comments = total_show_comments/counter

print("The average calculated number of 'SHOW' comments is: ", avg_show_comments)

The average calculated number of 'SHOW' comments is:  10.31669535283993

Well According to the calculations, ASK posts got more comments on average than SHOW posts did. In my opinion, the underlying reason for that people tend to show more reaction to ASK posts as people need some help with this kind of posts. People tend to answer more to ASK posts in the comments section. Since ASK posts are more likely to receive comments, we'll focus our remaining analysis just on these posts.

Finding the Amount of Ask Posts and Comments by Hour Created¶

In [14]:

import datetime as dt

result_list = []

for row in ask_posts:
    first_element = row[6]
    second_element = int(row[4])
    result_list.append([first_element,second_element])
    
counts_by_hour = {}
comments_by_hour = {}

for row in result_list:
    date = row[0]
    comment = row[1]
    datetime_object = dt.datetime.strptime(date, "%m/%d/%Y %H:%M")
    time = datetime_object.strftime("%H")
    
    if time not in counts_by_hour:
        counts_by_hour[time] = 1
        comments_by_hour[time] = comment
    else:
        counts_by_hour[time] += 1
        comments_by_hour[time] += comment
        
print(counts_by_hour)
print('\n')
print(comments_by_hour)

{'09': 45, '13': 85, '10': 59, '14': 107, '16': 108, '23': 68, '12': 73, '17': 100, '15': 116, '21': 109, '20': 80, '02': 58, '18': 109, '03': 54, '05': 46, '19': 110, '01': 60, '22': 71, '08': 48, '04': 47, '00': 55, '06': 44, '07': 34, '11': 58}


{'09': 251, '13': 1253, '10': 793, '14': 1416, '16': 1814, '23': 543, '12': 687, '17': 1146, '15': 4477, '21': 1745, '20': 1722, '02': 1381, '18': 1439, '03': 421, '05': 464, '19': 1188, '01': 683, '22': 479, '08': 492, '04': 337, '00': 447, '06': 397, '07': 267, '11': 641}

In [15]:

avg_by_hour = []

for hr in comments_by_hour:
    avg_by_hour.append([hr, comments_by_hour[hr] / counts_by_hour[hr]])

avg_by_hour

Out[15]:

[['09', 5.5777777777777775],
 ['13', 14.741176470588234],
 ['10', 13.440677966101696],
 ['14', 13.233644859813085],
 ['16', 16.796296296296298],
 ['23', 7.985294117647059],
 ['12', 9.41095890410959],
 ['17', 11.46],
 ['15', 38.5948275862069],
 ['21', 16.009174311926607],
 ['20', 21.525],
 ['02', 23.810344827586206],
 ['18', 13.20183486238532],
 ['03', 7.796296296296297],
 ['05', 10.08695652173913],
 ['19', 10.8],
 ['01', 11.383333333333333],
 ['22', 6.746478873239437],
 ['08', 10.25],
 ['04', 7.170212765957447],
 ['00', 8.127272727272727],
 ['06', 9.022727272727273],
 ['07', 7.852941176470588],
 ['11', 11.051724137931034]]

In [17]:

swap_avg_by_hour = []

for row in avg_by_hour:
    swap_avg_by_hour.append([row[1],row[0]])
    
print(swap_avg_by_hour)
print('\n')

sorted_swap = sorted(swap_avg_by_hour, reverse=True)

print(sorted_swap)

[[5.5777777777777775, '09'], [14.741176470588234, '13'], [13.440677966101696, '10'], [13.233644859813085, '14'], [16.796296296296298, '16'], [7.985294117647059, '23'], [9.41095890410959, '12'], [11.46, '17'], [38.5948275862069, '15'], [16.009174311926607, '21'], [21.525, '20'], [23.810344827586206, '02'], [13.20183486238532, '18'], [7.796296296296297, '03'], [10.08695652173913, '05'], [10.8, '19'], [11.383333333333333, '01'], [6.746478873239437, '22'], [10.25, '08'], [7.170212765957447, '04'], [8.127272727272727, '00'], [9.022727272727273, '06'], [7.852941176470588, '07'], [11.051724137931034, '11']]


[[38.5948275862069, '15'], [23.810344827586206, '02'], [21.525, '20'], [16.796296296296298, '16'], [16.009174311926607, '21'], [14.741176470588234, '13'], [13.440677966101696, '10'], [13.233644859813085, '14'], [13.20183486238532, '18'], [11.46, '17'], [11.383333333333333, '01'], [11.051724137931034, '11'], [10.8, '19'], [10.25, '08'], [10.08695652173913, '05'], [9.41095890410959, '12'], [9.022727272727273, '06'], [8.127272727272727, '00'], [7.985294117647059, '23'], [7.852941176470588, '07'], [7.796296296296297, '03'], [7.170212765957447, '04'], [6.746478873239437, '22'], [5.5777777777777775, '09']]

In [18]:

print("Top 5 Hours for 'Ask HN' Comments")

for avg, hr in sorted_swap[:5]:
    print("{}: {:.2f} average comments per post".format(dt.datetime.strptime(hr, "%H").strftime("%H:%M"),avg))

Top 5 Hours for 'Ask HN' Comments
15:00: 38.59 average comments per post
02:00: 23.81 average comments per post
20:00: 21.52 average comments per post
16:00: 16.80 average comments per post
21:00: 16.01 average comments per post

In [ ]: