Notebook

Exploring Hacker News Posts¶

Describe the project; Describe the dataset.

Opening the file and printing the first 5 rows:

In [1]:

from csv import reader
import datetime as dt

file = open('hacker_news.csv')
file_reader = reader(file)
hn = list(file_reader)

for row in hn[0:6]:
    print (row)
    print ('\n')

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']


['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52']


['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30']


['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20']


['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01']


['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12']

Extracting the headers and separating it from the dataset content:

In [2]:

headers = hn[0]
hn = hn[1:]
print(headers)
print(hn[:6])

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12'], ['10482257', 'Title II kills investment? Comcast and other ISPs are now spending more', 'http://arstechnica.com/business/2015/10/comcast-and-other-isps-boost-network-investment-despite-net-neutrality/', '53', '22', 'Deinos', '10/31/2015 9:48']]

In [3]:

ask_posts = []
show_posts = []
other_posts = []

for row in hn:
    title = row[1]
    
    if title.lower().startswith('ask hn'):
        ask_posts.append(row)
    elif title.lower().startswith('show hn'):
        show_posts.append(row)
    else:
        other_posts.append(row)

print('The number of ask posts is ' + str(len(ask_posts)))
print('The number of show posts is ' + str(len(show_posts)))
print('The number of other posts is ' + str(len(other_posts)))        

The number of ask posts is 1744
The number of show posts is 1162
The number of other posts is 17194

In [4]:

print(ask_posts[:5])
print('\n')
print(show_posts[:5])

[['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', '8/16/2016 9:55'], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', '11/22/2015 13:43'], ['11610310', 'Ask HN: Aby recent changes to CSS that broke mobile?', '', '1', '1', 'polskibus', '5/2/2016 10:14'], ['12210105', 'Ask HN: Looking for Employee #3 How do I do it?', '', '1', '3', 'sph130', '8/2/2016 14:20'], ['10394168', 'Ask HN: Someone offered to buy my browser extension from me. What now?', '', '28', '17', 'roykolak', '10/15/2015 16:38']]


[['10627194', 'Show HN: Wio Link  ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', '11/25/2015 14:03'], ['10646440', 'Show HN: Something pointless I made', 'http://dn.ht/picklecat/', '747', '102', 'dhotson', '11/29/2015 22:46'], ['11590768', 'Show HN: Shanhu.io, a programming playground powered by e8vm', 'https://shanhu.io', '1', '1', 'h8liu', '4/28/2016 18:05'], ['12178806', 'Show HN: Webscope  Easy way for web developers to communicate with Clients', 'http://webscopeapp.com', '3', '3', 'fastbrick', '7/28/2016 7:11'], ['10872799', 'Show HN: GeoScreenshot  Easily test Geo-IP based web pages', 'https://www.geoscreenshot.com/', '1', '9', 'kpsychwave', '1/9/2016 20:45']]

In [5]:

total_ask_comments = 0

for row in ask_posts:
    total_ask_comments += int(row[4])
    
avg_ask_comments = total_ask_comments / len(ask_posts)
print("The average amount of comments in Ask HN posts is {0: .2f}".format(avg_ask_comments))

total_show_comments = 0

for row in show_posts:
    total_show_comments += int(row[4])
    
avg_show_comments = total_show_comments / len(show_posts)
print("The average amount of comments in Show HN posts is {0: .2f}".format(avg_show_comments))

The average amount of comments in Ask HN posts is  14.04
The average amount of comments in Show HN posts is  10.32

It is possible to see that the average amount of comments received by Ask NH posts is bigger. For this reason, we will keep or analysis on this kind of posts.

In [6]:

print(headers)
print(hn[:20])

['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at']
[['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'], ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'], ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20'], ['11919867', 'Technology ventures: From Idea to Enterprise', 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429', '3', '1', 'hswarna', '6/17/2016 0:01'], ['10301696', 'Note by Note: The Making of Steinway L1037 (2007)', 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0', '8', '2', 'walterbell', '9/30/2015 4:12'], ['10482257', 'Title II kills investment? Comcast and other ISPs are now spending more', 'http://arstechnica.com/business/2015/10/comcast-and-other-isps-boost-network-investment-despite-net-neutrality/', '53', '22', 'Deinos', '10/31/2015 9:48'], ['10557283', 'Nuts and Bolts Business Advice', '', '3', '4', 'shomberj', '11/13/2015 0:45'], ['12296411', 'Ask HN: How to improve my personal website?', '', '2', '6', 'ahmedbaracat', '8/16/2016 9:55'], ['11337617', 'Shims, Jigs and Other Woodworking Concepts to Conquer Technical Debt', 'http://firstround.com/review/shims-jigs-and-other-woodworking-concepts-to-conquer-technical-debt/', '34', '7', 'zt', '3/22/2016 16:18'], ['10379326', 'That self-appendectomy', 'http://www.southpolestation.com/trivia/igy1/appendix.html', '91', '10', 'jimsojim', '10/13/2015 9:30'], ['11370829', 'Crate raises $4M seed round for its next-gen SQL database', 'http://techcrunch.com/2016/03/15/crate-raises-4m-seed-round-for-its-next-gen-sql-database/', '3', '1', 'hitekker', '3/27/2016 18:08'], ['11665197', 'Advertising Cannot Maintain the Internet. Heres the Secret Sauce Solution', 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/', '2', '1', 'dredmorbius', '5/10/2016 4:46'], ['11981466', 'Coding Is Over', 'https://medium.com/@loorinm/coding-is-over-6d653abe8da8', '18', '14', 'prostoalex', '6/26/2016 16:36'], ['10627194', 'Show HN: Wio Link  ESP8266 Based Web of Things Hardware Development Platform', 'https://iot.seeed.cc', '26', '22', 'kfihihc', '11/25/2015 14:03'], ['11587596', 'Custom Deleters for C++ Smart Pointers', 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html', '59', '18', 'ingve', '4/28/2016 10:01'], ['12335860', 'How often to update third party libraries?', '', '7', '5', 'rabid_oxen', '8/22/2016 12:37'], ['11403750', 'Review my AI based marketing bot', 'http://beta.crowdfireapp.com/?beta=agnipath', '1', '2', 'abhishekmaddy', '4/1/2016 9:45'], ['10610020', 'Ask HN: Am I the only one outraged by Twitter shutting down share counts?', '', '28', '29', 'tkfx', '11/22/2015 13:43'], ['10837634', "Ten years later, did Boston's Big Dig deliver?", 'https://www.bostonglobe.com/magazine/2015/12/29/years-later-did-big-dig-deliver/tSb8PIMS4QJUETsMpA7SpI/story.html', '109', '116', 'jseliger', '1/4/2016 18:58'], ['12121216', 'Valid.ly  Never send another OOPS message', 'https://www.valid.ly', '1', '1', 'validly', '7/19/2016 12:05']]

In [25]:

hours_ask_posts = {}
hours_ask_comments = {}

for row in ask_posts:
    
    creation_time = dt.datetime.strptime(row[6],'%m/%d/%Y %H:%M')
    creation_time_hour = creation_time.time()
    
    if creation_time_hour.hour not in hours_ask_posts:
        hours_ask_posts[creation_time_hour.hour] = 1
        hours_ask_comments[creation_time_hour.hour] = int(row[4])
    else:
        hours_ask_posts[creation_time_hour.hour] += 1
        hours_ask_comments[creation_time_hour.hour] += int(row[4])
        
print(hours_ask_posts)
print('\n')
print(hours_ask_comments)

{0: 55, 1: 60, 2: 58, 3: 54, 4: 47, 5: 46, 6: 44, 7: 34, 8: 48, 9: 45, 10: 59, 11: 58, 12: 73, 13: 85, 14: 107, 15: 116, 16: 108, 17: 100, 18: 109, 19: 110, 20: 80, 21: 109, 22: 71, 23: 68}


{0: 447, 1: 683, 2: 1381, 3: 421, 4: 337, 5: 464, 6: 397, 7: 267, 8: 492, 9: 251, 10: 793, 11: 641, 12: 687, 13: 1253, 14: 1416, 15: 4477, 16: 1814, 17: 1146, 18: 1439, 19: 1188, 20: 1722, 21: 1745, 22: 479, 23: 543}

In [43]:

avg_by_hour = []

for hour in list(hours_ask_posts.keys()): 
    support_list = []
    avg = round((hours_ask_comments[hour] / hours_ask_posts[hour]),2)
    support_list.append(hour)
    support_list.append(avg)
    avg_by_hour.append(support_list)
    
print(avg_by_hour)

[[0, 8.13], [1, 11.38], [2, 23.81], [3, 7.8], [4, 7.17], [5, 10.09], [6, 9.02], [7, 7.85], [8, 10.25], [9, 5.58], [10, 13.44], [11, 11.05], [12, 9.41], [13, 14.74], [14, 13.23], [15, 38.59], [16, 16.8], [17, 11.46], [18, 13.2], [19, 10.8], [20, 21.52], [21, 16.01], [22, 6.75], [23, 7.99]]

In [51]:

swap_avg_by_hour = []

for hour in avg_by_hour:
    support_list = []
    support_list.append(hour[1])
    support_list.append(hour[0])
    swap_avg_by_hour.append(support_list)
    
print(swap_avg_by_hour)

[[8.13, 0], [11.38, 1], [23.81, 2], [7.8, 3], [7.17, 4], [10.09, 5], [9.02, 6], [7.85, 7], [10.25, 8], [5.58, 9], [13.44, 10], [11.05, 11], [9.41, 12], [14.74, 13], [13.23, 14], [38.59, 15], [16.8, 16], [11.46, 17], [13.2, 18], [10.8, 19], [21.52, 20], [16.01, 21], [6.75, 22], [7.99, 23]]

In [52]:

swap_avg_by_hour = sorted(swap_avg_by_hour,reverse = True)
swap_avg_by_hour

Out[52]:

[[38.59, 15],
 [23.81, 2],
 [21.52, 20],
 [16.8, 16],
 [16.01, 21],
 [14.74, 13],
 [13.44, 10],
 [13.23, 14],
 [13.2, 18],
 [11.46, 17],
 [11.38, 1],
 [11.05, 11],
 [10.8, 19],
 [10.25, 8],
 [10.09, 5],
 [9.41, 12],
 [9.02, 6],
 [8.13, 0],
 [7.99, 23],
 [7.85, 7],
 [7.8, 3],
 [7.17, 4],
 [6.75, 22],
 [5.58, 9]]

In [55]:

for hour in swap_avg_by_hour[:6]:
    print(str(hour[1])+":00: " + str(hour[0]) + " comments per post")

15:00: 38.59 comments per post
2:00: 23.81 comments per post
20:00: 21.52 comments per post
16:00: 16.8 comments per post
21:00: 16.01 comments per post
13:00: 14.74 comments per post

In [ ]: