# create a new class which makes it possible to format text in colour or bold etc.
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
In this project, we'll work with a dataset of submissions to popular technology site Hacker News.
Hacker News is a site started by the startup incubator Y Combinator, where user-submitted stories (known as "posts") receive votes and comments, similar to reddit. Hacker News is extremely popular in technology and startup circles, and posts that make it to the top of the Hacker News listings can get hundreds of thousands of visitors as a result.
You can find the data set here, but note that we have reduced from almost 300,000 rows to approximately 20,000 rows by removing all submissions that didn't receive any comments and then randomly sampling from the remaining submissions. Below are descriptions of the columns:
We're specifically interested in posts with titles that begin with either Ask HN or Show HN. Users submit Ask HN posts to ask the Hacker News community a specific question.
We'll compare these two types of posts to determine the following:
Let's start by importing the libraries we need and reading the dataset into a list of lists.
#opening the csv file
import csv
opened_file = open("hacker_news.csv")
read_file = csv.reader(opened_file)
hn = list(read_file)
#show the first couple of lines to get an idea of the structure of the list.
for row in hn[:4]:
print(row)
print('\n')
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52'] ['10975351', 'How to Use Open Source and Shut the Fuck Up at the Same Time', 'http://hueniverse.com/2016/01/26/how-to-use-open-source-and-shut-the-fuck-up-at-the-same-time/', '39', '10', 'josep2', '1/26/2016 19:30'] ['11964716', "Florida DJs May Face Felony for April Fools' Water Joke", 'http://www.thewire.com/entertainment/2013/04/florida-djs-april-fools-water-joke/63798/', '2', '1', 'vezycash', '6/23/2016 22:20']
#take the header out of the dataset and assign to its own variable
headers = hn[0]
hn = hn[1:]
#check if outcome is as expected
print(headers)
print("\n")
print(hn[0])
['id', 'title', 'url', 'num_points', 'num_comments', 'author', 'created_at'] ['12224879', 'Interactive Dynamic Video', 'http://www.interactivedynamicvideo.com/', '386', '52', 'ne0phyte', '8/4/2016 11:52']
Now that all the data has been imported and the header has been split from the actual data its time for the next stap in this project.
First the data will be checked on missing data. After the checks have been performed the data will be filtered.
# Define a function to search for null data in the given dataset
def check_null_data(dataset_header, dataset):
total_missing = []
columns = len(dataset_header)
# Loop over each row in the dataset to identify any missing values at the given index
for column in range(columns):
null_value = False
null_count = 0
for row in dataset:
if row[column] == '':
null_value = True
null_count += 1
if null_value == True:
#print(a)
#print(dataset_header,'\n')
#print('Row Index: ', dataset.index(row),'\n') # Print the row number where the error was found
#print(row, '\n')
null_value = False
total_missing.append([dataset_header[column],null_count])
# Print the number of missing values identified at the given index
template = "Column " + color.BOLD + "{} " + color.END + "has " + color.BLUE + color.BOLD + "{} " + color.END + "missing values"
for col, missing in total_missing:
print(template.format(col, missing))
#run the defined function
check_null_data(headers, hn)
Column id has 0 missing values Column title has 0 missing values Column url has 2440 missing values Column num_points has 0 missing values Column num_comments has 0 missing values Column author has 0 missing values Column created_at has 0 missing values
As can be seen above, there are a lot of missing values in the column 'url'. All other columns show no missing values.
Since the information about URL is not deemed necessary we can continue with filtering the data.
As mentioned before we are especially interested in 'ask posts' and 'show posts'. The next step will be to split the dataset in three lists.
#split dataset into three lists
ask_posts = []
show_posts = []
other_posts = []
#fill the lists according to the type of post
for row in hn:
title = row[1]
if title.lower().startswith("ask hn"):
ask_posts.append(row)
elif title.lower().startswith("show hn"):
show_posts.append(row)
else:
other_posts.append(row)
#show how many posts there are of each type
template = "There are" + color.BOLD + " {:,} " + color.END + "of {} in the dataset "
print(template.format(len(ask_posts),"'ask posts'"))
print(template.format(len(show_posts),"'show posts'"))
print(template.format(len(other_posts),"'other posts'"))
There are 1,744 of 'ask posts' in the dataset There are 1,162 of 'show posts' in the dataset There are 17,194 of 'other posts' in the dataset
The data has been split and we can continue with the analysis.
The first question that we want to answer is:
Do Ask HN or Show HN receive more comments on average?\
We will find out in the next step.
#retrieving the average number of comments in 'ask posts'
def calc_avg(a_list, index):
total_comments = 0
for row in a_list:
nr_comments = int(row[index])
total_comments += nr_comments
avg_comments = round(total_comments / len(a_list),2)
return(avg_comments)
template = "Average {} comments per post: "+ color.BOLD + "{}" + color.END
#calc avg ask comments
avg_ask_comments = calc_avg(ask_posts,4)
print(template.format("ask",avg_ask_comments))
#calc avg show comments
avg_show_comments = calc_avg(show_posts,4)
print(template.format("show", avg_show_comments))
Average ask comments per post: 14.04 Average show comments per post: 10.32
After calculating the average amount of comments per post for both ask posts as well as show post it can be concluded that ask posts receive more comments on average.
The second question that needs to be answered is:
Do posts created at a certain time receive more comments on average?
import datetime as dt
# make a new list which will be filled with lists containing two elements:
# 1. creation time and
# 2. number of comments
result_list = []
#a for loop to append both elements to the result_list
for row in ask_posts:
created_at = row[6]
nr_of_comments = int(row[4])
result_list.append([created_at,nr_of_comments])
#creating two new dictionaries that will be based on result_list
counts_by_hour = {}
comments_by_hour = {}
#iterate over result_list and fill the two dictionaries
for row in result_list:
time = dt.datetime.strptime(row[0], "%m/%d/%Y %H:%M") #format creation time as datetime
hour = time.strftime("%H") #retrieve the hour from creation time
if hour in counts_by_hour:
counts_by_hour[hour] += 1
comments_by_hour[hour] += row[1]
else:
counts_by_hour[hour] = 1
comments_by_hour[hour] = row[1]
#sort both dictionaries by hour (which is the key of both dictionaries)
sorted_counts_by_hour = dict(sorted(counts_by_hour.items()))
sorted_comments_by_hour = dict(sorted(comments_by_hour.items()))
avg_by_hour = []
#to calculate avg amount of comments per hour both dictionaries are used
for hour1 in sorted_counts_by_hour:
count = sorted_counts_by_hour[hour1]
for hour2 in sorted_comments_by_hour:
comments = int(sorted_comments_by_hour[hour2])
if hour1 == hour2: #if hours from both dictionaries correspond the avg can be calculated
avg = comments / count
avg_by_hour.append([hour1,avg])
#the next step will change the order of list avg_by_hour so we can sort it afterwards on the average value.
swap_avg_by_hour = []
for row in avg_by_hour:
swap_avg_by_hour.append([row[1],row[0]])
sorted_swap = sorted(swap_avg_by_hour, reverse = True)
print('\n')
print(color.BOLD + "Top 5 Hours for Ask Posts Comments" + color.END)
template = "At " + color.BOLD + "{}" + color.END + " a post receives on average " + color.BOLD + "{:.2f}" + color.END + " posts"
#retrieve the top five hours in terms of average posts and print
for avg, hour in sorted_swap[:5]:
hour = dt.datetime.strptime(hour,"%H").strftime("%H:%M")
print(template.format(hour,avg))
Top 5 Hours for Ask Posts Comments At 15:00 a post receives on average 38.59 posts At 02:00 a post receives on average 23.81 posts At 20:00 a post receives on average 21.52 posts At 16:00 a post receives on average 16.80 posts At 21:00 a post receives on average 16.01 posts
The results show that at certain hours posts tend to receive more comments.
The end of the afternoon and beginning of the evening are times that a post is more likely to receive a lot of comments.