created_at: A timestamp of the story's creation time.
created_at_i: A unix epoch timestamp.
url: The URL of the story link.
objectID: The ID of the story.
author: The story's author (username on HN).
points: The number of upvotes the story had.
title: The headline of the post.
num_comments: The number of a comments a post has.
from datetime import datetime
import json
import io
import string
import csv
from pipeline import build_csv
from pipeline import Pipeline
from stop_words import stop_words
pipeline=Pipeline()
@pipeline.task()
def file_to_json():
with open('hn_stories_2014.json', 'r') as f:
data = json.load(f)
stories = data['stories']
return stories
#### we need to filter out most popular stories every year
@pipeline.task(depends_on=file_to_json)
def filter_stories(stories):
def is_popular(story): #is_popular will return boolean value
return (story["points"]>50 and story["num_comments"]>1
and not story["title"].startswith("ASK HN"))
return (story for story in stories if is_popular(story))
#### now dict to csv conversion is necessary
#### The purpose of translating the dictionaries to a CSV is that
#### we want to have a consistent data format when running the later
#### summarizations. By keeping consistent data formats,
#### each of your pipeline tasks will be adaptable with future task
#### requirements.
@pipeline.task(depends_on=filter_stories)
def json_to_csv(stories):
lines=[]
for story in stories:
lines.append(
(story['objectID'], datetime.strptime(story['created_at'], "%Y-%m-%dT%H:%M:%SZ"), story['url'], story['points'], story['title'])
)
return build_csv(lines,header=['objectID','created_at','url',
'points','title'], file=io.StringIO())
##### Once we have extracted the titles of each popular post,
#### we can then run the next word frequency task.
@pipeline.task(depends_on=json_to_csv)
def extract_titles(csv_file):
reader = csv.reader(csv_file)
header = next(reader)
idx = header.index('title')
return (line[idx] for line in reader)
@pipeline.task(depends_on=extract_titles)
def clean_titles(titles):
for title in titles:
title = title.lower()
title = ''.join(c for c in title if c not in string.punctuation)
yield title
@pipeline.task(depends_on=clean_titles)
def build_keyword_dictionary(titles):
word_freq={}
for title in titles:
for word in title.split(" "):
if word not in stop_words:
if word not in word_freq:
word_freq[word]=1
word_freq[word]+=1
return word_freq
@pipeline.task(depends_on=build_keyword_dictionary)
def top_words(keyword_dictionary):
top_values=sorted(keyword_dictionary.items() ,key=lambda x:x[1],reverse=True)
return top_values[:100]
ran = pipeline.run()
print(ran[top_words])
[('new', 186), ('google', 168), ('', 165), ('ask', 127), ('bitcoin', 103), ('open', 96), ('programming', 93), ('web', 90), ('data', 87), ('video', 80), ('python', 76), ('code', 75), ('released', 72), ('facebook', 72), ('using', 71), ('source', 69), ('2014', 66), ('2013', 66), ('free', 66), ('javascript', 66), ('game', 65), ('internet', 63), ('c', 61), ('microsoft', 60), ('work', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('software', 55), ('language', 55), ('use', 54), ('startup', 53), ('make', 52), ('apple', 51), ('time', 50), ('security', 49), ('yc', 49), ('nsa', 46), ('github', 46), ('windows', 45), ('like', 45), ('way', 43), ('project', 43), ('world', 42), ('developer', 41), ('computer', 41), ('heartbleed', 41), ('users', 41), ('1', 41), ('dont', 39), ('design', 38), ('git', 38), ('ios', 38), ('ceo', 37), ('os', 37), ('big', 37), ('vs', 37), ('twitter', 37), ('online', 37), ('life', 37), ('day', 36), ('apps', 35), ('android', 35), ('years', 35), ('best', 35), ('simple', 34), ('court', 34), ('mt', 34), ('firefox', 33), ('says', 33), ('guide', 33), ('site', 33), ('browser', 33), ('learning', 33), ('api', 33), ('gox', 33), ('problem', 32), ('server', 32), ('mozilla', 32), ('fast', 32), ('engine', 32), ('does', 31), ('better', 31), ('introducing', 31), ('text', 31), ('amazon', 31), ('year', 31), ('support', 30), ('tech', 30), ('stop', 30), ('million', 30), ('money', 30), ('people', 30), ('built', 30), ('learn', 29), ('developers', 29), ('did', 29), ('development', 29), ('3', 29), ('help', 29)]