from mrjob.job import MRJob class MRWordFrequencyCount(MRJob): def mapper(self, _, line): yield "chars", len(line) yield "words", len(line.split()) yield "lines", 1 def reducer(self, key, values): yield key, sum(values) if __name__ == '__main__': MRWordFrequencyCount.run() # Call reducer method of MRWordFrequencyCount object using some key and values. MRWordFrequencyCount.reducer(my_key, my_values) # Did not specify 'self' argument import re WORD_RE = re.compile(r"[\w']+") class MRMostUsedWord(MRJob): def mapper_get_words(self, _, line): # yield each word in the line for word in WORD_RE.findall(line): yield (word.lower(), 1) def combiner_count_words(self, word, counts): # optimization: sum the words we've seen so far yield (word, sum(counts)) def reducer_count_words(self, word, counts): # send all (num_occurrences, word) pairs to the same reducer. # num_occurrences is so we can easily use Python's max() function. yield None, (sum(counts), word) # discard the key; it is just None def reducer_find_max_word(self, _, word_count_pairs): # each item of word_count_pairs is (count, word), # so yielding one results in key=counts, value=word yield max(word_count_pairs) def steps(self): return [ self.mr(mapper=self.mapper_get_words, combiner=self.combiner_count_words, reducer=self.reducer_count_words), self.mr(reducer=self.reducer_find_max_word) ] if __name__ == '__main__': MRMostUsedWord.run() # This function converts a list into a generator. def example_generator(list): for item in list: yield item # Create a generator. my_generator = example_generator([0, 1, 2, 3, 4]) # Iterating over the generator works great the first time. print "generator iteration 1" print "---------------------" for value in my_generator: print value # ...but it doesn't work the second time. print "\n" print "generator iteration 2" print "---------------------" for value in my_generator: print value import numpy as np print np.__name__ import matplotlib.pyplot as plt print plt.__name__ python MRMostUsedWord.py some_file.txt > most_used_word.out python MRMostUsedWord.py -r emr some_file.txt > most_used_word.out word_list = [word.strip() for word in open("word_list.txt").readlines()] print "{0} words in list".format(len(word_list)) print "First ten words: {0}".format(", ".join(word_list[0:10])) friends = open("baseball_friends.csv").readlines() print friends[0].strip() print len(friends[0].split(",")) - 2 import pandas as pd import json # Read results. result_file = "baseball_friends.out" result = [[json.loads(field) for field in line.strip().split('\t')] for line in open(result_file)] # Break out columns. names = [x[0] for x in result] teams = [x[1][0] for x in result] redsox_count = [x[1][1] for x in result] cardinals_count = [x[1][2] for x in result] # Combine in data frame. result = pd.DataFrame(index=names, data={'teams': teams, 'redsox_count': redsox_count, 'cardinals_count': cardinals_count}) %matplotlib inline import matplotlib.pyplot as plt from matplotlib import rcParams rcParams['figure.figsize'] = (10, 6) rcParams['font.size'] = 14 # Average number of friends by affiliation. print result.groupby('teams').mean() # Histogram the affiliations of people who are friends of Red Sox fans. plt.hist(result.redsox_count[result.teams == "Red Sox"], label="Red Sox friend Red Sox") plt.hist(result.cardinals_count[result.teams == "Red Sox"], label="Red Sox friend Cardinals") plt.xlabel('number of friends') plt.ylabel('count') plt.legend(loc=0)