from mrjob.job import MRJob

class MRWordFrequencyCount(MRJob):

    def mapper(self, _, line):
        yield "chars", len(line)
        yield "words", len(line.split())
        yield "lines", 1

    def reducer(self, key, values):
        yield key, sum(values)

if __name__ == '__main__':
    MRWordFrequencyCount.run()


# Call reducer method of MRWordFrequencyCount object using some key and values.
MRWordFrequencyCount.reducer(my_key, my_values) # Did not specify 'self' argument

import re

WORD_RE = re.compile(r"[\w']+")


class MRMostUsedWord(MRJob):

    def mapper_get_words(self, _, line):
        # yield each word in the line
        for word in WORD_RE.findall(line):
            yield (word.lower(), 1)

    def combiner_count_words(self, word, counts):
        # optimization: sum the words we've seen so far
        yield (word, sum(counts))

    def reducer_count_words(self, word, counts):
        # send all (num_occurrences, word) pairs to the same reducer.
        # num_occurrences is so we can easily use Python's max() function.
        yield None, (sum(counts), word)

    # discard the key; it is just None
    def reducer_find_max_word(self, _, word_count_pairs):
        # each item of word_count_pairs is (count, word),
        # so yielding one results in key=counts, value=word
        yield max(word_count_pairs)

    def steps(self):
        return [
            self.mr(mapper=self.mapper_get_words,
                    combiner=self.combiner_count_words,
                    reducer=self.reducer_count_words),
            self.mr(reducer=self.reducer_find_max_word)
        ]


if __name__ == '__main__':
    MRMostUsedWord.run()

# This function converts a list into a generator.
def example_generator(list):
    for item in list:
        yield item
        
# Create a generator.
my_generator = example_generator([0, 1, 2, 3, 4])

# Iterating over the generator works great the first time.
print "generator iteration 1"
print "---------------------"
for value in my_generator:
    print value
    
# ...but it doesn't work the second time.
print "\n"
print "generator iteration 2"
print "---------------------"
for value in my_generator:
    print value

import numpy as np
print np.__name__

import matplotlib.pyplot as plt
print plt.__name__

python MRMostUsedWord.py some_file.txt > most_used_word.out

python MRMostUsedWord.py -r emr some_file.txt > most_used_word.out

word_list = [word.strip() for word in open("word_list.txt").readlines()]
print "{0} words in list".format(len(word_list))
print "First ten words: {0}".format(", ".join(word_list[0:10]))

friends = open("baseball_friends.csv").readlines()
print friends[0].strip()
print len(friends[0].split(",")) - 2

import pandas as pd
import json

# Read results.
result_file = "baseball_friends.out"
result = [[json.loads(field) for field in line.strip().split('\t')] for line in open(result_file)]

# Break out columns.
names = [x[0] for x in result]
teams = [x[1][0] for x in result]
redsox_count = [x[1][1] for x in result]
cardinals_count = [x[1][2] for x in result]

# Combine in data frame.
result = pd.DataFrame(index=names, data={'teams': teams, 'redsox_count': redsox_count, 
                                         'cardinals_count': cardinals_count})

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 6)
rcParams['font.size'] = 14

# Average number of friends by affiliation.
print result.groupby('teams').mean()

# Histogram the affiliations of people who are friends of Red Sox fans.
plt.hist(result.redsox_count[result.teams == "Red Sox"], label="Red Sox friend Red Sox")
plt.hist(result.cardinals_count[result.teams == "Red Sox"], label="Red Sox friend Cardinals")
plt.xlabel('number of friends')
plt.ylabel('count')
plt.legend(loc=0)