Catch 22 - Character Appearances

In [1]:
import re
import itertools
import string
import csv

# Open raw text and split over newlines
FILE = open('/bigdrive/Documents/MSAN622_Data_Visualization/msan622/project-prototype/Catch-22.txt')
data = FILE.read()
data = data.split("\n")

# Find chapters, skipping intro and appendix
chapters = {}
key = False
for line in data:
    # Find chapter markers and make new dictionary entry
    if re.match(r'^[0-9]+ [A-Za-z0-9-&\'. ]+$', line) is not None:
        key = int(line[0:2])
        chapters[key] = []
    # If we have a new chapter, append the lowercase, punctuation-cleaned words to the dictionary
    elif key:
        chapters[key].append(line.lower().translate(string.maketrans("",""), string.punctuation).split())
    # Stop at the end of the book
    if line == 'APPENDIX':
        break
# Clean up broken lists into one total list for each chapter
for chapter in chapters:
    chapters[chapter] = list(itertools.chain(*chapters[chapter]))
In [2]:
# Now look for occurances of the main characters in the book
char_names = {'yossarian':"Yossarian",
              'chaplain':"Chaplain Tappman",
              'milo':"Milo Minderbinder",
              'cathcart':"Colonel Cathcart",
              'korn':"Colonel Korn",
              'nately':"Nately",
              'orr':"Orr",
              'major':"Major Major Major Major",
              'dunbar':"Dunbar",
              'daneeka':"Doc Daneeka",
              'joe':"Hungry Joe",
              'clevinger':"Clevinger",
              'aarfy':"Aarfy",
              'dreedle':"General Dreedle",
              'danby':"Major Danby",
              'mcwatt':"McWatt",
              'scheisskopf':"General Scheisskopf",
              'peckem':"General Peckem",
              'dobbs':"Dobbs",
              'whitcomb':"Corporal Whitcomb",
              'black':"Captain Black",
              'halfoat':"Chief White Halfoat",
              'duckett':"Nurse Duckett",
              'coverley':"Major — de Coverley",
              'wintergreen':"ex-P.F.C. Wintergreen",
              'appleby':"Appleby",
              'havermeyer':"Havermeyer",
              'snowden':"Snowden"}
# Loop through characters and chapters, index an appearance by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
characters = {character: [] for character in char_names}
for character in characters:
    for chapter in chapters:
        length = len(chapters[chapter])
        # Speacial handling for Major Major (Major Major Major Major)
        if character == 'major':
            b = ['major','major']
            location = [i for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b]
            location.append(0)
            location = [location[i] for i in range(len(location) - 1) if location[i] != location[i+1] - 1]
            location = [(chapter + (float(x)/length)) for x in location]
        # Speacial handling for Captain Black
        elif character == 'black':
            b = ['captain','black']
            location = [(chapter + (float(i)/length)) for i in range(len(chapters[chapter])) if
                        chapters[chapter][i:i+len(b)] == b]
        else:
            location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if 
                        x == character]
        characters[character].append(location)
    # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
    characters[character] = sorted(list(set(list(itertools.chain(*characters[character])))))

# Print summary of number of appearances, limit character dictionary to those only appearing 50+ times
for char in sorted(characters):
    print char, len(characters[char])
aarfy 130
appleby 55
black 70
cathcart 310
chaplain 446
clevinger 131
coverley 59
danby 127
daneeka 150
dobbs 82
dreedle 128
duckett 61
dunbar 169
halfoat 69
havermeyer 52
joe 141
korn 214
major 183
mcwatt 116
milo 393
nately 205
orr 185
peckem 102
scheisskopf 115
snowden 52
whitcomb 78
wintergreen 56
yossarian 1347
In [10]:
# Now load it into a melted CSV file with characters and their appearance times
with open('catch22.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
    headers = ['Character', 'Chapter']
    csvwriter.writerow(headers)
    for character in characters:
        for location in characters[character]:
            this_row = [char_names[character], location]
            csvwriter.writerow(this_row)

Catch 22- Locations

In [3]:
# Now look for occurances of the main locations visited in the book
locations = {'pianosa':'Pianosa, Italy',
             'rome':'Rome, Italy',
             'smyrna':'Smyrna, Turkey',
             'corsica':'Corsica, France',
             'parma':'Parma, Italy',
             'salerno':'Salerno, Italy',
             'marrakech':'Marrakech, Morocco',
             'malta':'Valletta, Malta',
             'cairo':'Cairo, Egypt', 
             'sicily':'Sicily, Italy', 
             'istanbul':'Istanbul, Turkey', 
             'etna':'Mt Etna, Italy',
             'vesuvius':'Mt Vesuvius, Italy',
             'palermo':'Palermo, Italy', 
             'catania':'Catania, Italy', 
             'oran':'Oran, Algeria',
             'beirut':'Beirut, Lebanon',
             'bengasi':'Bengasi, Libya',
             'sardinia':'Sardinia, Italy',
             'barcelona':'Barcelona, Spain',
             'leghorn':'Livorno, Italy',
             'marseilles':'Marseilles, France',
             'spezia':'Spezia, Italy',
             'majorca':'Majorca, Spain',
             'elba':'Elba, Italy',
             'ferrara':'Ferrara, Italy',
             'bologna':'Bologna, Italy',
             'arezzo':'Arezzo, Italy',
             'avignon':'Avignon, France'}
# Use OpenStreetMaps to geo-code the cities
from geopy.geocoders import Nominatim
geolocator = Nominatim(timeout=10)
loc_geo = {}
for locale in sorted(locations):
    address, (latitude, longitude) = geolocator.geocode(locations[locale])
    loc_geo[locale] = (latitude, longitude)

# Loop through locations and chapters, index a location mention by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
loc_times = {locale: [] for locale in locations}
for locale in locations:
    for chapter in chapters:
        length = len(chapters[chapter])
        location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if 
                    x == locale]
        loc_times[locale].append(location)
    # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
    loc_times[locale] = sorted(list(set(list(itertools.chain(*loc_times[locale])))))
In [ ]:
# Now load it into a melted CSV file with locations, their mention times, and the geo-coded location
with open('catch22geo.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
    headers = ['Location', 'Time', 'Lat', 'Lon']
    csvwriter.writerow(headers)
    for locale in sorted(locations):
        for t in loc_times[locale]:
            this_line = [locale, t, loc_geo[locale][0], loc_geo[locale][1]]
            csvwriter.writerow(this_line)

Catch 22 - Most Used Words Around Yossarian

In [5]:
import nltk
from nltk.tag.simplify import simplify_wsj_tag
# Now look for the words surrounding our main character
yo_words = {'words': [], 'locs': []}
for chapter in chapters:
    length = len(chapters[chapter])
    location = [i for i, x in enumerate(chapters[chapter]) if x == 'yossarian']
    # Expand range of words to 20 either side, this just gets indexes
    locations = [range(max(0,(i-25)),min(len(chapters[chapter]),(i+26))) for i in location]
    # Remove duplicates for overlapping ranges
    locations = list(set(list(itertools.chain(*locations))))
    # Grab the words and store to dictionary
    words = [chapters[chapter][i] for i in locations]
    locations = [(chapter + (float(x)/length)) for x in locations]
    yo_words['words'].append(words)
    yo_words['locs'].append(locations)
    
# Clean up broken liss
yo_words['words'] = list(itertools.chain(*yo_words['words']))
yo_words['locs'] = list(itertools.chain(*yo_words['locs']))
yo_words['words'] = nltk.pos_tag(yo_words['words'])
yo_words['words'] = [(word, simplify_wsj_tag(tag)) for word, tag in yo_words['words']]
In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(('said','thats','im','dont','got','get','say','youre'))

# Now load it into a melted CSV file with word, POS type and their mention times
with open('catch22pos.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
    headers = ['Word', 'Time', 'POS']
    csvwriter.writerow(headers)
    for i in range(len(yo_words['locs'])):
        if yo_words['words'][i][0] not in stop and yo_words['words'][i][0] not in char_names:
            this_line = [yo_words['words'][i][0], yo_words['locs'][i], yo_words['words'][i][1]]
            csvwriter.writerow(this_line)
In [ ]: