Chiaroscuro Detector

This program counts the number of words in a text that pertain to lightness or darkness, as determined by the Princeton Wordnet hyponym tree for those concepts. See the appendices at the bottom of this page for a list of words used.

In [410]:
# Display plots in this window.  
%matplotlib inline

import nltk #using the Python Natural Language Toolkit
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet

#Logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.WARN)

def readText(filename): 
    """ 
    read custom text file (text of novel from Project Gutenberg) 
    and return NLTK Text object
    """
    logging.debug('Opening text file...')
    textfile=open(filename,'rU').read() 
    #tokens = nltk.word_tokenize(file)
    #text = nltk.Text(tokens)
    logging.debug('Text file read successfully.')
    
    logging.debug('Tokenizing text.')
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(textfile)
    logging.debug('Tokenizing completed.')
    return tokens

def cleanText(tokens): 
    """
    clean the text by removing puncuation, making all tokens lowercase, and lemmatizing. 
    """ 
    logging.debug('Cleaning text...')
    # normalize words by making them all lowercase
    tokensLower = [word.lower() for word in tokens] 
    from nltk.corpus import stopwords
    englishStopwords = stopwords.words('english')
    tokensNostops = [word for word in tokensLower if word not in englishStopwords]
    return tokensNostops

def flattenList(biglist): 
    return [item for sublist in biglist for item in sublist]

def replaceUnderscores(wordList): 
    newList = [] 
    for word in wordList: 
        newWord = word.replace('_', ' ')
        newList.append(newWord)
    return newList

def getDarkAndLightWords(): 
    """ 
    Uses wordnet to generate words pertaining to lightness and darkness, 
    specifically, hyponyms of "dark," "darkness," "light," and "lightness." 
    """
    darks = wordnet.synsets('dark') + wordnet.synsets('darkness')
    lights = wordnet.synsets('light') + wordnet.synsets('lightness')
    darksSyns = [syn.hyponyms() for syn in darks]
    lightsSyns = [syn.hyponyms() for syn in lights]
    
    # Flatten the lists, so that it returns a one-dimensional array. 
    flatDarks = flattenList(darksSyns)
    flatLights = flattenList(lightsSyns)
    
    # Get all word forms for all of the words in our synsets. 
    darksLemmas = [[str(lemma.name()) for lemma in flatDark.lemmas()] for flatDark in flatDarks] 
    lightsLemmas = [[str(lemma.name()) for lemma in flatLight.lemmas()] for flatLight in flatLights]
    darksList = flattenList(darksLemmas)
    lightsList = flattenList(lightsLemmas)
    
    # Turn underscores into spaces. 
    finalDarkWords = replaceUnderscores(darksList)
    finalLightWords = replaceUnderscores(lightsList)
    
    # Let's take all bigrams and break them up, since a search for "star" will match on 
    # "shooting star," anyway, and "beam of light" will be matched by searching for the words 
    # "beam" and "light." We'll filter out non-unique words, too, since breaking up "beam of light" 
    # will give us an extra "beam" and an extra "light." 
    curatedDarkWords = list(set(flattenList([word.split(' ') for word in finalDarkWords])))
    curatedLightWords = list(set(flattenList([word.split(' ') for word in finalLightWords])))
    
    wordsToRemove = ['fairy', 'fatuus', 'priming', 'room', 'pocket', 'buoyancy', 'euphoria', 'fuse', 'signal', 'sconce', 'friars', 'of', 'theater', 'ignuus']
    wordsToAdd = ['brightness', 'bright', 'light', 'sun', 'sunshine', 'sunlight', 'sunlit', 'sunstruck', 'ablaze']
    for word in wordsToRemove: 
        if word in curatedLightWords: 
            curatedLightWords.remove(word)
    for word in wordsToAdd: 
        if word not in curatedLightWords: 
            curatedLightWords.append(word)

    darkWordsToRemove = ['wedding', 'weeknight']
    darkWordsToAdd = ['dim', 'fog', 'dark', 'shadow', 'shade', 'fog', 'dingy', 'dismal', 'gloomy', 'gloom', 'black']

    for word in darkWordsToRemove: 
        if word in curatedDarkWords: 
            curatedDarkWords.remove(word)
    for word in darkWordsToAdd: 
        if word not in curatedDarkWords: 
            curatedDarkWords.append(word)
    
    return curatedDarkWords, curatedLightWords
In [413]:
def countChiaroscuro(filename): 
    tokens = readText(filename)
    logging.debug('Creating text object...')
    text = nltk.Text(tokens)
    newtokens = cleanText(tokens)
    
    # get lists of dark and light words 
    finalDarkWords, finalLightWords = getDarkAndLightWords()
    
    # initialize variables
    wordCountBright=0
    wordCountDark=0

    #count words cumulatively
    for word in finalLightWords: 
        wordCountBright=wordCountBright+text.count('word')
    for word in finalDarkWords:
        wordCountDark=wordCountDark+text.count('word')

    totalWords=len(text)
    print("\nTotals for text " + filename + ":")
    print("Total words in text: "+str(totalWords))
    print("Bright words: "+str(wordCountBright))
    print("Dark words: "+str(wordCountDark))
    wordCountBright=float(wordCountBright) #python 2 needs these to be floating point in order to divide them properly
    wordCountDark=float(wordCountDark) 
    totalWords=float(totalWords) 
    proportionBright=(wordCountBright/totalWords)
    proportionDark=(wordCountDark/totalWords)
    combinedProportion=(proportionBright+proportionDark)*100
    print("Proportion of bright words: " + str(proportionBright)) 
    print("Proportion of dark words: " + str(proportionDark))
    print("Combined proportion, as percentage (x100): " + str(combinedProportion))
    return proportionBright, proportionDark
    
def plotChiaroscuro(textsToAnalyze, textLabels, setLabel): 
    import numpy as np
    import matplotlib.pyplot as plt
    plt.figure(num=None, figsize=(10, 6), dpi=80, facecolor='w', edgecolor='k')

    texts = [countChiaroscuro(text) for text in textsToAnalyze] 
    brights = [x[0] for x in texts]
    darks = [x[1] for x in texts]
    maxSum = max([sum(x) for x in texts])
    
    N = len(texts)  # number of texts

    ind = np.arange(N)    # the x locations for the groups
    width = 0.5       # the width of the bars: can also be len(x) sequence
    opacity = 0.4

    p1 = plt.bar(ind, brights, width, color='r', alpha=opacity)
    p2 = plt.bar(ind, darks, width, color='b', alpha=opacity, bottom=brights)

    plt.ylabel('Scores')
    plt.title('Proportions of Light and Dark Words in ' + setLabel)
    plt.xticks(ind+width/2., textLabels)
    plt.yticks(np.arange(0,maxSum,maxSum/10))
    plt.legend( (p1[0], p2[0]), ('Bright Words', 'Dark Words') )

    plt.show()
In [405]:
setLabel = "Dickens Novels" 
textsToAnalyze = ['pickwick.txt', 'ot-text.txt', 'nn-text.txt', 'bh-text.txt', 'ht-text.txt']
textLabels = ('Pickwick', 'Oliver', 'Nickelby', 'Bleak', 'Hard', 'Cities')
plotChiaroscuro(textsToAnalyze, textLabels, setLabel)
Totals for text pickwick.txt:
Total words in text: 313359
Bright words: 11110
Dark words: 2310
Proportion of bright words: 0.035454542553429134
Proportion of dark words: 0.007371736570514969
Combined proportion, as percentage (x100): 4.28262791239441

Totals for text ot-text.txt:
Total words in text: 162312
Bright words: 8686
Dark words: 1806
Proportion of bright words: 0.05351421952782296
Proportion of dark words: 0.011126718911725566
Combined proportion, as percentage (x100): 6.464093843954853

Totals for text nn-text.txt:
Total words in text: 331041
Bright words: 20301
Dark words: 4221
Proportion of bright words: 0.06132473016937479
Proportion of dark words: 0.012750686470860105
Combined proportion, as percentage (x100): 7.40754166402349

Totals for text bh-text.txt:
Total words in text: 362309
Bright words: 12928
Dark words: 2688
Proportion of bright words: 0.035682249129886365
Proportion of dark words: 0.007419081502253601
Combined proportion, as percentage (x100): 4.310133063213996

Totals for text ht-text.txt:
Total words in text: 109113
Bright words: 6767
Dark words: 1407
Proportion of bright words: 0.06201827463272021
Proportion of dark words: 0.012894888785021033
Combined proportion, as percentage (x100): 7.491316341774123
In [406]:
setLabel = "Misc Victorian Novels" 
textsToAnalyze = ['bh-text.txt', 'mm-text.txt', 'ud-text.txt', 'ts-text.txt', 'pp-text.txt']
textLabels = ('Bleak House', 'Middlemarch', 'Mysteries of Udolpho', 'Turn of the Screw', 'Pride and Prejudice')
plotChiaroscuro(textsToAnalyze, textLabels, setLabel)
Totals for text bh-text.txt:
Total words in text: 362309
Bright words: 12928
Dark words: 2688
Proportion of bright words: 0.035682249129886365
Proportion of dark words: 0.007419081502253601
Combined proportion, as percentage (x100): 4.310133063213996

Totals for text mm-text.txt:
Total words in text: 326961
Bright words: 8888
Dark words: 1848
Proportion of bright words: 0.02718367022366582
Proportion of dark words: 0.005652050244524576
Combined proportion, as percentage (x100): 3.2835720468190397

Totals for text ud-text.txt:
Total words in text: 297288
Bright words: 3131
Dark words: 651
Proportion of bright words: 0.010531874814994215
Proportion of dark words: 0.0021897957536126584
Combined proportion, as percentage (x100): 1.2721670568606873

Totals for text ts-text.txt:
Total words in text: 46624
Bright words: 1616
Dark words: 336
Proportion of bright words: 0.034660260809883325
Proportion of dark words: 0.007206588881262869
Combined proportion, as percentage (x100): 4.186684969114619

Totals for text pp-text.txt:
Total words in text: 126070
Bright words: 4949
Dark words: 1029
Proportion of bright words: 0.039255968906163244
Proportion of dark words: 0.008162132148806218
Combined proportion, as percentage (x100): 4.741810105496946
In [407]:
setLabel = "Chapters of A Portrait of the Artist as a Young Man"  
textsToAnalyze = ['portrait/portrait-ch1.txt', 'portrait/portrait-ch2.txt', 'portrait/portrait-ch3.txt', 'portrait/portrait-ch4.txt', 'portrait/portrait-ch5.txt']
textLabels = ('Chapter 1', 'Chapter 2', 'Chapter 3', 'Chapter 4', 'Chapter 5')
plotChiaroscuro(textsToAnalyze, textLabels, setLabel)
Totals for text portrait/portrait-ch1.txt:
Total words in text: 17919
Bright words: 606
Dark words: 126
Proportion of bright words: 0.03381885149840951
Proportion of dark words: 0.007031642390758413
Combined proportion, as percentage (x100): 4.085049388916792

Totals for text portrait/portrait-ch2.txt:
Total words in text: 14905
Bright words: 808
Dark words: 168
Proportion of bright words: 0.054209996645421
Proportion of dark words: 0.011271385441127138
Combined proportion, as percentage (x100): 6.548138208654813

Totals for text portrait/portrait-ch3.txt:
Total words in text: 16658
Bright words: 808
Dark words: 168
Proportion of bright words: 0.04850522271581222
Proportion of dark words: 0.010085244327050066
Combined proportion, as percentage (x100): 5.859046704286229

Totals for text portrait/portrait-ch4.txt:
Total words in text: 9651
Bright words: 404
Dark words: 84
Proportion of bright words: 0.04186094705211895
Proportion of dark words: 0.008703761268262356
Combined proportion, as percentage (x100): 5.056470832038131

Totals for text portrait/portrait-ch5.txt:
Total words in text: 26739
Bright words: 2525
Dark words: 525
Proportion of bright words: 0.09443135494969894
Proportion of dark words: 0.019634242118254236
Combined proportion, as percentage (x100): 11.406559706795317

Appendix: Words Counted in this Analysis

Dark Synsets

In [271]:
for sense in darks:
    print(sense.name() + ': ' + sense.definition())   
dark.n.01: absence of light or illumination
iniquity.n.01: absence of moral or spiritual values
darkness.n.02: an unilluminated area
night.n.01: the time after sunset and before sunrise while it is dark outside
dark.n.05: an unenlightened state
dark.a.01: devoid of or deficient in light or brightness; shadowed or black
dark.a.02: (used of color) having a dark hue
dark.s.03: brunet (used of hair or skin or eyes)
black.s.05: stemming from evil characteristics or forces; wicked or dishonorable
dark.s.05: secret
dark.s.06: showing a brooding ill humor
benighted.s.02: lacking enlightenment or knowledge or culture
dark.s.08: marked by difficulty of style or expression
blue.s.08: causing dejection
colored.s.02: having skin rich in melanin pigments
dark.s.11: not giving performances; closed
dark.n.01: absence of light or illumination
darkness.n.02: an unilluminated area
iniquity.n.01: absence of moral or spiritual values
dark.n.05: an unenlightened state
darkness.n.05: having a dark or somber color
darkness.n.06: a swarthy complexion

Light Sensets

In [273]:
for sense in lights: 
    print(sense.definition())
(physics) electromagnetic radiation that can produce a visual sensation
any device serving as a source of illumination
a particular perspective or aspect of a situation
the quality of being luminous; emitting or reflecting light
an illuminated area
a condition of spiritual awareness; divine illumination
the visual effect of illumination on objects or scenes as created in pictures
a person regarded very fondly
having abundant light or illumination
mental understanding as an enlightening experience
merriment expressed by a brightness or gleam or animation of countenance
public awareness
a divine presence believed by Quakers to enlighten and guide the soul
a visual warning signal
a device for lighting or igniting fuel or charges or fires
make lighter or brighter
begin to smoke
to come to rest, settle
cause to start burning; subject to fire or great heat
fall to somebody by assignment or lot
alight from (a horse)
of comparatively little physical weight or density
(used of color) having a relatively small amount of coloring agent
of the military or industry; using (or being) relatively small or light arms or equipment
not great in degree or quantity or number
psychologically light; especially free from sadness or troubles
characterized by or emitting light
(used of vowels or syllables) pronounced with little or no stress
easily assimilated in the alimentary canal; not rich or heavily seasoned
(used of soil) loose and large-grained in consistency
(of sound or color) free from anything that dulls or dims
moving easily and quickly; nimble
demanding little effort; not burdensome
of little intensity or power or force
(physics, chemistry) not having atomic weight greater than average
weak and likely to lose consciousness
very thin and insubstantial
marked by temperance in indulgence
less than the correct or legal or full amount often deliberately so
having little importance
intended primarily as entertainment; not serious or profound
silly or trivial
designed for ease of movement or to carry little weight
having relatively few calories
(of sleep) easily disturbed
casual and unrestrained in sexual behavior
with few burdens
a feeling of joy and pride
the property of being comparatively small in weight
the gracefulness of a person or animal that is quick and nimble
having a light color
the visual effect of illumination on objects or scenes as created in pictures
the trait of being lighthearted and frivolous

Final Words Counted

In [408]:
curatedDarkWords
Out[408]:
['darkness',
 'night',
 'dimout',
 'blackness',
 'semidarkness',
 'blackout',
 'brownout',
 'lightlessness',
 'black',
 'pitch',
 'total',
 'foulness',
 'dim',
 'fog',
 'dark',
 'shadow',
 'shade',
 'dingy',
 'dismal',
 'gloomy',
 'gloom']
In [409]:
curatedLightWords
Out[409]:
['lighting',
 'irradiation',
 'aureole',
 'moonlight',
 'incandescence',
 'blinker',
 'running',
 'flood',
 'headlight',
 'sunlight',
 'gegenschein',
 'gloriole',
 'luminescence',
 'stoplight',
 'flame',
 'lamp',
 'photoflood',
 'lantern',
 'illumination',
 'brightness',
 'highlighting',
 'aura',
 'primer',
 'starlight',
 'Moon',
 'fuze',
 'counterglow',
 'glow',
 'gaslight',
 'headlamp',
 'sunshine',
 'corona',
 'half-light',
 'kindle',
 'strip',
 'fuzee',
 'torch',
 'cigarette',
 'candle',
 'fire',
 'spotlight',
 'up',
 'daylight',
 'friction',
 'fluorescence',
 'flasher',
 'houselights',
 'beam',
 'sidelight',
 'reignite',
 "friar's",
 'sun',
 'meteor',
 'twilight',
 'torchlight',
 'shaft',
 'searchlight',
 'conflagrate',
 'ray',
 'radiance',
 'traffic',
 'lighter',
 'jacklight',
 'light',
 'shooting',
 'glory',
 'anchor',
 'streamer',
 'nimbus',
 'candlelight',
 'sunniness',
 'lamplight',
 'halo',
 'flare',
 'highlight',
 "will-o'-the-wisp",
 'airiness',
 'navigation',
 'inflame',
 'ignis',
 'enkindle',
 'euphory',
 'glowing',
 'moonshine',
 'illuminance',
 'firelight',
 'floodlight',
 'night-light',
 'star',
 'lucifer',
 'match',
 'scintillation',
 'cigar',
 "jack-o'-lantern",
 'fusee',
 'panel',
 'riding',
 'bright',
 'sunlit',
 'sunstruck',
 'ablaze']