In [4]:
#coding=utf8
from __future__ import unicode_literals

import itertools
import re
In [5]:
import nltk
nltk.download('cmudict')
from nltk.corpus import cmudict
[nltk_data] Downloading package cmudict to /Users/tim/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
In [3]:
d = cmudict.dict()
In [81]:
def lookup(w):
    return d[w][0]

This is the CMU pronouncing dictionary. For each word, it has a list of phonemes:

In [102]:
words = [lookup('hat'), lookup('failure')]
words
Out[102]:
[[u'HH', u'AE1', u'T'], [u'F', u'EY1', u'L', u'Y', u'ER0']]

First, lets get rid of the numbers. I think they indicate stress. Whatever, we don't need 'em!

In [103]:
def strip_phonemes(phs):
    return [s.rstrip('0123456789') for s in phs]
In [104]:
words = map(strip_phonemes, words)
words
Out[104]:
[[u'HH', u'AE', u'T'], [u'F', u'EY', u'L', u'Y', u'ER']]

Some of these are dipthongs. Benjamin reckons we better split those up, so here goes. I looked at the docs for cmudict:

In [140]:
print("\n\n".join(cmudict.readme().split("\n\n")[3:-6]))
File Format: Each line consists of an uppercased word,
a counter (for alternative pronunciations), and a transcription.
Vowels are marked for stress (1=primary, 2=secondary, 0=no stress).
E.g.: NATURAL 1 N AE1 CH ER0 AH0 L

The dictionary contains 127069 entries.  Of these, 119400 words are assigned
a unique pronunciation, 6830 words have two pronunciations, and 839 words have
three or more pronunciations.  Many of these are fast-speech variants.

Phonemes: There are 39 phonemes, as shown below:
    
    Phoneme Example Translation    Phoneme Example Translation
    ------- ------- -----------    ------- ------- -----------
    AA      odd     AA D           AE      at      AE T
    AH      hut     HH AH T        AO      ought   AO T
    AW      cow     K AW           AY      hide    HH AY D
    B       be      B IY           CH      cheese  CH IY Z
    D       dee     D IY           DH      thee    DH IY
    EH      Ed      EH D           ER      hurt    HH ER T
    EY      ate     EY T           F       fee     F IY
    G       green   G R IY N       HH      he      HH IY
    IH      it      IH T           IY      eat     IY T
    JH      gee     JH IY          K       key     K IY
    L       lee     L IY           M       me      M IY
    N       knee    N IY           NG      ping    P IH NG
    OW      oat     OW T           OY      toy     T OY
    P       pee     P IY           R       read    R IY D
    S       sea     S IY           SH      she     SH IY
    T       tea     T IY           TH      theta   TH EY T AH
    UH      hood    HH UH D        UW      two     T UW
    V       vee     V IY           W       we      W IY
    Y       yield   Y IY L D       Z       zee     Z IY
    ZH      seizure S IY ZH ER

(For NLTK, entries have been sorted so that, e.g. FIRE 1 and FIRE 2
are contiguous, and not separated by FIRE'S 1.)

Based on this, Benjamin came up with the following table:

In [141]:
dipthongs_etc = {
    'AW': ['AE', 'UW'],
    'OW': ['AO', 'UW'],
    'EY': ['EH', 'IY'],
    'AY': ['AA', 'IY'],
    'OY': ['AO', 'IY'],
    'JH': ['D', 'ZH'],
    'CH': ['T', 'SH'],
}
In [142]:
def dipthify(words):
    return sum([dipthongs_etc.get(s, [s]) for s in words], [])
In [143]:
words = map(dipthify, words)
words
Out[143]:
[[u'HH', u'AE', u'T'], [u'F', u'EH', u'IY', u'L', u'Y', u'ER']]

We could use reversed() to get the result, but it's hard to read…

In [144]:
list(reversed([list(reversed(w)) for w in words]))
Out[144]:
[[u'ER', u'Y', u'L', u'IY', u'EH', u'F'], [u'T', u'AE', u'HH']]

IPA output

Benjamin can read IPA, so let's try that!

In [112]:
ipa = {
    'AA': 'ɑː',
    'AH': 'ʌ',
    'AW': 'ou',
    'B':  'b',
    'D':  'd',
    'EH': 'e',
    'EY': 'eɪ',
    'G':  'g',
    'IH': 'ɪ',
    'JH': 'dʒ',
    'L':  'l',
    'N':  'n',
    'OW': 'əʊ',
    'P':  'p',
    'S':  's',
    'T':  't',
    'UH': 'ʊ',
    'V':  'v',
    'Y':  'j',
    'ZH': 'ʒ',
    'AE': 'æ',
    'AO': 'ɔː',
    'AY': 'ʌɪ',
    'CH': 'tʃ',
    'DH': 'ð',
    'ER': 'əː',
    'F':  'f',
    'HH': 'h',
    'IY': 'iː',
    'K':  'k',
    'M':  'm',
    'NG': 'ŋ',
    'OY': 'ɔɪ',
    'R':  'r',
    'SH': 'ʃ',
    'TH': 'θ',
    'UW': 'uː',
    'W':  'w',
    'Z':  'z',
}
In [121]:
def ipaify(word):
    return ''.join(map(ipa.get, word))

result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))
print(result_ipa)
əːjliːef tæh

I can't read that, but it seemed to sound ok when Benjamin tried it!

English output

It'd be nice if we could make version which uses English words where possible…

So let's reverse the cmudict to get a dictionary mapping phonemes to words.

In [126]:
backwards = {}
for word, pronounciations in d.items():
    word = word_pat.search(word).group(1)
    for phonemes in pronounciations:
        key = tuple(strip_phonemes(phonemes))
        backwards[key] = word
In [167]:
backwards[tuple(reversed(strip_phonemes(lookup('ra'))))]
Out[167]:
u'are'
In [ ]:
We need single-phonemes to fall back on
In [122]:
english = {
    'AA': 'o', # 'ah' not 'aw'
    'AH': 'uh',
    'AW': 'ow',
    'B':  'b',
    'D':  'd',
    'EH': 'eh',
    'EY': 'ay',
    'G':  'g',
    'IH': 'ih',
    'JH': 'jh',
    'L':  'l',
    'N':  'n',
    'OW': 'oah',
    'P':  'p',
    'S':  's',
    'T':  't',
    'UH': 'ooh',
    'V':  'v',
    'Y':  'y',
    'ZH': 'zz',
    'AE': 'aa',
    'AO': 'aww',
    'AY': 'eye',
    'CH': 'ch',
    'DH': 'th',
    'ER': 'er',
    'F':  'f',
    'HH': 'h',
    'IY': 'ee', # 'ee' would be better than 'e'
    'K':  'k',
    'M':  'm',
    'NG': 'ng',
    'OY': 'oy',
    'R':  'r',
    'SH': 'sh',
    'TH': 'th',
    'UW': 'ooo', # 'oo' would be better than 'ou'
    'W':  'w',
    'Z':  'z',
}
In [168]:
for phoneme, word in english.items():
    backwards[phoneme] = word

Then we can do a dumb greedy search.

In [169]:
def engify(word):
    result = []
    word = list(word)
    while word:
        for i in range(len(word), 0, -1):
            key = tuple(word[:i])
            if key in backwards:
                result.append(backwards[key])
                word = word[i:]
                break
        else:
            result.append(english[word.pop(0)])
    return '-'.join(result)

result_english = ' '.join(reversed(map(engify, map(reversed, words))))
result_english
Out[169]:
u'are-y-leigh-f t-aa-h'

Put it all together

In [177]:
word_pat = re.compile(u'([A-z]+)')

text = unicode("I have milk, eggs and juice if any of that suits")
raw_words = filter(word_pat.match, word_pat.split(text))

words = [] # really phonemes
for w in raw_words:
    try:
        words.append(d[w.lower()][0])
    except KeyError:
        raise UnknownWord(w)

words = map(strip_phonemes, words)

words = map(dipthify, words)
result_english = ' '.join(reversed(map(engify, map(reversed, words))))

result_ipa = ' '.join(reversed(map(ipaify, map(reversed, words))))

print(result_english)
print(result_ipa)
stu-s t-aa-th v-uhh e-n-eh f-ih sioux-zz-d d-n-uhh z-g-eh clim v-aa-h e-awe
stuːs tæð vʌ iːne fɪ suːʒd dnʌ zge klɪm væh iːɑː
bugreport: ɑː is prounced “ah”, not “aw”
and “oo” would be better than “ou” for uː
and for that matter,
In [ ]:
class UnknownWord(Exception): pass