#!/usr/bin/env python
# coding: utf-8

# # Metropolis to Decode a Cipher
# 
# We obtain the coded message:

# In[256]:


code = 'mprboydewyemo egfywhoptl gyed egybuyld yoel wyptgpxel gyrfyx tloemyretzyhbmpxp wyetyexx m oelpbtyldelydewyreuum gy xbtbkpwlwyetgyloeg owyldelyjpg tptqyqehydewyemeok gyldbw yjdbyjelxdyplyewyeywpqtemybuywlo wwyptyld yuptetxpemywfwl kybld owydes yhptt gyplybtyeyw op wybuyl xdtpxemyuexlbowywnxdyewyopwptqywdboll okyg rlywem wyrfyld ynwyqbs otk tlyetgyt jyxbohboel ylevyhbmpxp wybld oykeoz lwyldelyxetyr ylehh gyuboygbmmeowyptxmngptqyldobnqdyld ywjehwykeoz lyetgympinpgplfympt wykeptlept gyrfyqmbremyx tloemyretzwyeo tlyf lywdbjptqyeyrpqygbmmeoywin  c '
print(code)


# **Goal: ** Decode this message.
# 
# ** Assumption: ** Simple cipher which replaces each letter with some other letter. The possible letters are a-z and space.

# ## Coding and Decoding with Dictionaries
# 
# We will use Python dictionaries to represent coders and decoders. There are several other possible data structures one could use.
# 
# Python dictionaries are key / value pairs.

# In[257]:


d = {'key1':'value1','key2':'value2'}


# In[258]:


print(d)


# In[259]:


d['key1']


# In[260]:


import string
import numpy as np
from numpy import random


# We obtain all of the ascii lowercase letters plus the ' ' symbol

# In[261]:


letters = list(string.ascii_lowercase)
letters = letters + [' ']
print(letters)


# In[262]:


letters_mix = letters.copy()
random.shuffle(letters_mix)
print(letters_mix)


# In[263]:


cipher = dict(list(zip(letters,letters_mix)))
print(cipher['a']) ## a becomes
print(cipher['b']) ## b becomes


# In[264]:


sample = "the yellow cat chased the brown dog"
sample_coded = "".join(list(map(lambda x: cipher[x], sample)))
print(sample_coded)
print(sample)


# If we know the cipher, we can trivially invert it by reversing the role of keys and values in the dictionary.

# In[265]:


decipher = dict(list(zip(letters_mix,letters)))


# In[266]:


"".join(list(map(lambda x: decipher[x], sample_coded)))


# Our goal is to find the deciper for the original code and use it to reconstruct the original message.

# ## Compute Probability of Data Given Cipher
# 
# We download 'War and Peace' and count the number of times we see $\gamma\beta$ where $\gamma$ and $\beta$ are any characters a-z and ' '. This requires a bit of data cleaning.

# In[267]:


## download all of "War and Peace"
from urllib import request


# In[268]:


data = request.urlopen("http://www.gutenberg.org/files/2600/2600-0.txt")


# In[269]:


raw = data.read().decode('utf8')


# In[270]:


raw[:5000]


# In[271]:


raw[10000:10500]


# In[272]:


raw = raw.lower() ## make all lower case
raw = raw.replace("\r","")
raw = raw.replace("\n"," ") ## replace carriage return with space
cleaned = "".join(list(filter(lambda x: x in letters, raw))) ## remove all non letters


# In[273]:


cleaned[:1000]


# In[274]:


cleaned[10000:11000]


# Split `cleaned` into character pairs, there are `len(cleaned) -1` of these. 

# In[275]:


n = len(cleaned)


# In[276]:


fromix = list(range(0,n-1))


# In[277]:


toix = list(range(2,n+1))


# In[278]:


fromtoix = list(zip(fromix,toix))


# In[279]:


fromtoix[:10]


# In[280]:


cleaned[fromtoix[0][0]:fromtoix[0][1]]


# In[281]:


pairs = list(map(lambda x: cleaned[x[0]:x[1]], fromtoix))


# In[282]:


pairs


# Count the number each pair. We use the `Counter` function in `collections`

# In[283]:


from collections import Counter
z = ['blue', 'red', 'blue', 'yellow', 'blue', 'red']
counts = dict(Counter(z))
counts


# In[284]:


counts.keys()


# In[285]:


counts.values()


# In[286]:


counts = dict(Counter(pairs))


# In[287]:


counts


# There are `len(counts.keys())` pairs in War and Peace

# In[288]:


len(counts.keys())


# Total possible pairs is.

# In[289]:


27*27


# So some pairs are missing because they never appeared in the book. We add these to counts. This is important to avoid python errors in the Metropolis algorithm. By adding (artificial) 1 counts to these pairs, we make the algorithm a bit more robust.

# In[290]:


## construct all possible pairs (NOTE: bad code)
allpairs = []
for i in letters:
    for j in letters:
        allpairs = allpairs + [i + j]


# In[291]:


absent = list(set(allpairs) - set(counts.keys()))


# In[292]:


absent = dict(list(zip(absent,len(absent)*[1])))


# In[293]:


counts = {**counts,**absent} ## merge together dictionaries


# In[294]:


counts


# Transform the counts to conditional probabilities:

# In[295]:


num_added = dict(Counter(list(map(lambda x: x[0], list(absent.keys())))))
num_added


# In[296]:


counts_single = dict(Counter(cleaned))
for i in num_added:
    counts_single[i] = counts_single[i] + num_added[i]
counts_single


# In[297]:


for i in counts:
    counts[i] = counts[i] / counts_single[i[0]]


# In[298]:


np.sum(np.array(list(counts.values())))


# In[299]:


pairprob = counts


# Normalize the counts of single letters.

# In[300]:


n = np.sum(np.array(list(counts_single.values())))
n


# In[301]:


for i in counts_single:
    counts_single[i] = counts_single[i] / n


# In[302]:


singleprob = counts_single
singleprob


# ## Calculating the Probability of Strings that are Not Encoded

# In[408]:


test = 'hello'

n = len(test)
toix = list(range(2,n+1))
fromix = list(range(0,n-1))
fromtoix = list(zip(fromix,toix))


# In[409]:


pairs = list(map(lambda x: test[x[0]:x[1]], fromtoix))
pairs


# In[410]:


X = dict(Counter(pairs))
X


# In[411]:


p = np.array(list(map(lambda y: pairprob[y], X.keys())))
p


# In[412]:


ns = np.array(list(X.values()))
ns


# In[413]:


out = np.sum(ns*np.log(p)) + np.log(singleprob[test[0]])
print(out)
print(np.exp(out))


# In[414]:


## if all len(test) character strings were equally likely
1.0 / np.power(27.0,len(test))


# ## The Posterior
# 
# The posterior is proportional to the likelihood times the prior:
# $$ \pi(\theta|x) \propto f(x|\theta)\pi(\theta) $$
# Here $\theta$ represents the inverse cipher, the function which maps the code back to the true letter. This is a discrete parameter that could take 26! possible values. We put a uniform prior on all possible parameter values. Therefore the posterior is proportional to just the likelihood $f(x|\theta)$.
# 
# In python, we can represent the $\theta$ parameter with a dictionary where the values are the coded message and the key are what letter the code corresponds to. One could also use  a pandas data frame to accomplish this.

# In[415]:


## random initial starting value for theta
theta = letters.copy()
random.shuffle(theta)
theta = dict(list(zip(letters,theta)))


# In[416]:


theta


# In[417]:


decoded = list(map(lambda x: theta[x], code))


# In[418]:


decoded = "".join(decoded)


# In[419]:


## decoded mesage does not appear to be correct, because theta was a random guess
decoded


# Compute the log likelihood (= log posterior because uniform prior) in pieces.

# In[424]:


n = len(code)
toix = list(range(2,n+1))
fromix = list(range(0,n-1))
fromtoix = list(zip(fromix,toix))


# In[425]:


pairs = list(map(lambda x: decoded[x[0]:x[1]], fromtoix))


# In[426]:


X = dict(Counter(pairs))


# In[427]:


X


# In[428]:


p = np.array(list(map(lambda y: pairprob[y], X.keys())))
ns = np.array(list(X.values()))
out = np.sum(ns*np.log(p)) + np.log(singleprob[decoded[0]])


# In[429]:


out


# We put the above code in functions which we can call in Metropolis.

# In[433]:


def decode(theta):
    decoded = list(map(lambda x: theta[x], code))
    decoded = "".join(decoded)
    return decoded


# In[434]:


def logpost(theta):
    decoded = decode(theta)
    pairs = list(map(lambda x: decoded[x[0]:x[1]], fromtoix))
    X = dict(Counter(pairs))
    p = np.array(list(map(lambda y: pairprob[y], X.keys())))
    ns = np.array(list(X.values()))
    return np.sum(ns*np.log(p)) + np.log(singleprob[decoded[0]])


# In[436]:


## check that function output matches scratch work
logpost(theta)


# ## Code Up Metropolis
# 
# The proposal distribution considers switching pairs of letters in the deciper theta.

# In[437]:


def thetaproposal(theta):
    thetanew = theta.copy()
    ixs = random.choice(list(theta.keys()),size=2,replace=False)
    thetanew[ixs[0]] = theta[ixs[1]]
    thetanew[ixs[1]] = theta[ixs[0]]
    return thetanew


# In[440]:


## random initial starting value for theta
theta = letters.copy()
random.shuffle(theta)
theta = dict(list(zip(letters,theta)))


# In[441]:


## metropolis
niter = 10000
for ii in np.arange(niter):
    thetanew = thetaproposal(theta)
    if (ii % 100) == 0:
        print("==== The " + str(ii) + " iteration estimate is: ====")
        print(decode(theta))
    if np.exp(logpost(thetanew) - logpost(theta)) > np.random.uniform():
        theta = thetanew


# In[191]:


decode(theta)


# There are 27 factorial different possible encodings. In a Markov chain with 10,000 iterations you check at most 

# In[444]:


10./np.math.factorial(27)


# fraction of all encodings.

# ### Exercises for Extra Practice
# 
# ** Exercise: ** Try other proposal distributions, such as randomly permuting 3 letters. Does this improve / worsen convergence?
# 
# ** Exercise: ** Rerun this demo with a different coded message. How does you ability to recover the truth change with message length?
# 
# ** Exercise: ** We computed bigram counts from "War and Peace." More generally one could compute [ngrams](https://en.wikipedia.org/wiki/N-gram). Try finding all 3 grams and calculating the conditional problability of any letter given the last two letters. So P(hello) = $P(h)P(e|h)P(l|he)P(l|el)P(o|ll)$.
# 
# ** Exercise: ** Write a "random word generator." Here the input to the function is a character sequence length $n$ and the output is a random text sequence drawn from a 1,2,or 3 gram distribution. Random sequences from 3 gram distributions should produce character strings with many real words.
# 
# ** Exercise: ** We are really more focused on the mode of the posterior than the full posterior distribution. Since the prior is flat, the mode of the posterior is equivalent to maximum likelihood estimation. Simulated tempering (Section 26.6 of Lange) is one method for finding the MLE / posterior mode for multimodal models. It shares many similarities with Metropolis. Implement simulated annealing for this problem. Does you algorithm converge more often to the correct solution than Metropolis?

# Read more about this problem (and a lot of MCMC theory) in [The MCMC Revolution](https://pdfs.semanticscholar.org/4e62/58ebd07b548f77f8cd33caf72d811c2bd54c.pdf).