In [1]:

#Make sure to address all inter-dependencies and packages installations
%matplotlib inline

In [2]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [3]:

dfCSI = pd.read_csv("C:/Users/Nicola/Desktop/BRACED Final Evaluation/Monitoring Tools and Data/Processing Data/CSI_reviewed1.csv",encoding='latin-1')
# Dataset Uploaded from Coping Strategy Index dataset 

In [4]:

#What is CSI: Check this online 
#https://www.wfp.org/content/coping-strategies-index-field-methods-manual-2nd-edition)

In [5]:

dfCSI.rename(columns={'CSI score assets':'CSIAssets'},inplace=True)
Base=dfCSI.loc[dfCSI['Study'] == 'Baseline']
Base = Base.apply(pd.to_numeric, errors='coerce')
End=dfCSI.loc[dfCSI['Study'] == 'Endline']
End = End.apply(pd.to_numeric, errors='coerce')
len(Base),len(End)

Out[5]:

(550, 746)

In [6]:

1-(Base.CSI_food.mean()/End.CSI_food.mean())

Out[6]:

0.6947973161402035

In [7]:

CSIAssets=dfCSI.CSIAssets.apply(pd.to_numeric, errors='coerce')
CSIfood=dfCSI.CSI_food.apply(pd.to_numeric, errors='coerce')

In [8]:

1-(Base.CSIAssets.mean()/End.CSIAssets.mean())

Out[8]:

0.2969372693726937

In [9]:

# Defining new variables
Year=dfCSI.Year.apply(pd.to_numeric, errors='coerce')
Commune=dfCSI.commune

In [10]:

# Define font specifics for graphical visualisation
import matplotlib.font_manager as font_manager
title_font = {'fontname':'Futura', 'size':'18', 'color':'black', 'weight':'normal',
              'verticalalignment':'bottom'}
axis_font = {'fontname':'Futura', 'size':'15'}
# Set the font properties
font_path = 'C:\Windows\Fonts\Futura.ttf'
font_prop = font_manager.FontProperties(fname=font_path, size=14)

In [11]:

#Boxplot seaborn library parameters
DsgImpF=sns.boxplot(CSIfood,Commune, Year, palette="BuGn_r")
#Disaggregated CSI value for Food per Commune
plt.xlim([0,200])
plt.title('Coping Strategy Index for food security',**title_font)
plt.ylabel('Commune',**axis_font)
plt.xlabel('CSI Food- Higher value equals to greater insecurity',**axis_font)
plt.tight_layout()
#Graph Into Figure
figure = DsgImpF.get_figure()
figure.set_size_inches(8, 7)
# Save in png format and adjust resolution
#figure.savefig('CSIFood.png', dpi=1000)

#Interpretation- Significant drop in CSI for food, less food insecurity 

In [12]:

Assets=sns.boxplot(CSIAssets,Commune, Year, palette="Blues")
plt.title('Coping Strategy Index for asset security',**title_font)
plt.xlabel('CSI Assets- Higher value equals to greater asset insecurity',**axis_font)
figure = Assets.get_figure()
figure.set_size_inches(8, 7)
#Drop in CSI for asset as well, less asset insecurity 

In [13]:

#import packages to visualise words processing
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from stop_words import get_stop_words
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim

C:\Users\Nicola\Anaconda3\envs\r\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [14]:

dfCSIQual = pd.read_csv("C:/Users/Nicola/Desktop/BRACED Final Evaluation/Final Evaluation Process and CSI/CSI Evaluation_2018.csv",encoding='latin-1')

In [15]:

# one-review-per-line of the new file with all qualitative evidence
def parse(dfCSIQual):
    g = gzip.open(dfCSI, 'rb')
    for l in g:
        yield eval(l)
def getDF(dfCSIQual):
    i = 0
    df = {}
    for d in parse(dfCSIQual):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [16]:

#Select and shape columns with text to be processed
analysis=dfCSIQual[['Q_3','Q_5','Q_7','Q_9','Q_11','Q_13','Q_15','Q_17','Q_19','Q_21','Q_23','Q_25','Q_27','Q_29']].dropna()
', '.join(analysis)
analysis.shape
analysis=pd.melt(analysis)
print(analysis.head(2))

  variable                                              value
0      Q_3     2 membres comité et 12bénéficiaires warrantage
1      Q_3  Warrantage, BC, jardins maraîchers et PFLN, VS...

In [17]:

print(analysis.ndim)
print(analysis.size)

2
588

In [18]:

#remove punctuation from melted text
import string
exclude = set(string.punctuation)
def remove_punctuation(x):
    try:
        x = ''.join(ch for ch in x if ch not in exclude)
    except:
        pass
    return x
analysis1 = analysis.apply(remove_punctuation)

In [19]:

# Define the clean list as set of word to be tokenized
words=pd.DataFrame(analysis1)
raw=words.as_matrix().flatten()

In [20]:

#tokenize words
import nltk
def tokenize(word):
    return word
tokenized = [tokenize(word) for sentence in raw for word in sentence.split()]

In [21]:

# clean the bag of words form stop words, set the language in French for this dataset
from stop_words import get_stop_words
fr_stop = get_stop_words('fr')
stopped_tokens = [i for i in tokenized if not i in fr_stop]
nltk_stpwd = stopwords.words('french')
stop_words_stpwd = get_stop_words('fr')
merged_stopwords = list(set(nltk_stpwd + fr_stop))
sb_stemmer = SnowballStemmer('french')

In [22]:

#Shape the text in a way to create a cleaner dictionary
num_reviews = analysis.shape[0]
doc_set = [analysis1.value for i in range(num_reviews)]
texts = []
for doc in doc_set:
    stopped_tokens = [token for token in tokenized if not token in merged_stopwords]
    stemmed_tokens = [sb_stemmer.stem(token) for token in stopped_tokens]
    # add tokens to list
    texts.append(stopped_tokens)
print (texts[1:1])

[]

In [23]:

#Creat dictionary
texts_dict = corpora.Dictionary(texts)
texts_dict.save('auto_review.dict') # lets save to disk for later use
# Examine each token’s unique id
print(texts_dict)

Dictionary(1011 unique tokens: ['(', '(15000fcfa),', '(CFW)Cash', '(CFW)agriculteurs', '(CFW,']...)

In [24]:

id2word = gensim.corpora.Dictionary()
corpus = [texts_dict.doc2bow(text) for text in texts]
len(corpus)
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim

In [25]:

from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=texts_dict)

In [26]:

lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=texts_dict)

In [27]:

lsimodel.show_topics(num_topics=5)
lsitopics = lsimodel.show_topics(formatted=True)

In [28]:

ldamodel = LdaModel(corpus=corpus, num_topics=5, id2word=texts_dict)

In [29]:

import pyLDAvis.gensim

In [30]:

pyLDAvis.enable_notebook()

In [31]:

pyLDAvis.gensim.prepare(ldamodel, corpus, texts_dict)

C:\Users\Nicola\Anaconda3\envs\r\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]

Out[31]:

In [ ]: