#Make sure to address all inter-dependencies and packages installations
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
dfCSI = pd.read_csv("C:/Users/Nicola/Desktop/BRACED Final Evaluation/Monitoring Tools and Data/Processing Data/CSI_reviewed1.csv",encoding='latin-1')
# Dataset Uploaded from Coping Strategy Index dataset
#What is CSI: Check this online
#https://www.wfp.org/content/coping-strategies-index-field-methods-manual-2nd-edition)
dfCSI.rename(columns={'CSI score assets':'CSIAssets'},inplace=True)
Base=dfCSI.loc[dfCSI['Study'] == 'Baseline']
Base = Base.apply(pd.to_numeric, errors='coerce')
End=dfCSI.loc[dfCSI['Study'] == 'Endline']
End = End.apply(pd.to_numeric, errors='coerce')
len(Base),len(End)
(550, 746)
1-(Base.CSI_food.mean()/End.CSI_food.mean())
0.6947973161402035
CSIAssets=dfCSI.CSIAssets.apply(pd.to_numeric, errors='coerce')
CSIfood=dfCSI.CSI_food.apply(pd.to_numeric, errors='coerce')
1-(Base.CSIAssets.mean()/End.CSIAssets.mean())
0.2969372693726937
# Defining new variables
Year=dfCSI.Year.apply(pd.to_numeric, errors='coerce')
Commune=dfCSI.commune
# Define font specifics for graphical visualisation
import matplotlib.font_manager as font_manager
title_font = {'fontname':'Futura', 'size':'18', 'color':'black', 'weight':'normal',
'verticalalignment':'bottom'}
axis_font = {'fontname':'Futura', 'size':'15'}
# Set the font properties
font_path = 'C:\Windows\Fonts\Futura.ttf'
font_prop = font_manager.FontProperties(fname=font_path, size=14)
#Boxplot seaborn library parameters
DsgImpF=sns.boxplot(CSIfood,Commune, Year, palette="BuGn_r")
#Disaggregated CSI value for Food per Commune
plt.xlim([0,200])
plt.title('Coping Strategy Index for food security',**title_font)
plt.ylabel('Commune',**axis_font)
plt.xlabel('CSI Food- Higher value equals to greater insecurity',**axis_font)
plt.tight_layout()
#Graph Into Figure
figure = DsgImpF.get_figure()
figure.set_size_inches(8, 7)
# Save in png format and adjust resolution
#figure.savefig('CSIFood.png', dpi=1000)
#Interpretation- Significant drop in CSI for food, less food insecurity
Assets=sns.boxplot(CSIAssets,Commune, Year, palette="Blues")
plt.title('Coping Strategy Index for asset security',**title_font)
plt.xlabel('CSI Assets- Higher value equals to greater asset insecurity',**axis_font)
figure = Assets.get_figure()
figure.set_size_inches(8, 7)
#Drop in CSI for asset as well, less asset insecurity
#import packages to visualise words processing
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from stop_words import get_stop_words
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
C:\Users\Nicola\Anaconda3\envs\r\lib\site-packages\gensim\utils.py:860: UserWarning: detected Windows; aliasing chunkize to chunkize_serial warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
dfCSIQual = pd.read_csv("C:/Users/Nicola/Desktop/BRACED Final Evaluation/Final Evaluation Process and CSI/CSI Evaluation_2018.csv",encoding='latin-1')
# one-review-per-line of the new file with all qualitative evidence
def parse(dfCSIQual):
g = gzip.open(dfCSI, 'rb')
for l in g:
yield eval(l)
def getDF(dfCSIQual):
i = 0
df = {}
for d in parse(dfCSIQual):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
#Select and shape columns with text to be processed
analysis=dfCSIQual[['Q_3','Q_5','Q_7','Q_9','Q_11','Q_13','Q_15','Q_17','Q_19','Q_21','Q_23','Q_25','Q_27','Q_29']].dropna()
', '.join(analysis)
analysis.shape
analysis=pd.melt(analysis)
print(analysis.head(2))
variable value 0 Q_3 2 membres comité et 12bénéficiaires warrantage 1 Q_3 Warrantage, BC, jardins maraîchers et PFLN, VS...
print(analysis.ndim)
print(analysis.size)
2 588
#remove punctuation from melted text
import string
exclude = set(string.punctuation)
def remove_punctuation(x):
try:
x = ''.join(ch for ch in x if ch not in exclude)
except:
pass
return x
analysis1 = analysis.apply(remove_punctuation)
# Define the clean list as set of word to be tokenized
words=pd.DataFrame(analysis1)
raw=words.as_matrix().flatten()
#tokenize words
import nltk
def tokenize(word):
return word
tokenized = [tokenize(word) for sentence in raw for word in sentence.split()]
# clean the bag of words form stop words, set the language in French for this dataset
from stop_words import get_stop_words
fr_stop = get_stop_words('fr')
stopped_tokens = [i for i in tokenized if not i in fr_stop]
nltk_stpwd = stopwords.words('french')
stop_words_stpwd = get_stop_words('fr')
merged_stopwords = list(set(nltk_stpwd + fr_stop))
sb_stemmer = SnowballStemmer('french')
#Shape the text in a way to create a cleaner dictionary
num_reviews = analysis.shape[0]
doc_set = [analysis1.value for i in range(num_reviews)]
texts = []
for doc in doc_set:
stopped_tokens = [token for token in tokenized if not token in merged_stopwords]
stemmed_tokens = [sb_stemmer.stem(token) for token in stopped_tokens]
# add tokens to list
texts.append(stopped_tokens)
print (texts[1:1])
[]
#Creat dictionary
texts_dict = corpora.Dictionary(texts)
texts_dict.save('auto_review.dict') # lets save to disk for later use
# Examine each token’s unique id
print(texts_dict)
Dictionary(1011 unique tokens: ['(', '(15000fcfa),', '(CFW)Cash', '(CFW)agriculteurs', '(CFW,']...)
id2word = gensim.corpora.Dictionary()
corpus = [texts_dict.doc2bow(text) for text in texts]
len(corpus)
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
lsimodel = LsiModel(corpus=corpus, num_topics=10, id2word=texts_dict)
lsimodel = LsiModel(corpus=corpus, num_topics=5, id2word=texts_dict)
lsimodel.show_topics(num_topics=5)
lsitopics = lsimodel.show_topics(formatted=True)
ldamodel = LdaModel(corpus=corpus, num_topics=5, id2word=texts_dict)
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldamodel, corpus, texts_dict)
C:\Users\Nicola\Anaconda3\envs\r\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated topic_term_dists = topic_term_dists.ix[topic_order]