import urllib.request urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv') import pandas as pd data = pd.read_csv('healthmopupandbaselinenmisfacility.csv') data.describe() data.describe? print(data['num_doctors_fulltime']) #print(data['num_nurses_fulltime']) # this ensures the plot appears in the web browser %matplotlib inline import matplotlib.pyplot as plt # this imports the plotting library in python _ = plt.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx') plt.plot? data[data['num_nurses_fulltime']>100] data[data['num_nurses_fulltime']>100].sort_values(by='num_nurses_fulltime', ascending=False) data['num_nurses_fulltime'].hist(bins=20) # histogram the data with 20 bins. plt.title('Histogram of Number of Nurses') data['num_nurses_fulltime'].hist(bins=100) # histogram the data with 20 bins. plt.title('Histogram of Number of Nurses') ax = plt.gca() ax.set_yscale('log') fig, ax = plt.subplots(figsize=(10, 7)) ax.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx') ax.set_xscale('log') # use a logarithmic x scale ax.set_yscale('log') # use a logarithmic Y scale # give the plot some titles and labels plt.title('Number of Nurses against Number of Doctors') plt.ylabel('number of nurses') plt.xlabel('number of doctors') large = (data.num_nurses_fulltime>2).sum() # number of positive outcomes (in sum True counts as 1, False counts as 0) total_facilities = data.num_nurses_fulltime.count() prob_large = float(large)/float(total_facilities) print("Probability of number of nurses being greather than 2 is:", prob_large) large = ((data.num_nurses_fulltime>2) & (data.num_doctors_fulltime>1)).sum() total_large_doctors = (data.num_doctors_fulltime>1).sum() prob_both_large = large/total_large_doctors print("Probability of number of nurses being greater than 2 given number of doctors is greater than 1 is:", prob_both_large) # Write code for your answer to Question 2 in this box # provide the answers so that the code runs correctly otherwise you will loose marks! num_doctors = 1 large = (data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>2).sum() total_facilities = data.num_nurses_fulltime.count() # this is total number of films prob_large = float(large)/float(total_facilities) print("Probability of nurses being greater than 2 and number of doctors being", num_doctors, "is:", prob_large) num_doctors=1 num_nurses=2 p_x = float((data.num_doctors_fulltime==num_doctors).sum())/float(data.num_nurses_fulltime.count()) p_y_given_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float((data.num_doctors_fulltime==num_doctors).sum()) p_y_and_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float(data.num_nurses_fulltime.count()) print("P(x) is", p_x) print("P(y|x) is", p_y_given_x) print("P(y,x) is", p_y_and_x) # Write code for your answer to Question 3 in this box # provide the answers so that the code runs correctly otherwise you will loose marks! data.columns $\dataScalar$ 0 1 ------------------ ----------- ------- $P(\dataScalar)$ $(1-\pi)$ $\pi$ def bernoulli(y_i, pi): if y_i == 1: return pi else: return 1-pi import pods pods.notebook.display_google_book(id='CF4UAAAAQAAJ', page='PA87') import pods from ipywidgets import IntSlider pods.notebook.display_plots('bayes-billiard{counter:0>3}.svg', directory='../slides/diagrams/ml', counter=IntSlider(0,0,9,1)) # Use this box for any code you need for the exercise data.head() import pandas as pd import numpy as np data = data[~pd.isnull(data['maternal_health_delivery_services'])] data = data.dropna() # Remove entries with missing values X = data[['emergency_transport', 'num_chews_fulltime', 'phcn_electricity', 'child_health_measles_immun_calc', 'num_nurses_fulltime', 'num_doctors_fulltime', 'improved_water_supply', 'improved_sanitation', 'antenatal_care_yn', 'family_planning_yn', 'malaria_treatment_artemisinin', 'latitude', 'longitude']].copy() y = data['maternal_health_delivery_services']==True # set label to be whether there's a maternal health delivery service # Create series of health center types with the relevant index s = data['facility_type_display'].apply(pd.Series, 1).stack() s.index = s.index.droplevel(-1) # to line up with df's index # Extract from the series the unique list of types. types = s.unique() # For each type extract the indices where it is present and add a column to X type_names = [] for htype in types: index = s[s==htype].index.tolist() type_col=htype.replace(' ', '_').replace('/','-').lower() type_names.append(type_col) X.loc[:, type_col] = 0.0 X.loc[index, type_col] = 1.0 X.describe() # assume data is binary or real. # this list encodes whether it is binary or real (1 for binary, 0 for real) binary_columns = ['emergency_transport', 'phcn_electricity', 'child_health_measles_immun_calc', 'improved_water_supply', 'improved_sanitation', 'antenatal_care_yn', 'family_planning_yn', 'malaria_treatment_artemisinin'] + type_names real_columns = ['num_chews_fulltime', 'num_nurses_fulltime', 'num_doctors_fulltime', 'latitude', 'longitude'] Bernoulli = pd.DataFrame(data=np.zeros((2,len(binary_columns))), columns=binary_columns, index=['theta_0', 'theta_1']) Gaussian = pd.DataFrame(data=np.zeros((4,len(real_columns))), columns=real_columns, index=['mu_0', 'sigma2_0', 'mu_1', 'sigma2_1']) num_train = 20000 indices = np.random.permutation(X.shape[0]) train_indices = indices[:num_train] test_indices = indices[num_train:] X_train = X.iloc[train_indices] y_train = y.iloc[train_indices]==True X_test = X.iloc[test_indices] y_test = y.iloc[test_indices]==True for column in X_train: if column in Gaussian: Gaussian[column]['mu_0'] = X_train[column][~y_train].mean() Gaussian[column]['mu_1'] = X_train[column][y_train].mean() Gaussian[column]['sigma2_0'] = X_train[column][~y_train].var(ddof=0) Gaussian[column]['sigma2_1'] = X_train[column][y_train].var(ddof=0) if column in Bernoulli: Bernoulli[column]['theta_0'] = X_train[column][~y_train].sum()/(~y_train).sum() Bernoulli[column]['theta_1'] = X_train[column][y_train].sum()/(y_train).sum() Bernoulli Gaussian prior = float(y_train.sum())/len(y_train) def log_gaussian(x, mu, sigma2): return -0.5* np.log(2*np.pi*sigma2)-((x-mu)**2)/(2*sigma2) def log_bernoulli(x, theta): return x*np.log(theta) + (1-x)*np.log(1-theta) import pods pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PA16') # fit the Bernoulli with Laplace smoothing. for column in X_train: if column in Bernoulli: Bernoulli[column]['theta_0'] = (X_train[column][~y_train].sum() + 1)/((~y_train).sum() + 2) Bernoulli[column]['theta_1'] = (X_train[column][y_train].sum() + 1)/((y_train).sum() + 2) import numpy as np import pandas as pd def predict(X_test, Gaussian, Bernoulli, prior): log_positive = pd.Series(data = np.zeros(X_test.shape[0]), index=X_test.index) log_negative = pd.Series(data = np.zeros(X_test.shape[0]), index=X_test.index) for column in X_test.columns: if column in Gaussian: log_positive += log_gaussian(X_test[column], Gaussian[column]['mu_1'], Gaussian[column]['sigma2_1']) log_negative += log_gaussian(X_test[column], Gaussian[column]['mu_0'], Gaussian[column]['sigma2_0']) elif column in Bernoulli: log_positive += log_bernoulli(X_test[column], Bernoulli[column]['theta_1']) log_negative += log_bernoulli(X_test[column], Bernoulli[column]['theta_0']) v = np.zeros_like(log_positive.values) for i in range(X_test.shape[0]): v[i] = np.exp(log_positive.values[i] + np.log(prior))/(np.exp(log_positive.values[i] + np.log(prior)) + np.exp(log_negative.values[i] + np.log(1-prior))) return v #return np.exp(log_positive + np.log(prior))/(np.exp(log_positive + np.log(prior)) + np.exp(log_negative + np.log(1-prior))) p_y = predict(X_test, Gaussian, Bernoulli, prior) correct = y_test.eq(p_y>0.5) total_correct = sum(correct) print("Total correct", total_correct, " out of ", len(y_test), "which is", float(total_correct)/len(y_test), "%") confusion_matrix = pd.DataFrame(data=np.zeros((2,2)), columns=['predicted no maternity', 'predicted maternity'], index =['actual no maternity','actual maternity']) confusion_matrix['predicted maternity']['actual maternity'] = (y_test & (p_y>0.5)).sum() confusion_matrix['predicted maternity']['actual no maternity'] = (~y_test & (p_y>0.5)).sum() confusion_matrix['predicted no maternity']['actual maternity'] = (y_test & ~(p_y>0.5)).sum() confusion_matrix['predicted no maternity']['actual no maternity'] = (~y_test & ~(p_y>0.5)).sum() confusion_matrix # Use this box for any code you need for the exercise # Use this box for any code you need for the exercise # Use this box for any code you need for the exercise