import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import matplotlib.style as style
#style.use('fivethirtyeight')
import seaborn as sb
data = pd.read_csv("../data/KaggleV2-May-2016.csv")
data.head()
PatientId | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hipertension | Diabetes | Alcoholism | Handcap | SMS_received | No-show | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | No |
1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | No |
2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | No |
3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | No |
4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | No |
data.rename(columns = {'Hipertension':'Hypertension',
'PatientId': 'PatientID',
'Handcap': 'Handicap',
'No-show': 'NoShow',
'Alcoholism':'Alcoholism'
}, inplace = True)
data.head()
PatientID | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | No |
1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | No |
2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | No |
3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | No |
4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | No |
Now trying to understand the data set. It's distributions and unique values. Also attempting to find funky and incorrect data points. I want to understand and check the integrity of the dataset
data.PatientID.value_counts()
8.221459e+14 88 9.963767e+10 84 2.688613e+13 70 3.353478e+13 65 2.584244e+11 62 7.579746e+13 62 8.713749e+14 62 6.264199e+12 62 6.684488e+13 57 8.722785e+11 55 8.923969e+13 54 8.435224e+09 51 8.534397e+14 50 1.447997e+13 46 6.543360e+13 46 8.189452e+13 42 9.452745e+12 42 1.882323e+14 40 9.496197e+12 38 2.271580e+12 38 1.336493e+13 37 1.484143e+12 35 8.883500e+13 34 9.861628e+14 34 7.124589e+14 33 4.167557e+14 30 6.128878e+12 30 8.121397e+13 29 8.634164e+12 24 3.699499e+13 23 .. 6.375629e+12 1 9.369127e+12 1 5.375556e+14 1 1.662184e+11 1 7.234615e+13 1 9.649990e+12 1 6.912783e+10 1 1.954265e+13 1 2.736377e+10 1 5.532694e+11 1 7.149583e+12 1 8.676752e+13 1 7.838359e+13 1 5.962625e+11 1 4.919862e+13 1 3.477350e+14 1 1.626595e+13 1 7.794917e+12 1 1.161950e+13 1 5.615364e+14 1 4.355592e+11 1 1.321328e+12 1 1.751987e+13 1 4.262579e+13 1 3.115681e+13 1 1.222828e+13 1 6.821231e+11 1 7.163981e+14 1 9.798964e+14 1 2.724571e+11 1 Name: PatientID, Length: 62299, dtype: int64
making sure there arent duplicate appointment IDs
data.AppointmentID.value_counts()
5769215 1 5731652 1 5707080 1 5702986 1 5715276 1 5717325 1 5711182 1 5758289 1 5762391 1 5741913 1 5483871 1 5660001 1 5653858 1 5666148 1 5668197 1 5641576 1 5639531 1 5649772 1 5645678 1 5647727 1 5692785 1 5686642 1 5694838 1 5696887 1 5674360 1 5733701 1 5651786 1 5672315 1 5719362 1 5672187 1 .. 5744033 1 5748131 1 5739943 1 5672324 1 5682563 1 5680512 1 5782866 1 5496110 1 5713200 1 5711153 1 5717298 1 5709110 1 5707063 1 5729592 1 5463358 1 5565768 1 5776721 1 5789023 1 5590396 1 5606756 1 5608807 1 5635434 1 5621101 1 5686470 1 5582192 1 5586290 1 5584243 1 5598584 1 5602682 1 5771266 1 Name: AppointmentID, Length: 110527, dtype: int64
data.Gender.unique()
array(['F', 'M'], dtype=object)
data.Gender.value_counts()
F 71840 M 38687 Name: Gender, dtype: int64
trying to understand my timeline
data.AppointmentDay.unique()
array(['2016-04-29T00:00:00Z', '2016-05-03T00:00:00Z', '2016-05-10T00:00:00Z', '2016-05-17T00:00:00Z', '2016-05-24T00:00:00Z', '2016-05-31T00:00:00Z', '2016-05-02T00:00:00Z', '2016-05-30T00:00:00Z', '2016-05-16T00:00:00Z', '2016-05-04T00:00:00Z', '2016-05-19T00:00:00Z', '2016-05-12T00:00:00Z', '2016-05-06T00:00:00Z', '2016-05-20T00:00:00Z', '2016-05-05T00:00:00Z', '2016-05-13T00:00:00Z', '2016-05-09T00:00:00Z', '2016-05-25T00:00:00Z', '2016-05-11T00:00:00Z', '2016-05-18T00:00:00Z', '2016-05-14T00:00:00Z', '2016-06-02T00:00:00Z', '2016-06-03T00:00:00Z', '2016-06-06T00:00:00Z', '2016-06-07T00:00:00Z', '2016-06-01T00:00:00Z', '2016-06-08T00:00:00Z'], dtype=object)
data.Age.value_counts()
0 3539 1 2273 52 1746 49 1652 53 1651 56 1635 38 1629 59 1624 2 1618 50 1613 57 1603 36 1580 51 1567 19 1545 39 1536 37 1533 54 1530 34 1526 33 1524 30 1521 6 1521 3 1513 17 1509 32 1505 5 1489 44 1487 18 1487 58 1469 46 1460 45 1453 ... 74 602 76 571 75 544 78 541 77 527 80 511 81 434 82 392 79 390 84 311 83 280 85 275 86 260 87 184 89 173 88 126 90 109 92 86 91 66 93 53 94 33 95 24 96 17 97 11 98 6 115 5 100 4 102 2 99 1 -1 1 Name: Age, Length: 104, dtype: int64
need to get rid of negative value. It is impossible for someone to be -1.
data['Age'][data['Age'] < 0] = 1
/Applications/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy """Entry point for launching an IPython kernel.
data.Age.value_counts()
0 3539 1 2274 52 1746 49 1652 53 1651 56 1635 38 1629 59 1624 2 1618 50 1613 57 1603 36 1580 51 1567 19 1545 39 1536 37 1533 54 1530 34 1526 33 1524 30 1521 6 1521 3 1513 17 1509 32 1505 5 1489 44 1487 18 1487 58 1469 46 1460 45 1453 ... 72 615 74 602 76 571 75 544 78 541 77 527 80 511 81 434 82 392 79 390 84 311 83 280 85 275 86 260 87 184 89 173 88 126 90 109 92 86 91 66 93 53 94 33 95 24 96 17 97 11 98 6 115 5 100 4 102 2 99 1 Name: Age, Length: 103, dtype: int64
data.Scholarship.value_counts()
0 99666 1 10861 Name: Scholarship, dtype: int64
data.Hypertension.value_counts()
0 88726 1 21801 Name: Hypertension, dtype: int64
data.Diabetes.value_counts()
0 102584 1 7943 Name: Diabetes, dtype: int64
data.Handicap.value_counts()
0 108286 1 2042 2 183 3 13 4 3 Name: Handicap, dtype: int64
data.SMS_received.value_counts()
0 75045 1 35482 Name: SMS_received, dtype: int64
data.NoShow.value_counts()
No 88208 Yes 22319 Name: NoShow, dtype: int64
making sure there aren't any values missing in the dataset
data.isnull().sum()
PatientID 0 AppointmentID 0 Gender 0 ScheduledDay 0 AppointmentDay 0 Age 0 Neighbourhood 0 Scholarship 0 Hypertension 0 Diabetes 0 Alcoholism 0 Handicap 0 SMS_received 0 NoShow 0 dtype: int64
print('Age:',sorted(data.Age.unique()))
print('Gender:',data.Gender.unique())
#print('DayOfTheWeek:',data.DayOfTheWeek.unique())
#print('Status:',data.Status.unique())
print('Diabetes:',data.Diabetes.unique())
print('Alchoholism:',data.Alcoholism.unique())
print('Hypertension:',data.Hypertension.unique())
print('Handicap:',data.Handicap.unique())
#print('Smokes:',data.Smokes.unique())
print('Scholarship:',data.Scholarship.unique())
#print('Tuberculosis:',data.Tuberculosis.unique())
print('SMS_received:',data.SMS_received.unique())
#print('AwaitingTime:',sorted(data.AwaitingTime.unique()))
#print('HourOfTheDay:', sorted(data.HourOfTheDay.unique()))
Age: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 102, 115] Gender: ['F' 'M'] Diabetes: [0 1] Alchoholism: [0 1] Hypertension: [1 0] Handicap: [0 1 2 3 4] Scholarship: [0 1] SMS_received: [0 1]
data.head()
PatientID | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | No |
1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | No |
2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | No |
3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | No |
4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | No |
data.drop(data.columns[[0]], axis=1)
data.head()
PatientID | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | No |
1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | No |
2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | No |
3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | No |
4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | No |
Creating two binary columns for the gender
data['Male'] = data['Gender'].replace(['F','M'], [0,1])
data['Female'] = data['Gender'].replace(['F','M'], [1,0])
data.head()
PatientID | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | Male | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | No | 0 | 1 |
1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | No | 1 | 0 |
2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | No | 0 | 1 |
3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | No | 0 | 1 |
4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | No | 0 | 1 |
changing NoShow column to be binary
data['NoShow'] = data['NoShow'].replace(['Yes','No'], [1,0])
data.head()
PatientID | AppointmentID | Gender | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | Male | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 5642903 | F | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 5.589978e+14 | 5642503 | M | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 4.262962e+12 | 5642549 | F | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 8.679512e+11 | 5642828 | F | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 8.841186e+12 | 5642494 | F | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
dropping uneeded columns
data = data[['PatientID','ScheduledDay','AppointmentDay', 'Age','Neighbourhood', 'Scholarship','Hypertension','Diabetes','Alcoholism','Handicap','SMS_received','NoShow','Male','Female']]
data.head()
PatientID | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | Male | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 2016-04-29T18:38:08Z | 2016-04-29T00:00:00Z | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 5.589978e+14 | 2016-04-29T16:08:27Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 4.262962e+12 | 2016-04-29T16:19:04Z | 2016-04-29T00:00:00Z | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 8.679512e+11 | 2016-04-29T17:29:31Z | 2016-04-29T00:00:00Z | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 8.841186e+12 | 2016-04-29T16:07:23Z | 2016-04-29T00:00:00Z | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
cleaning up dates
data['ScheduledDay'] = pd.to_datetime(data['ScheduledDay'])
data['AppointmentDay'] = pd.to_datetime(data['AppointmentDay'])
data.head()
PatientID | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | Male | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 2016-04-29 18:38:08 | 2016-04-29 | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
1 | 5.589978e+14 | 2016-04-29 16:08:27 | 2016-04-29 | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 4.262962e+12 | 2016-04-29 16:19:04 | 2016-04-29 | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
3 | 8.679512e+11 | 2016-04-29 17:29:31 | 2016-04-29 | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
4 | 8.841186e+12 | 2016-04-29 16:07:23 | 2016-04-29 | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
#original features that I was using, got rid of.
"""
data['ScheduledYear'], data['ScheduledMonth'], data['ScheduleDay'] = data['ScheduledDay'].dt.year, data['ScheduledDay'].dt.month, data['ScheduledDay'].dt.day
data['AppointmentYear'], data['AppointmentMonth'], data['AppointmentDayy'] = data['AppointmentDay'].dt.year, data['AppointmentDay'].dt.month, data['AppointmentDay'].dt.day
data.head
"""
"\ndata['ScheduledYear'], data['ScheduledMonth'], data['ScheduleDay'] = data['ScheduledDay'].dt.year, data['ScheduledDay'].dt.month, data['ScheduledDay'].dt.day\ndata['AppointmentYear'], data['AppointmentMonth'], data['AppointmentDayy'] = data['AppointmentDay'].dt.year, data['AppointmentDay'].dt.month, data['AppointmentDay'].dt.day\ndata.head\n"
Probably have to get rid of Neighbourhood column given we don't have the specefic hospital for all these NoShows. If we did we could have used distance from hospital as a feature.
Creating a feature that calculates the wait time of a particular patient from when they schedule the appointment to when they actually have the appointment. I believe this will be a really great feature to have. One would assume that the longer the wait time the more likely people are to no show for their appointments
data.ScheduledDay = pd.DatetimeIndex(data.ScheduledDay).normalize()
data['WaitingTime'] = data['AppointmentDay'] - data['ScheduledDay']
data.head()
PatientID | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | Male | Female | WaitingTime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 2016-04-29 | 2016-04-29 | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 days |
1 | 5.589978e+14 | 2016-04-29 | 2016-04-29 | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 days |
2 | 4.262962e+12 | 2016-04-29 | 2016-04-29 | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 days |
3 | 8.679512e+11 | 2016-04-29 | 2016-04-29 | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 days |
4 | 8.841186e+12 | 2016-04-29 | 2016-04-29 | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 days |
data['WaitingTime'] = data['WaitingTime'].apply(lambda x: x.days)
data.head(20)
PatientID | ScheduledDay | AppointmentDay | Age | Neighbourhood | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | NoShow | Male | Female | WaitingTime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.987250e+13 | 2016-04-29 | 2016-04-29 | 62 | JARDIM DA PENHA | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 5.589978e+14 | 2016-04-29 | 2016-04-29 | 56 | JARDIM DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 4.262962e+12 | 2016-04-29 | 2016-04-29 | 62 | MATA DA PRAIA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 8.679512e+11 | 2016-04-29 | 2016-04-29 | 8 | PONTAL DE CAMBURI | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 8.841186e+12 | 2016-04-29 | 2016-04-29 | 56 | JARDIM DA PENHA | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
5 | 9.598513e+13 | 2016-04-27 | 2016-04-29 | 76 | REPÚBLICA | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 |
6 | 7.336882e+14 | 2016-04-27 | 2016-04-29 | 23 | GOIABEIRAS | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 |
7 | 3.449833e+12 | 2016-04-27 | 2016-04-29 | 39 | GOIABEIRAS | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 2 |
8 | 5.639473e+13 | 2016-04-29 | 2016-04-29 | 21 | ANDORINHAS | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
9 | 7.812456e+13 | 2016-04-27 | 2016-04-29 | 19 | CONQUISTA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 |
10 | 7.345362e+14 | 2016-04-27 | 2016-04-29 | 30 | NOVA PALESTINA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 |
11 | 7.542951e+12 | 2016-04-26 | 2016-04-29 | 29 | NOVA PALESTINA | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 3 |
12 | 5.666548e+14 | 2016-04-28 | 2016-04-29 | 22 | NOVA PALESTINA | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
13 | 9.113946e+14 | 2016-04-28 | 2016-04-29 | 28 | NOVA PALESTINA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
14 | 9.988472e+13 | 2016-04-28 | 2016-04-29 | 54 | NOVA PALESTINA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 |
15 | 9.994839e+10 | 2016-04-26 | 2016-04-29 | 15 | NOVA PALESTINA | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 3 |
16 | 8.457439e+13 | 2016-04-28 | 2016-04-29 | 50 | NOVA PALESTINA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 |
17 | 1.479497e+13 | 2016-04-28 | 2016-04-29 | 40 | CONQUISTA | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
18 | 1.713538e+13 | 2016-04-26 | 2016-04-29 | 30 | NOVA PALESTINA | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 3 |
19 | 7.223289e+12 | 2016-04-29 | 2016-04-29 | 46 | DA PENHA | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
data['WaitingTime'].mean()
10.183701719941734
Need to make one last clean dataset picking which features I will actually use in the model.
data = data[['NoShow','ScheduledDay','AppointmentDay','Age','Scholarship','Hypertension','Diabetes','Alcoholism','Handicap','SMS_received','Male','Female','WaitingTime']]
data.head()
NoShow | ScheduledDay | AppointmentDay | Age | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | Male | Female | WaitingTime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2016-04-29 | 2016-04-29 | 62 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 2016-04-29 | 2016-04-29 | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 2016-04-29 | 2016-04-29 | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 0 | 2016-04-29 | 2016-04-29 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 2016-04-29 | 2016-04-29 | 56 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
Here is probably the most interesting and complex part to this analysis.
I will run a bunch of different models to see what their various testing/training accuracies are in order to pick the best one and then try and optomize that particular model.
data.head()
NoShow | ScheduledDay | AppointmentDay | Age | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | Male | Female | WaitingTime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2016-04-29 | 2016-04-29 | 62 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 2016-04-29 | 2016-04-29 | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 2016-04-29 | 2016-04-29 | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 0 | 2016-04-29 | 2016-04-29 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 2016-04-29 | 2016-04-29 | 56 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
#the features we are going to train our model on
showfeatures = data.iloc[:,3:19]
showfeatures.head()
Age | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | Male | Female | WaitingTime | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 62 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 56 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
# TRAIN/TEST PARTITION
#create the feature vectors and class labels
features = np.array(showfeatures)
labels = np.array(data['NoShow'])
features
array([[62, 0, 1, ..., 0, 1, 0], [56, 0, 0, ..., 1, 0, 0], [62, 0, 0, ..., 0, 1, 0], ..., [21, 0, 0, ..., 0, 1, 41], [38, 0, 0, ..., 0, 1, 41], [54, 0, 0, ..., 0, 1, 41]])
data['NoShow'].value_counts()
0 88208 1 22319 Name: NoShow, dtype: int64
#split the data into training and testing sets(67% training, 33% into testing)
training_features, testing_features, training_labels, testing_labels = train_test_split(features,labels, test_size = 0.2,random_state = 42 )
from sklearn import linear_model
lg = linear_model.LogisticRegression()
lg.fit(training_features,training_labels)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
# hold out cross validation
#print(accuracy_score(testing_labels, predictions))
score = cross_val_score(lg, training_features,training_labels, cv = 10, scoring= 'accuracy')
print(score)
predictions = lg.predict(testing_features)
print(predictions)
score = accuracy_score(testing_labels, predictions)
print(score)
[ 0.7955445 0.79463983 0.79359873 0.79246777 0.79552138 0.79518209 0.79552138 0.79371183 0.79416422 0.79199186] [0 0 0 ..., 0 0 0] 0.795304442233
"""from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(alpha=0.01, hidden_layer_sizes = (100,100))
mlp.fit(training_features,training_labels)"""
'from sklearn.neural_network import MLPClassifier\nmlp = MLPClassifier(alpha=0.01, hidden_layer_sizes = (100,100))\nmlp.fit(training_features,training_labels)'
"""# hold out cross validation
#print(accuracy_score(testing_labels, predictions))
score = cross_val_score(mlp, training_features,training_labels, cv = 10, scoring= 'accuracy')
print(score)
predictions = mlp.predict(testing_features)
print(predictions)
score = accuracy_score(testing_labels, predictions)
print(score)"""
"# hold out cross validation\n\n#print(accuracy_score(testing_labels, predictions))\n\nscore = cross_val_score(mlp, training_features,training_labels, cv = 10, scoring= 'accuracy')\nprint(score)\n\npredictions = mlp.predict(testing_features)\nprint(predictions)\n\nscore = accuracy_score(testing_labels, predictions)\nprint(score)"
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=2,n_estimators=100)
rf.fit(training_features,training_labels)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=2, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
# hold out cross validation
#print(accuracy_score(testing_labels, predictions))
score = cross_val_score(rf, training_features,training_labels, cv = 10, scoring= 'accuracy')
print(score)
predictions = rf.predict(testing_features)
print(predictions)
score = accuracy_score(testing_labels, predictions)
print(score)
[ 0.79769309 0.79769309 0.79778331 0.79778331 0.79778331 0.79778331 0.79778331 0.79778331 0.79778331 0.79776043] [0 0 0 ..., 0 0 0] 0.79928526192
from sklearn.neighbors import KNeighborsClassifier
#TRAIN/TEST ALGORITHM
#instance the model
kList = range(1,50)
cv_scores = []
neighbors = filter(lambda x: x % 2 != 0, kList)
for i in neighbors:
print(i)
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(training_features,training_labels)
#test the model
predictions = knn.predict(testing_features)
# hold out cross validation
#print(accuracy_score(testing_labels, predictions))
scores = cross_val_score(knn, training_features,training_labels, cv = 10, scoring= 'accuracy')
#print(scores)
cv_scores.append(scores.mean())
print("done finding")
1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 done finding
optimalk = cv_scores.index(max(cv_scores))
knn = KNeighborsClassifier(n_neighbors=optimalk)
knn.fit(training_features,training_labels)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=24, p=2, weights='uniform')
predictions = knn.predict(testing_features)
#data['predictions'] = knn.predict(testing_features)
score = accuracy_score(testing_labels, predictions)
print(score)
0.795711571519
data.head()
NoShow | ScheduledDay | AppointmentDay | Age | Scholarship | Hypertension | Diabetes | Alcoholism | Handicap | SMS_received | Male | Female | WaitingTime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2016-04-29 | 2016-04-29 | 62 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
1 | 0 | 2016-04-29 | 2016-04-29 | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
2 | 0 | 2016-04-29 | 2016-04-29 | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 0 | 2016-04-29 | 2016-04-29 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 2016-04-29 | 2016-04-29 | 56 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
age = 0.0
scholarship = 0.0
hypertension = 0.0
diabetes = 0.0
alcoholism = 0.0
handicap = 0.0
sms = 0.0
male = 0.0
female = 0.0
waitingtime = 0
inputage = int(input("Enter the patient's age: "+ "\n" ))
inputscholarship = input("Is the patient on Scholarship(Yes or No): "+ "\n" )
inputhypertension = input("Does the patient have Hypertension?(Yes or No): "+ "\n" )
inputdiabetes = input("Is the patient Diabetic?(Yes or No): "+ "\n" )
inputalcoholism = input("Is the patient an Alcoholic?(Yes or No): "+ "\n" )
inputhandicap = int(input("How many Handicaps does the patient have?: "+ "\n" ))
inputsms = input("Will the patient recieve a text message?(Yes or No): "+ "\n" )
inputgender = input("What is the patient gender?(Male or Female): "+ "\n")
inputwaitingtime = int(input("How many days away is the appointment?: "+ "\n"))
age = inputage
handicap = inputhandicap
waitingtime = inputwaitingtime
if inputscholarship == "Yes" or "yes":
scholarship = 1.0
elif inputscholarship == "No" or "no":
scholarship = 0
if hypertension == "Yes" or "yes":
hypertension = 1.0
elif hypertension == "No" or "no":
hypertension = 0
if inputdiabetes == "Yes" or "yes":
diabetes = 1.0
elif inputdiabetes == "No" or "no":
daibetes = 0
if inputalcoholism == "Yes" or "yes":
alcoholism = 1.0
elif inputalcoholism == "No" or "no":
alcoholism = 0
if inputsms == "Yes" or "yes":
sms = 1.0
elif inputsms == "No" or "no":
sms = 0
if inputgender == "Male" or "M":
male = 1.0
elif inputgender == "Female" or "F":
female = 1.0
else:
print("incorrect gender input")
answer = [age,scholarship,hypertension,diabetes,alcoholism,handicap,sms,male,female,waitingtime]
#the commented out chunk was for testing to not have to type everything in everytime I wanted to test
"""
print(" ")
print("Fixed answer: ")
fixedanswer = [5, 1.0, 1.0 ,1.0, 1.0, 4, 0.0, 1.0, 1.0,10]
print(fixedanswer)
print(rf.predict([fixedanswer]))
print(lg.predict([fixedanswer]))
print(knn.predict([fixedanswer]))
"""
print(" ")
print("user answer: ")
print(answer)
print(rf.predict([answer]))
print(lg.predict([answer]))
print(knn.predict([answer]))
Enter the patient's age: 5 Is the patient on Scholarship(Yes or No): Yes Does the patient have Hypertension?(Yes or No): Yes Is the patient Diabetic?(Yes or No): Yes Is the patient an Alcoholic?(Yes or No): Yes How many Handicaps does the patient have?: 5 Will the patient recieve a text message?(Yes or No): Yes What is the patient gender?(Male or Female): Female How many days away is the appointment?: 10 user answer: [5, 1.0, 1.0, 1.0, 1.0, 5, 1.0, 1.0, 0.0, 10] [0] [0] [0]
print("Here we will use an ensemble of our three different models to vote whether this patient will show up or not.")
print("Let's see how our individual models voted.")
noshowcounter = 0.0
showcounter = 0.0
randomresult = (rf.predict([answer]))
logresult = (lg.predict([answer]))
knnresult = (knn.predict([answer]))
print("")
print("random forrest:")
print(randomresult[0])
if randomresult[0] == 1:
noshowcounter+=1
print("no show")
else:
showcounter+=1
print("show")
print("")
print("logistic regression:")
print(logresult[0])
if logresult[0] == 1:
noshowcounter+=1
print("no show")
else:
showcounter+=1
print("show")
print("")
print("knn:")
print(knnresult[0])
if knnresult[0] == 1:
noshowcounter+=1
print("no show")
else:
showcounter+=1
print("show")
print("")
print("The final vote is %f no shows to %d shows" %(noshowcounter,showcounter))
Here we will use an ensemble of our three different models to vote whether this patient will show up or not. Let's see how our individual models voted. random forrest: 0 show logistic regression: 0 show knn: 0 show The final vote is 0.000000 no shows to 3 shows
What you can see here is that there needs to be more descrepencies or more features because an esenmble doesn't really help here because they all vote the same
if noshowcounter >= 2:
print("Our Esemble Model predicts the patient is not going to show up for their appointment.")
print("Be advised, intervention may be neccesary or suggested in order for the patient to show up ")
else:
print("Our Model Predicts that this patient will show up to their appointment, intervention is not needed")
Our Model Predicts that this patient will show up to their appointment, intervention is not needed
rfpredictions = rf.predict(testing_features)
lgpredictions = lg.predict(testing_features)
knnpredictions = knn.predict(testing_features)
print(rfpredictions)
print(lgpredictions)
print(knnpredictions)
print("")
lengthoflists = (len(rfpredictions))
ensemblepredictions = []
ensemblescore = 0
for i in range(len(rfpredictions)):
if rfpredictions[i] == 1:
ensemblescore+=1
if lgpredictions[i] == 1:
ensemblescore+=1
if knnpredictions[i] == 1:
ensemblescore+=1
if ensemblescore >= 2:
ensemblescore = 1
else:
ensemblescore = 0
ensemblepredictions.append(ensemblescore)
print("")
print("done with for loop")
[0 0 0 ..., 0 0 0] [0 0 0 ..., 0 0 0] [0 0 0 ..., 0 0 0] done with for loop
print("")
#uncomment if you want to see my ensemble predicting basically all zeros.
#print(ensemblepredictions)
ensemblescore = accuracy_score(testing_labels, ensemblepredictions)
print(ensemblescore)
0.799149552158
from sklearn.metrics import confusion_matrix
confusion_matrix(testing_labels, ensemblepredictions)
array([[17658, 11], [ 4429, 8]])
I was a bit confused what was going on here and why all of the predictions were zeros but what we learned is that just because you have these features doesn't neccesarily mean you can convert them into a yes or no no show. Rather indiviudal classifiers and ensemble model learned if you just say people are always going to show up you will get an 80% accuracy
That being said, you can still predict the probability at which individuals may or may not show up using predicta proba on logistic regression
age = 0.0
scholarship = 0.0
hypertension = 0.0
diabetes = 0.0
alcoholism = 0.0
handicap = 0.0
sms = 0.0
male = 0.0
female = 0.0
waitingtime = 0
inputage = int(input("Enter the patient's age: "+ "\n" ))
inputscholarship = input("Is the patient on Scholarship(Yes or No): "+ "\n" )
inputhypertension = input("Does the patient have Hypertension?(Yes or No): "+ "\n" )
inputdiabetes = input("Is the patient Diabetic?(Yes or No): "+ "\n" )
inputalcoholism = input("Is the patient an Alcoholic?(Yes or No): "+ "\n" )
inputhandicap = int(input("How many Handicaps does the patient have?: "+ "\n" ))
inputsms = input("Will the patient recieve a text message?(Yes or No): "+ "\n" )
inputgender = input("What is the patient gender?(Male or Female): "+ "\n")
inputwaitingtime = int(input("How many days away is the appointment?: "+ "\n"))
age = inputage
handicap = inputhandicap
waitingtime = inputwaitingtime
if inputscholarship == "Yes" or "yes":
scholarship = 1.0
elif inputscholarship == "No" or "no":
scholarship = 0
if hypertension == "Yes" or "yes":
hypertension = 1.0
elif hypertension == "No" or "no":
hypertension = 0
if inputdiabetes == "Yes" or "yes":
diabetes = 1.0
elif inputdiabetes == "No" or "no":
daibetes = 0
if inputalcoholism == "Yes" or "yes":
alcoholism = 1.0
elif inputalcoholism == "No" or "no":
alcoholism = 0
if inputsms == "Yes" or "yes":
sms = 1.0
elif inputsms == "No" or "no":
sms = 0
if inputgender == "Male" or "M":
male = 1.0
elif inputgender == "Female" or "F":
female = 1.0
else:
print("incorrect gender input")
answer = [age,scholarship,hypertension,diabetes,alcoholism,handicap,sms,male,female,waitingtime]
#the commented out chunk was for testing to not have to type everything in everytime I wanted to test
print(" ")
"""
print("Fixed answer: ")
fixedanswer = [100, 1.0, 1.0 ,1.0, 1.0, 5, 0.0, 1.0, 1.0,10]
print(fixedanswer)
#print(rf.predict([fixedanswer]))
#print(lg.predict([fixedanswer]))
#print(knn.predict([fixedanswer]))
"""
print(" ")
print("user answer: ")
print(answer)
Enter the patient's age: 5 Is the patient on Scholarship(Yes or No): Yes Does the patient have Hypertension?(Yes or No): Yes Is the patient Diabetic?(Yes or No): Yes Is the patient an Alcoholic?(Yes or No): Yes How many Handicaps does the patient have?: 5 Will the patient recieve a text message?(Yes or No): Yes What is the patient gender?(Male or Female): Female How many days away is the appointment?: 10 user answer: [5, 1.0, 1.0, 1.0, 1.0, 5, 1.0, 1.0, 0.0, 10]
probabilityresult = lg.predict_proba([answer])
showupproba = probabilityresult[0]
showupproba = showupproba[0]
noshowproba = probabilityresult[0]
noshowproba = noshowproba[1]
print("There is a %f percent chance patient #1 does show up and a %f percent chance this person doesn't show up" % (showupproba,noshowproba))
There is a 0.540198 percent chance patient #1 does show up and a 0.459802 percent chance this person doesn't show up
age = 0.0
scholarship = 0.0
hypertension = 0.0
diabetes = 0.0
alcoholism = 0.0
handicap = 0.0
sms = 0.0
male = 0.0
female = 0.0
waitingtime = 0
inputage = int(input("Enter the patient's age: "+ "\n" ))
inputscholarship = input("Is the patient on Scholarship(Yes or No): "+ "\n" )
inputhypertension = input("Does the patient have Hypertension?(Yes or No): "+ "\n" )
inputdiabetes = input("Is the patient Diabetic?(Yes or No): "+ "\n" )
inputalcoholism = input("Is the patient an Alcoholic?(Yes or No): "+ "\n" )
inputhandicap = int(input("How many Handicaps does the patient have?: "+ "\n" ))
inputsms = input("Will the patient recieve a text message?(Yes or No): "+ "\n" )
inputgender = input("What is the patient gender?(Male or Female): "+ "\n")
inputwaitingtime = int(input("How many days away is the appointment?: "+ "\n"))
age = inputage
handicap = inputhandicap
waitingtime = inputwaitingtime
if inputscholarship == "Yes" or "yes":
scholarship = 1.0
elif inputscholarship == "No" or "no":
scholarship = 0
if hypertension == "Yes" or "yes":
hypertension = 1.0
elif hypertension == "No" or "no":
hypertension = 0
if inputdiabetes == "Yes" or "yes":
diabetes = 1.0
elif inputdiabetes == "No" or "no":
daibetes = 0
if inputalcoholism == "Yes" or "yes":
alcoholism = 1.0
elif inputalcoholism == "No" or "no":
alcoholism = 0
if inputsms == "Yes" or "yes":
sms = 1.0
elif inputsms == "No" or "no":
sms = 0
if inputgender == "Male" or "M":
male = 1.0
elif inputgender == "Female" or "F":
female = 1.0
else:
print("incorrect gender input")
answer = [age,scholarship,hypertension,diabetes,alcoholism,handicap,sms,male,female,waitingtime]
#the commented out chunk was for testing to not have to type everything in everytime I wanted to test
print(" ")
"""
print("Fixed answer: ")
fixedanswer = [100, 1.0, 1.0 ,1.0, 1.0, 5, 0.0, 1.0, 1.0,10]
print(fixedanswer)
#print(rf.predict([fixedanswer]))
#print(lg.predict([fixedanswer]))
#print(knn.predict([fixedanswer]))
"""
print(" ")
print("user answer: ")
print(answer)
Enter the patient's age: 100 Is the patient on Scholarship(Yes or No): Yes Does the patient have Hypertension?(Yes or No): Yes Is the patient Diabetic?(Yes or No): Yes Is the patient an Alcoholic?(Yes or No): Yes How many Handicaps does the patient have?: 5 Will the patient recieve a text message?(Yes or No): Yes What is the patient gender?(Male or Female): Female How many days away is the appointment?: 10 user answer: [100, 1.0, 1.0, 1.0, 1.0, 5, 1.0, 1.0, 0.0, 10]
probabilityresult = lg.predict_proba([answer])
showupproba = probabilityresult[0]
showupproba = showupproba[0]
noshowproba = probabilityresult[0]
noshowproba = noshowproba[1]
print("There is a %f percent chance patient #1 does show up and a %f percent chance this person doesn't show up" % (showupproba,noshowproba))
There is a 0.707785 percent chance patient #1 does show up and a 0.292215 percent chance this person doesn't show up
Age seems to be a big huge factor which makes sense because children aren't able to get to appointments themselves. Also if a child is 5 and is an alcholic i'm pretty sure that would mean they would have an alcholic and unrealiable parents which would make sense that they would have a way less likelyhood to show up than a 100 year old with the same features.
While a 16% difference in showing up or not may not be the most instiutive or knowledgable thing in the world we can "empirically" say these people are more or less likely to show up than one another.
side note I tried different training testing partitions(10/90 and 70/30) and that didn't really have any affect on my accuracy levels.