import pandas as pd import numpy as np import statsmodels.api as sm #used for statistical modeling import matplotlib.pyplot as plt #to have plots show up automatically in notebook %matplotlib inline #this gives us the frequency method from nltk.book import * import statistics from statistics import median from collections import Counter from IPython.core.display import Image #reading in file diref_data = pd.read_csv('disciplinaryreferrals.csv') #renaming columns diref_data.rename(columns={'c03I02001_id': 'CaseNumber', 'c03I02002_timestamp': 'TimeStamp', 'c03I02003_student_id': 'StudentID', 'c03I02004_grade': 'Grade', 'c03I02005_date_and_time_of_misbehavior': 'TimeofIncident', 'c03I02006_location_of_misbehavior': 'Location', 'c03I02007_documenting_staff_id': 'DocumentingStaffID', 'c03I02008_documenting_staff': 'DocumentingStaff', 'c03I02009_classroom_or_administrative_managed': 'ClassroomOrAdministrative', 'c03I0200a_type_of_misbehavior': 'Type', 'c03I0200b_narrative_description_of_misbehavior': 'Description', 'c03I0200c_consequence': 'Consequence', 'c03I0200d_reporting_staff_id': 'ReportingStaffID', 'c03I0200e_reporting_staff': 'ReportingStaff', 'c03I0200f_d12_planning_completed': 'D12Planning', 'c03I0200g_narrative_of_consequence': 'NarrativeOfConsequence' }, inplace=True) # Deleting unneccesary information (case number:doesn't give new info, # teacher full names: have IDs,D12Planning:All say ABCD, # narrative of consequence: missing for most, Classroom or Administrative:Mostly Administrative) #Can comment out the bottom "del" if you would like to work with the values #del diref_data['CaseNumber'] del diref_data['DocumentingStaff'] del diref_data['ReportingStaff'] del diref_data['ClassroomOrAdministrative'] del diref_data['D12Planning'] del diref_data['NarrativeOfConsequence'] most_common_offense= 'null' #to store the most common offense offense_amount = -1 #to the amount of times that offense was commited #for loop to see how many times each unique offense was commited offense_cases = {} #creating dictionary to store amount of offenses for offense in diref_data['Type']: if not offense in offense_cases: offense_cases[offense] = 1 else: offense_cases[offense] = offense_cases[offense]+1 #lets now classify the types of disruptions more thoroughly- since some students have multiple #types of disruptions and are thus classified on their own #for offense in offense_cases: #lets see which locaiton provokes the most referrals for offense in offense_cases: if offense_cases[offense]>offense_amount: offense_amount = offense_cases[offense] most_common_offense = offense_amount print("Top Offense has been commited: ",offense_amount, "times") print('Top 5 offenses a kid is most likely going to commit:\n', sorted(offense_cases, key=offense_cases.__getitem__, reverse = True)[0:5]) #plot a representative bar graph plt.bar(range(len(offense_cases)), offense_cases.values(), align='center') plt.xlabel('Offense') plt.ylabel('Offense Cases') plt.title("General Distribution of Types of Offenses") #The bar graph plt.show() most_common_location= 'null' #to store the most common locaiton a referral was given in location_amount = -1 #to the amount of times that referral was given in that location #for loop to see how many times each location had a referral given out in location_cases = {} #creating dictionary to store amount of different locations for location in diref_data['Location']: if not location in location_cases: location_cases[location] = 1 else: location_cases[location] = location_cases[location]+1 #lets see which locaiton provokes the most referrals for location in location_cases: if location_cases[location]>location_amount: location_amount = location_cases[location] most_common_location = location print("Location:", most_common_location,"has provoked highest amount of referrals: ",location_amount) print('Top 5 places a kid is most likely going to get a referral:\n', sorted(location_cases, key=location_cases.__getitem__, reverse = True)[0:5]) #plot a representative bar graph plt.bar(range(len(location_cases)), location_cases.values(), align='center') plt.xlabel('Location') plt.ylabel('Referral Cases') plt.title("Location Correlation?") #The bar graph plt.show() highest_offense = 0 #to store the highest amount of offenses a student has had to deal with highest_offender = -1 #to store the student with most referrals #for loop to see how many times each student has recieved a referral all_student_cases = {} #creating dictionary to store amount of times theyve been cited for student in diref_data['StudentID']: if not student in all_student_cases: all_student_cases[student] = 1 else: all_student_cases[student] = all_student_cases[student]+1 #plot a representative bar graph plt.bar(range(len(all_student_cases)), sorted(all_student_cases.values()), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases') plt.title("Distribution of Referred Students") #The bar graph isn't really accurate in matching the teacher IDs because some IDs are missing #This is fixed in later graphs where StudentIDs are important, here it isn't at the moment plt.show() highest_offense = 0 #to store the highest amount of offenses a teacher has had to deal with highest_offender = -1 #to store the teacher with most referrals #for loop to see how many times each teacher has given out a referral teacher_cases = {} #creating dictionary to store amount of times theyve been cited for teacher in diref_data['ReportingStaffID']: if not teacher in teacher_cases: teacher_cases[teacher] = 1 else: teacher_cases[teacher] = teacher_cases[teacher]+1 #lets see which teacher gives out the most referrals for teacher in teacher_cases: if teacher_cases[teacher]>highest_offense: highest_offense = teacher_cases[teacher] highest_offender = teacher #lets find the "median" case to use later on for analysis print("Teacher", median(teacher_cases), "has given out the median amount of referrals.") print("Teacher", highest_offender,"has given the highest amount of referrals: ",highest_offense) #plot a representative bar graph plt.bar(range(len(teacher_cases)), teacher_cases.values(), align='center') plt.xlabel('Teacher') plt.ylabel('Referral Cases') plt.title("How many referrals has each teacher given?") #The bar graph isn't really accurate in matching the teacher IDs because some IDs are missing plt.show() global student_cases,teacher_cases # Let's see if students and teacher are strongly correlated (could personality conflicts # be playing a role)? temp_teacher = [0,0] temp_student = [0,0] #order students by increasing order of how many referrals they've recieved y = diref_data['StudentID'] # response #order teachers by increasing order of how many referrals they've given out X = diref_data['ReportingStaffID'] # predictor X = sm.add_constant(X) # Add a constant term to the predictor # The actual fitting happens here est = sm.OLS(y, X) #fit least squares model est = est.fit() est.summary() # Let's plot the regression line on top of the data x_ = np.array([X.min(), X.max()]) y_ = est.predict(x_) diref_data.plot(x='ReportingStaffID', y='StudentID', kind='scatter') plt.plot(x_[:, 1], y_, 'r-') plt.title("Student - Teacher Correlation") Looking at the non-significant p-value and the really poor R^2 value, it is clear that based on whichever teacher is reporting the student we cannot guess what student is being reported (which analytically speaking is kind of sad since that would have been a really cool causation to work with)! #Now we know from the previous data that Teacher 127 (aka Ms.Tyler) gives out the most referrals... #So lets see now what students she has given referrals to highest_offense = 0 #to store the highest amount of offenses a student has recieved highest_offender = -1 #to store the student with the most referrals handed out by Tyler highest_student_ID = 0 #to store highest student ID for plotting purposes #for loop to see how many times each student has recieved a referral from Tyler student_cases = {} #creating dictionary to store amount of times theyve been cited for (student,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(teacher == 127): if not student in student_cases: student_cases[student] = 1 else: student_cases[student] = student_cases[student]+1 #lets see what the highest studentID is for plotting purposes for student in student_cases: if student>highest_student_ID: highest_student_ID = student #for loop to see how many times each student has recieved a referral NOT from Tyler referral_cases = {} #creating dictionary to store amount of times theyve been cited by others for (all_students,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(all_students in student_cases): #checking if that student is in Tyler student cases if(teacher != 127):#counting only teachers other than Tyler if not all_students in referral_cases: referral_cases[all_students] = 1 else: referral_cases[all_students] = referral_cases[all_students]+1 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 percentage_keeper={} for student,referral in zip(student_cases,referral_cases): if(student_cases[student] != 0): #if student was given percentage_keeper[student] =(student_cases[student]/ (student_cases[student]+ referral_cases[referral]))*100 frequency = Counter() for percentage in percentage_keeper: frequency[percentage_keeper[percentage]]+=1 fdist = FreqDist(frequency) plt.title("Which proportion of referrals recieved by a student were given by Ms.Tyler?") fdist.plot() print(frequency.most_common(4)) From the data above, it doesn't seem that there's a strong correlation between Teacher bias and the amount of referrals given...either way, lets visually represnt the information above to get a strong in depth view. #filling in missing student IDs with 0 for plotting purposes for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 key_dictionary = {} for student,i in zip(student_cases,range(0,highest_student_ID+1)): key_dictionary[i] = student #plot a representative bar graph plt.bar(key_dictionary, student_cases.values(), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases from Tyler') plt.title("Tyler referral cases") #Showing how many referrals shes given out to different students plt.show() #filling in missing student IDs with 0 for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 key_dictionary = {} for referral,i in zip(referral_cases,range(0,highest_student_ID+1)): key_dictionary[i] = referral #plot a representative bar graph plt.bar(key_dictionary, referral_cases.values(), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases NOT from Tyler') plt.title("Non-Tyler referral cases") #Showing how many referrals shes given out to different students plt.show() Again, looks like there isn't such a strong correlation for Tyler...but lets run the same thing for the second largest referral giver: Teacher ID 112 (Top 2 cells of code are copied and manipulated for this teacher) #Lets analyze Ms.Naechia's referral tendencies... highest_offense = 0 #to store the highest amount of offenses a student has recieved highest_offender = -1 #to store the student with the most referrals handed out by Naechia highest_student_ID = 0 #to store highest student ID for plotting purposes #for loop to see how many times each student has recieved a referral from Naechia student_cases = {} #creating dictionary to store amount of times theyve been cited for (student,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(teacher == 112): if not student in student_cases: student_cases[student] = 1 else: student_cases[student] = student_cases[student]+1 #lets see what the highest studentID is for plotting purposes for student in student_cases: if student>highest_student_ID: highest_student_ID = student #for loop to see how many times each student has recieved a referral NOT from Naechia referral_cases = {} #creating dictionary to store amount of times theyve been cited by others for (all_students,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(all_students in student_cases): #checking if that student is in Naechia student cases if(teacher != 112):#counting only teachers other than Naechia if not all_students in referral_cases: referral_cases[all_students] = 1 else: referral_cases[all_students] = referral_cases[all_students]+1 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 percentage_keeper={} for student,referral in zip(student_cases,referral_cases): if(student_cases[student] != 0): #if student was given percentage_keeper[student] =(student_cases[student]/ (student_cases[student]+ referral_cases[referral]))*100 frequency = Counter() for percentage in percentage_keeper: frequency[percentage_keeper[percentage]]+=1 fdist = FreqDist(frequency) plt.title("Which proportion of referrals recieved by a student were given by Ms.Naechia?") fdist.plot() print(frequency.most_common(4)) #filling in missing student IDs with 0 for plotting purposes for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 key_dictionary = {} for student,i in zip(student_cases,range(0,highest_student_ID+1)): key_dictionary[i] = student #plot a representative bar graph plt.bar(key_dictionary, student_cases.values(), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases from Naechia') plt.title("Naechia referral cases") #Showing how many referrals shes given out to different students plt.show() #filling in missing student IDs with 0 for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 key_dictionary = {} for referral,i in zip(referral_cases,range(0,highest_student_ID+1)): key_dictionary[i] = referral #plot a representative bar graph plt.bar(key_dictionary, referral_cases.values(), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases NOT from Naechia') plt.title("Non-Naechia referral cases") #Showing how many referrals shes given out to different students plt.show() Naechia's data actually looks really similar to Tyler's...which prompts us to take a teacher who has given out the median amount of referrals to see if their data still looks similar... #Lets analyze the median case: Teacher 92 Ms.Crystal highest_offense = 0 #to store the highest amount of offenses a student has recieved highest_offender = -1 #to store the student with the most referrals handed out by Ms.Crystal highest_student_ID = 0 #to store highest student ID for plotting purposes #for loop to see how many times each student has recieved a referral from Ms.Crystal student_cases = {} #creating dictionary to store amount of times theyve been cited for (student,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(teacher == 92): if not student in student_cases: student_cases[student] = 1 else: student_cases[student] = student_cases[student]+1 #lets see what the highest studentID is for plotting purposes for student in student_cases: if student>highest_student_ID: highest_student_ID = student #for loop to see how many times each student has recieved a referral NOT from Ms.Crystal referral_cases = {} #creating dictionary to store amount of times theyve been cited by others for (all_students,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(all_students in student_cases): #checking if that student is in Ms.Crystal student cases if(teacher != 92):#counting only teachers other than Ms.Crystal if not all_students in referral_cases: referral_cases[all_students] = 1 else: referral_cases[all_students] = referral_cases[all_students]+1 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 percentage_keeper={} for student,referral in zip(student_cases,referral_cases): if(student_cases[student] != 0): #if student was given percentage_keeper[student] =(student_cases[student]/ (student_cases[student]+ referral_cases[referral]))*100 frequency = Counter() for percentage in percentage_keeper: frequency[percentage_keeper[percentage]]+=1 fdist = FreqDist(frequency) plt.title("Which proportion of referrals recieved by a student were given by Ms.Crystal?") fdist.plot() print(frequency.most_common(4)) #filling in missing student IDs with 0 for plotting purposes for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 key_dictionary = {} for student,i in zip(student_cases,range(0,highest_student_ID+1)): key_dictionary[i] = student #plot a representative bar graph plt.bar(key_dictionary, student_cases.values(), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases from Ms.Crystal') plt.title("Ms.Crystal referral cases") #Showing how many referrals shes given out to different students plt.show() #filling in missing student IDs with 0 for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 key_dictionary = {} for referral,i in zip(referral_cases,range(0,highest_student_ID+1)): key_dictionary[i] = referral #plot a representative bar graph plt.bar(key_dictionary, referral_cases.values(), align='center') plt.xlabel('Student') plt.ylabel('Referral Cases NOT from Ms.Crystal') plt.title("Non-Ms.Crystal referral cases") #Showing how many referrals shes given out to different students plt.show() We now will evaulate some proportions to see if there is any difference in the amount of referrals given: #Ms.Crystal: total_referrals = 6 total_sole_students = 1 #amount of 100% frequencies print("Ms.Crystal:",total_sole_students/total_referrals) #Ms.Naechia: total_referrals = 47 total_sole_students = 5 #amount of 100% frequencies print("Ms.Naechia:",total_sole_students/total_referrals) #Ms.Tyler: total_referrals = 70 total_sole_students = 13 #amount of 100% frequencies print("Ms.Tyler:",total_sole_students/total_referrals) global teacher_cases proportion_counter = {} #dictionary to keep all proportions for all teachers #filling in missing Teacher IDs with 0 to keep things uniform for i in range (0,175): if i not in teacher_cases: teacher_cases[i] = 0 #for loop to calculate how many students the teacher was the only one who gave them the referral for this_teacher in range(0,175): if(teacher_cases[this_teacher] == 0):#means this is a non-existant teacher ID continue; else: #for loop to see how many times each student has recieved a referral from Ms.Crystal student_cases = {} #creating dictionary to store amount of times theyve been cited for (student,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(teacher == this_teacher): if not student in student_cases: student_cases[student] = 1 else: student_cases[student] = student_cases[student]+1 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID): if i not in student_cases: student_cases[i] = 0 #creating dictionary to store amount of times theyve been cited by others referral_cases = {} for (all_students,teacher) in zip(diref_data['StudentID'],diref_data['ReportingStaffID']): if(all_students in student_cases): #checking if that student is in this teacher's student cases if(teacher != this_teacher):#counting only teachers other than this teacher if not all_students in referral_cases: referral_cases[all_students] = 1 else: referral_cases[all_students] = referral_cases[all_students]+1 #filling in missing student IDs with 0 to keep things uniform for i in range (0,highest_student_ID+1): if i not in referral_cases: referral_cases[i] = 0 #keeping track of the proportions percentage_keeper={} for student,referral in zip(student_cases,referral_cases): if(student_cases[student] != 0): #if student was given percentage_keeper[student] =(student_cases[student]/ (student_cases[student]+ referral_cases[referral]))*100 frequency = Counter() for percentage in percentage_keeper: frequency[percentage_keeper[percentage]]+=1 #assign amount of 100% frequencies to that teacher proportion_counter[this_teacher] = frequency[100.0] final_proportions = {} #now find all the proportions for teacher in teacher_cases: if(teacher_cases[teacher] == 0):#means this is a non-existant teacher ID continue; elif(proportion_counter[teacher] == 0): final_proportions[teacher] = 0 else: # store the amount of 100% frequencies/total referrals final_proportions[teacher] = proportion_counter[teacher]/teacher_cases[teacher] #lets visually represent this data #plot a representative bar graph plt.bar(range(len(final_proportions)), sorted(final_proportions.values()), align='center') plt.xlabel('Teacher') plt.ylabel('Proportion of sole referrals') plt.title("Proportion of Students Who Only Get Referrals From One Teacher") #The bar graph plt.show() print("The variance for the proportions is:",statistics.variance(final_proportions.values())) Ignoring the outliers, there is clearly little variance and it thus illustrates that there isn't much, if any, teacher-student bias present in giving referrals. The above now illuminates this question: Why then are some teachers giving out more referrals than others, as there doesn't seem to be a strong teacher bias going on. If it had to do with certain teachers teaching "bad grades" (groups of students who were more bad on average) then Tyler and Crystal should be working with the grades which recieve the most referrals... Lets see if this is the case: highest_referral_number = 0 # top number of referrals per grade highest_offending_grade= -1 # to store the grade with the highest amount of referrals #for loop to see how many times each teacher grade has recieven a referral grade_cases = {} #creating dictionary to store amount of times theyve been cited for grade in diref_data['Grade']: if not grade in grade_cases: grade_cases[grade] = 1 else: grade_cases[grade] = grade_cases[grade]+1 print(grade_cases) #lets see which teacher gives out the most referrals for grade in grade_cases: if grade_cases[grade]>highest_referral_number: highest_referral_number = grade_cases[grade] highest_offending_grade = grade print("Grade", highest_offending_grade,"has recieved the highest amount of referrals: ", highest_referral_number) #plot a representative bar graph plt.bar(range(len(grade_cases)), grade_cases.values(), align='center') plt.xlabel('Grade') plt.ylabel('Referral Cases') plt.title("Whats the Worst Disciplined Grade?") #The bar graph isn't really accurate in matching the teacher IDs because some IDs are missing plt.show() #Lets create a dictionary storing which grades Tyler and Naechia work with Tyler_cases = {} #creating dictionary to store amount of times theyve been cited for (grade,teacher) in zip(diref_data['Grade'],diref_data['ReportingStaffID']): if(teacher == 127): if not grade in Tyler_cases: Tyler_cases[grade] = 1 else: Tyler_cases[grade] = Tyler_cases[grade]+1 Naechia_cases = {} #creating dictionary to store amount of times theyve been cited for (grade,teacher) in zip(diref_data['Grade'],diref_data['ReportingStaffID']): if(teacher == 112): if not grade in Naechia_cases: Naechia_cases[grade] = 1 else: Naechia_cases[grade] = Naechia_cases[grade]+1 print("Tyler:",Tyler_cases) print("Naechia:",Naechia_cases) It looks like Though Tyler isn't teaching the majority of the "referral-prone" grades, Naechia is. To illustrate the most common actions that award students referrals, we decided that a word cloud would be the best illustration. We converted diref_data['Descriptions'] into a text file and then used a word cloud algorithm to generate this image. Image(filename='Action_wordcloud.png') As evidenced above, it seems that the most common actions have to do with "talking", "playing" "throwing", being "disrespectful" etc. This highlights that the actions are often "petty" and therefore ones which are easily changed and fixed (such as by a class). After all this analysis it seems that though we weren't able to prove that teacher bias has to do with excessive referral giving, we were able to show that in fact it doesn't. Though its often hard to believe that some teachers aren't just plain mean, it does seem that the data shows otherwise. With all of this said, there are definetly limitations that need to be considered. First, we have only used Monarch's data and they could perhaps have an exceptional team of teachers who are truly unbiased (being shown in the similar proportions above). That being said however, there were definetly teachers which stood out in the referral giving and that in itself shows that referral giving doesn't seem to be correlated with how exceptional/nice/mean a teacher is (to a healthy extent of course) and therefore doesn't necessarely need to be correlated with whether we are sampling an exceptional team or not. Second,within our methods we had always tested a student recieving a referral from either one specific teacher or all others. It would be nice to see an analysis as to how many other teachers contributed to the rest of the referral giving...could it have been that when comparing two teachers (instead of one) vs all others the proportion suddenly jumps? Either way, we hope that within all of this we illustrate that the teacher may deserve more credit than she gets.