Используя данные о школьниках, выявить степень их алкогольной зависимости. В данных, взятых с UCI 'Students', содержится информация о 30 признаках для каждого школьника, включая социальные и гендерные, а также указана материальная обеспеченность и количество свободного времени. Выбрать на свой взгляд наиболее весомые признаки и предсказать степень употребления алкоголя по выходным или будним по шкале от 0 до 5.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import plotly.plotly as py
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
from wordcloud import WordCloud
from subprocess import check_output
import os
student_mat_dataframe = pd.read_csv("/home/vitaly/Downloads/student-mat.csv") #read to csv file
print(student_mat_dataframe.columns)
Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3'], dtype='object')
trace1 = go.Box(
y = student_mat_dataframe.Dalc,
name = "Daily alcohol consumption",
marker = dict(color = "red")
)
trace2 = go.Box(
y = student_mat_dataframe.Walc,
name = "Weekly alcohol consumption",
marker = dict(color = "blue")
)
concatTrace = [trace1,trace2]
iplot(concatTrace)
X:Student Age
Y: Daily alcohol consumption
Z:Weekly alcohol consumption*
trace = go.Scatter3d(
x = student_mat_dataframe.age,
y = student_mat_dataframe.Dalc,
z = student_mat_dataframe.Walc,
mode = 'markers',
marker = dict(
size = 10,
color = student_mat_dataframe.age,
colorscale = 'Rainbow'
)
)
dataTrace = [trace]
layoutTrace = go.Layout(
margin = dict(
l = 0,
r = 0,
b = 0,
t = 0 )
)
figTrace = go.Figure(data=dataTrace, layout=layoutTrace)
iplot(figTrace)
#There are two school at the list
school_list = list(student_mat_dataframe.school.unique())
age_mean = []
for i in school_list:
x = student_mat_dataframe[student_mat_dataframe.school == i]
age_sum = sum(x.age)/len(x)
age_mean.append(age_sum)
#sorting
data = pd.DataFrame({'school_list':school_list,'age_mean':age_mean})
new_index = (data['age_mean'].sort_values(ascending=False)).index.values
sorted_data = data.reindex(new_index)
#visualization
plt.figure(figsize=(10,10))
sns.barplot(x=sorted_data['school_list'],y=sorted_data['age_mean'])
plt.xticks(rotation=90)
plt.xlabel("School")
plt.ylabel("Age Mean")
plt.show()
Below is the average timeout graph for schools shown
school_list = list(student_mat_dataframe.school.unique())
free_time = []
for i in school_list:
x = student_mat_dataframe[student_mat_dataframe.school == i]
freeTime = sum(x.freetime)/len(x)
free_time.append(freeTime)
#sorting
data = pd.DataFrame({'school_list':school_list,'free_time':free_time})
new_index = (data['free_time'].sort_values(ascending=False)).index.values
sorted_data2 = data.reindex(new_index)
#visualization
plt.figure(figsize=(10,10))
sns.barplot(x=sorted_data2['school_list'],y=sorted_data2['free_time'])
plt.xticks(rotation=90)
plt.xlabel("School")
plt.ylabel("Free Time")
plt.show()
The sum of the places where children between 15 and 22 live with family members or live alone
ageList = list(student_mat_dataframe.age.unique())
parentStatusT = []
parentStatusA = []
for i in ageList:
x = student_mat_dataframe[student_mat_dataframe.age == i]
parentStatusT.append(sum(x.Pstatus == "T"))
parentStatusA.append(sum(x.Pstatus == "A"))
#sorting
sort_data2 = pd.DataFrame({'student_age':ageList,'student_status':parentStatusA})
new_index4 = (sort_data2['student_status'].sort_values(ascending=False)).index
sorted_data4 = sort_data2.reindex(new_index4)
f,ax = plt.subplots(figsize=(15,10))
sns.barplot(x=ageList,y=parentStatusT,color="red",alpha=0.6,label="Living together")
sns.barplot(x=ageList,y=parentStatusA,color="blue",alpha=0.8,label="Living apart")
ax.legend(loc="upper right",frameon=True)
ax.set(xlabel="Living Together Sum and Living Part Sum ",ylabel="Age",title="Where do children between 15 and 22 live")
plt.show()
print(sorted_data.head())
print(sorted_data2.head())
school_list age_mean 1 MS 18.021739 0 GP 16.521490 school_list free_time 1 MS 3.326087 0 GP 3.223496
#Standardization
sorted_data['age_mean'] = sorted_data['age_mean']/max(sorted_data['age_mean'])
sorted_data2['free_time'] = sorted_data2['free_time']/max(sorted_data2['free_time'])
#Data Concat
data = pd.concat([sorted_data,sorted_data2['free_time']],axis=1)
data.sort_values('age_mean',inplace=True)
#Visualization
f,ax1 = plt.subplots(figsize=(20,10))
sns.pointplot(x='school_list',y='age_mean',data=data,color='lime',alpha=0.6)
sns.pointplot(x='school_list',y='free_time',data=data,color='red',alpha=0.8)
plt.xlabel("School",fontsize=15,color='blue')
plt.ylabel("Values",fontsize=15,color='blue')
plt.title("Age average vs. Time spent outside")
plt.grid()
#Notice That: Red: Free Time, Lime: Age Mean
Showing the professions listed below
fatherJobs_Unique = list(student_mat_dataframe.Fjob.unique())
print(fatherJobs_Unique)
motherJobs_Unique = list(student_mat_dataframe.Mjob.unique())
print(motherJobs_Unique)
['teacher', 'other', 'services', 'health', 'at_home'] ['at_home', 'health', 'other', 'services', 'teacher']
fatherJobs_xAxis = list(student_mat_dataframe.Fjob.unique())
fatherJobs_list = list(student_mat_dataframe.Fjob)
fJobsCounter = Counter(fatherJobs_list)
most_common_jobs = fJobsCounter.most_common(5)
x,y = zip(*most_common_jobs)
x,y = list(x),list(y)
plt.figure(figsize=(15,10))
sns.barplot(x=x,y=y,palette = sns.cubehelix_palette(len(x)))
plt.xlabel("Father Jobs")
plt.ylabel("Frequency")
plt.title("The profession of all fathers in two schools")
plt.show()
############################################################
motherJobs_xAxis = list(student_mat_dataframe.Mjob.unique())
motherJobs_list = list(student_mat_dataframe.Mjob)
mJobsCounter = Counter(motherJobs_list)
most_common_jobs = mJobsCounter.most_common(5)
x,y = zip(*most_common_jobs)
x,y = list(x),list(y)
plt.figure(figsize=(15,10))
sns.barplot(x=x,y=y,palette = sns.cubehelix_palette(len(x)))
plt.xlabel("Mother Jobs")
plt.ylabel("Frequency")
plt.title("The profession of all mothers in two schools")
plt.show()
Students who consume daily alcohol according to the professions of their parents
fatherJobsList = list(student_mat_dataframe.Fjob.unique())
FstudentDalc = []
for each in fatherJobsList:
x = student_mat_dataframe[student_mat_dataframe.Fjob == each]
FstudentDalc.append(sum(x.Dalc))
#sorting
sort_data = pd.DataFrame({'father_jobs':fatherJobsList,'student_dalc':FstudentDalc})
new_index3 = (sort_data['student_dalc'].sort_values(ascending=False)).index
sorted_data3 = sort_data.reindex(new_index3)
#Visualitizon
plt.figure(figsize=(15,15))
sns.barplot(x=FstudentDalc,y=fatherJobsList)
plt.xticks(rotation=90)
plt.xlabel("Weekly Alcohol Consumption")
plt.ylabel("Father Jobs")
plt.show()
It shows whether there is internet available for two different schools depending on the age of the students.
plt.figure(figsize=(15,15))
sns.boxplot(x="school",y="age",hue="internet",data=student_mat_dataframe,palette="PRGn")
plt.show()
Show students who want to go to university
plt.figure(figsize=(15,15))
sns.swarmplot(x="school",y="age",hue="higher",data=student_mat_dataframe)
plt.show()
Weekly and daily alcohol consumption of students aged 15 years is given below
Intermediate values were not marked because alcohol consumption was an int number
In general, more alcohol is consumed over the weekend
Median values are not marked because alcohol consumption is counted as int, so the graph looks mixed
student_mat_dataframe2 = student_mat_dataframe.loc[:,["school","age","Dalc","Walc"]]
trace1 = go.Scatter(
x = student_mat_dataframe2.age,
y = student_mat_dataframe2.Walc,
mode = "lines",
name="Weekly Alcohol Consumption",
marker = dict(color = 'green'),
text = student_mat_dataframe2.school
)
trace2 = go.Scatter(
x = student_mat_dataframe2.age,
y = student_mat_dataframe2.Dalc,
mode = "lines+markers",
name="Daily Alcohol Consumption",
marker = dict(color = 'blue'),
text = student_mat_dataframe2.school
)
newData = [trace1,trace2]
layout = dict(title="Weekly and daily alcohol consumption of students aged 15 - Line Plot",
xaxis=dict(title="Age",ticklen=5,zeroline=False))
fig = dict(data = newData,layout=layout)
iplot(fig)