import numpy as np
import pandas as pd
C:\Users\ivypa\Anaconda3\lib\site-packages\pandas\computation\__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used UserWarning)
data = pd.read_csv('responses.csv')
data.head()
Timestamp | Фамилия Имя Отчество | E-mail адрес | Роль на курсе | Пол | Группа | Дата рождения | GPA (Средний балл) | Родной город | Имели опыт анализа данных до курса? | ... | Сколько параметров имеет нормальное распределение? | Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки? | Сколько листьев имеет полное двоичное дерево высоты 3? | Характеризуйте тремя предложениями, почему решили пойти на курс? | Какими социальными сетями пользуетесь? | Ссылка на личный вебсайт | Ссылка на профиль в LinkedIn | Ссылка на страницу в Facebook | Согласие на обработку данных | Факультет | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1/11/2016 21:16:50 | Шестаков Андрей Владимирович | shestakoffandrey@gmail.com | Семинарист | мужской | ИАД-3 | 5/17/1991 | 7.00 | г. Бор, Нижегородская область | да | ... | 2.0 | Критерий знаковых рангов Уилкоксона | 8 | For. The. Science! | Facebook, VK.com (ВКонтакте), LinkedIn, Github... | NaN | NaN | NaN | Я заполнил форму полностью и даю согласие на о... | Компьютерных Наук |
1 | 1/11/2016 22:13:51 | Кашницкий Юрий Савельевич | yury.kashnitsky@gmail.com | Семинарист | мужской | аспирант 3 года | 11/1/1990 | 4.53 | Москва | да | ... | 2.0 | Критерий знаковых рангов Уилкоксона | 8 | Хочу получить опыт преподавания не только Pyth... | Facebook, VK.com (ВКонтакте), LinkedIn, Github | http://www.hse.ru/staff/ykashnitsky | https://www.linkedin.com/profile/view?id=19224... | https://www.facebook.com/festline | Я заполнил форму полностью и даю согласие на о... | Факультет Компьютерных Наук |
2 | 1/12/2016 9:28:50 | Захарова Елена Сергеевна | 1583253@gmail.com | Студент | женский | 20 | 11/19/1996 | NaN | Москва | нет | ... | NaN | Не знаю | Не знаю, что такое двоичное дерево | Полезно. Для. Основной специальности. | VK.com (ВКонтакте), Instagram | NaN | NaN | NaN | Я заполнил форму полностью и даю согласие на о... | Гуманитарных наук/Лингвистика |
3 | 1/12/2016 9:30:32 | Михайлин Анатолий Владимирович | mehanat1996@gmail.com | Студент | мужской | 13 | 5/26/1996 | 7.20 | Москва | да | ... | 2.0 | Не знаю | Не знаю, что такое двоичное дерево | Занимаюсь разработкой web-приложений (на языке... | VK.com (ВКонтакте) | vk.com/mehanat | NaN | NaN | Я заполнил форму полностью и даю согласие на о... | ГиМУ |
4 | 1/12/2016 9:33:02 | Мельник Анастасия Александровна | melnik-a-a@mail.ru | Студент | женский | 20 | 8/3/2016 | 9.00 | Москва | нет | ... | 2.0 | Не знаю | 8 | Я учусь на прикладной лингвистике, а в програм... | Facebook, VK.com (ВКонтакте), Instagram | https://vk.com/feed | NaN | NaN | Я заполнил форму полностью и даю согласие на о... | Лингвистики |
5 rows × 27 columns
data.shape
(112, 27)
data.columns
Index(['Timestamp', 'Фамилия Имя Отчество', 'E-mail адрес', 'Роль на курсе', 'Пол', 'Группа', 'Дата рождения', 'GPA (Средний балл)', 'Родной город', 'Имели опыт анализа данных до курса?', 'Есть ли у вас научные публикации / доклады?', 'Есть ли у вас публикации в ненаучных изданиях (журналы, тематические блоги и т.д.)? Перечислите издания через запятую.', 'Какой ваш уровень владения английским языком?', 'Как называется изучаемый на курсе предмет по-английски?', 'Какими языками программирования владеете?', 'Сколько будет 2+2?', 'После удержания налога на доходы Мария Константиновна получила 16530 рублей. Сколько рублей составляет заработная плата Марии Константиновны?', 'Сколько параметров имеет нормальное распределение?', 'Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?', 'Сколько листьев имеет полное двоичное дерево высоты 3?', 'Характеризуйте тремя предложениями, почему решили пойти на курс?', 'Какими социальными сетями пользуетесь?', 'Ссылка на личный вебсайт', 'Ссылка на профиль в LinkedIn', 'Ссылка на страницу в Facebook', 'Согласие на обработку данных', 'Факультет'], dtype='object')
data = pd.read_csv('responses.csv')
data = data.drop(104)
data = data.dropna(subset=['E-mail адрес', 'Пол', 'Дата рождения',
'GPA (Средний балл)', 'Есть ли у вас научные публикации / доклады?',
'Какой ваш уровень владения английским языком?',
'Как называется изучаемый на курсе предмет по-английски?',
'Сколько параметров имеет нормальное распределение?',
'Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?'])
data.shape
(76, 27)
new_data = pd.DataFrame()
new_data['email'] = [email.split('@')[1] for email in data['E-mail адрес']]
new_data.head()
0 | gmail.com |
---|---|
1 | gmail.com |
2 | gmail.com |
3 | mail.ru |
4 | edu.hse.ru |
new_data.insert(1, 'course_role', data["Роль на курсе"].values)
new_data.insert(2, 'gender', data["Пол"].values)
new_data.head()
course_role | gender | ||
---|---|---|---|
0 | gmail.com | Семинарист | мужской |
1 | gmail.com | Семинарист | мужской |
2 | gmail.com | Студент | мужской |
3 | mail.ru | Студент | женский |
4 | edu.hse.ru | Студент | женский |
new_data.insert(3, 'birth_date', [date.split('/')[2] for date in data['Дата рождения']])
new_data.head()
course_role | gender | birth_date | ||
---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 |
1 | gmail.com | Семинарист | мужской | 1990 |
2 | gmail.com | Студент | мужской | 1996 |
3 | mail.ru | Студент | женский | 2016 |
4 | edu.hse.ru | Студент | женский | 1996 |
tmp_data = np.empty(data['GPA (Средний балл)'].values.shape, 'object') # [0..2.5, 2.5..5, 5..7.5, 7.5..10]
tmp_data[data['GPA (Средний балл)'].values < 2.5] = '0'
tmp_data[np.bitwise_and(data['GPA (Средний балл)'].values >= 2.5, data['GPA (Средний балл)'].values < 5.0)] = '1'
tmp_data[np.bitwise_and(data['GPA (Средний балл)'].values >= 5.0, data['GPA (Средний балл)'].values < 7.5)] = '2'
tmp_data[data['GPA (Средний балл)'].values >= 7.5] = '3'
tmp_data
array(['2', '1', '2', '3', '3', '3', '2', '3', '3', '3', '3', '3', '3', '3', '3', '3', '2', '3', '3', '1', '3', '3', '3', '2', '3', '3', '2', '2', '1', '2', '3', '2', '2', '2', '2', '3', '3', '3', '3', '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '2', '2', '3', '2', '3', '3', '3', '3', '3', '3', '3', '2', '3', '3', '3', '3', '2', '3', '3', '3', '3', '3', '2', '3', '2', '3'], dtype=object)
new_data.insert(4, 'gpa', tmp_data)
new_data.head()
course_role | gender | birth_date | gpa | ||
---|---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 | 2 |
1 | gmail.com | Семинарист | мужской | 1990 | 1 |
2 | gmail.com | Студент | мужской | 1996 | 2 |
3 | mail.ru | Студент | женский | 2016 | 3 |
4 | edu.hse.ru | Студент | женский | 1996 | 3 |
set(data['Есть ли у вас научные публикации / доклады?'])
{'доклады на топовых конференциях по некоторой тематике', 'есть какие-то', 'нет'}
new_data.insert(5, 'papers', ['many' if val[0] == 'д' else 'some' if val[0] == 'е' else 'none'
for val in data['Есть ли у вас научные публикации / доклады?']])
new_data.head()
course_role | gender | birth_date | gpa | papers | ||
---|---|---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 | 2 | some |
1 | gmail.com | Семинарист | мужской | 1990 | 1 | many |
2 | gmail.com | Студент | мужской | 1996 | 2 | none |
3 | mail.ru | Студент | женский | 2016 | 3 | none |
4 | edu.hse.ru | Студент | женский | 1996 | 3 | none |
set(data['Какой ваш уровень владения английским языком?'])
{'Advanced', 'Beginner', 'Full proficiency', 'Intermediate', 'Upper-intermediate'}
new_data.insert(6, 'english_level', data['Какой ваш уровень владения английским языком?'].values)
new_data.head()
course_role | gender | birth_date | gpa | papers | english_level | ||
---|---|---|---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 | 2 | some | Upper-intermediate |
1 | gmail.com | Семинарист | мужской | 1990 | 1 | many | Upper-intermediate |
2 | gmail.com | Студент | мужской | 1996 | 2 | none | Intermediate |
3 | mail.ru | Студент | женский | 2016 | 3 | none | Upper-intermediate |
4 | edu.hse.ru | Студент | женский | 1996 | 3 | none | Upper-intermediate |
set(data['Как называется изучаемый на курсе предмет по-английски?'])
{'Data Analysis', 'Data Mining', 'Data Science', 'Machine Learning'}
new_data.insert(7, 'course_name', data['Как называется изучаемый на курсе предмет по-английски?'].values)
new_data.head()
course_role | gender | birth_date | gpa | papers | english_level | course_name | ||
---|---|---|---|---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 | 2 | some | Upper-intermediate | Data Analysis |
1 | gmail.com | Семинарист | мужской | 1990 | 1 | many | Upper-intermediate | Data Analysis |
2 | gmail.com | Студент | мужской | 1996 | 2 | none | Intermediate | Data Analysis |
3 | mail.ru | Студент | женский | 2016 | 3 | none | Upper-intermediate | Data Mining |
4 | edu.hse.ru | Студент | женский | 1996 | 3 | none | Upper-intermediate | Data Mining |
set(data['Сколько параметров имеет нормальное распределение?'])
{1.0, 2.0, 3.0, 42.0}
new_data.insert(8, 'normal_params', list(map(str, data['Сколько параметров имеет нормальное распределение?'])))
new_data.head()
course_role | gender | birth_date | gpa | papers | english_level | course_name | normal_params | ||
---|---|---|---|---|---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 | 2 | some | Upper-intermediate | Data Analysis | 2.0 |
1 | gmail.com | Семинарист | мужской | 1990 | 1 | many | Upper-intermediate | Data Analysis | 2.0 |
2 | gmail.com | Студент | мужской | 1996 | 2 | none | Intermediate | Data Analysis | 2.0 |
3 | mail.ru | Студент | женский | 2016 | 3 | none | Upper-intermediate | Data Mining | 2.0 |
4 | edu.hse.ru | Студент | женский | 1996 | 3 | none | Upper-intermediate | Data Mining | 2.0 |
set(data['Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?'])
{'Критерий Мана-Уитни', 'Критерий Стьюдента', 'Критерий Стьюдента для связанных выборок', 'Критерий знаковых рангов Уилкоксона', 'Не знаю'}
new_data.insert(9, 'stat_crit', data['Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?'].values)
new_data.head()
course_role | gender | birth_date | gpa | papers | english_level | course_name | normal_params | stat_crit | ||
---|---|---|---|---|---|---|---|---|---|---|
0 | gmail.com | Семинарист | мужской | 1991 | 2 | some | Upper-intermediate | Data Analysis | 2.0 | Критерий знаковых рангов Уилкоксона |
1 | gmail.com | Семинарист | мужской | 1990 | 1 | many | Upper-intermediate | Data Analysis | 2.0 | Критерий знаковых рангов Уилкоксона |
2 | gmail.com | Студент | мужской | 1996 | 2 | none | Intermediate | Data Analysis | 2.0 | Не знаю |
3 | mail.ru | Студент | женский | 2016 | 3 | none | Upper-intermediate | Data Mining | 2.0 | Не знаю |
4 | edu.hse.ru | Студент | женский | 1996 | 3 | none | Upper-intermediate | Data Mining | 2.0 | Критерий Стьюдента для связанных выборок |
# ...
new_data = pd.get_dummies(new_data)
new_data.head()
email_bk.ru | email_edu.hse.ru | email_gmail.com | email_inbox.ru | email_mail.ru | email_outlook.com | email_yande.ru | email_yandex.ru | course_role_Лектор | course_role_Семинарист | ... | course_name_Machine Learning | normal_params_1.0 | normal_params_2.0 | normal_params_3.0 | normal_params_42.0 | stat_crit_Критерий Мана-Уитни | stat_crit_Критерий Стьюдента | stat_crit_Критерий Стьюдента для связанных выборок | stat_crit_Критерий знаковых рангов Уилкоксона | stat_crit_Не знаю | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
5 rows × 45 columns
new_data.shape
(76, 45)
# conda install orange3
# pip install orange3-associate
# https://orange3-associate.readthedocs.io/en/latest/
# conda create --name py27 python=2.7 anaconda (если стоит третий питон по умолчанию)
# activate py27
# conda install orange
# http://docs.orange.biolab.si/2/reference/rst/Orange.associate.html
# deactivate (после использования)
from orangecontrib.associate.fpgrowth import frequent_itemsets, association_rules
itemsets = dict(frequent_itemsets(new_data.values, 0.5))
itemsets
{frozenset({10, 12}): 38, frozenset({10, 23, 25}): 39, frozenset({37}): 63, frozenset({25}): 65, frozenset({25, 37}): 53, frozenset({17, 25}): 46, frozenset({10}): 73, frozenset({10, 25}): 65, frozenset({10, 23}): 45, frozenset({10, 25, 37}): 53, frozenset({23}): 45, frozenset({23, 37}): 40, frozenset({33}): 41, frozenset({10, 33}): 40, frozenset({17}): 53, frozenset({17, 37}): 44, frozenset({23, 25}): 39, frozenset({10, 17}): 53, frozenset({10, 17, 25}): 46, frozenset({17, 25, 37}): 38, frozenset({10, 37}): 60, frozenset({10, 17, 37}): 44, frozenset({10, 17, 25, 37}): 38, frozenset({10, 23, 37}): 40, frozenset({31}): 40, frozenset({12}): 41, frozenset({10, 31}): 38}
for i, item in enumerate(new_data.columns):
print(i, item)
0 email_bk.ru 1 email_edu.hse.ru 2 email_gmail.com 3 email_inbox.ru 4 email_mail.ru 5 email_outlook.com 6 email_yande.ru 7 email_yandex.ru 8 course_role_Лектор 9 course_role_Семинарист 10 course_role_Студент 11 gender_женский 12 gender_мужской 13 birth_date_1990 14 birth_date_1991 15 birth_date_1994 16 birth_date_1995 17 birth_date_1996 18 birth_date_1997 19 birth_date_2016 20 birth_date_2041 21 gpa_1 22 gpa_2 23 gpa_3 24 papers_many 25 papers_none 26 papers_some 27 english_level_Advanced 28 english_level_Beginner 29 english_level_Full proficiency 30 english_level_Intermediate 31 english_level_Upper-intermediate 32 course_name_Data Analysis 33 course_name_Data Mining 34 course_name_Data Science 35 course_name_Machine Learning 36 normal_params_1.0 37 normal_params_2.0 38 normal_params_3.0 39 normal_params_42.0 40 stat_crit_Критерий Мана-Уитни 41 stat_crit_Критерий Стьюдента 42 stat_crit_Критерий Стьюдента для связанных выборок 43 stat_crit_Критерий знаковых рангов Уилкоксона 44 stat_crit_Не знаю
sorted(list(association_rules(itemsets, 0.8)), key=lambda x: -x[3])
[(frozenset({17, 25, 37}), frozenset({10}), 38, 1.0), (frozenset({23, 25}), frozenset({10}), 39, 1.0), (frozenset({25, 37}), frozenset({10}), 53, 1.0), (frozenset({17, 25}), frozenset({10}), 46, 1.0), (frozenset({23, 37}), frozenset({10}), 40, 1.0), (frozenset({17, 37}), frozenset({10}), 44, 1.0), (frozenset({25}), frozenset({10}), 65, 1.0), (frozenset({23}), frozenset({10}), 45, 1.0), (frozenset({17}), frozenset({10}), 53, 1.0), (frozenset({33}), frozenset({10}), 40, 0.975609756097561), (frozenset({37}), frozenset({10}), 60, 0.9523809523809523), (frozenset({31}), frozenset({10}), 38, 0.95), (frozenset({12}), frozenset({10}), 38, 0.926829268292683), (frozenset({10}), frozenset({25}), 65, 0.8904109589041096), (frozenset({10, 23}), frozenset({37}), 40, 0.8888888888888888), (frozenset({23}), frozenset({10, 37}), 40, 0.8888888888888888), (frozenset({23}), frozenset({37}), 40, 0.8888888888888888), (frozenset({10, 37}), frozenset({25}), 53, 0.8833333333333333), (frozenset({10, 17}), frozenset({25}), 46, 0.8679245283018868), (frozenset({17}), frozenset({10, 25}), 46, 0.8679245283018868), (frozenset({17}), frozenset({25}), 46, 0.8679245283018868), (frozenset({10, 23}), frozenset({25}), 39, 0.8666666666666667), (frozenset({23}), frozenset({10, 25}), 39, 0.8666666666666667), (frozenset({23}), frozenset({25}), 39, 0.8666666666666667), (frozenset({10, 17, 37}), frozenset({25}), 38, 0.8636363636363636), (frozenset({17, 37}), frozenset({10, 25}), 38, 0.8636363636363636), (frozenset({17, 37}), frozenset({25}), 38, 0.8636363636363636), (frozenset({37}), frozenset({10, 25}), 53, 0.8412698412698413), (frozenset({37}), frozenset({25}), 53, 0.8412698412698413), (frozenset({10, 17}), frozenset({37}), 44, 0.8301886792452831), (frozenset({17}), frozenset({10, 37}), 44, 0.8301886792452831), (frozenset({17}), frozenset({37}), 44, 0.8301886792452831), (frozenset({10, 17, 25}), frozenset({37}), 38, 0.8260869565217391), (frozenset({17, 25}), frozenset({10, 37}), 38, 0.8260869565217391), (frozenset({17, 25}), frozenset({37}), 38, 0.8260869565217391), (frozenset({10}), frozenset({37}), 60, 0.821917808219178), (frozenset({10, 25}), frozenset({37}), 53, 0.8153846153846154), (frozenset({25}), frozenset({10, 37}), 53, 0.8153846153846154), (frozenset({25}), frozenset({37}), 53, 0.8153846153846154)]
itemsets = dict(frequent_itemsets(new_data.values, 0.3))
itemsets
{frozenset({10, 17, 31}): 30, frozenset({10, 17, 23, 25}): 28, frozenset({10, 17, 23, 37}): 30, frozenset({10, 17, 44}): 24, frozenset({10, 23, 33, 37}): 24, frozenset({11, 23, 37}): 24, frozenset({10, 23}): 45, frozenset({22}): 28, frozenset({17, 37}): 44, frozenset({17, 23, 37}): 30, frozenset({10, 44}): 31, frozenset({2, 10, 37}): 23, frozenset({17, 23, 25}): 28, frozenset({23, 31}): 25, frozenset({23, 31, 37}): 23, frozenset({25, 37}): 53, frozenset({23, 25, 37}): 35, frozenset({10, 23, 25, 37}): 35, frozenset({33}): 41, frozenset({17, 33}): 26, frozenset({10, 17, 25, 31}): 24, frozenset({25, 37, 44}): 23, frozenset({2, 25}): 25, frozenset({12}): 41, frozenset({12, 25, 37}): 25, frozenset({10, 12, 25, 37}): 25, frozenset({12, 17, 25}): 24, frozenset({10, 17, 37}): 44, frozenset({10, 25, 33, 37}): 31, frozenset({12, 17, 37}): 23, frozenset({11}): 35, frozenset({11, 37}): 31, frozenset({10, 23, 31, 37}): 23, frozenset({10, 42}): 24, frozenset({11, 25}): 32, frozenset({10, 12}): 38, frozenset({25, 31, 37}): 28, frozenset({10, 31, 37}): 34, frozenset({17}): 53, frozenset({10, 17, 25}): 46, frozenset({10, 17, 23, 25, 37}): 25, frozenset({32}): 24, frozenset({10, 33}): 40, frozenset({10, 22}): 26, frozenset({10, 11}): 35, frozenset({10, 25, 31, 37}): 28, frozenset({10, 23, 33}): 25, frozenset({10, 23, 25, 33}): 23, frozenset({23, 33, 37}): 24, frozenset({17, 25, 33}): 23, frozenset({12, 17}): 28, frozenset({10, 33, 37}): 34, frozenset({11, 25, 37}): 28, frozenset({22, 25}): 24, frozenset({23, 25, 33}): 23, frozenset({31}): 40, frozenset({17, 25, 31}): 24, frozenset({10, 31}): 38, frozenset({10, 23, 31}): 25, frozenset({10, 12, 17}): 28, frozenset({25}): 65, frozenset({10, 23, 25}): 39, frozenset({25, 44}): 28, frozenset({10, 11, 25}): 32, frozenset({17, 25}): 46, frozenset({10, 22, 25}): 24, frozenset({10, 12, 17, 37}): 23, frozenset({12, 25}): 33, frozenset({10, 25, 37}): 53, frozenset({11, 17}): 25, frozenset({10, 25, 37, 44}): 23, frozenset({17, 44}): 24, frozenset({10}): 73, frozenset({10, 17}): 53, frozenset({10, 12, 17, 25}): 24, frozenset({37}): 63, frozenset({10, 37}): 60, frozenset({33, 37}): 35, frozenset({10, 17, 33, 37}): 23, frozenset({42}): 24, frozenset({23, 25}): 39, frozenset({44}): 31, frozenset({10, 37, 44}): 26, frozenset({17, 23}): 34, frozenset({10, 17, 23}): 34, frozenset({17, 31}): 30, frozenset({10, 11, 17}): 25, frozenset({17, 25, 37}): 38, frozenset({23, 37}): 40, frozenset({10, 11, 23, 37}): 24, frozenset({2}): 31, frozenset({2, 10}): 28, frozenset({10, 25}): 65, frozenset({10, 25, 44}): 28, frozenset({25, 33}): 37, frozenset({10, 25, 33}): 37, frozenset({10, 11, 25, 37}): 28, frozenset({12, 37}): 32, frozenset({10, 17, 25, 33}): 23, frozenset({2, 37}): 26, frozenset({10, 12, 25}): 33, frozenset({23, 33}): 25, frozenset({11, 23}): 25, frozenset({25, 33, 37}): 31, frozenset({10, 17, 25, 37}): 38, frozenset({17, 31, 37}): 27, frozenset({10, 17, 33}): 26, frozenset({10, 12, 37}): 29, frozenset({2, 10, 25}): 25, frozenset({23}): 45, frozenset({17, 23, 25, 37}): 25, frozenset({10, 11, 37}): 31, frozenset({10, 23, 37}): 40, frozenset({10, 25, 31}): 31, frozenset({37, 44}): 26, frozenset({25, 31}): 31, frozenset({17, 33, 37}): 23, frozenset({10, 11, 23}): 25, frozenset({31, 37}): 36, frozenset({10, 17, 31, 37}): 27}
for i, item in enumerate(new_data.columns):
print(i, item)
0 email_bk.ru 1 email_edu.hse.ru 2 email_gmail.com 3 email_inbox.ru 4 email_mail.ru 5 email_outlook.com 6 email_yande.ru 7 email_yandex.ru 8 course_role_Лектор 9 course_role_Семинарист 10 course_role_Студент 11 gender_женский 12 gender_мужской 13 birth_date_1990 14 birth_date_1991 15 birth_date_1994 16 birth_date_1995 17 birth_date_1996 18 birth_date_1997 19 birth_date_2016 20 birth_date_2041 21 gpa_1 22 gpa_2 23 gpa_3 24 papers_many 25 papers_none 26 papers_some 27 english_level_Advanced 28 english_level_Beginner 29 english_level_Full proficiency 30 english_level_Intermediate 31 english_level_Upper-intermediate 32 course_name_Data Analysis 33 course_name_Data Mining 34 course_name_Data Science 35 course_name_Machine Learning 36 normal_params_1.0 37 normal_params_2.0 38 normal_params_3.0 39 normal_params_42.0 40 stat_crit_Критерий Мана-Уитни 41 stat_crit_Критерий Стьюдента 42 stat_crit_Критерий Стьюдента для связанных выборок 43 stat_crit_Критерий знаковых рангов Уилкоксона 44 stat_crit_Не знаю
for rule in sorted(list(association_rules(itemsets, 0.9)), key=lambda x: -x[3]):
print("{} --> {} with supp = {}, conf = {}".format([new_data.columns[ftr] for ftr in rule[0]],
[new_data.columns[ftr] for ftr in rule[1]],
rule[2], rule[3]))
['birth_date_1996', 'normal_params_2.0', 'papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['birth_date_1996', 'papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['birth_date_1996', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 30, conf = 1.0 ['course_name_Data Mining', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['papers_none', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 35, conf = 1.0 ['birth_date_1996', 'papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['papers_none', 'gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['papers_none', 'normal_params_2.0', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['gpa_3', 'normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 23, conf = 1.0 ['papers_none', 'normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['papers_none', 'gpa_3', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 23, conf = 1.0 ['birth_date_1996', 'gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 23, conf = 1.0 ['papers_none', 'stat_crit_Не знаю', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 23, conf = 1.0 ['birth_date_1996', 'gender_мужской', 'papers_none'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['birth_date_1996', 'normal_params_2.0', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 23, conf = 1.0 ['gender_женский', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['papers_none', 'gender_женский', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['birth_date_1996', 'papers_none', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 23, conf = 1.0 ['birth_date_1996', 'normal_params_2.0', 'papers_none'] --> ['course_role_Студент'] with supp = 38, conf = 1.0 ['birth_date_1996', 'normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 27, conf = 1.0 ['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 30, conf = 1.0 ['birth_date_1996', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['birth_date_1996', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 44, conf = 1.0 ['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['birth_date_1996', 'papers_none'] --> ['course_role_Студент'] with supp = 46, conf = 1.0 ['gpa_3', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['birth_date_1996', 'gender_мужской'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 39, conf = 1.0 ['papers_none', 'gender_женский'] --> ['course_role_Студент'] with supp = 32, conf = 1.0 ['papers_none', 'gpa_2'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['papers_none', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 53, conf = 1.0 ['stat_crit_Не знаю', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 26, conf = 1.0 ['birth_date_1996', 'gpa_3'] --> ['course_role_Студент'] with supp = 34, conf = 1.0 ['birth_date_1996', 'gender_женский'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['papers_none', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['papers_none', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 37, conf = 1.0 ['papers_none', 'gender_мужской'] --> ['course_role_Студент'] with supp = 33, conf = 1.0 ['birth_date_1996', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 26, conf = 1.0 ['papers_none', 'email_gmail.com'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['gender_женский', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 40, conf = 1.0 ['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['gender_женский', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['gpa_3'] --> ['course_role_Студент'] with supp = 45, conf = 1.0 ['stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['stat_crit_Критерий Стьюдента для связанных выборок'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['gender_женский'] --> ['course_role_Студент'] with supp = 35, conf = 1.0 ['birth_date_1996'] --> ['course_role_Студент'] with supp = 53, conf = 1.0 ['papers_none'] --> ['course_role_Студент'] with supp = 65, conf = 1.0 ['course_name_Data Mining'] --> ['course_role_Студент'] with supp = 40, conf = 0.975609756097561 ['course_name_Data Mining', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 34, conf = 0.9714285714285714 ['course_name_Data Mining', 'course_role_Студент', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96 ['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96 ['course_role_Студент', 'gender_женский', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96 ['gender_женский', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96 ['gender_женский', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96 ['course_name_Data Mining', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96 ['normal_params_2.0'] --> ['course_role_Студент'] with supp = 60, conf = 0.9523809523809523 ['english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 38, conf = 0.95 ['normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 34, conf = 0.9444444444444444 ['gpa_2'] --> ['course_role_Студент'] with supp = 26, conf = 0.9285714285714286 ['gender_мужской'] --> ['course_role_Студент'] with supp = 38, conf = 0.926829268292683 ['course_name_Data Mining', 'course_role_Студент'] --> ['papers_none'] with supp = 37, conf = 0.925 ['course_role_Студент', 'gpa_2'] --> ['papers_none'] with supp = 24, conf = 0.9230769230769231 ['course_role_Студент', 'gpa_3', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 23, conf = 0.92 ['english_level_Upper-intermediate', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 23, conf = 0.92 ['course_name_Data Mining', 'course_role_Студент', 'gpa_3'] --> ['papers_none'] with supp = 23, conf = 0.92 ['course_name_Data Mining', 'gpa_3'] --> ['papers_none', 'course_role_Студент'] with supp = 23, conf = 0.92 ['gpa_3', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 23, conf = 0.92 ['course_name_Data Mining', 'gpa_3'] --> ['papers_none'] with supp = 23, conf = 0.92 ['course_role_Студент', 'gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143 ['gender_женский'] --> ['papers_none', 'course_role_Студент'] with supp = 32, conf = 0.9142857142857143 ['gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143 ['course_name_Data Mining', 'course_role_Студент', 'normal_params_2.0'] --> ['papers_none'] with supp = 31, conf = 0.9117647058823529 ['gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 29, conf = 0.90625 ['papers_none', 'course_role_Студент', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 28, conf = 0.9032258064516129 ['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 28, conf = 0.9032258064516129 ['course_role_Студент', 'gender_женский', 'normal_params_2.0'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['gender_женский', 'normal_params_2.0'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129 ['papers_none', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 28, conf = 0.9032258064516129 ['gender_женский', 'normal_params_2.0'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['course_role_Студент', 'stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['stat_crit_Не знаю'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129 ['stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['email_gmail.com'] --> ['course_role_Студент'] with supp = 28, conf = 0.9032258064516129 ['course_name_Data Mining'] --> ['papers_none', 'course_role_Студент'] with supp = 37, conf = 0.9024390243902439 ['course_name_Data Mining'] --> ['papers_none'] with supp = 37, conf = 0.9024390243902439 ['birth_date_1996', 'course_role_Студент', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 27, conf = 0.9 ['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 27, conf = 0.9 ['birth_date_1996', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 27, conf = 0.9 ['english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 36, conf = 0.9
for rule in sorted(list(association_rules(itemsets, 0.9)), key=lambda x: -x[3]):
if len(rule[0]) > 1:
continue
print("{} --> {} with supp = {}, conf = {}".format([new_data.columns[ftr] for ftr in rule[0]],
[new_data.columns[ftr] for ftr in rule[1]],
rule[2], rule[3]))
['gpa_3'] --> ['course_role_Студент'] with supp = 45, conf = 1.0 ['stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['stat_crit_Критерий Стьюдента для связанных выборок'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['gender_женский'] --> ['course_role_Студент'] with supp = 35, conf = 1.0 ['birth_date_1996'] --> ['course_role_Студент'] with supp = 53, conf = 1.0 ['papers_none'] --> ['course_role_Студент'] with supp = 65, conf = 1.0 ['course_name_Data Mining'] --> ['course_role_Студент'] with supp = 40, conf = 0.975609756097561 ['normal_params_2.0'] --> ['course_role_Студент'] with supp = 60, conf = 0.9523809523809523 ['english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 38, conf = 0.95 ['gpa_2'] --> ['course_role_Студент'] with supp = 26, conf = 0.9285714285714286 ['gender_мужской'] --> ['course_role_Студент'] with supp = 38, conf = 0.926829268292683 ['gender_женский'] --> ['papers_none', 'course_role_Студент'] with supp = 32, conf = 0.9142857142857143 ['gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143 ['stat_crit_Не знаю'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129 ['stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['email_gmail.com'] --> ['course_role_Студент'] with supp = 28, conf = 0.9032258064516129 ['course_name_Data Mining'] --> ['papers_none', 'course_role_Студент'] with supp = 37, conf = 0.9024390243902439 ['course_name_Data Mining'] --> ['papers_none'] with supp = 37, conf = 0.9024390243902439 ['english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 36, conf = 0.9
for rule in sorted(list(association_rules(itemsets, 0.9)), key=lambda x: -x[3]):
if len(rule[0]) != 2:
continue
print("{} --> {} with supp = {}, conf = {}".format([new_data.columns[ftr] for ftr in rule[0]],
[new_data.columns[ftr] for ftr in rule[1]],
rule[2], rule[3]))
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 30, conf = 1.0 ['birth_date_1996', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['birth_date_1996', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 44, conf = 1.0 ['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['birth_date_1996', 'papers_none'] --> ['course_role_Студент'] with supp = 46, conf = 1.0 ['gpa_3', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['birth_date_1996', 'gender_мужской'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 39, conf = 1.0 ['papers_none', 'gender_женский'] --> ['course_role_Студент'] with supp = 32, conf = 1.0 ['papers_none', 'gpa_2'] --> ['course_role_Студент'] with supp = 24, conf = 1.0 ['papers_none', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 53, conf = 1.0 ['stat_crit_Не знаю', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 26, conf = 1.0 ['birth_date_1996', 'gpa_3'] --> ['course_role_Студент'] with supp = 34, conf = 1.0 ['birth_date_1996', 'gender_женский'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['papers_none', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 28, conf = 1.0 ['papers_none', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 37, conf = 1.0 ['papers_none', 'gender_мужской'] --> ['course_role_Студент'] with supp = 33, conf = 1.0 ['birth_date_1996', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 26, conf = 1.0 ['papers_none', 'email_gmail.com'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['gender_женский', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 40, conf = 1.0 ['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 31, conf = 1.0 ['gender_женский', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0 ['course_name_Data Mining', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 34, conf = 0.9714285714285714 ['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96 ['gender_женский', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96 ['gender_женский', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96 ['course_name_Data Mining', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96 ['normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 34, conf = 0.9444444444444444 ['course_name_Data Mining', 'course_role_Студент'] --> ['papers_none'] with supp = 37, conf = 0.925 ['course_role_Студент', 'gpa_2'] --> ['papers_none'] with supp = 24, conf = 0.9230769230769231 ['english_level_Upper-intermediate', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 23, conf = 0.92 ['course_name_Data Mining', 'gpa_3'] --> ['papers_none', 'course_role_Студент'] with supp = 23, conf = 0.92 ['gpa_3', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 23, conf = 0.92 ['course_name_Data Mining', 'gpa_3'] --> ['papers_none'] with supp = 23, conf = 0.92 ['course_role_Студент', 'gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143 ['gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 29, conf = 0.90625 ['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 28, conf = 0.9032258064516129 ['gender_женский', 'normal_params_2.0'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129 ['papers_none', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 28, conf = 0.9032258064516129 ['gender_женский', 'normal_params_2.0'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['course_role_Студент', 'stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129 ['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 27, conf = 0.9 ['birth_date_1996', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 27, conf = 0.9