In [1]:
import numpy as np
import pandas as pd
C:\Users\ivypa\Anaconda3\lib\site-packages\pandas\computation\__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)
In [2]:
data = pd.read_csv('responses.csv')
In [3]:
data.head()
Out[3]:
Timestamp Фамилия Имя Отчество E-mail адрес Роль на курсе Пол Группа Дата рождения GPA (Средний балл) Родной город Имели опыт анализа данных до курса? ... Сколько параметров имеет нормальное распределение? Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки? Сколько листьев имеет полное двоичное дерево высоты 3? Характеризуйте тремя предложениями, почему решили пойти на курс? Какими социальными сетями пользуетесь? Ссылка на личный вебсайт Ссылка на профиль в LinkedIn Ссылка на страницу в Facebook Согласие на обработку данных Факультет
0 1/11/2016 21:16:50 Шестаков Андрей Владимирович [email protected] Семинарист мужской ИАД-3 5/17/1991 7.00 г. Бор, Нижегородская область да ... 2.0 Критерий знаковых рангов Уилкоксона 8 For. The. Science! Facebook, VK.com (ВКонтакте), LinkedIn, Github... NaN NaN NaN Я заполнил форму полностью и даю согласие на о... Компьютерных Наук
1 1/11/2016 22:13:51 Кашницкий Юрий Савельевич [email protected] Семинарист мужской аспирант 3 года 11/1/1990 4.53 Москва да ... 2.0 Критерий знаковых рангов Уилкоксона 8 Хочу получить опыт преподавания не только Pyth... Facebook, VK.com (ВКонтакте), LinkedIn, Github http://www.hse.ru/staff/ykashnitsky https://www.linkedin.com/profile/view?id=19224... https://www.facebook.com/festline Я заполнил форму полностью и даю согласие на о... Факультет Компьютерных Наук
2 1/12/2016 9:28:50 Захарова Елена Сергеевна [email protected] Студент женский 20 11/19/1996 NaN Москва нет ... NaN Не знаю Не знаю, что такое двоичное дерево Полезно. Для. Основной специальности. VK.com (ВКонтакте), Instagram NaN NaN NaN Я заполнил форму полностью и даю согласие на о... Гуманитарных наук/Лингвистика
3 1/12/2016 9:30:32 Михайлин Анатолий Владимирович [email protected] Студент мужской 13 5/26/1996 7.20 Москва да ... 2.0 Не знаю Не знаю, что такое двоичное дерево Занимаюсь разработкой web-приложений (на языке... VK.com (ВКонтакте) vk.com/mehanat NaN NaN Я заполнил форму полностью и даю согласие на о... ГиМУ
4 1/12/2016 9:33:02 Мельник Анастасия Александровна [email protected] Студент женский 20 8/3/2016 9.00 Москва нет ... 2.0 Не знаю 8 Я учусь на прикладной лингвистике, а в програм... Facebook, VK.com (ВКонтакте), Instagram https://vk.com/feed NaN NaN Я заполнил форму полностью и даю согласие на о... Лингвистики

5 rows × 27 columns

In [4]:
data.shape
Out[4]:
(112, 27)
In [5]:
data.columns
Out[5]:
Index(['Timestamp', 'Фамилия Имя Отчество', 'E-mail адрес', 'Роль на курсе',
       'Пол', 'Группа', 'Дата рождения', 'GPA (Средний балл)', 'Родной город',
       'Имели опыт анализа данных до курса?',
       'Есть ли у вас научные публикации / доклады?',
       'Есть ли у вас публикации в ненаучных изданиях (журналы, тематические блоги и т.д.)? Перечислите издания через запятую.',
       'Какой ваш уровень владения английским языком?',
       'Как называется изучаемый на курсе предмет по-английски?',
       'Какими языками программирования владеете?', 'Сколько будет 2+2?',
       'После удержания налога на доходы Мария Константиновна получила 16530 рублей. Сколько рублей составляет заработная плата Марии Константиновны?',
       'Сколько параметров имеет нормальное распределение?',
       'Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?',
       'Сколько листьев имеет полное двоичное дерево высоты 3?',
       'Характеризуйте тремя предложениями, почему решили пойти на курс?',
       'Какими социальными сетями пользуетесь?', 'Ссылка на личный вебсайт',
       'Ссылка на профиль в LinkedIn', 'Ссылка на страницу в Facebook',
       'Согласие на обработку данных', 'Факультет'],
      dtype='object')
In [6]:
data = pd.read_csv('responses.csv')
data = data.drop(104)
data = data.dropna(subset=['E-mail адрес', 'Пол', 'Дата рождения', 
                           'GPA (Средний балл)', 'Есть ли у вас научные публикации / доклады?',
                          'Какой ваш уровень владения английским языком?',
                          'Как называется изучаемый на курсе предмет по-английски?',
                          'Сколько параметров имеет нормальное распределение?',
                          'Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?'])
data.shape
Out[6]:
(76, 27)
In [7]:
new_data = pd.DataFrame()
In [8]:
new_data['email'] = [email.split('@')[1] for email in data['E-mail адрес']]
new_data.head()
Out[8]:
email
0 gmail.com
1 gmail.com
2 gmail.com
3 mail.ru
4 edu.hse.ru
In [9]:
new_data.insert(1, 'course_role', data["Роль на курсе"].values)
new_data.insert(2, 'gender', data["Пол"].values)
new_data.head()
Out[9]:
email course_role gender
0 gmail.com Семинарист мужской
1 gmail.com Семинарист мужской
2 gmail.com Студент мужской
3 mail.ru Студент женский
4 edu.hse.ru Студент женский
In [10]:
new_data.insert(3, 'birth_date', [date.split('/')[2] for date in data['Дата рождения']])
new_data.head()
Out[10]:
email course_role gender birth_date
0 gmail.com Семинарист мужской 1991
1 gmail.com Семинарист мужской 1990
2 gmail.com Студент мужской 1996
3 mail.ru Студент женский 2016
4 edu.hse.ru Студент женский 1996
In [11]:
tmp_data = np.empty(data['GPA (Средний балл)'].values.shape, 'object') # [0..2.5, 2.5..5, 5..7.5, 7.5..10]
tmp_data[data['GPA (Средний балл)'].values < 2.5] = '0'
tmp_data[np.bitwise_and(data['GPA (Средний балл)'].values >= 2.5, data['GPA (Средний балл)'].values < 5.0)] = '1'
tmp_data[np.bitwise_and(data['GPA (Средний балл)'].values >= 5.0, data['GPA (Средний балл)'].values < 7.5)] = '2'
tmp_data[data['GPA (Средний балл)'].values >= 7.5] = '3'
tmp_data
Out[11]:
array(['2', '1', '2', '3', '3', '3', '2', '3', '3', '3', '3', '3', '3',
       '3', '3', '3', '2', '3', '3', '1', '3', '3', '3', '2', '3', '3',
       '2', '2', '1', '2', '3', '2', '2', '2', '2', '3', '3', '3', '3',
       '3', '2', '2', '2', '2', '2', '2', '2', '2', '2', '3', '2', '2',
       '3', '2', '3', '3', '3', '3', '3', '3', '3', '2', '3', '3', '3',
       '3', '2', '3', '3', '3', '3', '3', '2', '3', '2', '3'], dtype=object)
In [12]:
new_data.insert(4, 'gpa', tmp_data)
new_data.head()
Out[12]:
email course_role gender birth_date gpa
0 gmail.com Семинарист мужской 1991 2
1 gmail.com Семинарист мужской 1990 1
2 gmail.com Студент мужской 1996 2
3 mail.ru Студент женский 2016 3
4 edu.hse.ru Студент женский 1996 3
In [13]:
set(data['Есть ли у вас научные публикации / доклады?'])
Out[13]:
{'доклады на топовых конференциях по некоторой тематике',
 'есть какие-то',
 'нет'}
In [14]:
new_data.insert(5, 'papers', ['many' if val[0] == 'д' else 'some' if val[0] == 'е' else 'none' 
                              for val in data['Есть ли у вас научные публикации / доклады?']])
new_data.head()
Out[14]:
email course_role gender birth_date gpa papers
0 gmail.com Семинарист мужской 1991 2 some
1 gmail.com Семинарист мужской 1990 1 many
2 gmail.com Студент мужской 1996 2 none
3 mail.ru Студент женский 2016 3 none
4 edu.hse.ru Студент женский 1996 3 none
In [15]:
set(data['Какой ваш уровень владения английским языком?'])
Out[15]:
{'Advanced',
 'Beginner',
 'Full proficiency',
 'Intermediate',
 'Upper-intermediate'}
In [16]:
new_data.insert(6, 'english_level', data['Какой ваш уровень владения английским языком?'].values)
new_data.head()
Out[16]:
email course_role gender birth_date gpa papers english_level
0 gmail.com Семинарист мужской 1991 2 some Upper-intermediate
1 gmail.com Семинарист мужской 1990 1 many Upper-intermediate
2 gmail.com Студент мужской 1996 2 none Intermediate
3 mail.ru Студент женский 2016 3 none Upper-intermediate
4 edu.hse.ru Студент женский 1996 3 none Upper-intermediate
In [17]:
set(data['Как называется изучаемый на курсе предмет по-английски?'])
Out[17]:
{'Data Analysis', 'Data Mining', 'Data Science', 'Machine Learning'}
In [18]:
new_data.insert(7, 'course_name', data['Как называется изучаемый на курсе предмет по-английски?'].values)
new_data.head()
Out[18]:
email course_role gender birth_date gpa papers english_level course_name
0 gmail.com Семинарист мужской 1991 2 some Upper-intermediate Data Analysis
1 gmail.com Семинарист мужской 1990 1 many Upper-intermediate Data Analysis
2 gmail.com Студент мужской 1996 2 none Intermediate Data Analysis
3 mail.ru Студент женский 2016 3 none Upper-intermediate Data Mining
4 edu.hse.ru Студент женский 1996 3 none Upper-intermediate Data Mining
In [19]:
set(data['Сколько параметров имеет нормальное распределение?'])
Out[19]:
{1.0, 2.0, 3.0, 42.0}
In [20]:
new_data.insert(8, 'normal_params', list(map(str, data['Сколько параметров имеет нормальное распределение?'])))
new_data.head()
Out[20]:
email course_role gender birth_date gpa papers english_level course_name normal_params
0 gmail.com Семинарист мужской 1991 2 some Upper-intermediate Data Analysis 2.0
1 gmail.com Семинарист мужской 1990 1 many Upper-intermediate Data Analysis 2.0
2 gmail.com Студент мужской 1996 2 none Intermediate Data Analysis 2.0
3 mail.ru Студент женский 2016 3 none Upper-intermediate Data Mining 2.0
4 edu.hse.ru Студент женский 1996 3 none Upper-intermediate Data Mining 2.0
In [21]:
set(data['Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?'])
Out[21]:
{'Критерий Мана-Уитни',
 'Критерий Стьюдента',
 'Критерий Стьюдента для связанных выборок',
 'Критерий знаковых рангов Уилкоксона',
 'Не знаю'}
In [22]:
new_data.insert(9, 'stat_crit', data['Какой статистический критерий примените для проверки различия между двумя связанными выборками, в случае, если нельзя сделать предположение о виде распределения выборки?'].values)
new_data.head()
Out[22]:
email course_role gender birth_date gpa papers english_level course_name normal_params stat_crit
0 gmail.com Семинарист мужской 1991 2 some Upper-intermediate Data Analysis 2.0 Критерий знаковых рангов Уилкоксона
1 gmail.com Семинарист мужской 1990 1 many Upper-intermediate Data Analysis 2.0 Критерий знаковых рангов Уилкоксона
2 gmail.com Студент мужской 1996 2 none Intermediate Data Analysis 2.0 Не знаю
3 mail.ru Студент женский 2016 3 none Upper-intermediate Data Mining 2.0 Не знаю
4 edu.hse.ru Студент женский 1996 3 none Upper-intermediate Data Mining 2.0 Критерий Стьюдента для связанных выборок
In [23]:
# ...
In [24]:
new_data = pd.get_dummies(new_data)
new_data.head()
Out[24]:
email_bk.ru email_edu.hse.ru email_gmail.com email_inbox.ru email_mail.ru email_outlook.com email_yande.ru email_yandex.ru course_role_Лектор course_role_Семинарист ... course_name_Machine Learning normal_params_1.0 normal_params_2.0 normal_params_3.0 normal_params_42.0 stat_crit_Критерий Мана-Уитни stat_crit_Критерий Стьюдента stat_crit_Критерий Стьюдента для связанных выборок stat_crit_Критерий знаковых рангов Уилкоксона stat_crit_Не знаю
0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
2 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
3 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
4 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0

5 rows × 45 columns

In [25]:
new_data.shape
Out[25]:
(76, 45)
In [26]:
# conda install orange3
# pip install orange3-associate
# https://orange3-associate.readthedocs.io/en/latest/

# conda create --name py27 python=2.7 anaconda (если стоит третий питон по умолчанию)
# activate py27
# conda install orange
# http://docs.orange.biolab.si/2/reference/rst/Orange.associate.html
# deactivate (после использования)
In [27]:
from orangecontrib.associate.fpgrowth import frequent_itemsets, association_rules
In [28]:
itemsets = dict(frequent_itemsets(new_data.values, 0.5))
In [29]:
itemsets
Out[29]:
{frozenset({10, 12}): 38,
 frozenset({10, 23, 25}): 39,
 frozenset({37}): 63,
 frozenset({25}): 65,
 frozenset({25, 37}): 53,
 frozenset({17, 25}): 46,
 frozenset({10}): 73,
 frozenset({10, 25}): 65,
 frozenset({10, 23}): 45,
 frozenset({10, 25, 37}): 53,
 frozenset({23}): 45,
 frozenset({23, 37}): 40,
 frozenset({33}): 41,
 frozenset({10, 33}): 40,
 frozenset({17}): 53,
 frozenset({17, 37}): 44,
 frozenset({23, 25}): 39,
 frozenset({10, 17}): 53,
 frozenset({10, 17, 25}): 46,
 frozenset({17, 25, 37}): 38,
 frozenset({10, 37}): 60,
 frozenset({10, 17, 37}): 44,
 frozenset({10, 17, 25, 37}): 38,
 frozenset({10, 23, 37}): 40,
 frozenset({31}): 40,
 frozenset({12}): 41,
 frozenset({10, 31}): 38}
In [30]:
for i, item in enumerate(new_data.columns):
    print(i, item)
0 email_bk.ru
1 email_edu.hse.ru
2 email_gmail.com
3 email_inbox.ru
4 email_mail.ru
5 email_outlook.com
6 email_yande.ru
7 email_yandex.ru
8 course_role_Лектор
9 course_role_Семинарист
10 course_role_Студент
11 gender_женский
12 gender_мужской
13 birth_date_1990
14 birth_date_1991
15 birth_date_1994
16 birth_date_1995
17 birth_date_1996
18 birth_date_1997
19 birth_date_2016
20 birth_date_2041
21 gpa_1
22 gpa_2
23 gpa_3
24 papers_many
25 papers_none
26 papers_some
27 english_level_Advanced
28 english_level_Beginner
29 english_level_Full proficiency
30 english_level_Intermediate
31 english_level_Upper-intermediate
32 course_name_Data Analysis
33 course_name_Data Mining
34 course_name_Data Science
35 course_name_Machine Learning
36 normal_params_1.0
37 normal_params_2.0
38 normal_params_3.0
39 normal_params_42.0
40 stat_crit_Критерий Мана-Уитни
41 stat_crit_Критерий Стьюдента
42 stat_crit_Критерий Стьюдента для связанных выборок
43 stat_crit_Критерий знаковых рангов Уилкоксона
44 stat_crit_Не знаю
In [31]:
sorted(list(association_rules(itemsets, 0.8)), key=lambda x: -x[3])
Out[31]:
[(frozenset({17, 25, 37}), frozenset({10}), 38, 1.0),
 (frozenset({23, 25}), frozenset({10}), 39, 1.0),
 (frozenset({25, 37}), frozenset({10}), 53, 1.0),
 (frozenset({17, 25}), frozenset({10}), 46, 1.0),
 (frozenset({23, 37}), frozenset({10}), 40, 1.0),
 (frozenset({17, 37}), frozenset({10}), 44, 1.0),
 (frozenset({25}), frozenset({10}), 65, 1.0),
 (frozenset({23}), frozenset({10}), 45, 1.0),
 (frozenset({17}), frozenset({10}), 53, 1.0),
 (frozenset({33}), frozenset({10}), 40, 0.975609756097561),
 (frozenset({37}), frozenset({10}), 60, 0.9523809523809523),
 (frozenset({31}), frozenset({10}), 38, 0.95),
 (frozenset({12}), frozenset({10}), 38, 0.926829268292683),
 (frozenset({10}), frozenset({25}), 65, 0.8904109589041096),
 (frozenset({10, 23}), frozenset({37}), 40, 0.8888888888888888),
 (frozenset({23}), frozenset({10, 37}), 40, 0.8888888888888888),
 (frozenset({23}), frozenset({37}), 40, 0.8888888888888888),
 (frozenset({10, 37}), frozenset({25}), 53, 0.8833333333333333),
 (frozenset({10, 17}), frozenset({25}), 46, 0.8679245283018868),
 (frozenset({17}), frozenset({10, 25}), 46, 0.8679245283018868),
 (frozenset({17}), frozenset({25}), 46, 0.8679245283018868),
 (frozenset({10, 23}), frozenset({25}), 39, 0.8666666666666667),
 (frozenset({23}), frozenset({10, 25}), 39, 0.8666666666666667),
 (frozenset({23}), frozenset({25}), 39, 0.8666666666666667),
 (frozenset({10, 17, 37}), frozenset({25}), 38, 0.8636363636363636),
 (frozenset({17, 37}), frozenset({10, 25}), 38, 0.8636363636363636),
 (frozenset({17, 37}), frozenset({25}), 38, 0.8636363636363636),
 (frozenset({37}), frozenset({10, 25}), 53, 0.8412698412698413),
 (frozenset({37}), frozenset({25}), 53, 0.8412698412698413),
 (frozenset({10, 17}), frozenset({37}), 44, 0.8301886792452831),
 (frozenset({17}), frozenset({10, 37}), 44, 0.8301886792452831),
 (frozenset({17}), frozenset({37}), 44, 0.8301886792452831),
 (frozenset({10, 17, 25}), frozenset({37}), 38, 0.8260869565217391),
 (frozenset({17, 25}), frozenset({10, 37}), 38, 0.8260869565217391),
 (frozenset({17, 25}), frozenset({37}), 38, 0.8260869565217391),
 (frozenset({10}), frozenset({37}), 60, 0.821917808219178),
 (frozenset({10, 25}), frozenset({37}), 53, 0.8153846153846154),
 (frozenset({25}), frozenset({10, 37}), 53, 0.8153846153846154),
 (frozenset({25}), frozenset({37}), 53, 0.8153846153846154)]
In [32]:
itemsets = dict(frequent_itemsets(new_data.values, 0.3))
itemsets
Out[32]:
{frozenset({10, 17, 31}): 30,
 frozenset({10, 17, 23, 25}): 28,
 frozenset({10, 17, 23, 37}): 30,
 frozenset({10, 17, 44}): 24,
 frozenset({10, 23, 33, 37}): 24,
 frozenset({11, 23, 37}): 24,
 frozenset({10, 23}): 45,
 frozenset({22}): 28,
 frozenset({17, 37}): 44,
 frozenset({17, 23, 37}): 30,
 frozenset({10, 44}): 31,
 frozenset({2, 10, 37}): 23,
 frozenset({17, 23, 25}): 28,
 frozenset({23, 31}): 25,
 frozenset({23, 31, 37}): 23,
 frozenset({25, 37}): 53,
 frozenset({23, 25, 37}): 35,
 frozenset({10, 23, 25, 37}): 35,
 frozenset({33}): 41,
 frozenset({17, 33}): 26,
 frozenset({10, 17, 25, 31}): 24,
 frozenset({25, 37, 44}): 23,
 frozenset({2, 25}): 25,
 frozenset({12}): 41,
 frozenset({12, 25, 37}): 25,
 frozenset({10, 12, 25, 37}): 25,
 frozenset({12, 17, 25}): 24,
 frozenset({10, 17, 37}): 44,
 frozenset({10, 25, 33, 37}): 31,
 frozenset({12, 17, 37}): 23,
 frozenset({11}): 35,
 frozenset({11, 37}): 31,
 frozenset({10, 23, 31, 37}): 23,
 frozenset({10, 42}): 24,
 frozenset({11, 25}): 32,
 frozenset({10, 12}): 38,
 frozenset({25, 31, 37}): 28,
 frozenset({10, 31, 37}): 34,
 frozenset({17}): 53,
 frozenset({10, 17, 25}): 46,
 frozenset({10, 17, 23, 25, 37}): 25,
 frozenset({32}): 24,
 frozenset({10, 33}): 40,
 frozenset({10, 22}): 26,
 frozenset({10, 11}): 35,
 frozenset({10, 25, 31, 37}): 28,
 frozenset({10, 23, 33}): 25,
 frozenset({10, 23, 25, 33}): 23,
 frozenset({23, 33, 37}): 24,
 frozenset({17, 25, 33}): 23,
 frozenset({12, 17}): 28,
 frozenset({10, 33, 37}): 34,
 frozenset({11, 25, 37}): 28,
 frozenset({22, 25}): 24,
 frozenset({23, 25, 33}): 23,
 frozenset({31}): 40,
 frozenset({17, 25, 31}): 24,
 frozenset({10, 31}): 38,
 frozenset({10, 23, 31}): 25,
 frozenset({10, 12, 17}): 28,
 frozenset({25}): 65,
 frozenset({10, 23, 25}): 39,
 frozenset({25, 44}): 28,
 frozenset({10, 11, 25}): 32,
 frozenset({17, 25}): 46,
 frozenset({10, 22, 25}): 24,
 frozenset({10, 12, 17, 37}): 23,
 frozenset({12, 25}): 33,
 frozenset({10, 25, 37}): 53,
 frozenset({11, 17}): 25,
 frozenset({10, 25, 37, 44}): 23,
 frozenset({17, 44}): 24,
 frozenset({10}): 73,
 frozenset({10, 17}): 53,
 frozenset({10, 12, 17, 25}): 24,
 frozenset({37}): 63,
 frozenset({10, 37}): 60,
 frozenset({33, 37}): 35,
 frozenset({10, 17, 33, 37}): 23,
 frozenset({42}): 24,
 frozenset({23, 25}): 39,
 frozenset({44}): 31,
 frozenset({10, 37, 44}): 26,
 frozenset({17, 23}): 34,
 frozenset({10, 17, 23}): 34,
 frozenset({17, 31}): 30,
 frozenset({10, 11, 17}): 25,
 frozenset({17, 25, 37}): 38,
 frozenset({23, 37}): 40,
 frozenset({10, 11, 23, 37}): 24,
 frozenset({2}): 31,
 frozenset({2, 10}): 28,
 frozenset({10, 25}): 65,
 frozenset({10, 25, 44}): 28,
 frozenset({25, 33}): 37,
 frozenset({10, 25, 33}): 37,
 frozenset({10, 11, 25, 37}): 28,
 frozenset({12, 37}): 32,
 frozenset({10, 17, 25, 33}): 23,
 frozenset({2, 37}): 26,
 frozenset({10, 12, 25}): 33,
 frozenset({23, 33}): 25,
 frozenset({11, 23}): 25,
 frozenset({25, 33, 37}): 31,
 frozenset({10, 17, 25, 37}): 38,
 frozenset({17, 31, 37}): 27,
 frozenset({10, 17, 33}): 26,
 frozenset({10, 12, 37}): 29,
 frozenset({2, 10, 25}): 25,
 frozenset({23}): 45,
 frozenset({17, 23, 25, 37}): 25,
 frozenset({10, 11, 37}): 31,
 frozenset({10, 23, 37}): 40,
 frozenset({10, 25, 31}): 31,
 frozenset({37, 44}): 26,
 frozenset({25, 31}): 31,
 frozenset({17, 33, 37}): 23,
 frozenset({10, 11, 23}): 25,
 frozenset({31, 37}): 36,
 frozenset({10, 17, 31, 37}): 27}
In [33]:
for i, item in enumerate(new_data.columns):
    print(i, item)
0 email_bk.ru
1 email_edu.hse.ru
2 email_gmail.com
3 email_inbox.ru
4 email_mail.ru
5 email_outlook.com
6 email_yande.ru
7 email_yandex.ru
8 course_role_Лектор
9 course_role_Семинарист
10 course_role_Студент
11 gender_женский
12 gender_мужской
13 birth_date_1990
14 birth_date_1991
15 birth_date_1994
16 birth_date_1995
17 birth_date_1996
18 birth_date_1997
19 birth_date_2016
20 birth_date_2041
21 gpa_1
22 gpa_2
23 gpa_3
24 papers_many
25 papers_none
26 papers_some
27 english_level_Advanced
28 english_level_Beginner
29 english_level_Full proficiency
30 english_level_Intermediate
31 english_level_Upper-intermediate
32 course_name_Data Analysis
33 course_name_Data Mining
34 course_name_Data Science
35 course_name_Machine Learning
36 normal_params_1.0
37 normal_params_2.0
38 normal_params_3.0
39 normal_params_42.0
40 stat_crit_Критерий Мана-Уитни
41 stat_crit_Критерий Стьюдента
42 stat_crit_Критерий Стьюдента для связанных выборок
43 stat_crit_Критерий знаковых рангов Уилкоксона
44 stat_crit_Не знаю
In [34]:
for rule in sorted(list(association_rules(itemsets, 0.9)), key=lambda x: -x[3]):
    print("{} --> {} with supp = {}, conf = {}".format([new_data.columns[ftr] for ftr in rule[0]],
                                                      [new_data.columns[ftr] for ftr in rule[1]],
                                                      rule[2], rule[3]))
['birth_date_1996', 'normal_params_2.0', 'papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['birth_date_1996', 'papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['birth_date_1996', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 30, conf = 1.0
['course_name_Data Mining', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['papers_none', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 35, conf = 1.0
['birth_date_1996', 'papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['papers_none', 'gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['papers_none', 'normal_params_2.0', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['gpa_3', 'normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 23, conf = 1.0
['papers_none', 'normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['papers_none', 'gpa_3', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 23, conf = 1.0
['birth_date_1996', 'gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 23, conf = 1.0
['papers_none', 'stat_crit_Не знаю', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 23, conf = 1.0
['birth_date_1996', 'gender_мужской', 'papers_none'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['birth_date_1996', 'normal_params_2.0', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 23, conf = 1.0
['gender_женский', 'normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['papers_none', 'gender_женский', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['birth_date_1996', 'papers_none', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 23, conf = 1.0
['birth_date_1996', 'normal_params_2.0', 'papers_none'] --> ['course_role_Студент'] with supp = 38, conf = 1.0
['birth_date_1996', 'normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 27, conf = 1.0
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 30, conf = 1.0
['birth_date_1996', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['birth_date_1996', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 44, conf = 1.0
['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['birth_date_1996', 'papers_none'] --> ['course_role_Студент'] with supp = 46, conf = 1.0
['gpa_3', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['birth_date_1996', 'gender_мужской'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 39, conf = 1.0
['papers_none', 'gender_женский'] --> ['course_role_Студент'] with supp = 32, conf = 1.0
['papers_none', 'gpa_2'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['papers_none', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 53, conf = 1.0
['stat_crit_Не знаю', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 26, conf = 1.0
['birth_date_1996', 'gpa_3'] --> ['course_role_Студент'] with supp = 34, conf = 1.0
['birth_date_1996', 'gender_женский'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['papers_none', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['papers_none', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 37, conf = 1.0
['papers_none', 'gender_мужской'] --> ['course_role_Студент'] with supp = 33, conf = 1.0
['birth_date_1996', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 26, conf = 1.0
['papers_none', 'email_gmail.com'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['gender_женский', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 40, conf = 1.0
['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['gender_женский', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['gpa_3'] --> ['course_role_Студент'] with supp = 45, conf = 1.0
['stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['stat_crit_Критерий Стьюдента для связанных выборок'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['gender_женский'] --> ['course_role_Студент'] with supp = 35, conf = 1.0
['birth_date_1996'] --> ['course_role_Студент'] with supp = 53, conf = 1.0
['papers_none'] --> ['course_role_Студент'] with supp = 65, conf = 1.0
['course_name_Data Mining'] --> ['course_role_Студент'] with supp = 40, conf = 0.975609756097561
['course_name_Data Mining', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 34, conf = 0.9714285714285714
['course_name_Data Mining', 'course_role_Студент', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96
['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96
['course_role_Студент', 'gender_женский', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96
['gender_женский', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96
['gender_женский', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96
['course_name_Data Mining', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96
['normal_params_2.0'] --> ['course_role_Студент'] with supp = 60, conf = 0.9523809523809523
['english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 38, conf = 0.95
['normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 34, conf = 0.9444444444444444
['gpa_2'] --> ['course_role_Студент'] with supp = 26, conf = 0.9285714285714286
['gender_мужской'] --> ['course_role_Студент'] with supp = 38, conf = 0.926829268292683
['course_name_Data Mining', 'course_role_Студент'] --> ['papers_none'] with supp = 37, conf = 0.925
['course_role_Студент', 'gpa_2'] --> ['papers_none'] with supp = 24, conf = 0.9230769230769231
['course_role_Студент', 'gpa_3', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 23, conf = 0.92
['english_level_Upper-intermediate', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 23, conf = 0.92
['course_name_Data Mining', 'course_role_Студент', 'gpa_3'] --> ['papers_none'] with supp = 23, conf = 0.92
['course_name_Data Mining', 'gpa_3'] --> ['papers_none', 'course_role_Студент'] with supp = 23, conf = 0.92
['gpa_3', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 23, conf = 0.92
['course_name_Data Mining', 'gpa_3'] --> ['papers_none'] with supp = 23, conf = 0.92
['course_role_Студент', 'gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143
['gender_женский'] --> ['papers_none', 'course_role_Студент'] with supp = 32, conf = 0.9142857142857143
['gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143
['course_name_Data Mining', 'course_role_Студент', 'normal_params_2.0'] --> ['papers_none'] with supp = 31, conf = 0.9117647058823529
['gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 29, conf = 0.90625
['papers_none', 'course_role_Студент', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 28, conf = 0.9032258064516129
['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 28, conf = 0.9032258064516129
['course_role_Студент', 'gender_женский', 'normal_params_2.0'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['gender_женский', 'normal_params_2.0'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129
['papers_none', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 28, conf = 0.9032258064516129
['gender_женский', 'normal_params_2.0'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['course_role_Студент', 'stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['stat_crit_Не знаю'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129
['stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['email_gmail.com'] --> ['course_role_Студент'] with supp = 28, conf = 0.9032258064516129
['course_name_Data Mining'] --> ['papers_none', 'course_role_Студент'] with supp = 37, conf = 0.9024390243902439
['course_name_Data Mining'] --> ['papers_none'] with supp = 37, conf = 0.9024390243902439
['birth_date_1996', 'course_role_Студент', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 27, conf = 0.9
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 27, conf = 0.9
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 27, conf = 0.9
['english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 36, conf = 0.9
In [35]:
for rule in sorted(list(association_rules(itemsets, 0.9)), key=lambda x: -x[3]):
    if len(rule[0]) > 1:
        continue
    print("{} --> {} with supp = {}, conf = {}".format([new_data.columns[ftr] for ftr in rule[0]],
                                                      [new_data.columns[ftr] for ftr in rule[1]],
                                                      rule[2], rule[3]))
['gpa_3'] --> ['course_role_Студент'] with supp = 45, conf = 1.0
['stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['stat_crit_Критерий Стьюдента для связанных выборок'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['gender_женский'] --> ['course_role_Студент'] with supp = 35, conf = 1.0
['birth_date_1996'] --> ['course_role_Студент'] with supp = 53, conf = 1.0
['papers_none'] --> ['course_role_Студент'] with supp = 65, conf = 1.0
['course_name_Data Mining'] --> ['course_role_Студент'] with supp = 40, conf = 0.975609756097561
['normal_params_2.0'] --> ['course_role_Студент'] with supp = 60, conf = 0.9523809523809523
['english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 38, conf = 0.95
['gpa_2'] --> ['course_role_Студент'] with supp = 26, conf = 0.9285714285714286
['gender_мужской'] --> ['course_role_Студент'] with supp = 38, conf = 0.926829268292683
['gender_женский'] --> ['papers_none', 'course_role_Студент'] with supp = 32, conf = 0.9142857142857143
['gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143
['stat_crit_Не знаю'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129
['stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['email_gmail.com'] --> ['course_role_Студент'] with supp = 28, conf = 0.9032258064516129
['course_name_Data Mining'] --> ['papers_none', 'course_role_Студент'] with supp = 37, conf = 0.9024390243902439
['course_name_Data Mining'] --> ['papers_none'] with supp = 37, conf = 0.9024390243902439
['english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 36, conf = 0.9
In [36]:
for rule in sorted(list(association_rules(itemsets, 0.9)), key=lambda x: -x[3]):
    if len(rule[0]) != 2:
        continue
    print("{} --> {} with supp = {}, conf = {}".format([new_data.columns[ftr] for ftr in rule[0]],
                                                      [new_data.columns[ftr] for ftr in rule[1]],
                                                      rule[2], rule[3]))
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 30, conf = 1.0
['birth_date_1996', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['birth_date_1996', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 44, conf = 1.0
['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['birth_date_1996', 'papers_none'] --> ['course_role_Студент'] with supp = 46, conf = 1.0
['gpa_3', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['birth_date_1996', 'gender_мужской'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['papers_none', 'gpa_3'] --> ['course_role_Студент'] with supp = 39, conf = 1.0
['papers_none', 'gender_женский'] --> ['course_role_Студент'] with supp = 32, conf = 1.0
['papers_none', 'gpa_2'] --> ['course_role_Студент'] with supp = 24, conf = 1.0
['papers_none', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 53, conf = 1.0
['stat_crit_Не знаю', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 26, conf = 1.0
['birth_date_1996', 'gpa_3'] --> ['course_role_Студент'] with supp = 34, conf = 1.0
['birth_date_1996', 'gender_женский'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['papers_none', 'stat_crit_Не знаю'] --> ['course_role_Студент'] with supp = 28, conf = 1.0
['papers_none', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 37, conf = 1.0
['papers_none', 'gender_мужской'] --> ['course_role_Студент'] with supp = 33, conf = 1.0
['birth_date_1996', 'course_name_Data Mining'] --> ['course_role_Студент'] with supp = 26, conf = 1.0
['papers_none', 'email_gmail.com'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['gender_женский', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['normal_params_2.0', 'gpa_3'] --> ['course_role_Студент'] with supp = 40, conf = 1.0
['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 31, conf = 1.0
['gender_женский', 'gpa_3'] --> ['course_role_Студент'] with supp = 25, conf = 1.0
['course_name_Data Mining', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 34, conf = 0.9714285714285714
['course_name_Data Mining', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96
['gender_женский', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 24, conf = 0.96
['gender_женский', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96
['course_name_Data Mining', 'gpa_3'] --> ['normal_params_2.0'] with supp = 24, conf = 0.96
['normal_params_2.0', 'english_level_Upper-intermediate'] --> ['course_role_Студент'] with supp = 34, conf = 0.9444444444444444
['course_name_Data Mining', 'course_role_Студент'] --> ['papers_none'] with supp = 37, conf = 0.925
['course_role_Студент', 'gpa_2'] --> ['papers_none'] with supp = 24, conf = 0.9230769230769231
['english_level_Upper-intermediate', 'gpa_3'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 23, conf = 0.92
['course_name_Data Mining', 'gpa_3'] --> ['papers_none', 'course_role_Студент'] with supp = 23, conf = 0.92
['gpa_3', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 23, conf = 0.92
['course_name_Data Mining', 'gpa_3'] --> ['papers_none'] with supp = 23, conf = 0.92
['course_role_Студент', 'gender_женский'] --> ['papers_none'] with supp = 32, conf = 0.9142857142857143
['gender_мужской', 'normal_params_2.0'] --> ['course_role_Студент'] with supp = 29, conf = 0.90625
['papers_none', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 28, conf = 0.9032258064516129
['gender_женский', 'normal_params_2.0'] --> ['papers_none', 'course_role_Студент'] with supp = 28, conf = 0.9032258064516129
['papers_none', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 28, conf = 0.9032258064516129
['gender_женский', 'normal_params_2.0'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['course_role_Студент', 'stat_crit_Не знаю'] --> ['papers_none'] with supp = 28, conf = 0.9032258064516129
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['course_role_Студент', 'normal_params_2.0'] with supp = 27, conf = 0.9
['birth_date_1996', 'english_level_Upper-intermediate'] --> ['normal_params_2.0'] with supp = 27, conf = 0.9