In [1]:

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 999)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import plotly.graph_objects as go
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier, _tree
from IPython.display import display, HTML

Data preparation¶

In [2]:

df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv", dtype={"MonthlyCharges": float})
df.head()

Out[2]:

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	Yes	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	No	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes

In [3]:

def to_num(total_charges, monthly_charges, tenure):
    try:
        return float(total_charges)
    except:
        return monthly_charges * tenure
    
df_with_fixed_types = df.assign(TotalCharges=df.apply(
                        lambda x: to_num(x["TotalCharges"], x["MonthlyCharges"], x["tenure"]), axis=1))

In [4]:

df_num = df_with_fixed_types.select_dtypes([int, float])
df_num.columns

Out[4]:

Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [5]:

df_cat = df_with_fixed_types.select_dtypes("object").drop(columns="customerID")
df_cat.columns

Out[5]:

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'Churn'],
      dtype='object')

In [6]:

df_dummies = pd.get_dummies(df_cat, drop_first=True)
df_dummies.head()

Out[6]:

	gender_Male	Partner_Yes	PhoneService_Yes	MultipleLines_No phone service	InternetService_Fiber optic	OnlineSecurity_Yes	OnlineBackup_Yes	DeviceProtection_Yes	TechSupport_Yes	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	0	1	0	1	0	0	1	0	0	0	1	1	0	0
1	1	0	1	0	0	1	0	1	0	1	0	0	1	0
2	1	0	1	0	0	1	1	0	0	0	1	0	1	1
3	1	0	0	1	0	1	0	1	1	1	0	0	0	0
4	0	0	1	0	1	0	0	0	0	0	1	1	0	1

In [7]:

df_features_and_target = pd.concat([df_num, df_dummies], axis=1)
df_features_and_target.head()

Out[7]:

	tenure	MonthlyCharges	TotalCharges	gender_Male	Partner_Yes	PhoneService_Yes	MultipleLines_No phone service	InternetService_Fiber optic	OnlineSecurity_Yes	OnlineBackup_Yes	DeviceProtection_Yes	TechSupport_Yes	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	1	29.85	29.85	0	1	0	1	0	0	1	0	0	0	1	1	0	0
1	34	56.95	1889.50	1	0	1	0	0	1	0	1	0	1	0	0	1	0
2	2	53.85	108.15	1	0	1	0	0	1	1	0	0	0	1	0	1	1
3	45	42.30	1840.75	1	0	0	1	0	1	0	1	1	1	0	0	0	0
4	2	70.70	151.65	0	0	1	0	1	0	0	0	0	0	1	1	0	1

In [8]:

target = "Churn_Yes"
x = df_features_and_target.drop(columns=target)
y = df_features_and_target[target]

In [9]:

y.describe()

Out[9]:

count    7043.000000
mean        0.265370
std         0.441561
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Churn_Yes, dtype: float64

In [10]:

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

Random Forest¶

In [11]:

rf = RandomForestClassifier(n_estimators=200, n_jobs=8, min_samples_leaf=20, random_state=123)
rf.fit(x_train, y_train)

Out[11]:

RandomForestClassifier(min_samples_leaf=20, n_estimators=200, n_jobs=8,
                       random_state=123)

In [12]:

rf.score(x_train, y_train), rf.score(x_test, y_test)

Out[12]:

(0.8202903124013885, 0.7971631205673759)

In [13]:

feature_importances = rf.feature_importances_
fig = go.Figure(data=[
    go.Bar(name='Features importances', x=x.columns, y=feature_importances)
])

fig.show()

Clustering¶

In [14]:

n_clusters = 5

In [15]:

df_with_likely_churn = df_features_and_target[rf.predict_proba(x)[:, 1] > 0.3]
x_with_likely_churn = df_with_likely_churn.drop(columns=target)

In [16]:

x_scaled = StandardScaler().fit_transform(x_with_likely_churn)
aggl_clustering_euclidian = AgglomerativeClustering(n_clusters=n_clusters, linkage="average")
aggl_clustering_euclidian.fit(x_scaled)

Out[16]:

AgglomerativeClustering(linkage='average', n_clusters=5)

In [17]:

x_with_rf_feature_importance = (x_scaled.T * feature_importances.reshape(-1, 1)).T
aggl_clustering_feature_importance = AgglomerativeClustering(n_clusters=n_clusters, linkage="average")
aggl_clustering_feature_importance.fit(x_with_rf_feature_importance)

Out[17]:

AgglomerativeClustering(linkage='average', n_clusters=5)

In [18]:

encoding = rf.apply(x_with_likely_churn)
distance_matrix = pairwise_distances(encoding, metric="hamming")

aggl_clustering_from_rf = AgglomerativeClustering(n_clusters=n_clusters, 
                                                  affinity="precomputed", 
                                                  linkage="average")
aggl_clustering_from_rf.fit(distance_matrix)

Out[18]:

AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=5)

Analyze clustering¶

In [19]:

df_clusters = df_with_likely_churn.assign(cluster_rf=list(map(str, aggl_clustering_from_rf.labels_)),
                                           cluster_eucl=list(map(str, aggl_clustering_euclidian.labels_)),
                                           cluster_fi=list(map(str, aggl_clustering_feature_importance.labels_)))

Clusters interpretations¶

In [20]:

# Functions retrieved from https://towardsdatascience.com/the-easiest-way-to-interpret-clustering-result-8137e488a127

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
    inner_tree: _tree.Tree = tree.tree_
    classes = tree.classes_
    class_rules_dict = dict()

    def tree_dfs(node_id=0, current_rule=[]):
        # feature[i] holds the feature to split on, for the internal node i.
        split_feature = inner_tree.feature[node_id]
        if split_feature != _tree.TREE_UNDEFINED: # internal node
            name = feature_names[split_feature]
            threshold = inner_tree.threshold[node_id]
            # left child
            left_rule = current_rule + ["({} <= {})".format(name, threshold)]
            tree_dfs(inner_tree.children_left[node_id], left_rule)
            # right child
            right_rule = current_rule + ["({} > {})".format(name, threshold)]
            tree_dfs(inner_tree.children_right[node_id], right_rule)
        else: # leaf
            dist = inner_tree.value[node_id][0]
            dist = dist/dist.sum()
            max_idx = dist.argmax()
            if len(current_rule) == 0:
                rule_string = "ALL"
            else:
                rule_string = " and ".join(current_rule)
            # register new rule to dictionary
            selected_class = classes[max_idx]
            class_probability = dist[max_idx]
            class_rules = class_rules_dict.get(selected_class, [])
            class_rules.append((rule_string, class_probability))
            class_rules_dict[selected_class] = class_rules

    tree_dfs() # start from root, node_id = 0
    return class_rules_dict

def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
    # Create Model
    tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
    tree.fit(data, clusters)
    
    # Generate Report
    feature_names = data.columns
    class_rule_dict = get_class_rules(tree, feature_names)

    report_class_list = []
    for class_name in class_rule_dict.keys():
        rule_list = class_rule_dict[class_name]
        combined_string = ""
        for rule in rule_list:
            combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
        report_class_list.append((class_name, combined_string))
        
    cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
    cluster_instance_df.columns = ["class_name", "instance_count"]
    report_df = pd.DataFrame(report_class_list, columns=["class_name", "rule_list"])
    report_df = pd.merge(cluster_instance_df, report_df, on="class_name", how="left")
    pretty_print(report_df.sort_values(by="class_name").reset_index(drop=True)[["class_name", "instance_count", "rule_list"]])

In [21]:

cluster_report(x_with_likely_churn, aggl_clustering_euclidian.labels_, min_samples_leaf=20, pruning_level=0.01)

	class_name	instance_count	rule_list
0	0	2197	[0.9995446265938069] (MonthlyCharges > 43.70000076293945) and (Contract_One year <= 0.5) and (PhoneService_Yes > 0.5)
1	1	217	[1.0] (MonthlyCharges <= 43.70000076293945) and (PhoneService_Yes <= 0.5) [1.0] (MonthlyCharges > 43.70000076293945) and (Contract_One year <= 0.5) and (PhoneService_Yes <= 0.5)
2	2	141	[0.986013986013986] (MonthlyCharges <= 43.70000076293945) and (PhoneService_Yes > 0.5)
3	3	1	NaN
4	4	79	[0.9875] (MonthlyCharges > 43.70000076293945) and (Contract_One year > 0.5)

In [22]:

cluster_report(x_with_likely_churn, aggl_clustering_feature_importance.labels_, min_samples_leaf=20, pruning_level=0.01)

	class_name	instance_count	rule_list
0	0	17	[0.7727272727272727] (tenure > 20.5) and (TotalCharges <= 5715.474853515625) and (MonthlyCharges <= 69.2750015258789)
1	1	1946	[0.9896854048478597] (tenure <= 20.5)
2	2	66	[0.9672131147540983] (tenure > 20.5) and (TotalCharges > 5715.474853515625)
3	3	1	NaN
4	4	605	[0.9494290375203915] (tenure > 20.5) and (TotalCharges <= 5715.474853515625) and (MonthlyCharges > 69.2750015258789)

In [23]:

cluster_report(x_with_likely_churn, aggl_clustering_from_rf.labels_, min_samples_leaf=10, pruning_level=0.01)

	class_name	instance_count	rule_list
0	0	593	[1.0] (InternetService_Fiber optic <= 0.5) and (TechSupport_No internet service <= 0.5)
1	1	1127	[0.9725177304964538] (InternetService_Fiber optic > 0.5) and (tenure <= 17.5)
2	2	541	[0.9460966542750929] (InternetService_Fiber optic > 0.5) and (tenure > 17.5) and (PaymentMethod_Electronic check > 0.5)
3	3	141	[1.0] (InternetService_Fiber optic <= 0.5) and (TechSupport_No internet service > 0.5)
4	4	233	[0.9787234042553191] (InternetService_Fiber optic > 0.5) and (tenure > 17.5) and (PaymentMethod_Electronic check <= 0.5)

Clusters visualization¶

In [32]:

def get_distinct_legend_elements_for_cluster_and_payement(fig):
    displayed_clusters = []
    for trace in fig.data:
        name = trace.name.split(',')
        if name[0] in displayed_clusters:
            trace["showlegend"] = False
        else:
            trace["name"] = "Cluster " + name[0]
            displayed_clusters.append(name[0])
            
    fig.add_trace(go.Scatter(y=[None], mode="markers",
                             marker=dict(symbol="square", color="grey"),
                             name="Payment by electronic check",
                             ))
    fig.add_trace(go.Scatter(y=[None], mode="markers",
                             marker=dict(symbol="x", color="grey"),
                             name="Payment with other methods",
                             ))
    
    fig.update_layout(legend_title_text="Clusters and payment methods")

In [33]:

fig_eucl = px.scatter(df_clusters.rename(columns={"cluster_eucl": "cluster"}), 
                       x="MonthlyCharges", y="tenure", color="cluster",
                        symbol="PaymentMethod_Electronic check",
                       symbol_map={0: "square", 1: "x"})

get_distinct_legend_elements_for_cluster_and_payement(fig_eucl)

fig_eucl.show()

In [34]:

fig_fi = px.scatter(df_clusters.rename(columns={"cluster_fi": "cluster"}), 
                   x="MonthlyCharges", y="tenure", color="cluster",
                    symbol="PaymentMethod_Electronic check",
                   symbol_map={0: "square", 1: "x"})

get_distinct_legend_elements_for_cluster_and_payement(fig_fi)

fig_fi.show()

In [35]:

fig_rf = px.scatter(df_clusters.rename(columns={"cluster_rf": "cluster"}), 
                      x="MonthlyCharges", y="tenure", color="cluster",
                  symbol="PaymentMethod_Electronic check",
                   symbol_map={0: "square", 1: "x"})

get_distinct_legend_elements_for_cluster_and_payement(fig_rf)

fig_rf.show()

In [ ]:

	gender_Male	Partner_Yes	PhoneService_Yes	MultipleLines_No phone service	InternetService_Fiber optic	OnlineSecurity_Yes	OnlineBackup_Yes	DeviceProtection_Yes	TechSupport_Yes	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	0	1	0	1	0	0	1	0	0	0	1	1	0	0
1	1	0	1	0	0	1	0	1	0	1	0	0	1	0
2	1	0	1	0	0	1	1	0	0	0	1	0	1	1
3	1	0	0	1	0	1	0	1	1	1	0	0	0	0
4	0	0	1	0	1	0	0	0	0	0	1	1	0	1

	gender_Male	Partner_Yes	PhoneService_Yes	MultipleLines_No phone service	InternetService_Fiber optic	OnlineSecurity_Yes	OnlineBackup_Yes	DeviceProtection_Yes	TechSupport_Yes	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	0	1	0	1	0	0	1	0	0	0	1	1	0	0
1	1	0	1	0	0	1	0	1	0	1	0	0	1	0
2	1	0	1	0	0	1	1	0	0	0	1	0	1	1
3	1	0	0	1	0	1	0	1	1	1	0	0	0	0
4	0	0	1	0	1	0	0	0	0	0	1	1	0	1

	gender_Male	Partner_Yes	PhoneService_Yes	MultipleLines_No phone service	InternetService_Fiber optic	OnlineSecurity_Yes	OnlineBackup_Yes	DeviceProtection_Yes	TechSupport_Yes	Contract_One year	PaperlessBilling_Yes	PaymentMethod_Electronic check	PaymentMethod_Mailed check	Churn_Yes
0	0	1	0	1	0	0	1	0	0	0	1	1	0	0
1	1	0	1	0	0	1	0	1	0	1	0	0	1	0
2	1	0	1	0	0	1	1	0	0	0	1	0	1	1
3	1	0	0	1	0	1	0	1	1	1	0	0	0	0
4	0	0	1	0	1	0	0	0	0	0	1	1	0	1