import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 999)
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
import plotly.graph_objects as go
import plotly.express as px
from sklearn.tree import DecisionTreeClassifier, _tree
from IPython.display import display, HTML
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv", dtype={"MonthlyCharges": float})
df.head()
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
def to_num(total_charges, monthly_charges, tenure):
try:
return float(total_charges)
except:
return monthly_charges * tenure
df_with_fixed_types = df.assign(TotalCharges=df.apply(
lambda x: to_num(x["TotalCharges"], x["MonthlyCharges"], x["tenure"]), axis=1))
df_num = df_with_fixed_types.select_dtypes([int, float])
df_num.columns
Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')
df_cat = df_with_fixed_types.select_dtypes("object").drop(columns="customerID")
df_cat.columns
Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'], dtype='object')
df_dummies = pd.get_dummies(df_cat, drop_first=True)
df_dummies.head()
gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | MultipleLines_Yes | InternetService_Fiber optic | InternetService_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No internet service | DeviceProtection_Yes | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_One year | Contract_Two year | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | Churn_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
3 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
df_features_and_target = pd.concat([df_num, df_dummies], axis=1)
df_features_and_target.head()
SeniorCitizen | tenure | MonthlyCharges | TotalCharges | gender_Male | Partner_Yes | Dependents_Yes | PhoneService_Yes | MultipleLines_No phone service | MultipleLines_Yes | InternetService_Fiber optic | InternetService_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No internet service | DeviceProtection_Yes | TechSupport_No internet service | TechSupport_Yes | StreamingTV_No internet service | StreamingTV_Yes | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_One year | Contract_Two year | PaperlessBilling_Yes | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | Churn_Yes | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 29.85 | 29.85 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
1 | 0 | 34 | 56.95 | 1889.50 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
2 | 0 | 2 | 53.85 | 108.15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
3 | 0 | 45 | 42.30 | 1840.75 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 2 | 70.70 | 151.65 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
target = "Churn_Yes"
x = df_features_and_target.drop(columns=target)
y = df_features_and_target[target]
y.describe()
count 7043.000000 mean 0.265370 std 0.441561 min 0.000000 25% 0.000000 50% 0.000000 75% 1.000000 max 1.000000 Name: Churn_Yes, dtype: float64
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
rf = RandomForestClassifier(n_estimators=200, n_jobs=8, min_samples_leaf=20, random_state=123)
rf.fit(x_train, y_train)
RandomForestClassifier(min_samples_leaf=20, n_estimators=200, n_jobs=8, random_state=123)
rf.score(x_train, y_train), rf.score(x_test, y_test)
(0.8202903124013885, 0.7971631205673759)
feature_importances = rf.feature_importances_
fig = go.Figure(data=[
go.Bar(name='Features importances', x=x.columns, y=feature_importances)
])
fig.show()
n_clusters = 5
df_with_likely_churn = df_features_and_target[rf.predict_proba(x)[:, 1] > 0.3]
x_with_likely_churn = df_with_likely_churn.drop(columns=target)
x_scaled = StandardScaler().fit_transform(x_with_likely_churn)
aggl_clustering_euclidian = AgglomerativeClustering(n_clusters=n_clusters, linkage="average")
aggl_clustering_euclidian.fit(x_scaled)
AgglomerativeClustering(linkage='average', n_clusters=5)
x_with_rf_feature_importance = (x_scaled.T * feature_importances.reshape(-1, 1)).T
aggl_clustering_feature_importance = AgglomerativeClustering(n_clusters=n_clusters, linkage="average")
aggl_clustering_feature_importance.fit(x_with_rf_feature_importance)
AgglomerativeClustering(linkage='average', n_clusters=5)
encoding = rf.apply(x_with_likely_churn)
distance_matrix = pairwise_distances(encoding, metric="hamming")
aggl_clustering_from_rf = AgglomerativeClustering(n_clusters=n_clusters,
affinity="precomputed",
linkage="average")
aggl_clustering_from_rf.fit(distance_matrix)
AgglomerativeClustering(affinity='precomputed', linkage='average', n_clusters=5)
df_clusters = df_with_likely_churn.assign(cluster_rf=list(map(str, aggl_clustering_from_rf.labels_)),
cluster_eucl=list(map(str, aggl_clustering_euclidian.labels_)),
cluster_fi=list(map(str, aggl_clustering_feature_importance.labels_)))
# Functions retrieved from https://towardsdatascience.com/the-easiest-way-to-interpret-clustering-result-8137e488a127
def pretty_print(df):
return display( HTML( df.to_html().replace("\\n","<br>") ) )
def get_class_rules(tree: DecisionTreeClassifier, feature_names: list):
inner_tree: _tree.Tree = tree.tree_
classes = tree.classes_
class_rules_dict = dict()
def tree_dfs(node_id=0, current_rule=[]):
# feature[i] holds the feature to split on, for the internal node i.
split_feature = inner_tree.feature[node_id]
if split_feature != _tree.TREE_UNDEFINED: # internal node
name = feature_names[split_feature]
threshold = inner_tree.threshold[node_id]
# left child
left_rule = current_rule + ["({} <= {})".format(name, threshold)]
tree_dfs(inner_tree.children_left[node_id], left_rule)
# right child
right_rule = current_rule + ["({} > {})".format(name, threshold)]
tree_dfs(inner_tree.children_right[node_id], right_rule)
else: # leaf
dist = inner_tree.value[node_id][0]
dist = dist/dist.sum()
max_idx = dist.argmax()
if len(current_rule) == 0:
rule_string = "ALL"
else:
rule_string = " and ".join(current_rule)
# register new rule to dictionary
selected_class = classes[max_idx]
class_probability = dist[max_idx]
class_rules = class_rules_dict.get(selected_class, [])
class_rules.append((rule_string, class_probability))
class_rules_dict[selected_class] = class_rules
tree_dfs() # start from root, node_id = 0
return class_rules_dict
def cluster_report(data: pd.DataFrame, clusters, min_samples_leaf=50, pruning_level=0.01):
# Create Model
tree = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf, ccp_alpha=pruning_level)
tree.fit(data, clusters)
# Generate Report
feature_names = data.columns
class_rule_dict = get_class_rules(tree, feature_names)
report_class_list = []
for class_name in class_rule_dict.keys():
rule_list = class_rule_dict[class_name]
combined_string = ""
for rule in rule_list:
combined_string += "[{}] {}\n\n".format(rule[1], rule[0])
report_class_list.append((class_name, combined_string))
cluster_instance_df = pd.Series(clusters).value_counts().reset_index()
cluster_instance_df.columns = ["class_name", "instance_count"]
report_df = pd.DataFrame(report_class_list, columns=["class_name", "rule_list"])
report_df = pd.merge(cluster_instance_df, report_df, on="class_name", how="left")
pretty_print(report_df.sort_values(by="class_name").reset_index(drop=True)[["class_name", "instance_count", "rule_list"]])
cluster_report(x_with_likely_churn, aggl_clustering_euclidian.labels_, min_samples_leaf=20, pruning_level=0.01)
class_name | instance_count | rule_list | |
---|---|---|---|
0 | 0 | 2197 | [0.9995446265938069] (MonthlyCharges > 43.70000076293945) and (Contract_One year <= 0.5) and (PhoneService_Yes > 0.5) |
1 | 1 | 217 | [1.0] (MonthlyCharges <= 43.70000076293945) and (PhoneService_Yes <= 0.5) [1.0] (MonthlyCharges > 43.70000076293945) and (Contract_One year <= 0.5) and (PhoneService_Yes <= 0.5) |
2 | 2 | 141 | [0.986013986013986] (MonthlyCharges <= 43.70000076293945) and (PhoneService_Yes > 0.5) |
3 | 3 | 1 | NaN |
4 | 4 | 79 | [0.9875] (MonthlyCharges > 43.70000076293945) and (Contract_One year > 0.5) |
cluster_report(x_with_likely_churn, aggl_clustering_feature_importance.labels_, min_samples_leaf=20, pruning_level=0.01)
class_name | instance_count | rule_list | |
---|---|---|---|
0 | 0 | 17 | [0.7727272727272727] (tenure > 20.5) and (TotalCharges <= 5715.474853515625) and (MonthlyCharges <= 69.2750015258789) |
1 | 1 | 1946 | [0.9896854048478597] (tenure <= 20.5) |
2 | 2 | 66 | [0.9672131147540983] (tenure > 20.5) and (TotalCharges > 5715.474853515625) |
3 | 3 | 1 | NaN |
4 | 4 | 605 | [0.9494290375203915] (tenure > 20.5) and (TotalCharges <= 5715.474853515625) and (MonthlyCharges > 69.2750015258789) |
cluster_report(x_with_likely_churn, aggl_clustering_from_rf.labels_, min_samples_leaf=10, pruning_level=0.01)
class_name | instance_count | rule_list | |
---|---|---|---|
0 | 0 | 593 | [1.0] (InternetService_Fiber optic <= 0.5) and (TechSupport_No internet service <= 0.5) |
1 | 1 | 1127 | [0.9725177304964538] (InternetService_Fiber optic > 0.5) and (tenure <= 17.5) |
2 | 2 | 541 | [0.9460966542750929] (InternetService_Fiber optic > 0.5) and (tenure > 17.5) and (PaymentMethod_Electronic check > 0.5) |
3 | 3 | 141 | [1.0] (InternetService_Fiber optic <= 0.5) and (TechSupport_No internet service > 0.5) |
4 | 4 | 233 | [0.9787234042553191] (InternetService_Fiber optic > 0.5) and (tenure > 17.5) and (PaymentMethod_Electronic check <= 0.5) |
def get_distinct_legend_elements_for_cluster_and_payement(fig):
displayed_clusters = []
for trace in fig.data:
name = trace.name.split(',')
if name[0] in displayed_clusters:
trace["showlegend"] = False
else:
trace["name"] = "Cluster " + name[0]
displayed_clusters.append(name[0])
fig.add_trace(go.Scatter(y=[None], mode="markers",
marker=dict(symbol="square", color="grey"),
name="Payment by electronic check",
))
fig.add_trace(go.Scatter(y=[None], mode="markers",
marker=dict(symbol="x", color="grey"),
name="Payment with other methods",
))
fig.update_layout(legend_title_text="Clusters and payment methods")
fig_eucl = px.scatter(df_clusters.rename(columns={"cluster_eucl": "cluster"}),
x="MonthlyCharges", y="tenure", color="cluster",
symbol="PaymentMethod_Electronic check",
symbol_map={0: "square", 1: "x"})
get_distinct_legend_elements_for_cluster_and_payement(fig_eucl)
fig_eucl.show()
fig_fi = px.scatter(df_clusters.rename(columns={"cluster_fi": "cluster"}),
x="MonthlyCharges", y="tenure", color="cluster",
symbol="PaymentMethod_Electronic check",
symbol_map={0: "square", 1: "x"})
get_distinct_legend_elements_for_cluster_and_payement(fig_fi)
fig_fi.show()
fig_rf = px.scatter(df_clusters.rename(columns={"cluster_rf": "cluster"}),
x="MonthlyCharges", y="tenure", color="cluster",
symbol="PaymentMethod_Electronic check",
symbol_map={0: "square", 1: "x"})
get_distinct_legend_elements_for_cluster_and_payement(fig_rf)
fig_rf.show()