In [34]:

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn import preprocessing
pd.options.display.float_format = '{:.2f}'.format           
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from scipy.spatial import distance_matrix
from scipy.cluster import hierarchy
from sklearn import manifold

Data Load¶

In [35]:

df_full = pd.read_csv('df_full.CSV', engine='python', encoding='utf-8', index_col=0)

In [36]:

df_full.head()

Out[36]:

	obj_constructionPhase	geo_bln	Generated_index	obj_purchasePrice
0	no_information	Bayern	1	540000.00
1	completed	Nordrhein_Westfalen	2	408000.00
2	no_information	Nordrhein_Westfalen	3	184000.00
3	completed	Nordrhein_Westfalen	4	729000.00
4	no_information	Saarland	5	119000.00

Example - Table¶

In [37]:

grouped = df_full.groupby(['obj_constructionPhase','geo_bln'])['obj_purchasePrice'].mean()
grouped_count = df_full.groupby(['obj_constructionPhase','geo_bln'])['Generated_index'].count()
grouped = pd.merge(grouped, grouped_count, left_on=['obj_constructionPhase','geo_bln'], right_on=['obj_constructionPhase','geo_bln'])
grouped.rename( columns={'Generated_index':'Amount_offers'}, inplace=True )
grouped= pd.DataFrame(grouped) 
grouped.reset_index(inplace=True)
grouped = grouped[['obj_constructionPhase','geo_bln','Amount_offers']]
grouped=grouped.pivot(index='geo_bln', columns='obj_constructionPhase', values='Amount_offers')
grouped.fillna(0, inplace=True)
grouped

Out[37]:

obj_constructionPhase	completed	no_information	projected	under_construction
geo_bln
Baden_Württemberg	5787	15117	5465	207
Bayern	5407	16221	6163	623
Berlin	1060	3499	537	64
Brandenburg	2161	6317	3008	153
Bremen	331	1277	88	29
Hamburg	312	2043	369	46
Hessen	3993	12279	3240	225
Mecklenburg_Vorpommern	1124	3991	587	54
Niedersachsen	5492	19352	3120	219
Nordrhein_Westfalen	8525	28071	7469	340
Rheinland_Pfalz	4322	11601	4636	136
Saarland	729	2672	399	8
Sachsen	1875	6678	3800	118
Sachsen_Anhalt	1112	5358	1644	35
Schleswig_Holstein	2302	7657	1032	128
Thüringen	473	3247	843	17

In [38]:

#First chart - what is a clustermap
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="average", annot=True)

In [39]:

#Chart with standard scaler - Row
h=sns.clustermap(grouped, cmap="Blues", linewidth=.5, standard_scale=0,  method="average", annot=True)

In [40]:

h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="single", annot=True, col_cluster=False)

In [41]:

grouped = df_full.groupby(['geo_bln'])['obj_purchasePrice'].count()
grtouped = grouped.rename( columns={'obj_purchasePrice':'Amount_offers'}, inplace=True )
grouped = pd.DataFrame(grouped) 
grouped.reset_index(inplace=True)
grouped = grouped.set_index('geo_bln')
grouped

Out[41]:

	0
geo_bln
Baden_Württemberg	26576
Bayern	28414
Berlin	5160
Brandenburg	11639
Bremen	1725
Hamburg	2770
Hessen	19737
Mecklenburg_Vorpommern	5756
Niedersachsen	28183
Nordrhein_Westfalen	44405
Rheinland_Pfalz	20695
Saarland	3808
Sachsen	12471
Sachsen_Anhalt	8149
Schleswig_Holstein	11119
Thüringen	4580

In [42]:

h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="single", annot=True, col_cluster=False, figsize=(11,9))

In [43]:

h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="complete", annot=True, col_cluster=False, figsize=(11,9))

In [44]:

h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="average", annot=True, col_cluster=False, figsize=(11,9))

In [45]:

h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="ward", annot=True, col_cluster=False, figsize=(11,9))

In [46]:

distance_matrix = pd.DataFrame(distance_matrix(grouped.values, grouped.values), index=grouped.index, columns=grouped.index)
distance_matrix.to_csv('distance_matrix.CSV',sep=',')
distance_matrix

Out[46]:

geo_bln	Baden_Württemberg	Bayern	Berlin	Brandenburg	Bremen	Hamburg	Hessen	Mecklenburg_Vorpommern	Niedersachsen	Nordrhein_Westfalen	Rheinland_Pfalz	Saarland	Sachsen	Sachsen_Anhalt	Schleswig_Holstein	Thüringen
geo_bln
Baden_Württemberg	0.00	1838.00	21416.00	14937.00	24851.00	23806.00	6839.00	20820.00	1607.00	17829.00	5881.00	22768.00	14105.00	18427.00	15457.00	21996.00
Bayern	1838.00	0.00	23254.00	16775.00	26689.00	25644.00	8677.00	22658.00	231.00	15991.00	7719.00	24606.00	15943.00	20265.00	17295.00	23834.00
Berlin	21416.00	23254.00	0.00	6479.00	3435.00	2390.00	14577.00	596.00	23023.00	39245.00	15535.00	1352.00	7311.00	2989.00	5959.00	580.00
Brandenburg	14937.00	16775.00	6479.00	0.00	9914.00	8869.00	8098.00	5883.00	16544.00	32766.00	9056.00	7831.00	832.00	3490.00	520.00	7059.00
Bremen	24851.00	26689.00	3435.00	9914.00	0.00	1045.00	18012.00	4031.00	26458.00	42680.00	18970.00	2083.00	10746.00	6424.00	9394.00	2855.00
Hamburg	23806.00	25644.00	2390.00	8869.00	1045.00	0.00	16967.00	2986.00	25413.00	41635.00	17925.00	1038.00	9701.00	5379.00	8349.00	1810.00
Hessen	6839.00	8677.00	14577.00	8098.00	18012.00	16967.00	0.00	13981.00	8446.00	24668.00	958.00	15929.00	7266.00	11588.00	8618.00	15157.00
Mecklenburg_Vorpommern	20820.00	22658.00	596.00	5883.00	4031.00	2986.00	13981.00	0.00	22427.00	38649.00	14939.00	1948.00	6715.00	2393.00	5363.00	1176.00
Niedersachsen	1607.00	231.00	23023.00	16544.00	26458.00	25413.00	8446.00	22427.00	0.00	16222.00	7488.00	24375.00	15712.00	20034.00	17064.00	23603.00
Nordrhein_Westfalen	17829.00	15991.00	39245.00	32766.00	42680.00	41635.00	24668.00	38649.00	16222.00	0.00	23710.00	40597.00	31934.00	36256.00	33286.00	39825.00
Rheinland_Pfalz	5881.00	7719.00	15535.00	9056.00	18970.00	17925.00	958.00	14939.00	7488.00	23710.00	0.00	16887.00	8224.00	12546.00	9576.00	16115.00
Saarland	22768.00	24606.00	1352.00	7831.00	2083.00	1038.00	15929.00	1948.00	24375.00	40597.00	16887.00	0.00	8663.00	4341.00	7311.00	772.00
Sachsen	14105.00	15943.00	7311.00	832.00	10746.00	9701.00	7266.00	6715.00	15712.00	31934.00	8224.00	8663.00	0.00	4322.00	1352.00	7891.00
Sachsen_Anhalt	18427.00	20265.00	2989.00	3490.00	6424.00	5379.00	11588.00	2393.00	20034.00	36256.00	12546.00	4341.00	4322.00	0.00	2970.00	3569.00
Schleswig_Holstein	15457.00	17295.00	5959.00	520.00	9394.00	8349.00	8618.00	5363.00	17064.00	33286.00	9576.00	7311.00	1352.00	2970.00	0.00	6539.00
Thüringen	21996.00	23834.00	580.00	7059.00	2855.00	1810.00	15157.00	1176.00	23603.00	39825.00	16115.00	772.00	7891.00	3569.00	6539.00	0.00

In [47]:

data = pd.read_csv("distance_matrix.csv", index_col='geo_bln')

mds = manifold.MDS(n_components=2, dissimilarity="precomputed", n_init=50, max_iter=1000, random_state=1)
results = mds.fit(data.values)

Geo_bln = data.columns
coords = results.embedding_

fig = plt.figure(figsize=(12,10))

plt.subplots_adjust(bottom = 0.1)
plt.scatter(coords[:, 0], coords[:, 1])

for label, x, y in zip(Geo_bln, coords[:, 0], coords[:, 1]):
    plt.annotate(
        label,
        xy = (x, y), 
        xytext = (-20, 20),
        textcoords = 'offset points'
    )
plt.show()

Clusters - matrix data¶

In [48]:

# Define labels for the dataset header
matriz_headers = ['completed','no_information','projected','under_construction']

# Create frame-based difference matrix
matriz_diferencias = [
    [],                         
    [11.35],                       
    [0.61, 11.96],                   
    [4.65, 16, 4.04],                   
    ]
matriz_headers, matriz_diferencias

Out[48]:

(['completed', 'no_information', 'projected', 'under_construction'],
 [[], [11.35], [0.61, 11.96], [4.65, 16, 4.04]])

In [49]:

# Functions
# Returns the coordinates (column, row) containing the lowest value of the matrix
def posicion_con_valor_mas_bajo(matriz):
    # Cell value with minimum value
    valor_minimo = float('inf')
    posicion_x = -1
    posicion_y = -1
    
    # Traverse the entire matrix to find the smallest value
    for i in range(len(matriz)):
        for j in range(len(matriz[i])):
            if(matriz[i][j] < valor_minimo):
                valor_minimo = matriz[i][j]
                posicion_x = i
                posicion_y = j
                
    return posicion_x, posicion_y

# Debug
# x, y = posicion_con_valor_mas_bajo(matriz_diferencias)
# print( 'Posicion:', x, ',', y, '-> Valor',  matriz_diferencias[x][y])

In [50]:

# Combine two labels into one and erase the position with less value
# params: List of labels, positions
def juntar_headers(headers, a, b):
    if b < a:
        a, b = b, a
        
    # Join labels in the first position
    headers[a] = "(" + headers[a] + "," + headers[b] + ")"

    # Delete label from second position
    del headers[b]

# Debug
# juntar_headers(matriz_headers, 1, 2)
# matriz_headers

In [51]:

# Join two columns of the matrix and save the result (average) in the first column
# Params: matrix of differences, position of the columns
def juntar_matriz(matriz, a, b):
    if b < a:
        a, b = b, a

    # Recalculate row for smaller index
    fila = []
    for i in range(0, a):
        fila.append((matriz[a][i] + matriz[b][i])/2)
    matriz[a] = fila
    
    # Recalculate entire column
    for i in range(a+1, b):
        matriz[i][a] = (matriz[i][a] + matriz[b][i])/2
        
    # get the rest of the values in row i
    for i in range(b+1, len(matriz)):
        matriz[i][a] = (matriz[i][a] + matriz[i][b])/2
        # Delete the second column
        del matriz[i][b]

    # Delete the second row
    del matriz[b]

In [52]:

# UPGMA Core
def upgma(matriz, headers):
    # Stop when all labels are joined
    print('Matrix of Initial Differences: ', matriz)
    
    iteracion = 0
    while len(headers) > 1:        
        print('\nIteration: ', iteracion)
        print('#Clusters: ', len(matriz))
                
        # Locate the position in the matrix containing the lowest value
        x, y = posicion_con_valor_mas_bajo(matriz)
        print('Position with lower value:', x, ',', y, '-> Valor = ',  matriz[x][y])
        
        # Actualizar los headers
        juntar_headers(headers, x, y)        
        print('Labels updated:', headers)
        
        # Junta dos columnas de la matriz de diferencias en las coordenadas de la posicion
        juntar_matriz(matriz, x, y)
        print('Array values joined:', matriz)
        iteracion += 1

    # Final result stored in the first position
    return headers[0]
#Source: https://github.com/nextco/clustering-algorithms/blob/master/upgma.ipynb

In [53]:

upgma(matriz_diferencias, matriz_headers)

Matrix of Initial Differences:  [[], [11.35], [0.61, 11.96], [4.65, 16, 4.04]]

Iteration:  0
#Clusters:  4
Position with lower value: 2 , 0 -> Valor =  0.61
Labels updated: ['(completed,projected)', 'no_information', 'under_construction']
Array values joined: [[], [11.655000000000001], [4.345000000000001, 16]]

Iteration:  1
#Clusters:  3
Position with lower value: 2 , 0 -> Valor =  4.345000000000001
Labels updated: ['((completed,projected),under_construction)', 'no_information']
Array values joined: [[], [13.8275]]

Iteration:  2
#Clusters:  2
Position with lower value: 1 , 0 -> Valor =  13.8275
Labels updated: ['(((completed,projected),under_construction),no_information)']
Array values joined: [[]]

Out[53]:

'(((completed,projected),under_construction),no_information)'

In [ ]: