In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn import preprocessing
pd.options.display.float_format = '{:.2f}'.format           
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from scipy.spatial import distance_matrix
from scipy.cluster import hierarchy
from sklearn import manifold

Data Load

In [35]:
df_full = pd.read_csv('df_full.CSV', engine='python', encoding='utf-8', index_col=0)
In [36]:
df_full.head()
Out[36]:
obj_constructionPhase geo_bln Generated_index obj_purchasePrice
0 no_information Bayern 1 540000.00
1 completed Nordrhein_Westfalen 2 408000.00
2 no_information Nordrhein_Westfalen 3 184000.00
3 completed Nordrhein_Westfalen 4 729000.00
4 no_information Saarland 5 119000.00

Example - Table

In [37]:
grouped = df_full.groupby(['obj_constructionPhase','geo_bln'])['obj_purchasePrice'].mean()
grouped_count = df_full.groupby(['obj_constructionPhase','geo_bln'])['Generated_index'].count()
grouped = pd.merge(grouped, grouped_count, left_on=['obj_constructionPhase','geo_bln'], right_on=['obj_constructionPhase','geo_bln'])
grouped.rename( columns={'Generated_index':'Amount_offers'}, inplace=True )
grouped= pd.DataFrame(grouped) 
grouped.reset_index(inplace=True)
grouped = grouped[['obj_constructionPhase','geo_bln','Amount_offers']]
grouped=grouped.pivot(index='geo_bln', columns='obj_constructionPhase', values='Amount_offers')
grouped.fillna(0, inplace=True)
grouped
Out[37]:
obj_constructionPhase completed no_information projected under_construction
geo_bln
Baden_Württemberg 5787 15117 5465 207
Bayern 5407 16221 6163 623
Berlin 1060 3499 537 64
Brandenburg 2161 6317 3008 153
Bremen 331 1277 88 29
Hamburg 312 2043 369 46
Hessen 3993 12279 3240 225
Mecklenburg_Vorpommern 1124 3991 587 54
Niedersachsen 5492 19352 3120 219
Nordrhein_Westfalen 8525 28071 7469 340
Rheinland_Pfalz 4322 11601 4636 136
Saarland 729 2672 399 8
Sachsen 1875 6678 3800 118
Sachsen_Anhalt 1112 5358 1644 35
Schleswig_Holstein 2302 7657 1032 128
Thüringen 473 3247 843 17
In [38]:
#First chart - what is a clustermap
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="average", annot=True)
In [39]:
#Chart with standard scaler - Row
h=sns.clustermap(grouped, cmap="Blues", linewidth=.5, standard_scale=0,  method="average", annot=True)
In [40]:
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="single", annot=True, col_cluster=False)
In [41]:
grouped = df_full.groupby(['geo_bln'])['obj_purchasePrice'].count()
grtouped = grouped.rename( columns={'obj_purchasePrice':'Amount_offers'}, inplace=True )
grouped = pd.DataFrame(grouped) 
grouped.reset_index(inplace=True)
grouped = grouped.set_index('geo_bln')
grouped
Out[41]:
0
geo_bln
Baden_Württemberg 26576
Bayern 28414
Berlin 5160
Brandenburg 11639
Bremen 1725
Hamburg 2770
Hessen 19737
Mecklenburg_Vorpommern 5756
Niedersachsen 28183
Nordrhein_Westfalen 44405
Rheinland_Pfalz 20695
Saarland 3808
Sachsen 12471
Sachsen_Anhalt 8149
Schleswig_Holstein 11119
Thüringen 4580
In [42]:
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="single", annot=True, col_cluster=False, figsize=(11,9))
In [43]:
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="complete", annot=True, col_cluster=False, figsize=(11,9))
In [44]:
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="average", annot=True, col_cluster=False, figsize=(11,9))
In [45]:
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="ward", annot=True, col_cluster=False, figsize=(11,9))
In [46]:
distance_matrix = pd.DataFrame(distance_matrix(grouped.values, grouped.values), index=grouped.index, columns=grouped.index)
distance_matrix.to_csv('distance_matrix.CSV',sep=',')
distance_matrix
Out[46]:
geo_bln Baden_Württemberg Bayern Berlin Brandenburg Bremen Hamburg Hessen Mecklenburg_Vorpommern Niedersachsen Nordrhein_Westfalen Rheinland_Pfalz Saarland Sachsen Sachsen_Anhalt Schleswig_Holstein Thüringen
geo_bln
Baden_Württemberg 0.00 1838.00 21416.00 14937.00 24851.00 23806.00 6839.00 20820.00 1607.00 17829.00 5881.00 22768.00 14105.00 18427.00 15457.00 21996.00
Bayern 1838.00 0.00 23254.00 16775.00 26689.00 25644.00 8677.00 22658.00 231.00 15991.00 7719.00 24606.00 15943.00 20265.00 17295.00 23834.00
Berlin 21416.00 23254.00 0.00 6479.00 3435.00 2390.00 14577.00 596.00 23023.00 39245.00 15535.00 1352.00 7311.00 2989.00 5959.00 580.00
Brandenburg 14937.00 16775.00 6479.00 0.00 9914.00 8869.00 8098.00 5883.00 16544.00 32766.00 9056.00 7831.00 832.00 3490.00 520.00 7059.00
Bremen 24851.00 26689.00 3435.00 9914.00 0.00 1045.00 18012.00 4031.00 26458.00 42680.00 18970.00 2083.00 10746.00 6424.00 9394.00 2855.00
Hamburg 23806.00 25644.00 2390.00 8869.00 1045.00 0.00 16967.00 2986.00 25413.00 41635.00 17925.00 1038.00 9701.00 5379.00 8349.00 1810.00
Hessen 6839.00 8677.00 14577.00 8098.00 18012.00 16967.00 0.00 13981.00 8446.00 24668.00 958.00 15929.00 7266.00 11588.00 8618.00 15157.00
Mecklenburg_Vorpommern 20820.00 22658.00 596.00 5883.00 4031.00 2986.00 13981.00 0.00 22427.00 38649.00 14939.00 1948.00 6715.00 2393.00 5363.00 1176.00
Niedersachsen 1607.00 231.00 23023.00 16544.00 26458.00 25413.00 8446.00 22427.00 0.00 16222.00 7488.00 24375.00 15712.00 20034.00 17064.00 23603.00
Nordrhein_Westfalen 17829.00 15991.00 39245.00 32766.00 42680.00 41635.00 24668.00 38649.00 16222.00 0.00 23710.00 40597.00 31934.00 36256.00 33286.00 39825.00
Rheinland_Pfalz 5881.00 7719.00 15535.00 9056.00 18970.00 17925.00 958.00 14939.00 7488.00 23710.00 0.00 16887.00 8224.00 12546.00 9576.00 16115.00
Saarland 22768.00 24606.00 1352.00 7831.00 2083.00 1038.00 15929.00 1948.00 24375.00 40597.00 16887.00 0.00 8663.00 4341.00 7311.00 772.00
Sachsen 14105.00 15943.00 7311.00 832.00 10746.00 9701.00 7266.00 6715.00 15712.00 31934.00 8224.00 8663.00 0.00 4322.00 1352.00 7891.00
Sachsen_Anhalt 18427.00 20265.00 2989.00 3490.00 6424.00 5379.00 11588.00 2393.00 20034.00 36256.00 12546.00 4341.00 4322.00 0.00 2970.00 3569.00
Schleswig_Holstein 15457.00 17295.00 5959.00 520.00 9394.00 8349.00 8618.00 5363.00 17064.00 33286.00 9576.00 7311.00 1352.00 2970.00 0.00 6539.00
Thüringen 21996.00 23834.00 580.00 7059.00 2855.00 1810.00 15157.00 1176.00 23603.00 39825.00 16115.00 772.00 7891.00 3569.00 6539.00 0.00
In [47]:
data = pd.read_csv("distance_matrix.csv", index_col='geo_bln')

mds = manifold.MDS(n_components=2, dissimilarity="precomputed", n_init=50, max_iter=1000, random_state=1)
results = mds.fit(data.values)

Geo_bln = data.columns
coords = results.embedding_

fig = plt.figure(figsize=(12,10))

plt.subplots_adjust(bottom = 0.1)
plt.scatter(coords[:, 0], coords[:, 1])

for label, x, y in zip(Geo_bln, coords[:, 0], coords[:, 1]):
    plt.annotate(
        label,
        xy = (x, y), 
        xytext = (-20, 20),
        textcoords = 'offset points'
    )
plt.show()

Clusters - matrix data

In [48]:
# Define labels for the dataset header
matriz_headers = ['completed','no_information','projected','under_construction']

# Create frame-based difference matrix
matriz_diferencias = [
    [],                         
    [11.35],                       
    [0.61, 11.96],                   
    [4.65, 16, 4.04],                   
    ]
matriz_headers, matriz_diferencias
Out[48]:
(['completed', 'no_information', 'projected', 'under_construction'],
 [[], [11.35], [0.61, 11.96], [4.65, 16, 4.04]])
In [49]:
# Functions
# Returns the coordinates (column, row) containing the lowest value of the matrix
def posicion_con_valor_mas_bajo(matriz):
    # Cell value with minimum value
    valor_minimo = float('inf')
    posicion_x = -1
    posicion_y = -1
    
    # Traverse the entire matrix to find the smallest value
    for i in range(len(matriz)):
        for j in range(len(matriz[i])):
            if(matriz[i][j] < valor_minimo):
                valor_minimo = matriz[i][j]
                posicion_x = i
                posicion_y = j
                
    return posicion_x, posicion_y

# Debug
# x, y = posicion_con_valor_mas_bajo(matriz_diferencias)
# print( 'Posicion:', x, ',', y, '-> Valor',  matriz_diferencias[x][y])
In [50]:
# Combine two labels into one and erase the position with less value
# params: List of labels, positions
def juntar_headers(headers, a, b):
    if b < a:
        a, b = b, a
        
    # Join labels in the first position
    headers[a] = "(" + headers[a] + "," + headers[b] + ")"

    # Delete label from second position
    del headers[b]

# Debug
# juntar_headers(matriz_headers, 1, 2)
# matriz_headers
In [51]:
# Join two columns of the matrix and save the result (average) in the first column
# Params: matrix of differences, position of the columns
def juntar_matriz(matriz, a, b):
    if b < a:
        a, b = b, a

    # Recalculate row for smaller index
    fila = []
    for i in range(0, a):
        fila.append((matriz[a][i] + matriz[b][i])/2)
    matriz[a] = fila
    
    # Recalculate entire column
    for i in range(a+1, b):
        matriz[i][a] = (matriz[i][a] + matriz[b][i])/2
        
    # get the rest of the values in row i
    for i in range(b+1, len(matriz)):
        matriz[i][a] = (matriz[i][a] + matriz[i][b])/2
        # Delete the second column
        del matriz[i][b]

    # Delete the second row
    del matriz[b]
In [52]:
# UPGMA Core
def upgma(matriz, headers):
    # Stop when all labels are joined
    print('Matrix of Initial Differences: ', matriz)
    
    iteracion = 0
    while len(headers) > 1:        
        print('\nIteration: ', iteracion)
        print('#Clusters: ', len(matriz))
                
        # Locate the position in the matrix containing the lowest value
        x, y = posicion_con_valor_mas_bajo(matriz)
        print('Position with lower value:', x, ',', y, '-> Valor = ',  matriz[x][y])
        
        # Actualizar los headers
        juntar_headers(headers, x, y)        
        print('Labels updated:', headers)
        
        # Junta dos columnas de la matriz de diferencias en las coordenadas de la posicion
        juntar_matriz(matriz, x, y)
        print('Array values joined:', matriz)
        iteracion += 1

    # Final result stored in the first position
    return headers[0]
#Source: https://github.com/nextco/clustering-algorithms/blob/master/upgma.ipynb
In [53]:
upgma(matriz_diferencias, matriz_headers)
Matrix of Initial Differences:  [[], [11.35], [0.61, 11.96], [4.65, 16, 4.04]]

Iteration:  0
#Clusters:  4
Position with lower value: 2 , 0 -> Valor =  0.61
Labels updated: ['(completed,projected)', 'no_information', 'under_construction']
Array values joined: [[], [11.655000000000001], [4.345000000000001, 16]]

Iteration:  1
#Clusters:  3
Position with lower value: 2 , 0 -> Valor =  4.345000000000001
Labels updated: ['((completed,projected),under_construction)', 'no_information']
Array values joined: [[], [13.8275]]

Iteration:  2
#Clusters:  2
Position with lower value: 1 , 0 -> Valor =  13.8275
Labels updated: ['(((completed,projected),under_construction),no_information)']
Array values joined: [[]]
Out[53]:
'(((completed,projected),under_construction),no_information)'
In [ ]: