import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn import preprocessing
pd.options.display.float_format = '{:.2f}'.format
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from scipy.spatial import distance_matrix
from scipy.cluster import hierarchy
from sklearn import manifold
df_full = pd.read_csv('df_full.CSV', engine='python', encoding='utf-8', index_col=0)
df_full.head()
obj_constructionPhase | geo_bln | Generated_index | obj_purchasePrice | |
---|---|---|---|---|
0 | no_information | Bayern | 1 | 540000.00 |
1 | completed | Nordrhein_Westfalen | 2 | 408000.00 |
2 | no_information | Nordrhein_Westfalen | 3 | 184000.00 |
3 | completed | Nordrhein_Westfalen | 4 | 729000.00 |
4 | no_information | Saarland | 5 | 119000.00 |
grouped = df_full.groupby(['obj_constructionPhase','geo_bln'])['obj_purchasePrice'].mean()
grouped_count = df_full.groupby(['obj_constructionPhase','geo_bln'])['Generated_index'].count()
grouped = pd.merge(grouped, grouped_count, left_on=['obj_constructionPhase','geo_bln'], right_on=['obj_constructionPhase','geo_bln'])
grouped.rename( columns={'Generated_index':'Amount_offers'}, inplace=True )
grouped= pd.DataFrame(grouped)
grouped.reset_index(inplace=True)
grouped = grouped[['obj_constructionPhase','geo_bln','Amount_offers']]
grouped=grouped.pivot(index='geo_bln', columns='obj_constructionPhase', values='Amount_offers')
grouped.fillna(0, inplace=True)
grouped
obj_constructionPhase | completed | no_information | projected | under_construction |
---|---|---|---|---|
geo_bln | ||||
Baden_Württemberg | 5787 | 15117 | 5465 | 207 |
Bayern | 5407 | 16221 | 6163 | 623 |
Berlin | 1060 | 3499 | 537 | 64 |
Brandenburg | 2161 | 6317 | 3008 | 153 |
Bremen | 331 | 1277 | 88 | 29 |
Hamburg | 312 | 2043 | 369 | 46 |
Hessen | 3993 | 12279 | 3240 | 225 |
Mecklenburg_Vorpommern | 1124 | 3991 | 587 | 54 |
Niedersachsen | 5492 | 19352 | 3120 | 219 |
Nordrhein_Westfalen | 8525 | 28071 | 7469 | 340 |
Rheinland_Pfalz | 4322 | 11601 | 4636 | 136 |
Saarland | 729 | 2672 | 399 | 8 |
Sachsen | 1875 | 6678 | 3800 | 118 |
Sachsen_Anhalt | 1112 | 5358 | 1644 | 35 |
Schleswig_Holstein | 2302 | 7657 | 1032 | 128 |
Thüringen | 473 | 3247 | 843 | 17 |
#First chart - what is a clustermap
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="average", annot=True)
#Chart with standard scaler - Row
h=sns.clustermap(grouped, cmap="Blues", linewidth=.5, standard_scale=0, method="average", annot=True)
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="single", annot=True, col_cluster=False)
grouped = df_full.groupby(['geo_bln'])['obj_purchasePrice'].count()
grtouped = grouped.rename( columns={'obj_purchasePrice':'Amount_offers'}, inplace=True )
grouped = pd.DataFrame(grouped)
grouped.reset_index(inplace=True)
grouped = grouped.set_index('geo_bln')
grouped
0 | |
---|---|
geo_bln | |
Baden_Württemberg | 26576 |
Bayern | 28414 |
Berlin | 5160 |
Brandenburg | 11639 |
Bremen | 1725 |
Hamburg | 2770 |
Hessen | 19737 |
Mecklenburg_Vorpommern | 5756 |
Niedersachsen | 28183 |
Nordrhein_Westfalen | 44405 |
Rheinland_Pfalz | 20695 |
Saarland | 3808 |
Sachsen | 12471 |
Sachsen_Anhalt | 8149 |
Schleswig_Holstein | 11119 |
Thüringen | 4580 |
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="single", annot=True, col_cluster=False, figsize=(11,9))
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="complete", annot=True, col_cluster=False, figsize=(11,9))
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="average", annot=True, col_cluster=False, figsize=(11,9))
h=sns.clustermap(grouped, cmap="Blues", fmt="d", linewidth=.5, method="ward", annot=True, col_cluster=False, figsize=(11,9))
distance_matrix = pd.DataFrame(distance_matrix(grouped.values, grouped.values), index=grouped.index, columns=grouped.index)
distance_matrix.to_csv('distance_matrix.CSV',sep=',')
distance_matrix
geo_bln | Baden_Württemberg | Bayern | Berlin | Brandenburg | Bremen | Hamburg | Hessen | Mecklenburg_Vorpommern | Niedersachsen | Nordrhein_Westfalen | Rheinland_Pfalz | Saarland | Sachsen | Sachsen_Anhalt | Schleswig_Holstein | Thüringen |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
geo_bln | ||||||||||||||||
Baden_Württemberg | 0.00 | 1838.00 | 21416.00 | 14937.00 | 24851.00 | 23806.00 | 6839.00 | 20820.00 | 1607.00 | 17829.00 | 5881.00 | 22768.00 | 14105.00 | 18427.00 | 15457.00 | 21996.00 |
Bayern | 1838.00 | 0.00 | 23254.00 | 16775.00 | 26689.00 | 25644.00 | 8677.00 | 22658.00 | 231.00 | 15991.00 | 7719.00 | 24606.00 | 15943.00 | 20265.00 | 17295.00 | 23834.00 |
Berlin | 21416.00 | 23254.00 | 0.00 | 6479.00 | 3435.00 | 2390.00 | 14577.00 | 596.00 | 23023.00 | 39245.00 | 15535.00 | 1352.00 | 7311.00 | 2989.00 | 5959.00 | 580.00 |
Brandenburg | 14937.00 | 16775.00 | 6479.00 | 0.00 | 9914.00 | 8869.00 | 8098.00 | 5883.00 | 16544.00 | 32766.00 | 9056.00 | 7831.00 | 832.00 | 3490.00 | 520.00 | 7059.00 |
Bremen | 24851.00 | 26689.00 | 3435.00 | 9914.00 | 0.00 | 1045.00 | 18012.00 | 4031.00 | 26458.00 | 42680.00 | 18970.00 | 2083.00 | 10746.00 | 6424.00 | 9394.00 | 2855.00 |
Hamburg | 23806.00 | 25644.00 | 2390.00 | 8869.00 | 1045.00 | 0.00 | 16967.00 | 2986.00 | 25413.00 | 41635.00 | 17925.00 | 1038.00 | 9701.00 | 5379.00 | 8349.00 | 1810.00 |
Hessen | 6839.00 | 8677.00 | 14577.00 | 8098.00 | 18012.00 | 16967.00 | 0.00 | 13981.00 | 8446.00 | 24668.00 | 958.00 | 15929.00 | 7266.00 | 11588.00 | 8618.00 | 15157.00 |
Mecklenburg_Vorpommern | 20820.00 | 22658.00 | 596.00 | 5883.00 | 4031.00 | 2986.00 | 13981.00 | 0.00 | 22427.00 | 38649.00 | 14939.00 | 1948.00 | 6715.00 | 2393.00 | 5363.00 | 1176.00 |
Niedersachsen | 1607.00 | 231.00 | 23023.00 | 16544.00 | 26458.00 | 25413.00 | 8446.00 | 22427.00 | 0.00 | 16222.00 | 7488.00 | 24375.00 | 15712.00 | 20034.00 | 17064.00 | 23603.00 |
Nordrhein_Westfalen | 17829.00 | 15991.00 | 39245.00 | 32766.00 | 42680.00 | 41635.00 | 24668.00 | 38649.00 | 16222.00 | 0.00 | 23710.00 | 40597.00 | 31934.00 | 36256.00 | 33286.00 | 39825.00 |
Rheinland_Pfalz | 5881.00 | 7719.00 | 15535.00 | 9056.00 | 18970.00 | 17925.00 | 958.00 | 14939.00 | 7488.00 | 23710.00 | 0.00 | 16887.00 | 8224.00 | 12546.00 | 9576.00 | 16115.00 |
Saarland | 22768.00 | 24606.00 | 1352.00 | 7831.00 | 2083.00 | 1038.00 | 15929.00 | 1948.00 | 24375.00 | 40597.00 | 16887.00 | 0.00 | 8663.00 | 4341.00 | 7311.00 | 772.00 |
Sachsen | 14105.00 | 15943.00 | 7311.00 | 832.00 | 10746.00 | 9701.00 | 7266.00 | 6715.00 | 15712.00 | 31934.00 | 8224.00 | 8663.00 | 0.00 | 4322.00 | 1352.00 | 7891.00 |
Sachsen_Anhalt | 18427.00 | 20265.00 | 2989.00 | 3490.00 | 6424.00 | 5379.00 | 11588.00 | 2393.00 | 20034.00 | 36256.00 | 12546.00 | 4341.00 | 4322.00 | 0.00 | 2970.00 | 3569.00 |
Schleswig_Holstein | 15457.00 | 17295.00 | 5959.00 | 520.00 | 9394.00 | 8349.00 | 8618.00 | 5363.00 | 17064.00 | 33286.00 | 9576.00 | 7311.00 | 1352.00 | 2970.00 | 0.00 | 6539.00 |
Thüringen | 21996.00 | 23834.00 | 580.00 | 7059.00 | 2855.00 | 1810.00 | 15157.00 | 1176.00 | 23603.00 | 39825.00 | 16115.00 | 772.00 | 7891.00 | 3569.00 | 6539.00 | 0.00 |
data = pd.read_csv("distance_matrix.csv", index_col='geo_bln')
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", n_init=50, max_iter=1000, random_state=1)
results = mds.fit(data.values)
Geo_bln = data.columns
coords = results.embedding_
fig = plt.figure(figsize=(12,10))
plt.subplots_adjust(bottom = 0.1)
plt.scatter(coords[:, 0], coords[:, 1])
for label, x, y in zip(Geo_bln, coords[:, 0], coords[:, 1]):
plt.annotate(
label,
xy = (x, y),
xytext = (-20, 20),
textcoords = 'offset points'
)
plt.show()
# Define labels for the dataset header
matriz_headers = ['completed','no_information','projected','under_construction']
# Create frame-based difference matrix
matriz_diferencias = [
[],
[11.35],
[0.61, 11.96],
[4.65, 16, 4.04],
]
matriz_headers, matriz_diferencias
(['completed', 'no_information', 'projected', 'under_construction'], [[], [11.35], [0.61, 11.96], [4.65, 16, 4.04]])
# Functions
# Returns the coordinates (column, row) containing the lowest value of the matrix
def posicion_con_valor_mas_bajo(matriz):
# Cell value with minimum value
valor_minimo = float('inf')
posicion_x = -1
posicion_y = -1
# Traverse the entire matrix to find the smallest value
for i in range(len(matriz)):
for j in range(len(matriz[i])):
if(matriz[i][j] < valor_minimo):
valor_minimo = matriz[i][j]
posicion_x = i
posicion_y = j
return posicion_x, posicion_y
# Debug
# x, y = posicion_con_valor_mas_bajo(matriz_diferencias)
# print( 'Posicion:', x, ',', y, '-> Valor', matriz_diferencias[x][y])
# Combine two labels into one and erase the position with less value
# params: List of labels, positions
def juntar_headers(headers, a, b):
if b < a:
a, b = b, a
# Join labels in the first position
headers[a] = "(" + headers[a] + "," + headers[b] + ")"
# Delete label from second position
del headers[b]
# Debug
# juntar_headers(matriz_headers, 1, 2)
# matriz_headers
# Join two columns of the matrix and save the result (average) in the first column
# Params: matrix of differences, position of the columns
def juntar_matriz(matriz, a, b):
if b < a:
a, b = b, a
# Recalculate row for smaller index
fila = []
for i in range(0, a):
fila.append((matriz[a][i] + matriz[b][i])/2)
matriz[a] = fila
# Recalculate entire column
for i in range(a+1, b):
matriz[i][a] = (matriz[i][a] + matriz[b][i])/2
# get the rest of the values in row i
for i in range(b+1, len(matriz)):
matriz[i][a] = (matriz[i][a] + matriz[i][b])/2
# Delete the second column
del matriz[i][b]
# Delete the second row
del matriz[b]
# UPGMA Core
def upgma(matriz, headers):
# Stop when all labels are joined
print('Matrix of Initial Differences: ', matriz)
iteracion = 0
while len(headers) > 1:
print('\nIteration: ', iteracion)
print('#Clusters: ', len(matriz))
# Locate the position in the matrix containing the lowest value
x, y = posicion_con_valor_mas_bajo(matriz)
print('Position with lower value:', x, ',', y, '-> Valor = ', matriz[x][y])
# Actualizar los headers
juntar_headers(headers, x, y)
print('Labels updated:', headers)
# Junta dos columnas de la matriz de diferencias en las coordenadas de la posicion
juntar_matriz(matriz, x, y)
print('Array values joined:', matriz)
iteracion += 1
# Final result stored in the first position
return headers[0]
#Source: https://github.com/nextco/clustering-algorithms/blob/master/upgma.ipynb
upgma(matriz_diferencias, matriz_headers)
Matrix of Initial Differences: [[], [11.35], [0.61, 11.96], [4.65, 16, 4.04]] Iteration: 0 #Clusters: 4 Position with lower value: 2 , 0 -> Valor = 0.61 Labels updated: ['(completed,projected)', 'no_information', 'under_construction'] Array values joined: [[], [11.655000000000001], [4.345000000000001, 16]] Iteration: 1 #Clusters: 3 Position with lower value: 2 , 0 -> Valor = 4.345000000000001 Labels updated: ['((completed,projected),under_construction)', 'no_information'] Array values joined: [[], [13.8275]] Iteration: 2 #Clusters: 2 Position with lower value: 1 , 0 -> Valor = 13.8275 Labels updated: ['(((completed,projected),under_construction),no_information)'] Array values joined: [[]]
'(((completed,projected),under_construction),no_information)'