store <- read.csv('store.csv') store <- store[,c("Store","StoreType","Assortment", "CompetitionDistance", "CompetitionOpenSinceMonth")] head(store) set.seed(1680) library(dplyr) library(cluster) library(Rtsne) library(ggplot2) library(gpairs) library(corrplot) library(gplots) library(car) lambda_CompetitionDistance <- coef(powerTransform(store$CompetitionDistance)) lambda_CompetitionOpenSinceMonth <- coef(powerTransform(store$CompetitionOpenSinceMonth)) lambda_CompetitionDistance lambda_CompetitionOpenSinceMonth par(mfrow=c(2,1)) hist(store$CompetitionDistance, xlab="Original variable", main="Histogram of original variable") hist(bcPower(store$CompetitionDistance, lambda_CompetitionDistance), xlab="Box-Cox Transform", ylab="New Distribution", main="Transformed Distribution") store$CompetitionDistance <- bcPower(store$CompetitionDistance, lambda_CompetitionDistance) store = na.omit(store) library(cluster) gower.dist <- daisy(store, metric = "gower") summary(gower.dist) sil_width <- c(NA) for(i in 2:10){ pam_fit <- pam(gower.dist, diss = TRUE, k = i) sil_width[i] <- pam_fit$silinfo$avg.width } pam_fit par(mfrow=c(1,1)) plot(1:10, sil_width, xlab = "Number of clusters", ylab = "Silhouette Width") lines(1:10, sil_width) pam_fit <- pam(gower.dist, diss = TRUE, k = 6) pam_results <- store %>% mutate(cluster = pam_fit$clustering) %>% group_by(cluster) %>% do(the_summary = summary(.)) pam_results$the_summary store[pam_fit$medoids, ] df_clusters <- cbind(store) df_clusters[pam_fit$medoids, ] df_clusters$cluster <- factor(pam_fit$clustering) tsne_obj <- Rtsne(gower.dist, is_distance = TRUE, perplexity = 28) tsne_data <- tsne_obj$Y %>% data.frame() %>% setNames(c("X", "Y")) %>% mutate(cluster = factor(pam_fit$clustering), name = store$Store) ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes(color = cluster)) data = tsne_data %>% filter(X > 0, Y > 0) head(data) dim(data) colnames(data) <- c("X", "Y", "cluster", 'Store') print(data %>% left_join(store, by = "Store") %>% collect %>%.[["Store"]])