#!/usr/bin/env python
# coding: utf-8
# # AutoPandas
#
#
#
#
Process, visualize and use data easily.
# ### Table of contents
#
# * [1. Read data](#section1)
# * [2. Processing](#section2)
# * [3. Visualization](#section3)
# * [4. Benchmark](#section4)
# * [5. Comparison](#section5)
# * [6. Generation](#section6)
# In[1]:
import autopandas as apd
get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
#
# # 1. Read data
# #### Easily load CSV, AutoML or pd.DataFrame.
#
# apd.AutoData is a subclass of pd.DataFrame representing 2D data frames:
# * Examples are in rows
# * Features are in columns
#
# You can load a dataset in CSV format by calling `read_csv` method. It should automatically detect the seperator.
# In[3]:
input_file = 'autopandas/data/wine.csv' # wine, adult, diabetes, iris, mushrooms, seeds, squares1-2, titanic, boston
data = apd.read_csv(input_file)
# You can also load some toy datasets from `datasets` module
# In[2]:
data = apd.datasets.load_wine()
# In[5]:
data.head()
# **Keys:**
# * **Row**: train, valid, test, header
# * **Column**: X, y, categorical, numerical
# In[7]:
# show index entry for 'numerical'
data.indexes['numerical']
# try also: categorical, X, y, train, test, X_categorical, y_test, etc.
# #### Get a subset of data
# In[8]:
# calling the object itself is equivalent to calling "get_data" method
data('header')
#data('X_header') # five first rows of X (examples without the class target)
#data('categorical_header') # five first rows of categorical variables
#data('numerical_header') # five first rows of numerical variables
#data('y_test') # test set of target
#data.get_data('y') # the target variable
# #### Set the target variable (class) if needed
#
# You can set a continuous or categorical variable. AutoPandas will detect if it is a **classification or regression** problem.
#
# You can also set a **list of targets**.
# In[9]:
data.set_class('quality')
#data.set_class(['age', 'fnlwgt']) #['income', 'gender'] # multiclass
# #### Split data in train and test sets
#
# When loading the data, train/test split is done.
#
# You can re-do the train_test split with different parameters if needed.
# In[7]:
# data.train_test_split(test_size=0.3) # already done automatically
# _Remark: you can also define train/test sets by hand by calling the index_
#
# data.indexes['train'] = [0, 1, 4, 5, 6] # indexes of train rows
# ### Descriptors
# In[10]:
data.descriptors() # AutoPandas descriptors
# data.describe() # Pandas describe method
#
# # 2. Processing
#
#
# Parameters: **method** and **key** (targeted set).
# **Missing values imputation**
#
# Different methods available: mean, median, remove, most.
#
# Imputation with a predictive model is currently being tested.
# In[3]:
data = data.imputation() # mean, median, remove, most
# **Encoding**
#
# Various encodings are available: label, one_hot, likelihood, target, count and more.
# In[4]:
data = data.encoding('label', 'categorical') # encode categorical variables
#data = data.encoding('one_hot', 'categorical',) # one_hot, likelihood, count, target, etc.
#data.encoding('drop', 'numerical') # simply drop numerical columns
# **Normalization**
#
# Min-max or standard normalizations.
# In[5]:
data = data.normalization('standard', 'numerical') # min-max, standard
#data2 = data.normalization('min-max', 'numerical')
# In[17]:
data.min(axis=0)
# #### Synthetic Data Vault
#
# Apply encoding and decoding processing method from the paper "Synthetic Data Vault" by Neha Patki, Roy Wedge and Kalyan Veeramachanemi.
# In[14]:
# encode
sdv_data, limits, min_max = apd.sdv.encode(data)
# do something
new_data = sdv_data
# decode
data = apd.sdv.decode(new_data, data, limits, min_max)
# **Dimensionality reduction**
#
# Dimensionality reduction methods: PCA, LDA, T-SNE, random projection, feature_hashing.
# In[15]:
#data = data.reduction()
data.reduction(method='hashing')('header') # pca, lda, tsne, feature_hashing
#
# # 3. Visualization
# #### Class distribution
# In[19]:
data.plot('y') # alias data('y').plot()
# #### 2D PCA plot
# In[20]:
# for class coloration: c=data.get_data('y')
# class is used for coloration by default but the dimensionality reduction erased the class column
data.pca(n_components=2).plot(c=data.get_data('y'))
# #### T-SNE
# In[21]:
data.tsne().plot(c=data.get_data('y'))
# #### Linear Discriminant Analysis
# In[37]:
data('train').lda(n_components=2).plot(c=data('y_train'))
# #### Heatmap
# In[38]:
data.plot() # alias data.heatmap()
# #### Correlation matrix plot
# In[39]:
data.corr().plot() # alias data.correlation()
# #### Features pairplot
# In[40]:
data[data.columns[:3]].pairplot() # max_features is set to 12 by default
# #### Features boxplot
# In[41]:
data.boxplot()
#
# # 4. Benchmark
# #### Compute a model's score on the task
#
# By default, the method naturally train model on train set and test it on test set.
# In[56]:
#data.set_class('income')
data.score()
# In[22]:
# mean and variance on several runs
data.score_error_bars()
# In[37]:
# score on another target
#data.set_class('pH')
#data.score()
# #### Compute score with custom model and scoring function
# In[57]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
data.score(model=LogisticRegression(), metric=f1_score)
# #### Call auto-sklearn
# In[40]:
# data.score(method='automatic')
#
# # 5. Comparison
# #### Two similar datasets (subsets of the same distribution)
# In[15]:
ad1 = apd.read_csv('autopandas/data/squares1.csv') # CSV separator is infered automatically
ad2 = apd.read_csv('autopandas/data/squares2.csv')
# ### 3 types of distances:
# #### 1) Between points/columns
# * L0, Euclidean and more
# * Kolmogorov-Smirnof, Jensen-Shannon, Mutual information
# #### 2) Between distributions (datasets)
# #### Default: nn_discrepancy
# In[35]:
ad1.distance(ad2)
# #### Nearest Neihbors Adversarial Accuracy
# In[46]:
ad1[:300].distance(ad2[:300], method='nnaa')
# * Euclidean?
# * MMD?
# * etc.
# #### Adversarial score / Binary classification / Discriminant / Classifier score / Discrepancy metric
# In[36]:
print(ad1.distance(ad2, method='discriminant')) # or 'discrepancy'
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(20, 20))
print(ad1.distance(ad2, method='discriminant', model=model))
# #### Task score / Utility
# Some possible parameters: model, metric.
# In[37]:
ad1.set_class('0.0.29')
ad2.set_class('0.0.29')
print(ad1.score(verbose=True)) # Trained on ad1 and tested on ad1 (with split)
print(ad2.score()) # Trained on ad2 and tested on ad2 (with split)
print(ad1.score(test=ad2)) # Trained on ad1 and tested on ad2
print(ad2.score(test=ad1)) # Trained on ad2 and tested on ad1
# ### Overlay plot
# In[38]:
pca1 = ad1.pca(n_components=2)
pca2 = ad2.pca(n_components=2)
pca1.plot(ad=pca2) # alias ad.plot(pca1, pca2)
# ### Marginal plots
# In[25]:
ad1.compare_marginals(ad2, method='all', target='0.0') # if no target, it uses the defined class
#
# # 6. Generation
# #### Copy
# In[14]:
gen = apd.generators.Copycat()
gen.fit(data)
gen.sample(n=5)
# In[15]:
gendata = gen.sample(data.shape[0])
data.distance(gendata)
# #### Additive Noise Model
# In[20]:
gen = apd.generators.ANM() # use 'model' parameter to use custom model for imputation
gen.fit(data)
gen.sample(n=5)
# In[21]:
# p is the probability of replacing original data
# If p is small a lot of data is copied and nnaa can be under 0.5
generated = gen.sample(n=500, p=0.1)
print(data[:500].distance(generated, method='nnaa'))
generated = gen.sample(n=500, p=1)
print(data[:500].distance(generated, method='nnaa'))
# In[25]:
pca_data_1, pca_model = data.pca(n_components=2, return_param=True)
pca_data_1 = pca_data_1[:500]
pca_data_2 = gen.sample(n=500).pca(model=pca_model)
pca1.plot(ad=pca2, names=['original', 'generated'])
# #### Copula
# In[6]:
gen = apd.generators.Copula() # use 'model' parameter to use custom model generation (between copula tricks)
gen.fit(data)
data.compare_marginals(gen.sample(n=500), method='mean')
data.compare_marginals(gen.sample(n=500), method='std')
# In[ ]:
# Also:
# - VAE (apd.generators.VAE)
# - GMM (apd.generators.GMM)
# - KDE (apd.generators.KDE)
# #### Artificial data
# In[3]:
gen = apd.generators.Artificial()
gen.sample(n=5)
# In[7]:
gen.sample(n=100, noise=0.05).get_data('X').plot()
# In[8]:
gen = apd.generators.Artificial(method='blobs')
gen.sample(n=5)
# In[9]:
gen.sample(n=100).get_data('X').plot()
# In[ ]: