Process, visualize and use data easily.

# ### Table of contents # # * [1. Read data](#section1) # * [2. Processing](#section2) # * [3. Visualization](#section3) # * [4. Benchmark](#section4) # * [5. Comparison](#section5) # * [6. Generation](#section6) # In[1]: import autopandas as apd get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # # # 1. Read data # #### Easily load CSV, AutoML or pd.DataFrame. # # apd.AutoData is a subclass of pd.DataFrame representing 2D data frames: # * Examples are in rows # * Features are in columns # # You can load a dataset in CSV format by calling `read_csv` method. It should automatically detect the seperator. # In[3]: input_file = 'autopandas/data/wine.csv' # wine, adult, diabetes, iris, mushrooms, seeds, squares1-2, titanic, boston data = apd.read_csv(input_file) # You can also load some toy datasets from `datasets` module # In[2]: data = apd.datasets.load_wine() # In[5]: data.head() # **Keys:** # * **Row**: train, valid, test, header # * **Column**: X, y, categorical, numerical # In[7]: # show index entry for 'numerical' data.indexes['numerical'] # try also: categorical, X, y, train, test, X_categorical, y_test, etc. # #### Get a subset of data # In[8]: # calling the object itself is equivalent to calling "get_data" method data('header') #data('X_header') # five first rows of X (examples without the class target) #data('categorical_header') # five first rows of categorical variables #data('numerical_header') # five first rows of numerical variables #data('y_test') # test set of target #data.get_data('y') # the target variable # #### Set the target variable (class) if needed # # You can set a continuous or categorical variable. AutoPandas will detect if it is a **classification or regression** problem. # # You can also set a **list of targets**. # In[9]: data.set_class('quality') #data.set_class(['age', 'fnlwgt']) #['income', 'gender'] # multiclass # #### Split data in train and test sets # # When loading the data, train/test split is done. # # You can re-do the train_test split with different parameters if needed. # In[7]: # data.train_test_split(test_size=0.3) # already done automatically # _Remark: you can also define train/test sets by hand by calling the index_ # # data.indexes['train'] = [0, 1, 4, 5, 6] # indexes of train rows # ### Descriptors # In[10]: data.descriptors() # AutoPandas descriptors # data.describe() # Pandas describe method # # # 2. Processing # # # Parameters: **method** and **key** (targeted set). # **Missing values imputation** # # Different methods available: mean, median, remove, most. # # Imputation with a predictive model is currently being tested. # In[3]: data = data.imputation() # mean, median, remove, most # **Encoding** # # Various encodings are available: label, one_hot, likelihood, target, count and more. # In[4]: data = data.encoding('label', 'categorical') # encode categorical variables #data = data.encoding('one_hot', 'categorical',) # one_hot, likelihood, count, target, etc. #data.encoding('drop', 'numerical') # simply drop numerical columns # **Normalization** # # Min-max or standard normalizations. # In[5]: data = data.normalization('standard', 'numerical') # min-max, standard #data2 = data.normalization('min-max', 'numerical') # In[17]: data.min(axis=0) # #### Synthetic Data Vault # # Apply encoding and decoding processing method from the paper "Synthetic Data Vault" by Neha Patki, Roy Wedge and Kalyan Veeramachanemi. # In[14]: # encode sdv_data, limits, min_max = apd.sdv.encode(data) # do something new_data = sdv_data # decode data = apd.sdv.decode(new_data, data, limits, min_max) # **Dimensionality reduction** # # Dimensionality reduction methods: PCA, LDA, T-SNE, random projection, feature_hashing. # In[15]: #data = data.reduction() data.reduction(method='hashing')('header') # pca, lda, tsne, feature_hashing # # # 3. Visualization # #### Class distribution # In[19]: data.plot('y') # alias data('y').plot() # #### 2D PCA plot # In[20]: # for class coloration: c=data.get_data('y') # class is used for coloration by default but the dimensionality reduction erased the class column data.pca(n_components=2).plot(c=data.get_data('y')) # #### T-SNE # In[21]: data.tsne().plot(c=data.get_data('y')) # #### Linear Discriminant Analysis # In[37]: data('train').lda(n_components=2).plot(c=data('y_train')) # #### Heatmap # In[38]: data.plot() # alias data.heatmap() # #### Correlation matrix plot # In[39]: data.corr().plot() # alias data.correlation() # #### Features pairplot # In[40]: data[data.columns[:3]].pairplot() # max_features is set to 12 by default # #### Features boxplot # In[41]: data.boxplot() # # # 4. Benchmark # #### Compute a model's score on the task # # By default, the method naturally train model on train set and test it on test set. # In[56]: #data.set_class('income') data.score() # In[22]: # mean and variance on several runs data.score_error_bars() # In[37]: # score on another target #data.set_class('pH') #data.score() # #### Compute score with custom model and scoring function # In[57]: from sklearn.metrics import f1_score from sklearn.linear_model import LogisticRegression data.score(model=LogisticRegression(), metric=f1_score) # #### Call auto-sklearn # In[40]: # data.score(method='automatic') # # # 5. Comparison # #### Two similar datasets (subsets of the same distribution) # In[15]: ad1 = apd.read_csv('autopandas/data/squares1.csv') # CSV separator is infered automatically ad2 = apd.read_csv('autopandas/data/squares2.csv') # ### 3 types of distances: # #### 1) Between points/columns # * L0, Euclidean and more # * Kolmogorov-Smirnof, Jensen-Shannon, Mutual information # #### 2) Between distributions (datasets) # #### Default: nn_discrepancy # In[35]: ad1.distance(ad2) # #### Nearest Neihbors Adversarial Accuracy # In[46]: ad1[:300].distance(ad2[:300], method='nnaa') # * Euclidean? # * MMD? # * etc. # #### Adversarial score / Binary classification / Discriminant / Classifier score / Discrepancy metric # In[36]: print(ad1.distance(ad2, method='discriminant')) # or 'discrepancy' from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(20, 20)) print(ad1.distance(ad2, method='discriminant', model=model)) # #### Task score / Utility # Some possible parameters: model, metric. # In[37]: ad1.set_class('0.0.29') ad2.set_class('0.0.29') print(ad1.score(verbose=True)) # Trained on ad1 and tested on ad1 (with split) print(ad2.score()) # Trained on ad2 and tested on ad2 (with split) print(ad1.score(test=ad2)) # Trained on ad1 and tested on ad2 print(ad2.score(test=ad1)) # Trained on ad2 and tested on ad1 # ### Overlay plot # In[38]: pca1 = ad1.pca(n_components=2) pca2 = ad2.pca(n_components=2) pca1.plot(ad=pca2) # alias ad.plot(pca1, pca2) # ### Marginal plots # In[25]: ad1.compare_marginals(ad2, method='all', target='0.0') # if no target, it uses the defined class # # # 6. Generation # #### Copy # In[14]: gen = apd.generators.Copycat() gen.fit(data) gen.sample(n=5) # In[15]: gendata = gen.sample(data.shape[0]) data.distance(gendata) # #### Additive Noise Model # In[20]: gen = apd.generators.ANM() # use 'model' parameter to use custom model for imputation gen.fit(data) gen.sample(n=5) # In[21]: # p is the probability of replacing original data # If p is small a lot of data is copied and nnaa can be under 0.5 generated = gen.sample(n=500, p=0.1) print(data[:500].distance(generated, method='nnaa')) generated = gen.sample(n=500, p=1) print(data[:500].distance(generated, method='nnaa')) # In[25]: pca_data_1, pca_model = data.pca(n_components=2, return_param=True) pca_data_1 = pca_data_1[:500] pca_data_2 = gen.sample(n=500).pca(model=pca_model) pca1.plot(ad=pca2, names=['original', 'generated']) # #### Copula # In[6]: gen = apd.generators.Copula() # use 'model' parameter to use custom model generation (between copula tricks) gen.fit(data) data.compare_marginals(gen.sample(n=500), method='mean') data.compare_marginals(gen.sample(n=500), method='std') # In[ ]: # Also: # - VAE (apd.generators.VAE) # - GMM (apd.generators.GMM) # - KDE (apd.generators.KDE) # #### Artificial data # In[3]: gen = apd.generators.Artificial() gen.sample(n=5) # In[7]: gen.sample(n=100, noise=0.05).get_data('X').plot() # In[8]: gen = apd.generators.Artificial(method='blobs') gen.sample(n=5) # In[9]: gen.sample(n=100).get_data('X').plot() # In[ ]: