#!/usr/bin/env python # coding: utf-8 # # Manipulation Techniques # # There are lots of ways to change the shape and data in your `DataFrame`. # # Let's explore the popular options. # In[1]: # Setup from datetime import datetime import os import numpy as np import pandas as pd users = pd.read_csv(os.path.join('data', 'users.csv'), index_col=0) transactions = pd.read_csv(os.path.join('data', 'transactions.csv'), index_col=0) # Pop out a quick sanity check (users.shape, transactions.shape) # ## Assigning values # Let's assume that we work on the CashBox Customer Support team and the user with name of **`Adrian Fang`** called and told us about an error in his balance. It should be **`$35`**, but is currently **`$30.01`** # # ### With Chaining (don't do this) # Let's walk ourselves right into a common problem... # In[2]: # First let's make sure there is only one Adrian Fang users[(users.first_name == "Adrian") & (users.last_name == "Fang")] # > [Yo, Adrian! I did it!](https://www.youtube.com/watch?v=_61IZp_RNYg) # # Our goal is to update the balance, so the common thought process here, usually leads for us to just chain off the returned `DataFrame` like so... # In[3]: users[(users.first_name == "Adrian") & (users.last_name == "Fang")]['balance'] # ... and since that appears to work, maybe we'll go ahead and set it to the new value. # In[4]: users[(users.first_name == "Adrian") & (users.last_name == "Fang")]['balance'] = 35.00 # Uh oh! As you can see in the **SettingWithCopyWarning** above that you should not attempt to chain and assign this way. # # ### Using `loc` # The [solution](http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy) is to use the `DataFrame.loc` object to locate the row and specific column to update. # In[5]: users.loc[(users.first_name == "Adrian") & (users.last_name == "Fang"), 'balance'] = 35.00 # Display our updated user with the new value assigned users.loc['adrian'] # ### Using `at` # # You can also use the `DataFrame.at` method to quickly set scalar values # In[6]: users.at['adrian', 'balance'] = 35.00 # ## Adding Rows # # So we changed the **`balance`** variable for Adrian, and now we need to track that the transaction occurred. # # Let's take a quick peek at the **`transcactions`** DataFrame. # In[7]: transactions.head() # In[8]: # Let's build a new record record = dict(sender=np.nan, receiver='adrian', amount=4.99, sent_date=datetime.now().date()) # ### Appending with [`DataFrame.append`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html) # # The `DataFrame.append`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html) method adds a new row to a new dataset. This method doesn't change the original dataset it returns a copy of the DataFrame with the new row(s) appended. # # The index for our **`transactions`** is auto assigned, so we'll set the **`ignore_index`** keyword argument to `True`, so it gets generated. # # In[9]: # Remember this is returning a copy... transactions.append(record, ignore_index=True).tail() # If you are appending multiple rows, the more effective way to get the job done is by using the [`pandas.concat`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.htm) method. # ### Setting With Enlargement # # If you assign to a non-existent index key, the DataFrame will be enlarged automatically, the row will just be added. # # There is a slight problem here, as the index in the **`transactions`** DataFrame is autogenerated. A popular workaround is to figure out the last used index, and increment it. # In[10]: # Largest current record, incremented next_key = transactions.index.max() + 1 transactions.loc[next_key] = record # Make sure it got added transactions.tail() # ## Adding Columns # # You can add columns much like you do rows, missing values will be set to `np.nan`. # ### Setting With Enlargement # In[11]: latest_id = transactions.index.max() # Add a new column named notes transactions.at[latest_id, 'notes'] = 'Adrian called customer support to report billing error.' transactions.tail() # The column can be added and assigned from an expression. # In[12]: # Add a new column called large. This is a bad name and use of a column ;) transactions['large'] = transactions.amount > 70 # In[13]: transactions.head() # ## Renaming Columns # Renaming columns can be acheived using the [`DataFrame.rename`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html) method. You specify the current name(s) as the key(s) and the new name(s) as the value(s). # # By default this returns a copy, but you can use the `inplace` parameter to change the existing `DataFrame`. # In[14]: transactions.rename(columns={'large': 'big_sender'}, inplace=True) transactions.head() # ## Deleting Columns # # In addition to slicing a `DataFrame` to simply not include a specific existing column. You can also drop columns by name. Let's remove the two that we added, in place. # In[15]: transactions.drop(columns=['notes'], inplace=True) transactions.head() # You might have seen this done before using the `axis` parameter. # # Let's get rid of the oddly named **`big_sender`** column. Why'd you let me name it that way? # In[16]: transactions.drop(['big_sender'], axis='columns', inplace=True) transactions.head() # ## Deleting Rows # You can see also use the [`DataFrame.drop`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html) method to remove row(s) by index. # In[17]: last_key = transactions.index.max() transactions.drop(index=[last_key], inplace=True) transactions.tail()