#!/usr/bin/env python # coding: utf-8 # # Handling Duplicated and Missing Data # In[1]: # Setup from datetime import datetime import os import numpy as np import pandas as pd from utils import render users = pd.read_csv(os.path.join('data', 'users.csv'), index_col=0) transactions = pd.read_csv(os.path.join('data', 'transactions.csv'), index_col=0) requests = pd.read_csv(os.path.join('data', 'requests.csv'), index_col=0) # Perform the merge from the previous notebook # (s2n06-combining-dataframes.ipynb) successful_requests = requests.merge( transactions, left_on=['from_user', 'to_user', 'amount'], right_on=['receiver', 'sender', 'amount'] ) # Statement from previous notebook "Wow! ${:,.2f} has passed through the request system in {} transactions!!!".format( successful_requests.amount.sum(), len(successful_requests), ) # ## Duplicated Data # # We realized in our the previous notebook (s2n6-combining-dataframes.ipynb) that the **`requests`** `DataFrame` had duplicates. Unfortunately this means that our **`successful_requests`** also contains duplicates because we merged those same values with a transaction, even though in actuality, only one of those duplicated requests should be deemed "successful". # # We should correct our `DataFrame` by removing the duplicate requests, keeping only the last one, as that is really the one that triggered the actual transaction. The great news is that there is a method named [`drop_duplicates`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop_duplicates.html) that does just that. # # Like `duplicated` there is a `keep` parameter that works similarly, you tell it which of the duplicates to keep. # In[2]: # Let's get our records sorted chronologically successful_requests.sort_values('request_date', inplace=True) # And then we'll drop dupes keeping only the last one. # Note the use of the inplace keyword successful_requests.drop_duplicates(('from_user', 'to_user', 'amount'), keep='last', inplace=True) # Statement from previous notebook "Wow! ${:,.2f} has passed through the request system in {} transactions!!!".format( successful_requests.amount.sum(), len(successful_requests), ) # Yikes, that little bit of difference can be pretty important, especially if that data is being used to make a decision. Always make sure to check for duplicates in situations where you are merging one or more `DataFrame`s together. # ## Locating records not found in another DataFrame using [`isin`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.isin.html) # # Now that we have seen how successful the requests, have been, it would be nice to locate all the users who **have not** yet made a request to another user. # # Basically we would to ask if each user from the **`users`** `DataFrame` if they are in the **`requests`** `DataFrame` as a **`from_user`**. # # The method `isin` is available for [`Series`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.isin.html), [`DataFrame`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.isin.html), and even an [`Index`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Index.isin.html). The method takes an iterable and returns a boolean index of whether or not the value is contained in the owning `Series` or `DataFrame`. # # In[3]: # Create a boolean array where we check to see if the label (username) # is in the `from_user` Series. made_request_index = users.index.isin(requests.from_user) # This will get us a list of all users who **have** made a request users[made_request_index].head() # Because we have a boolean index, we can negate it, using the `~` bitwise operator, to get to what we were looking for, users who have not yet made a request. # In[4]: users[~made_request_index].head() # ## Locating Missing Data # # As I was looking at these people who hadn't made requests I noticed that a few of them had a NaN (Not A Number) for a **`last_name`**. # # We can get a quick overview of how many blank values we have by using the [`DataFrame.count`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.count.html) # # In[5]: users.count() # [`Series.isna`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.isna.html) is a method that allows you to check for missing values, such as `None` or `np.nan`. It returns a boolean array which can be used as an index. # In[6]: # Gather all users where the last name is missing users[users.last_name.isna()].head() # ### Filling Missing Values # # You can also set all unknown values to a specific value using the [`fillna`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.fillna.html) method. By default it will return a new `DataFrame`, use the `inplace` parameter to update the existing. # In[9]: # Make a copy of the DataFrame with "Unknown" as the last name where it is missing users_with_unknown = users.fillna('Unknown') # Make sure we got 'em all users_with_unknown[users_with_unknown.last_name.isna()] # ### Dropping Rows with Missing Data # # Another option is to just drop rows that contain missing data. That is the [`DataFrame.dropna`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html) method. Use `inplace` parameter to update the existing `DataFrame`. # # In this situation it's a little drastic, but you can imagine where missing data, makes what you are trying to accomplish invalid # In[10]: users_with_last_names = users.dropna() # Row counts of the original (len(users), len(users_with_last_names))