#!/usr/bin/env python # coding: utf-8 # [Back to the GitHub repository](https://github.com/rasbt/python_reference) # In[1]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -d -p pandas") # [More information](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `watermark` magic command extension. #
#
# # Things in Pandas I Wish I'd Known Earlier # This is just a small but growing collection of pandas snippets that I find occasionally and particularly useful -- consider it as my personal notebook. Suggestions, tips, and contributions are very, very welcome! # ### Sections # - [Loading Some Example Data](#Loading-Some-Example-Data) # - [Renaming Columns](#Renaming-Columns) # - [Converting Column Names to Lowercase](#Converting-Column-Names-to-Lowercase) # - [Renaming Particular Columns](#Renaming-Particular-Columns) # - [Applying Computations Rows-wise](#Applying-Computations-Rows-wise) # - [Changing Values in a Column](#Changing-Values-in-a-Column) # - [Adding a New Column](#Adding-a-New-Column) # - [Applying Functions to Multiple Columns](#Applying-Functions-to-Multiple-Columns) # - [Missing Values aka NaNs](#Missing-Values-aka-NaNs) # - [Counting Rows with NaNs](#Counting-Rows-with-NaNs) # - [Selecting NaN Rows](#Selecting-NaN-Rows) # - [Selecting non-NaN Rows](#Selecting-non-NaN-Rows) # - [Filling NaN Rows](#Filling-NaN-Rows) # - [Appending Rows to a DataFrame](#Appending-Rows-to-a-DataFrame) # - [Sorting and Reindexing DataFrames](#Sorting-and-Reindexing-DataFrames) # - [Updating Columns](#Updating-Columns) # - [Chaining Conditions - Using Bitwise Operators](#Chaining-Conditions---Using-Bitwise-Operators) # - [Column Types](#Column-Types) # - [Printing Column Types](#Printing-Column-Types) # - [Selecting by Column Type](#Selecting-by-Column-Type) # - [Converting Column Types](#Converting-Column-Types) # - [If-tests](#If-tests) #
#
# # Loading Some Example Data # [[back to section overview](#Sections)] # I am heavily into sports prediction (via a machine learning approach) these days. So, let us use a (very) small subset of the soccer data that I am just working with. # In[2]: import pandas as pd df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv') df #
#
# # Renaming Columns # [[back to section overview](#Sections)] #
#
# ### Converting Column Names to Lowercase # In[3]: # Converting column names to lowercase df.columns = [c.lower() for c in df.columns] # or # df.rename(columns=lambda x : x.lower()) df.tail(3) #
#
# ### Renaming Particular Columns # In[4]: df = df.rename(columns={'p': 'points', 'gp': 'games', 'sot': 'shots_on_target', 'g': 'goals', 'ppg': 'points_per_game', 'a': 'assists',}) df.tail(3) #
#
# # Applying Computations Rows-wise # [[back to section overview](#Sections)] #
#
# ### Changing Values in a Column # In[5]: # Processing `salary` column df['salary'] = df['salary'].apply(lambda x: x.strip('$m')) df.tail() #
#
# ### Adding a New Column # In[6]: df['team'] = pd.Series('', index=df.index) # or df.insert(loc=8, column='position', value='') df.tail(3) # In[7]: # Processing `player` column def process_player_col(text): name, rest = text.split('\n') position, team = [x.strip() for x in rest.split(' — ')] return pd.Series([name, team, position]) df[['player', 'team', 'position']] = df.player.apply(process_player_col) # modified after tip from reddit.com/user/hharison # # Alternative (inferior) approach: # #for idx,row in df.iterrows(): # name, position, team = process_player_col(row['player']) # df.ix[idx, 'player'], df.ix[idx, 'position'], df.ix[idx, 'team'] = name, position, team df.tail(3) #
#
# ### Applying Functions to Multiple Columns # In[8]: cols = ['player', 'position', 'team'] df[cols] = df[cols].applymap(lambda x: x.lower()) df.head() #
#
# # Missing Values aka NaNs # [[back to section overview](#Sections)] #
#
# ### Counting Rows with NaNs # In[9]: nans = df.shape[0] - df.dropna().shape[0] print('%d rows have missing values' % nans) #
#
# ### Selecting NaN Rows # In[10]: # Selecting all rows that have NaNs in the `assists` column df[df['assists'].isnull()] #
#
# ### Selecting non-NaN Rows # In[11]: df[df['assists'].notnull()] #
#
# ### Filling NaN Rows # In[12]: # Filling NaN cells with default value 0 df.fillna(value=0, inplace=True) df #
#
# # Appending Rows to a DataFrame # [[back to section overview](#Sections)] # In[13]: # Adding an "empty" row to the DataFrame import numpy as np df = df.append(pd.Series( [np.nan]*len(df.columns), # Fill cells with NaNs index=df.columns), ignore_index=True) df.tail(3) # In[14]: # Filling cells with data df.loc[df.index[-1], 'player'] = 'new player' df.loc[df.index[-1], 'salary'] = 12.3 df.tail(3) #
#
# # Sorting and Reindexing DataFrames # [[back to section overview](#Sections)] # In[15]: # Sorting the DataFrame by a certain column (from highest to lowest) df.sort('goals', ascending=False, inplace=True) df.head() # In[16]: # Optional reindexing of the DataFrame after sorting df.index = range(1,len(df.index)+1) df.head() #
#
# # Updating Columns # [[back to section overview](#Sections)] # In[17]: # Creating a dummy DataFrame with changes in the `salary` column df_2 = df.copy() df_2.loc[0:2, 'salary'] = [20.0, 15.0] df_2.head(3) #
#
# In[18]: # Temporarily use the `player` columns as indices to # apply the update functions df.set_index('player', inplace=True) df_2.set_index('player', inplace=True) df.head(3) #
#
# In[19]: # Update the `salary` column df.update(other=df_2['salary'], overwrite=True) df.head(3) #
#
# In[20]: # Reset the indices df.reset_index(inplace=True) df.head(3) #
#
# # Chaining Conditions - Using Bitwise Operators # [[back to section overview](#Sections)] # In[21]: # Selecting only those players that either playing for Arsenal or Chelsea df[ (df['team'] == 'arsenal') | (df['team'] == 'chelsea') ] # In[22]: # Selecting forwards from Arsenal only df[ (df['team'] == 'arsenal') & (df['position'] == 'forward') ] #
#
# # Column Types # [[back to section overview](#Sections)] #
#
# ### Printing Column Types # In[23]: types = df.columns.to_series().groupby(df.dtypes).groups types #
#
# ### Selecting by Column Type # In[24]: # select string columns df.loc[:, (df.dtypes == np.dtype('O')).values].head() #
#
# ### Converting Column Types # In[25]: df['salary'] = df['salary'].astype(float) # In[26]: types = df.columns.to_series().groupby(df.dtypes).groups types #
#
# # If-tests # [[back to section overview](#Sections)] # I was recently asked how to do an if-test in pandas, that is, how to create an array of 1s and 0s depending on a condition, e.g., if `val` less than 0.5 -> 0, else -> 1. Using the boolean mask, that's pretty simple since `True` and `False` are integers after all. # In[1]: int(True) # In[2]: import pandas as pd a = [[2., .3, 4., 5.], [.8, .03, 0.02, 5.]] df = pd.DataFrame(a) df # In[3]: df = df <= 0.05 df # In[4]: df.astype(int) # In[ ]: