#!/usr/bin/env python
# coding: utf-8
# [Back to the GitHub repository](https://github.com/rasbt/python_reference)
# In[1]:
get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', "-a 'Sebastian Raschka' -v -d -p pandas")
# [More information](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `watermark` magic command extension.
#
#
# # Things in Pandas I Wish I'd Known Earlier
# This is just a small but growing collection of pandas snippets that I find occasionally and particularly useful -- consider it as my personal notebook. Suggestions, tips, and contributions are very, very welcome!
# ### Sections
# - [Loading Some Example Data](#Loading-Some-Example-Data)
# - [Renaming Columns](#Renaming-Columns)
# - [Converting Column Names to Lowercase](#Converting-Column-Names-to-Lowercase)
# - [Renaming Particular Columns](#Renaming-Particular-Columns)
# - [Applying Computations Rows-wise](#Applying-Computations-Rows-wise)
# - [Changing Values in a Column](#Changing-Values-in-a-Column)
# - [Adding a New Column](#Adding-a-New-Column)
# - [Applying Functions to Multiple Columns](#Applying-Functions-to-Multiple-Columns)
# - [Missing Values aka NaNs](#Missing-Values-aka-NaNs)
# - [Counting Rows with NaNs](#Counting-Rows-with-NaNs)
# - [Selecting NaN Rows](#Selecting-NaN-Rows)
# - [Selecting non-NaN Rows](#Selecting-non-NaN-Rows)
# - [Filling NaN Rows](#Filling-NaN-Rows)
# - [Appending Rows to a DataFrame](#Appending-Rows-to-a-DataFrame)
# - [Sorting and Reindexing DataFrames](#Sorting-and-Reindexing-DataFrames)
# - [Updating Columns](#Updating-Columns)
# - [Chaining Conditions - Using Bitwise Operators](#Chaining-Conditions---Using-Bitwise-Operators)
# - [Column Types](#Column-Types)
# - [Printing Column Types](#Printing-Column-Types)
# - [Selecting by Column Type](#Selecting-by-Column-Type)
# - [Converting Column Types](#Converting-Column-Types)
# - [If-tests](#If-tests)
#
#
# # Loading Some Example Data
# [[back to section overview](#Sections)]
# I am heavily into sports prediction (via a machine learning approach) these days. So, let us use a (very) small subset of the soccer data that I am just working with.
# In[2]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/rasbt/python_reference/master/Data/some_soccer_data.csv')
df
#
#
# # Renaming Columns
# [[back to section overview](#Sections)]
#
#
# ### Converting Column Names to Lowercase
# In[3]:
# Converting column names to lowercase
df.columns = [c.lower() for c in df.columns]
# or
# df.rename(columns=lambda x : x.lower())
df.tail(3)
#
#
# ### Renaming Particular Columns
# In[4]:
df = df.rename(columns={'p': 'points',
'gp': 'games',
'sot': 'shots_on_target',
'g': 'goals',
'ppg': 'points_per_game',
'a': 'assists',})
df.tail(3)
#
#
# # Applying Computations Rows-wise
# [[back to section overview](#Sections)]
#
#
# ### Changing Values in a Column
# In[5]:
# Processing `salary` column
df['salary'] = df['salary'].apply(lambda x: x.strip('$m'))
df.tail()
#
#
# ### Adding a New Column
# In[6]:
df['team'] = pd.Series('', index=df.index)
# or
df.insert(loc=8, column='position', value='')
df.tail(3)
# In[7]:
# Processing `player` column
def process_player_col(text):
name, rest = text.split('\n')
position, team = [x.strip() for x in rest.split(' — ')]
return pd.Series([name, team, position])
df[['player', 'team', 'position']] = df.player.apply(process_player_col)
# modified after tip from reddit.com/user/hharison
#
# Alternative (inferior) approach:
#
#for idx,row in df.iterrows():
# name, position, team = process_player_col(row['player'])
# df.ix[idx, 'player'], df.ix[idx, 'position'], df.ix[idx, 'team'] = name, position, team
df.tail(3)
#
#
# ### Applying Functions to Multiple Columns
# In[8]:
cols = ['player', 'position', 'team']
df[cols] = df[cols].applymap(lambda x: x.lower())
df.head()
#
#
# # Missing Values aka NaNs
# [[back to section overview](#Sections)]
#
#
# ### Counting Rows with NaNs
# In[9]:
nans = df.shape[0] - df.dropna().shape[0]
print('%d rows have missing values' % nans)
#
#
# ### Selecting NaN Rows
# In[10]:
# Selecting all rows that have NaNs in the `assists` column
df[df['assists'].isnull()]
#
#
# ### Selecting non-NaN Rows
# In[11]:
df[df['assists'].notnull()]
#
#
# ### Filling NaN Rows
# In[12]:
# Filling NaN cells with default value 0
df.fillna(value=0, inplace=True)
df
#
#
# # Appending Rows to a DataFrame
# [[back to section overview](#Sections)]
# In[13]:
# Adding an "empty" row to the DataFrame
import numpy as np
df = df.append(pd.Series(
[np.nan]*len(df.columns), # Fill cells with NaNs
index=df.columns),
ignore_index=True)
df.tail(3)
# In[14]:
# Filling cells with data
df.loc[df.index[-1], 'player'] = 'new player'
df.loc[df.index[-1], 'salary'] = 12.3
df.tail(3)
#
#
# # Sorting and Reindexing DataFrames
# [[back to section overview](#Sections)]
# In[15]:
# Sorting the DataFrame by a certain column (from highest to lowest)
df.sort('goals', ascending=False, inplace=True)
df.head()
# In[16]:
# Optional reindexing of the DataFrame after sorting
df.index = range(1,len(df.index)+1)
df.head()
#
#
# # Updating Columns
# [[back to section overview](#Sections)]
# In[17]:
# Creating a dummy DataFrame with changes in the `salary` column
df_2 = df.copy()
df_2.loc[0:2, 'salary'] = [20.0, 15.0]
df_2.head(3)
#
#
# In[18]:
# Temporarily use the `player` columns as indices to
# apply the update functions
df.set_index('player', inplace=True)
df_2.set_index('player', inplace=True)
df.head(3)
#
#
# In[19]:
# Update the `salary` column
df.update(other=df_2['salary'], overwrite=True)
df.head(3)
#
#
# In[20]:
# Reset the indices
df.reset_index(inplace=True)
df.head(3)
#
#
# # Chaining Conditions - Using Bitwise Operators
# [[back to section overview](#Sections)]
# In[21]:
# Selecting only those players that either playing for Arsenal or Chelsea
df[ (df['team'] == 'arsenal') | (df['team'] == 'chelsea') ]
# In[22]:
# Selecting forwards from Arsenal only
df[ (df['team'] == 'arsenal') & (df['position'] == 'forward') ]
#
#
# # Column Types
# [[back to section overview](#Sections)]
#
#
# ### Printing Column Types
# In[23]:
types = df.columns.to_series().groupby(df.dtypes).groups
types
#
#
# ### Selecting by Column Type
# In[24]:
# select string columns
df.loc[:, (df.dtypes == np.dtype('O')).values].head()
#
#
# ### Converting Column Types
# In[25]:
df['salary'] = df['salary'].astype(float)
# In[26]:
types = df.columns.to_series().groupby(df.dtypes).groups
types
#
#
# # If-tests
# [[back to section overview](#Sections)]
# I was recently asked how to do an if-test in pandas, that is, how to create an array of 1s and 0s depending on a condition, e.g., if `val` less than 0.5 -> 0, else -> 1. Using the boolean mask, that's pretty simple since `True` and `False` are integers after all.
# In[1]:
int(True)
# In[2]:
import pandas as pd
a = [[2., .3, 4., 5.], [.8, .03, 0.02, 5.]]
df = pd.DataFrame(a)
df
# In[3]:
df = df <= 0.05
df
# In[4]:
df.astype(int)
# In[ ]: