#!/usr/bin/env python # coding: utf-8 # ## Tips for Selecting Columns in a DataFrame # # Notebook to accompany this [post](https://pbpython.com/selecting-columns.html). # # # In[1]: import pandas as pd import numpy as np # In[2]: df = pd.read_csv( 'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true' ) # Build a mapping list so we can see the index of all the columns # In[3]: col_mapping = [f"{c[0]}:{c[1]}" for c in enumerate(df.columns)] # In[4]: col_mapping # We can also build a dictionary # In[5]: col_mapping_dict = {c[0]:c[1] for c in enumerate(df.columns)} # In[6]: col_mapping_dict # Use iloc to select just the second column (Unique Squirrel ID) # In[7]: df.iloc[:, 2] # Pass a list of integers to select multiple columns by index # In[8]: df.iloc[:, [0,1,2]] # We can also pass a slice object to select a range of columns # In[9]: df.iloc[:, 0:3] # If we want to combine the list and slice notation, we need to use nump.r_ to process the data into an appropriate format. # In[10]: np.r_[0:3,15:19,24,25] # We can pass the output of np.r_ to .iloc to use multiple selection approaches # In[11]: df.iloc[:, np.r_[0:3,15:19,24,25]] # We can use the same notation when reading in a csv as well # In[12]: df_2 = pd.read_csv( 'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true', usecols=np.r_[1,2,5:8,15:25], ) # In[13]: df_2.head() # We can also select columns using a boolean array # In[14]: run_cols = df.columns.str.contains('run', case=False) run_cols # In[15]: df.iloc[:, run_cols].head() # A lambda function can be useful for combining into 1 line. # In[16]: df.iloc[:, lambda df:df.columns.str.contains('run', case=False)].head() # A more complex example # In[17]: df.iloc[:, lambda df: df.columns.str.contains('district|precinct|boundaries', case=False)].head() # Combining index and boolean arrays # In[18]: location_cols = df.columns.str.contains('district|precinct|boundaries', case=False) location_cols # In[19]: location_indices = [i for i, col in enumerate(location_cols) if col] location_indices # In[20]: df.iloc[:, np.r_[0:3,location_indices]].head() # In[ ]: