#!/usr/bin/env python
# coding: utf-8

# ## Tips for Selecting Columns in a DataFrame
# 
# Notebook to accompany this [post](https://pbpython.com/selecting-columns.html).
# 
# 

# In[1]:


import pandas as pd
import numpy as np


# In[2]:


df = pd.read_csv(
    'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true'
)


# Build a mapping list so we can see the index of all the columns

# In[3]:


col_mapping = [f"{c[0]}:{c[1]}" for c in enumerate(df.columns)]


# In[4]:


col_mapping


# We can also build a dictionary

# In[5]:


col_mapping_dict = {c[0]:c[1] for c in enumerate(df.columns)}


# In[6]:


col_mapping_dict


# Use iloc to select just the second column (Unique Squirrel ID)

# In[7]:


df.iloc[:, 2]


# Pass a list of integers to select multiple columns by index

# In[8]:


df.iloc[:, [0,1,2]]


# We can also pass a slice object to select a range of columns

# In[9]:


df.iloc[:, 0:3]


# If we want to combine the list and slice notation, we need to use nump.r_ to process the data into an appropriate format.

# In[10]:


np.r_[0:3,15:19,24,25]


# We can pass the output of np.r_ to .iloc to use multiple selection approaches

# In[11]:


df.iloc[:, np.r_[0:3,15:19,24,25]]


# We can use the same notation when reading in a csv as well

# In[12]:


df_2 = pd.read_csv(
    'https://data.cityofnewyork.us/api/views/vfnx-vebw/rows.csv?accessType=DOWNLOAD&bom=true&format=true',
    usecols=np.r_[1,2,5:8,15:25],
)


# In[13]:


df_2.head()


# We can also select columns using a boolean array

# In[14]:


run_cols = df.columns.str.contains('run', case=False)
run_cols


# In[15]:


df.iloc[:, run_cols].head()


# A lambda function can be useful for combining into 1 line.

# In[16]:


df.iloc[:, lambda df:df.columns.str.contains('run', case=False)].head()


# A more complex example

# In[17]:


df.iloc[:, lambda df: df.columns.str.contains('district|precinct|boundaries',
                                              case=False)].head()


# Combining index and boolean arrays

# In[18]:


location_cols = df.columns.str.contains('district|precinct|boundaries',
                                        case=False)
location_cols


# In[19]:


location_indices = [i for i, col in enumerate(location_cols) if col]
location_indices


# In[20]:


df.iloc[:, np.r_[0:3,location_indices]].head()


# In[ ]: