#!/usr/bin/env python # coding: utf-8 # ![Pandas Logo](http://pandas.pydata.org/_static/pandas_logo.png) # # # Week 08 - Introduction to Pandas # # ## Today's Agenda # - Pandas: Introduction # - Series # - DataFrames # - Indexing, Selecting, Filtering # - Drop columns # - Handling missing Data # # In[1]: # Importing modules get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import pandas as pd import numpy as np import seaborn as sns sns.set_context("notebook") # # Series # A _Series_ is a one-dimensional array-like object containing an array of data and an associated array of __data labels__. # One can use any NumPy data type to assign to the _Series_ # # Creating a Series: # In[2]: series_1 = pd.Series(np.random.random(10)) series_1 # One can get a NumPy array from the Series, by typing: # In[3]: series_1.values # # Reindexing # One can also get the indices of each element, by typing: # In[4]: series_1.index.values # One can also have a custom set of indices: # In[5]: import string alphabet = string.lowercase alphabet = np.array([x for x in alphabet])[0:10] alphabet # In[6]: series_2 = pd.Series(np.random.random(len(alphabet)), index=alphabet) series_2 # One can select only a subsample of the _Series_ # In[7]: series_1[[1,3,4]] # In[8]: series_2[['a','d','h']] # # Arithmetic and function Mapping # You can also perform numerical expressions # In[9]: series_1[1]**2 # Or find values greater than some value '__x__' # In[10]: x = 0.5 series_1[series_1 >= x] # You can apply functions to a column, and save it as a _new_ Series # In[11]: import sys def exponentials(arr, basis=10.): """ Uses the array `arr` as the exponents for `basis` Parameters ---------- arr: numpy array, list, pandas Series; shape (N,) array to be used as exponents of `basis` power: int or float, optional (default = 10) number used as the basis Returns ------- exp_arr: numpy array or list, shape (N,) array of values for `basis`**`arr` """ if isinstance(arr, list): exp_arr = [basis**x for x in arr] return exp_arr elif isinstance(arr, np.ndarray) or isinstance(arr, pd.core.series.Series): exp_arr = basis**arr return exp_arr else: cmd = ">>>> `arr` is not a list nor a numpy array" cmd +="\n>>>> Please give the correct type of object" print(cmd) sys.exit(1) # In[12]: exponentials(series_1[series_1 >= x]) # You can also __create__ a _Series_ using a _dictionary_ (we talked about these on __Week 4__) # In[13]: labels_arr = ['foo', 'bar', 'baz'] data_arr = [100, 200, 300] dict_1 = dict(zip(labels_arr, data_arr)) dict_1 # In[14]: series_3 = pd.Series(dict_1) series_3 # # Handling Missing Data # One of the most useful features of pandas is that it __can handle missing data__ quite easily: # In[15]: index = ['foo', 'bar', 'baz', 'qux'] series_4 = pd.Series(dict_1, index=index) series_4 # In[16]: pd.isnull(series_4) # In[17]: series_3 + series_4 # So using a Series is powerful, but __DataFrames__ are probably what gets used the most since it represents a _tabular data structure_ containing an ordered collection of __columns__ and __rows__. # # DataFrames # A DataFrame is a "tabular data structure" containing an _ordered collection of columns_. Each column can a have a __different__ data type. # # Row and column operations are treated roughly symmetrically. # One can obtain a DataFrame from a normal dictionary, or by reading a file with columns and rows. # Creating a DataFrame # In[18]: data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'], 'year' : [2012, 2013, 2014, 2014, 2015], 'popu' : [5.0, 5.1, 5.2, 4.0, 4.1]} df_1 = pd.DataFrame(data_1) df_1 # This DataFrame has 4 rows and 3 columns by the name "_pop_", "_state_", and "_year_". # # The way to __access__ a DataFrame is quite similar to that of accessing a _Series_.
# To access a __column__, one writes the name of the `column`, as in the following example: # In[19]: df_1['popu'] # In[20]: df_1.popu # One can also handle __missing data__ with DataFrames. # Like Series, columns that are not present in the data are NaNs: # In[21]: df_2 = pd.DataFrame(data_1, columns=['year', 'state', 'popu', 'unempl']) df_2 # In[22]: df_2['state'] # One can __retrieve a row__ by: # In[23]: df_2.ix[1] # Editing a DataFrame is quite easy to do. One can _assign_ a Series to a column of the DataFrame. If the Series is a list or an array, __the length must match the DataFrame__. # In[24]: unempl = pd.Series([1.0, 2.0, 10.], index=[1,3,5]) unempl # In[25]: df_2['unempl'] = unempl df_2 # In[26]: df_2.unempl.isnull() # You can also __transpose__ a DataFrame, i.e. switch rows by columns, and columns by rows # In[27]: df_2.T # Now, let's say you want to show __only the 'year' and 'popu' columns__. # You can do it by: # In[28]: df_2[['year', 'popu']] # # Dropping Entries # Let's say you only need a subsample of the table that you have, and you need to __drop__ a column from the DataFrame. # You can do that by using the '_drop_' option: # In[29]: df_2 # In[30]: df_3 = df_2.drop('unempl', axis=1) df_3 # You can also __drop certain rows__: # In[31]: df_4 = df_2.drop([1,4]) df_4 # __Look at this carefully__! The DataFrame _kept_ preserved the same indices as for __df_2__. # # If you can to __reset__ the indices, you can do that by: # In[32]: df_4.reset_index(inplace=True) df_4 # # Gaia Dataset # ![Gaia](http://pbs.twimg.com/profile_images/1266601315/gaiaLogo_reasonably_small.png) # # Pandas is great at reading Data tables and CSV files, and other kinds of documents. # For the remainder of this notebook, we will be using the [Gaia](https://www.cosmos.esa.int/web/gaia)'s DR1 catalogue. # In[33]: gaia_df = pd.read_csv('http://1016243957.rsc.cdn77.org/Gaia/gaia_source/csv/GaiaSource_000-000-000.csv.gz', compression='gzip') # In[34]: gaia_df # ### Shape, Columns and Rows # You can get the shape of the "gaia_df" DataFrame by typing: # In[35]: gaia_df.shape # That means there are __218453 rows__ and __57 columns__. # To get an array of the columns available, one could write: # In[36]: gaia_df.columns.values.sort() gaia_df.columns.values # Let's say you only want a DataFrame with the the colums: # - ra (right ascension) # - dec (declination) # - l (galactic longitude) # - b (galactic latitude) # # You do this by using the __loc__ option for the DataFrame: # In[37]: gaia_df_2 = gaia_df.loc[:,['ra','dec','l','b']] gaia_df_2 # This selects all of the rows, and only the selected columns in the list. # You can also select only a subsample of the rows as well, as in the following example. # Let's say I just want a random subsample of __10%__ of the galaxies in the Gaia DR1 catalogue. I can do that by: # In[38]: # Defining indices to select from the DataFrame import random nrows = len(gaia_df_2) random_idx = random.sample(np.arange(nrows), int(0.01*nrows)) random_idx = np.sort(random_idx) # In[39]: gaia_df_3 = gaia_df_2.loc[random_idx,:] gaia_df_3 # I'm __re-normalizing__ the indices of this DataFrame # In[40]: gaia_df_3.reset_index(inplace=True, drop=True) gaia_df_3 # You can produce __plots__ directly from the DataFrame # In[41]: gaia_df_3.plot('ra','dec',kind='scatter', label='Gaia', title='Right Ascension and Declination for Gaia') # Or even __Scatterplot Matrices__: # In[42]: sns.pairplot(gaia_df_3) # In[43]: sns.jointplot(gaia_df_3.l, gaia_df_3.b, color='green') # # Indexing, Selecting, Filtering Data # # Now I want to filter the data based on __ra__ and __dec__: # # I want to select all the stars within: # - 45 < RA < 50 # - 5 < Dec < 10 # # Normally, you would could do in numpy using the __`np.where`__ function, like in the following example: # In[44]: ra_arr = gaia_df.ra.values dec_arr = gaia_df.dec.values # In[45]: # Just showing the first 25 elements np.column_stack((ra_arr, dec_arr))[0:25] # In[46]: ## Numpy way of finding the stars that meet the criteria # RA critera ra_idx = np.where((ra_arr >= 45) & (ra_arr <= 50))[0] # Dec criteria dec_idx = np.where((dec_arr >= 5) & (dec_arr <= 10))[0] # Finding `intersecting' indices that meet both criteria radec_idx = np.intersect1d(ra_idx, dec_idx) # Selecting the values from only those indices ra_new = ra_arr[radec_idx] dec_new = dec_arr[radec_idx] # Printing out ra and dec for corresponding indices print(np.column_stack((ra_new, dec_new))) # This is rather convoluted and __long__, and one can easily make a mistake if s/he doesn't keep track of which arrays s/he is using! # # In Pandas, this is __much easier!!__ # In[47]: gaia_df_4 = gaia_df.loc[( (gaia_df.ra >= 45) & (gaia_df.ra <= 50) & (gaia_df.dec >= 5) & (gaia_df.dec <= 10))] gaia_df_4[['ra','dec']] # # Future of Pandas # Pandas is a __great__ for handling data, especially comma-delimited or space-separated data. Pandas is also compatible with many other packages, like __seaborn__, __astropy__, NumPy, etc. # # We will have another lecture on Pandas that will cover much more advanced aspects of Pandas. __Make sure you keep checking the schedule!__ # # Resources # - [12 Useful Pandas Techniques in Python for Data Manipulation](https://www.analyticsvidhya.com/blog/2016/01/12-pandas-techniques-python-data-manipulation/) # - [Datacamp Pandas Tutorial](https://www.datacamp.com/community/tutorials/pandas-tutorial-dataframe-python) # - [Top 8 resources for learning data analysis with pandas](http://www.dataschool.io/best-python-pandas-resources/)