#!/usr/bin/env python # coding: utf-8 # This notebook was prepared by [Donne Martin](http://donnemartin.com). Source and license info is on [GitHub](https://github.com/donnemartin/data-science-ipython-notebooks). # # Pandas # # Credits: The following are notes taken while working through [Python for Data Analysis](http://www.amazon.com/Python-Data-Analysis-Wrangling-IPython/dp/1449319793) by Wes McKinney # # * Series # * DataFrame # * Reindexing # * Dropping Entries # * Indexing, Selecting, Filtering # * Arithmetic and Data Alignment # * Function Application and Mapping # * Sorting and Ranking # * Axis Indices with Duplicate Values # * Summarizing and Computing Descriptive Statistics # * Cleaning Data (Under Construction) # * Input and Output (Under Construction) # In[1]: from pandas import Series, DataFrame import pandas as pd import numpy as np # ## Series # # A Series is a one-dimensional array-like object containing an array of data and an associated array of data labels. The data can be any NumPy data type and the labels are the Series' index. # Create a Series: # In[2]: ser_1 = Series([1, 1, 2, -3, -5, 8, 13]) ser_1 # Get the array representation of a Series: # In[3]: ser_1.values # Index objects are immutable and hold the axis labels and metadata such as names and axis names. # # Get the index of the Series: # In[4]: ser_1.index # Create a Series with a custom index: # In[5]: ser_2 = Series([1, 1, 2, -3, -5], index=['a', 'b', 'c', 'd', 'e']) ser_2 # Get a value from a Series: # In[6]: ser_2[4] == ser_2['e'] # Get a set of values from a Series by passing in a list: # In[7]: ser_2[['c', 'a', 'b']] # Get values great than 0: # In[8]: ser_2[ser_2 > 0] # Scalar multiply: # In[9]: ser_2 * 2 # Apply a numpy math function: # In[10]: import numpy as np np.exp(ser_2) # A Series is like a fixed-length, ordered dict. # # Create a series by passing in a dict: # In[11]: dict_1 = {'foo' : 100, 'bar' : 200, 'baz' : 300} ser_3 = Series(dict_1) ser_3 # Re-order a Series by passing in an index (indices not found are NaN): # In[12]: index = ['foo', 'bar', 'baz', 'qux'] ser_4 = Series(dict_1, index=index) ser_4 # Check for NaN with the pandas method: # In[13]: pd.isnull(ser_4) # Check for NaN with the Series method: # In[14]: ser_4.isnull() # Series automatically aligns differently indexed data in arithmetic operations: # In[15]: ser_3 + ser_4 # Name a Series: # In[16]: ser_4.name = 'foobarbazqux' # Name a Series index: # In[17]: ser_4.index.name = 'label' # In[18]: ser_4 # Rename a Series' index in place: # In[19]: ser_4.index = ['fo', 'br', 'bz', 'qx'] ser_4 # ## DataFrame # # A DataFrame is a tabular data structure containing an ordered collection of columns. Each column can have a different type. DataFrames have both row and column indices and is analogous to a dict of Series. Row and column operations are treated roughly symmetrically. Columns returned when indexing a DataFrame are views of the underlying data, not a copy. To obtain a copy, use the Series' copy method. # # Create a DataFrame: # In[20]: data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'], 'year' : [2012, 2013, 2014, 2014, 2015], 'pop' : [5.0, 5.1, 5.2, 4.0, 4.1]} df_1 = DataFrame(data_1) df_1 # Create a DataFrame specifying a sequence of columns: # In[21]: df_2 = DataFrame(data_1, columns=['year', 'state', 'pop']) df_2 # Like Series, columns that are not present in the data are NaN: # In[22]: df_3 = DataFrame(data_1, columns=['year', 'state', 'pop', 'unempl']) df_3 # Retrieve a column by key, returning a Series: # # In[23]: df_3['state'] # Retrive a column by attribute, returning a Series: # In[24]: df_3.year # Retrieve a row by position: # In[25]: df_3.ix[0] # Update a column by assignment: # In[26]: df_3['unempl'] = np.arange(5) df_3 # Assign a Series to a column (note if assigning a list or array, the length must match the DataFrame, unlike a Series): # In[27]: unempl = Series([6.0, 6.0, 6.1], index=[2, 3, 4]) df_3['unempl'] = unempl df_3 # Assign a new column that doesn't exist to create a new column: # In[28]: df_3['state_dup'] = df_3['state'] df_3 # Delete a column: # In[29]: del df_3['state_dup'] df_3 # Create a DataFrame from a nested dict of dicts (the keys in the inner dicts are unioned and sorted to form the index in the result, unless an explicit index is specified): # In[30]: pop = {'VA' : {2013 : 5.1, 2014 : 5.2}, 'MD' : {2014 : 4.0, 2015 : 4.1}} df_4 = DataFrame(pop) df_4 # Transpose the DataFrame: # In[31]: df_4.T # Create a DataFrame from a dict of Series: # In[32]: data_2 = {'VA' : df_4['VA'][1:], 'MD' : df_4['MD'][2:]} df_5 = DataFrame(data_2) df_5 # Set the DataFrame index name: # In[33]: df_5.index.name = 'year' df_5 # Set the DataFrame columns name: # In[34]: df_5.columns.name = 'state' df_5 # Return the data contained in a DataFrame as a 2D ndarray: # In[35]: df_5.values # If the columns are different dtypes, the 2D ndarray's dtype will accomodate all of the columns: # In[36]: df_3.values # ## Reindexing # Create a new object with the data conformed to a new index. Any missing values are set to NaN. # In[37]: df_3 # Reindexing rows returns a new frame with the specified index: # In[38]: df_3.reindex(list(reversed(range(0, 6)))) # Missing values can be set to something other than NaN: # In[39]: df_3.reindex(range(6, 0), fill_value=0) # Interpolate ordered data like a time series: # In[40]: ser_5 = Series(['foo', 'bar', 'baz'], index=[0, 2, 4]) # In[41]: ser_5.reindex(range(5), method='ffill') # In[42]: ser_5.reindex(range(5), method='bfill') # Reindex columns: # In[43]: df_3.reindex(columns=['state', 'pop', 'unempl', 'year']) # Reindex rows and columns while filling rows: # In[44]: df_3.reindex(index=list(reversed(range(0, 6))), fill_value=0, columns=['state', 'pop', 'unempl', 'year']) # Reindex using ix: # In[45]: df_6 = df_3.ix[range(0, 7), ['state', 'pop', 'unempl', 'year']] df_6 # ## Dropping Entries # Drop rows from a Series or DataFrame: # In[46]: df_7 = df_6.drop([0, 1]) df_7 # Drop columns from a DataFrame: # In[47]: df_7 = df_7.drop('unempl', axis=1) df_7 # ## Indexing, Selecting, Filtering # Series indexing is similar to NumPy array indexing with the added bonus of being able to use the Series' index values. # In[48]: ser_2 # Select a value from a Series: # In[49]: ser_2[0] == ser_2['a'] # Select a slice from a Series: # In[50]: ser_2[1:4] # Select specific values from a Series: # In[51]: ser_2[['b', 'c', 'd']] # Select from a Series based on a filter: # In[52]: ser_2[ser_2 > 0] # Select a slice from a Series with labels (note the end point is inclusive): # In[53]: ser_2['a':'b'] # Assign to a Series slice (note the end point is inclusive): # In[54]: ser_2['a':'b'] = 0 ser_2 # Pandas supports indexing into a DataFrame. # In[55]: df_6 # Select specified columns from a DataFrame: # In[56]: df_6[['pop', 'unempl']] # Select a slice from a DataFrame: # In[57]: df_6[:2] # Select from a DataFrame based on a filter: # In[58]: df_6[df_6['pop'] > 5] # Perform a scalar comparison on a DataFrame: # In[59]: df_6 > 5 # Perform a scalar comparison on a DataFrame, retain the values that pass the filter: # In[60]: df_6[df_6 > 5] # Select a slice of rows from a DataFrame (note the end point is inclusive): # In[61]: df_6.ix[2:3] # Select a slice of rows from a specific column of a DataFrame: # In[62]: df_6.ix[0:2, 'pop'] # Select rows based on an arithmetic operation on a specific row: # In[63]: df_6.ix[df_6.unempl > 5.0] # ## Arithmetic and Data Alignment # Adding Series objects results in the union of index pairs if the pairs are not the same, resulting in NaN for indices that do not overlap: # In[64]: np.random.seed(0) ser_6 = Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) ser_6 # In[65]: np.random.seed(1) ser_7 = Series(np.random.randn(5), index=['a', 'c', 'e', 'f', 'g']) ser_7 # In[66]: ser_6 + ser_7 # Set a fill value instead of NaN for indices that do not overlap: # In[67]: ser_6.add(ser_7, fill_value=0) # Adding DataFrame objects results in the union of index pairs for rows and columns if the pairs are not the same, resulting in NaN for indices that do not overlap: # In[68]: np.random.seed(0) df_8 = DataFrame(np.random.rand(9).reshape((3, 3)), columns=['a', 'b', 'c']) df_8 # In[69]: np.random.seed(1) df_9 = DataFrame(np.random.rand(9).reshape((3, 3)), columns=['b', 'c', 'd']) df_9 # In[70]: df_8 + df_9 # Set a fill value instead of NaN for indices that do not overlap: # In[71]: df_10 = df_8.add(df_9, fill_value=0) df_10 # Like NumPy, pandas supports arithmetic operations between DataFrames and Series. # # Match the index of the Series on the DataFrame's columns, broadcasting down the rows: # In[72]: ser_8 = df_10.ix[0] df_11 = df_10 - ser_8 df_11 # Match the index of the Series on the DataFrame's columns, broadcasting down the rows and union the indices that do not match: # In[73]: ser_9 = Series(range(3), index=['a', 'd', 'e']) ser_9 # In[74]: df_11 - ser_9 # Broadcast over the columns and match the rows (axis=0) by using an arithmetic method: # In[75]: df_10 # In[76]: ser_10 = Series([100, 200, 300]) ser_10 # In[77]: df_10.sub(ser_10, axis=0) # ## Function Application and Mapping # NumPy ufuncs (element-wise array methods) operate on pandas objects: # In[78]: df_11 = np.abs(df_11) df_11 # Apply a function on 1D arrays to each column: # In[79]: func_1 = lambda x: x.max() - x.min() df_11.apply(func_1) # Apply a function on 1D arrays to each row: # In[80]: df_11.apply(func_1, axis=1) # Apply a function and return a DataFrame: # In[81]: func_2 = lambda x: Series([x.min(), x.max()], index=['min', 'max']) df_11.apply(func_2) # Apply an element-wise Python function to a DataFrame: # In[82]: func_3 = lambda x: '%.2f' %x df_11.applymap(func_3) # Apply an element-wise Python function to a Series: # In[83]: df_11['a'].map(func_3) # ## Sorting and Ranking # In[84]: ser_4 # Sort a Series by its index: # In[85]: ser_4.sort_index() # Sort a Series by its values: # In[86]: ser_4.sort_values() # In[87]: df_12 = DataFrame(np.arange(12).reshape((3, 4)), index=['three', 'one', 'two'], columns=['c', 'a', 'b', 'd']) df_12 # Sort a DataFrame by its index: # In[88]: df_12.sort_index() # Sort a DataFrame by columns in descending order: # In[89]: df_12.sort_index(axis=1, ascending=False) # Sort a DataFrame's values by column: # In[90]: df_12.sort_values(by=['d', 'c']) # Ranking is similar to numpy.argsort except that ties are broken by assigning each group the mean rank: # In[91]: ser_11 = Series([7, -5, 7, 4, 2, 0, 4, 7]) ser_11 = ser_11.sort_values() ser_11 # In[92]: ser_11.rank() # Rank a Series according to when they appear in the data: # In[93]: ser_11.rank(method='first') # Rank a Series in descending order, using the maximum rank for the group: # In[94]: ser_11.rank(ascending=False, method='max') # DataFrames can rank over rows or columns. # In[95]: df_13 = DataFrame({'foo' : [7, -5, 7, 4, 2, 0, 4, 7], 'bar' : [-5, 4, 2, 0, 4, 7, 7, 8], 'baz' : [-1, 2, 3, 0, 5, 9, 9, 5]}) df_13 # Rank a DataFrame over rows: # In[96]: df_13.rank() # Rank a DataFrame over columns: # In[97]: df_13.rank(axis=1) # ## Axis Indexes with Duplicate Values # Labels do not have to be unique in Pandas: # In[98]: ser_12 = Series(range(5), index=['foo', 'foo', 'bar', 'bar', 'baz']) ser_12 # In[99]: ser_12.index.is_unique # Select Series elements: # In[100]: ser_12['foo'] # Select DataFrame elements: # In[101]: df_14 = DataFrame(np.random.randn(5, 4), index=['foo', 'foo', 'bar', 'bar', 'baz']) df_14 # In[102]: df_14.ix['bar'] # ## Summarizing and Computing Descriptive Statistics # Unlike NumPy arrays, Pandas descriptive statistics automatically exclude missing data. NaN values are excluded unless the entire row or column is NA. # In[103]: df_6 # In[104]: df_6.sum() # Sum over the rows: # In[105]: df_6.sum(axis=1) # Account for NaNs: # In[106]: df_6.sum(axis=1, skipna=False) # ## Cleaning Data (Under Construction) # * Replace # * Drop # * Concatenate # In[107]: from pandas import Series, DataFrame import pandas as pd # Setup a DataFrame: # In[108]: data_1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'], 'year' : [2012, 2013, 2014, 2014, 2015], 'population' : [5.0, 5.1, 5.2, 4.0, 4.1]} df_1 = DataFrame(data_1) df_1 # ### Replace # Replace all occurrences of a string with another string, in place (no copy): # In[109]: df_1.replace('VA', 'VIRGINIA', inplace=True) df_1 # In a specified column, replace all occurrences of a string with another string, in place (no copy): # In[110]: df_1.replace({'state' : { 'MD' : 'MARYLAND' }}, inplace=True) df_1 # ### Drop # Drop the 'population' column and return a copy of the DataFrame: # In[111]: df_2 = df_1.drop('population', axis=1) df_2 # ### Concatenate # Concatenate two DataFrames: # In[112]: data_2 = {'state' : ['NY', 'NY', 'NY', 'FL', 'FL'], 'year' : [2012, 2013, 2014, 2014, 2015], 'population' : [6.0, 6.1, 6.2, 3.0, 3.1]} df_3 = DataFrame(data_2) df_3 # In[113]: df_4 = pd.concat([df_1, df_3]) df_4 # ## Input and Output (Under Construction) # * Reading # * Writing # In[114]: from pandas import Series, DataFrame import pandas as pd # ### Reading # Read data from a CSV file into a DataFrame (use sep='\t' for TSV): # In[115]: df_1 = pd.read_csv("../data/ozone.csv") # Get a summary of the DataFrame: # In[116]: df_1.describe() # List the first five rows of the DataFrame: # In[117]: df_1.head() # ### Writing # Create a copy of the CSV file, encoded in UTF-8 and hiding the index and header labels: # In[118]: df_1.to_csv('../data/ozone_copy.csv', encoding='utf-8', index=False, header=False) # View the data directory: # In[119]: get_ipython().system('ls -l ../data/')