#!/usr/bin/env python # coding: utf-8 # # pandas Dataframe - Basic Operativity # In[1]: import addutils.toc ; addutils.toc.js(ipy_notebook=True) # In[2]: from addutils import css_notebook css_notebook() # See [pandas documentation]() for more information and examples. Run the code at the end of the Notebook to generate the example files. # In[3]: import numpy as np import pandas as pd from numpy import NaN from addutils import side_by_side2 from addutils import css_notebook from addutils import read_txt from IPython.core.display import HTML from faker import Factory css_notebook() # ## 1 File I/O and DataFrame Generation # Pandas reads and format data from many different file formats: `txt, csv, web, xls, mat`. In this case we use `read_csv` to read two textual data files. # # First have a look to the file in its original form: # In[4]: import warnings warnings.filterwarnings('ignore') from utilities.generators import generate_all generate_all() # In[5]: read_txt('temp/p01_prices.txt') # ### 1.1 Create DataFrames with read_csv # This file can be read and formatted at the same time using `read_csv`. Lets read the two files `p01_prices.txt` and `p01_volumes.txt` # In[ ]: prices = pd.read_csv('temp/p01_prices.txt', index_col=0, parse_dates=[0]) volumes = pd.read_csv('temp/p01_volumes.txt', index_col=0, parse_dates=[0]) HTML(side_by_side2(prices, volumes)) # Both "prices" and "volumes" datasets are 2D DataFrame objects: # In[ ]: type(prices) # ### 1.2 Create DataFrames from Python Dictionaries # In[ ]: fakeIT = Factory.create('it_IT') data = {'Name' : [fakeIT.name() for i in range(5)], 'Company' : [fakeIT.company() for i in range(5)], 'City' : [fakeIT.city() for i in range(5)]} # In[ ]: df = pd.DataFrame(data, columns = ['Name','Company','City']) df # ### 1.3 Create DataFrames from Items # In[ ]: df = pd.DataFrame.from_items([('Name', [fakeIT.name() for i in range(5)]), ('Company', [fakeIT.company() for i in range(5)])]) df # ### 1.4 Create DataFrames fron Numpy Arrays # In[ ]: df = pd.DataFrame(np.array([[2,5],[3,6]]).T, index=list('ab'), columns=['ONE','TWO']) df # ### 1.5 DataFrames can be converted in Numpy Arrays # In[ ]: np.asarray(df) # ### 1.6 DataFrames, Series and Panels # In Pandas there are 3 main data structure types ("data container" objects): # # * **Series:** for one-dimensional data # * **DataFrames:** for bi-dimensional data (matrices) # * **Panels:** for 3D or nD data # # For simplicity, in this course we will describe only pandas Series and DataFrames. # ## 2 Automatic Data Alignment # As you can see the dates has been interpreted correctly and used as row index. Notice that the rows in the two datafiles are misaligned, this is not a problem in pandas because the *Automatic Data Alignment* feature: an operation involving the two datasets will simply use `NaN` for the undefined (misaligned) values # In[ ]: prices*volumes # Which can be better formatted to a "2 decimal places float number" with comma as thousands separator (see Package Options): # In[ ]: pd.set_option('display.float_format', lambda x: '{:,.1f}'.format(x)) # formatting (prices*volumes).replace('nan', '-') # replacing NaN # Example: calculate the volume-weighted average price # In[ ]: vwap = (prices*volumes).sum()/volumes.sum() vwap.dropna() # ## 3 Indexing # ### 3.1 Label-Based Indexing # `.loc` is strictly label based, will raise KeyError when the items are not found, allowed inputs are: # # * A single label # * A list or array of labels [’a’, ’b’, ’c’] # * A slice object with labels [’a’:’f’] (note that contrary to usual python slices, both the start and the stop are included!) # In[ ]: HTML(side_by_side2(prices, prices.loc['2012-11-21':'2012-11-27',['AAPL', 'GOOG']])) # Columns can be selected without specifying the index: # In[ ]: HTML(side_by_side2(prices, prices[['AAPL', 'GOOG']])) # ### 3.2 Position-Based Indexing # `.iloc` is strictly position based, will raise KeyError when the items are out of bounds: # In[ ]: HTML(side_by_side2(prices, prices.iloc[1:5,[0, 1]])) # ## 4 DataFrame Basic Operations # ### 4.1 Reindex/Reorder rows and columns # In[ ]: data = np.array([[2,5,8,11],[3,6,9,12]]) d1 = pd.DataFrame(data.T, index=list('abce'), columns=['K','W']) HTML(side_by_side2(d1, pd.DataFrame(d1, index=list('baez'), columns=['W','K','T'])) ) # ### 4.2 Calculate new columns # In[ ]: d1['Z'] = d1['W']-d1['K'] d1['B'] = d1['W']>4 d1 # ### 4.3 Deleting rows and columns # In[ ]: d1['SUM'] = d1.sum(axis=1) HTML(side_by_side2(d1, d1.drop(['b', 'c'], axis=0), d1.drop(['Z', 'B'], axis=1))) # ### 4.4 Inserting colums in a specific position # In[ ]: d2 = d1.copy() # .copy() method is needed to create a new object. d2.insert(1, 'Exp(W)', np.exp(d1['W'])) HTML(side_by_side2(d1, d2, space=10)) # Example: Indexing rows to create a new column with empty values, then use the Forward Fill Padding to fill the gaps # In[ ]: d1['part'] = d1['K'].iloc[:2] d1['part'] = d1['part'].fillna(method='ffill') d1 # ### 4.5 Check if a value or a list of given values are contained in a specific column # In[ ]: # TODO: Upgrade side_by_side2 to include series HTML(side_by_side2(d1, d1['K'].isin([3, 8]))) # ### 4.6 Rename columns # In[ ]: HTML(side_by_side2(d1, d1.rename(columns={'K':'ONE','W':'TWO','Z':'THREE'}))) # ### 4.7 Iterate efficiently through rows # `iterrows` returns an iterator yielding each index value along with a Series containing the data in each row: # In[ ]: for row_index, row in d1.iterrows(): print(row_index, '**', ' - '.join([str(item) for item in row])) # `itertuples` returns an iterator yielding a tuple for each row in the DataFrame. The first element of the tuple is the row’s corresponding index value, while the remaining elements are the row values: # In[ ]: for t in d1.itertuples(): print(t) # ## 5 Duplicated Data # ### 5.1 Find duplicated data in columns # In[ ]: d3 = pd.read_csv('temp/p01_d2.csv', index_col=0) d3['a dup'] = d3.duplicated(['a']) d3['a+b dup'] = d3.duplicated(['a', 'b']) d3['a+b dup - take last'] = d3.duplicated(['a', 'b'], keep='last') d3 # ### 5.2 Remove Duplicates # In[ ]: d3.drop_duplicates(['a', 'b'],keep='last') # ## 6 Working with Large Arrays # ### 6.1 Control the DataFrame memory occupation # Let's start by generating a DataFrame from a Numpy Array. We'll see than there is no memory overhead on DataFrame Values: # In[ ]: rows, cols = 100, 100 np_array = np.array(np.random.randn(rows, cols), dtype=np.float64) d4 = pd.DataFrame(np_array) print ('Rows x Cols x 8: ', rows*cols*8) print ('np Array Memory Occupation: ', np_array.nbytes) print ('Dataframe Values Memory Occupation: ', d4.values.nbytes) print ('Dataframe Index Memory Occupation: ', d4.index.nbytes) print ('Dataframe Columns Memory Occupation: ', d4.columns.nbytes) # To reduce the memory occupation it's possible to change the value's dtype: # In[ ]: d4 = d4.astype(dtype=np.float16) print ('Dataframe Values Memory Occupation: ', d4.values.nbytes) # If the data is sparse the Dataframe can be sparsified as well to save further resources with the `to_sparse()` method: # In[ ]: d4.iloc[2:,4:] = np.nan print ('Dataframe Values Memory Occupation: ', d4.values.nbytes) d4 = d4.to_sparse() print ('Dataframe Values Memory Occupation: ', d4.values.nbytes) # In this case rows and colums are np.int64 arrays: # In[ ]: d4.columns # ### 6.2 Explore large arrays # Working with large arrays: in Excel is difficult to explore arrays with thousands of lines and columns. Explore the pandas capabilities with the following code. The first line visualize the firts two lines, while the second actually load the whole file. Try to do the same in Excel for comparison. # In[ ]: d3 = pd.read_csv('example_data/p01_d3.csv.gz', compression='gzip') # In[ ]: for col in d3.columns: print (col, end=' - ') # Try by yourself: # # d3.head() # d3[d3.columns[:3]].head() # d3[d3.columns[-4:-1]].tail() # d3.ix[1000:1010, :7] # d3.ix[:, 'Abitanti'].describe() # In[ ]: d3[d3.columns[-4:-1]].tail() # ## 7 Column pct_change and shift # It is possibile to add multiple new columns to a `DataFrame`. # In[ ]: data = np.array([[3, 5, 7, 10, 13, 16, 56, 72], [8, 16, 28, 37, 45, 57, 69, 90], [3, 6, NaN, NaN, 15, 18, NaN, NaN], [1, 2, 4, 7, 11, 16, 65, 88], [NaN, NaN, NaN, NaN, 16, 19, 82, 91]]) d4 = pd.DataFrame(data.T, columns=['one', 'two', 'three', 'four', 'five']) d4 # In[ ]: d4[['one ret','two ret']] = d4[['one','two']].pct_change()+1 d4 # In[ ]: d4['four var'] = np.log(d4['four'] - d4['four'].shift()) d4 # ## 8 Reindex # `DataFrame.reindex` method conforms a `DataFrame` to a new index, filling cells with no values. It is possible to use this method to rearrange columns. # In[ ]: d5 = d4.reindex(columns=['one','one ret','two','two ret','four','four var']) d5 # Notice that `DataFrame.reindex` gives a new view, hence `d4` isn't changed. # In[ ]: d4 # ## 9 More on Indexing: Multi Index # In[ ]: d6 = pd.read_csv('temp/p01_d4.csv', index_col=['Country', 'Number', 'Dir']) d6 = d6.sort_index(level=0) d6 # Try by yourself: # # d6.loc['Fra'] # d6.loc['Fra', 'two'] # d6.loc['Fra':'Ger'] # d6.reorder_levels([2,1,0], axis=0).sortlevel(0) # d6.reset_index(level=1) # In[ ]: d6.loc['Fra'] # In[ ]: d6 = d6.sort_index(level=0) d6 # ## 10 Package Options # The way the DataFrames are displayed can be customized ijn many ways: ([See documentation](http://pandas.pydata.org/pandas-docs/stable/basics.html#working-with-package-options)): # In[ ]: print ('Display Max Rows: \t', pd.get_option('display.max_rows')) pd.describe_option('display.max_rows') # In[ ]: pd.set_option('display.max_rows', 10) pd.get_option('display.max_rows') # In[ ]: pd.reset_option('display.max_rows') pd.get_option('display.max_rows') # **Try by yourself**: # # pd.describe_option('display.chop_threshold') # pd.describe_option('display.colheader_justify') # pd.describe_option('display.column_space') # pd.describe_option('display.date_dayfirst') # pd.describe_option('display.date_yearfirst') # pd.describe_option('display.encoding') # pd.describe_option('display.expand_frame_repr') # pd.describe_option('display.float_format') # pd.describe_option('display.max_columns') # pd.describe_option('display.max_colwidth') # pd.describe_option('display.max_rows') # pd.describe_option('display.notebook_repr_html') # pd.describe_option('display.precision') # # for more options see documentation... # In[ ]: pd.set_option('display.precision', 2) pd.set_option('display.notebook_repr_html', False) prices # In[ ]: pd.set_option('display.notebook_repr_html', True) prices # --- # # Visit [www.add-for.com]() for more tutorials and updates. # # This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.