#!/usr/bin/env python # coding: utf-8 # ChEn-1070: Introduction to Chemical Engineering Spring 2019 UMass Lowell; Profs. Manohar and de Almeida **26Mar2019** # # # 04. Tabular Numerics # --- # ## Table of Contents # * [Introduction](#introduction) # + Importing the `pandas` package # * [Series](#series) # * [Data Frame](#data-frame) # --- # ## Introduction # A significant `Python` package that supports tabular calculations (spreadsheet) is called: `Pandas` # # + [Home page](https://pandas.pydata.org/), # + [Documentation](http://pandas.pydata.org/pandas-docs/stable/), # # we will use `Pandas` to build tabular data and perform statistical analysis. We will access `Pandas` with the `import` keyword # # + `import pandas` # # # In[1]: '''Import pandas''' import pandas as pd # ## Series (1-Dimensional) # # It is a one-dimensional *labeled* array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. For example, note how to create a `pandas` series: # In[2]: '''Create basic Pandas series''' data = [1.10134,2.29211,3.383457,4.678454] s0 = pd.Series(data) print(s0) # Note that the overall type is `float64` (*i.e.* floating point with double precision), and the default labels are added: 0,1,2,3 # In[3]: '''Display number of significant digits''' pd.options.display.float_format = '{:.2e}'.format print(s0) # There are various [statistics](http://pandas.pydata.org/pandas-docs/stable/getting_started/basics.html#descriptive-statistics) in `pandas`: # In[4]: '''Statistics''' print('s0 mean =',s0.mean()) print('s0 sum =',s0.sum()) print('s0 std =',s0.std()) print('s0 var =',s0.var()) print('s0 sem =',s0.sem()) # # |Function |Description| # |-----------|-----------| # |count | Number of non-NA observations | # |sum | Sum of values | # |mean | Mean of values | # |mad | Mean absolute deviation | # |median | Arithmetic median of values | # |min | Minimum | # |max | Maximum | # |mode | Mode | # |abs | Absolute Value | # |prod | Product of values | # |std | Bessel-corrected sample standard deviation | # |var | Unbiased variance | # |sem | Standard error of the mean | # |skew | Sample skewness (3rd moment) | # |kurt | Sample kurtosis (4th moment) | # |quantile | Sample quantile (value at %) | # |cumsum | Cumulative sum | # |cumprod | Cumulative product | # |cummax | Cumulative maximum | # |cummin | Cumulative minimum | # # # There is a convenient `describe()` function which computes a variety of summary statistics: # In[5]: '''Describe statistics summary of data''' s0.describe() # In[6]: '''Data access with index operator''' print("s0[3] = ",s0[3]) # In[7]: '''Data assignment with index operator''' s0[2] = 'hello' print(s0) # Note that the overall type is now `object` because of the mixed data type. # In[8]: '''Fast data access with integer index''' s0.iat[-1] # In[16]: '''Slower than iat[], more generic (see below in Data Frames) access with integer index''' s0.iloc[-2] # In[10]: '''Create a Pandas series w/ a dictionary''' data = {'a':1,'b':2,'c':3,'d':4} s1 = pd.Series(data) print(s1) # The overall type is `int64` (*i.e* integer with double precision), but notice the labels are taken from the data dictionary (the keys are made as labels). # In[11]: '''Create a Pandas series w/ the same value''' s2 = pd.Series( 8.0, index=['a', 'b', 'c', 'd'] ) print(s2) # The data can be a repeated value (a scalar). # In[12]: '''Data access as a dictionary using the index operator''' print("s1['b'] = ",s1['b']) print("s2['d'] = ",s2['d']) # In[18]: '''Fast data access with integer index''' s2.iat[-1] # In[19]: '''Create a Pandas series using lists''' import pandas as pd labels = ['symbol','atomic number','group','period','mass number'] data = ['H',1,1,1,1] # hydrogen h1 = pd.Series( data, index=labels) print(h1) print('') data = ['D',1,1,1,2] # deuterium h2 = pd.Series( data, index=labels) print(h2) print('') data = ['Be-9',4,2,2,9] # beryllium be_9 = pd.Series( data, index=labels) print(be_9) print('') data = ['Be-10',4,2,2,10] # beryllium be_10 = pd.Series( data, index=labels) print(be_10) # The overall type is `object` because the data types are mixed. Notice the index labels are more elaborate and provided as a list of `str`. # ## Data Frame (2-Dimensional) # # It is a two-dimensional (table) *labeled* data structure with various data types. The **row** labels are collectively referred to as the index. The **column** labels appear as a new structure. For example, note how to create a `pandas` data frame: # In[20]: '''Using existing series as columns''' data = {'S1':s1,'S2':s2} df1 = pd.DataFrame( data ) print(df1) # In[21]: df1.columns # In[22]: '''Change column labels''' df1.columns = ['x','y'] print(df1) # In[23]: '''Using a dictionary to create a table''' hydrogen = {'symbol':'H', 'atomic_number':1, 'group':1, 'period':1, 'isotopes':[1,2,3]} helium = {'symbol':'He', 'atomic_number':2, 'group':18, 'period':1, 'isotopes':[3,4]} lithium = {'symbol':'Li', 'atomic_number':3, 'group':1, 'period':2, 'isotopes':[6,7]} beryllium = {'symbol':'Be', 'atomic_number':4, 'group':2, 'period':2, 'isotopes':[9,10]} boron = {'symbol':'B', 'atomic_number':5, 'group':13, 'period':2, 'isotopes':[10,11]} data = {'hydrogen':hydrogen,'helium':helium,'lithium':lithium,'beryllium':beryllium,'boron':boron} df2 = pd.DataFrame(data) # In[24]: df2 # In[57]: '''Column selection''' type(df2['helium']) # In[26]: '''Rows of a column''' df2['helium'][1:-1] # Some data access operation: # # |Operation | Syntax | Result| # |----------|-----------|----------| # |Select column | df[col] | Series # |Select row by label | df.loc[label] | Series # |Select row by integer | location df.iloc[loc] | Series # |Slice rows df[5:10] | DataFrame # |Select rows by boolean vector | df[bool_vec] | DataFrame # # There is more on data access and [slicing](http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#indexing). # In[27]: '''Access of "isotopes" row by label or also called index''' df2.loc['isotopes'] # In[53]: '''Access of a table entry using row and column labels''' df2.loc['isotopes','boron'] # In[32]: '''Access of "isotopes" row by index''' df2.iloc[2] # In[36]: '''Access the last two rows by index''' df2.iloc[-2:] # In[43]: '''Note that iat will not work''' #df2.iat[-2:] #df2.iat[-2] # In[45]: '''This pure index access will work''' df2.iat[1,3] # row index 1, column index 3, that is: second row, fourth column # In[ ]: