#!/usr/bin/env python # coding: utf-8 # In[1]: # import import pandas as pd import numpy as np # #### Possible data inputs to DataFrame constructor # ``` # 2D ndarray A matrix of data, passing optional row and column labels # dict of arrays, lists, or tuples Each sequence becomes a column in the DataFrame. All sequences must be the same length. # NumPy structured/record array Treated as the “dict of arrays” case # dict of Series Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed. # dict of dicts Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case. # list of dicts or Series Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels # List of lists or tuples Treated as the “2D ndarray” case # Another DataFrame The DataFrame’s indexes are used unless different ones are passed # NumPy MaskedArray Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result # ``` # In[2]: state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'] year = [2000, 2001, 2002, 2001, 2002] pop = [1.5, 1.7, 3.6, 2.4, 2.9] print(type(state), type(year), type(pop)) # In[3]: # creating dataframe df = pd.DataFrame({'state':state, 'year':year, 'pop':pop}) print(df.info()) # In[4]: print(df) # In[5]: sdata = {'state':state, 'year':year, 'pop':pop} print(sdata,"\n",type(sdata)) # In[6]: df = pd.DataFrame(sdata, columns=['pop1', 'state1', 'year1']) # we can not rename columns like this, but create column names # if doesn't exists print(df) # In[7]: df = pd.DataFrame(sdata, columns=['pop1', 'state', 'year']) # this will pick those columns from sdata which matched print(df) # In[8]: df = pd.DataFrame(sdata) print(df.columns) # In[9]: # renaming columns and index df.columns = ['pop1', 'state1', 'year1'] df.index = ['one', 'two', 'three', 'four', 'five'] print(df) # In[10]: # stats about dataframe print(df.index, "\n", df.shape, "\n", df.columns) # In[11]: df['pop1'] = 1.5 print(df) # In[12]: df['pop1'] = range(5) print(df) # In[13]: # can access the data as print(df['state1']) # In[14]: print(df.state1) # In[15]: # for deleting any columns del df['pop1'] print(df) # In[16]: # transpose the dataframe dft = df.T print(dft) # In[17]: # using columns as an index df.index = df['year1'] del df['year1'] print(df) # In[18]: df.columns.name, df.index.name # In[19]: df.columns # In[20]: # printing values df.values # #### Index methods and properties # ``` # append Concatenate with additional Index objects, producing a new Index # diff Compute set difference as an Index # intersection Compute set intersection # union Compute set union # isin Compute boolean array indicating whether each value is contained in the passed collection # delete Compute new Index with element at index i deleted # drop Compute new index by deleting passed values # insert Compute new Index by inserting element at index i # is_monotonic Returns True if each element is greater than or equal to the previous element # is_unique Returns True if the Index has no duplicate values # unique Compute the array of unique values in the Index # ``` # In[21]: # Series and DataFrames index are mutable df.index # In[22]: #df.index[2]=2009 # this will throw a error # #### Reindex Series or DataFrme # ``` # index New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying # method Interpolation (fill) method, see Table 5-4 for options. # fill_value Substitute value to use when introducing missing data by reindexing # limit When forward- or backfilling, maximum size gap to fill # level Match simple Index on level of MultiIndex, otherwise select subset of # copy Do not copy underlying data if new index is equivalent to old index. True by default (i.e. always copy data). # ``` # In[23]: print(df) # In[24]: df.index # In[25]: # df2 = df.reindex([2000, 2001, 2002, 2001, 2002, 2009]) # this will throw an value error, as index should be unique # In[26]: frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California']) print(frame) # In[27]: frame2 = frame.reindex(['a', 'b', 'c', 'd']) print(frame2) # In[28]: # likewise let's revert the df df['year'] = df.index df.index = [0,1,2,3,4] print(df) # In[29]: # now we can reindex this df df2 = df.reindex([1,2,3,4,5,6,7]) # again, reindex will first look into the df and then create the new print(df2) # as here, it will keep 1,2,3,4 and drop 0 and create new 5,6,7 index # In[30]: # better and faster way to do that is - df3=df2.ix[[1,2,3,4,6]] print(df3) # In[31]: # CAN ALter the columns as well new_columns = ['state1', 'year', 'population'] df4 = df3.ix[[1,2,3,4,6], new_columns] print(df4) # In[32]: df4.columns # In[33]: # renaming columns df4.columns = ['state', 'year', 'pop'] print(df4) # In[34]: # dropping index or columns df5=df4.drop([3]) print(df5) # In[35]: df5 = df5.drop(['pop'], axis=1) print(df5) # ### Indexing, selection, and filtering # In[36]: df4 # In[37]: df4[df4['state']=='Ohio'] # In[38]: df4[['state', 'year']] # In[39]: df4['year'][df4['state']=='Ohio']=2004 # In[40]: df4 # In[41]: # ix enables you to select a subset of the rows and columns from a DataFrame with NumPy like notation plus axis labels df4.ix[[1,2],['state']] # In[42]: df4.ix[[3,6],[0,2]] # In[43]: df4.ix[df4['year']<2003,[0,2]] # #### Indexing options with DataFrame # ``` # obj[val] Select single column or sequence of columns from the DataFrame. Special case con-veniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion). # obj.ix[val] Selects single row of subset of rows from the DataFrame. # obj.ix[:, val] Selects single column of subset of columns. # obj.ix[val1, val2] Select both rows and columns. # reindex method Conform one or more axes to new indexes. # xs method Select single row or column as a Series by label. # icol, irow methods Select single column or row, respectively, as a Series by integer location. # get_value, set_value methods Select single value by row and column label. # ``` # ### Arithmetic and data alignment # In[44]: s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e']) s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g']) # In[45]: s1 + s2 #assigned NaN for those index which is not found in another series # In[46]: df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado']) df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) # In[47]: df1 + df2 # #### Arithmetic methods with fill values # In[48]: df1.add(df2, fill_value=0) # In[49]: # when reindexing a Series or DataFrame, you can also specify a different fill value df1.reindex(columns=df2.columns, fill_value=0) # #### Flexible arithmetic methods # ``` # add Method for addition (+) # sub Method for subtraction (-) # div Method for division (/) # mul Method for multiplication (*) # ``` # #### Operations between DataFrame and Series # In[50]: frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) frame # In[51]: series = frame.ix[0] # pickng first row series # In[52]: frame * series # In[53]: # By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, # broadcasting down the rows: frame - series # In[54]: series2 = pd.Series(range(3), index=['b', 'e', 'f']) frame * series2 # #### Function application and mapping # In[55]: f = lambda x : x.max() - x.min() # In[56]: frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon']) print(frame) # In[57]: frame.apply(f) # In[58]: frame.apply(f, axis=1) # In[59]: # defining a func def f(x): return pd.Series([x.max(), x.min()], index=['max', 'min']) # In[60]: frame.apply(f) # In[61]: frame.apply(f, axis=1) # In[62]: format = lambda x: '%.2f' % x frame.applymap(format) # #### Sorting and ranking # In[63]: obj = pd.Series(range(4), index=['d', 'a', 'b', 'c']) obj # In[64]: # sorting on index obj.sort_index() # In[65]: frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c']) frame # In[66]: frame.sort_index() # In[67]: frame.sort_index(axis=1) # In[68]: frame.sort_index(axis=1).sort_index() # In[69]: frame.sort_index(axis=1, ascending=False) # In[70]: # To sort a Series by its values, use its order method sr = pd.Series(['2', np.nan, '-3', '5']) sr # In[71]: # sorting by value sr.sort_values() # In[72]: frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]}) frame # In[73]: frame.sort_values(by='b') # In[74]: frame.sort_values(by=['a', 'b']) # In[75]: # ranking # Explore more obj = pd.Series([7, -5, 7, 4, 2, 0, 4]) obj # In[76]: obj.rank() # #### Axis indexes with duplicate values # In[77]: obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c']) obj # In[78]: obj.index.unique() # get unique index # In[79]: obj.index.is_unique # check if index are unique # In[80]: df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b']) df # In[81]: df.index.is_unique # In[82]: df.ix['a'] # ix is used to select rows by index # In[83]: df.ix[0] # In[ ]: