#!/usr/bin/env python # coding: utf-8 # In[1]: # import import pandas as pd import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # #### Object Creation - Series and DataFrame # In[2]: # creating Series (array) s1 = pd.Series([[1,3,5,np.nan,6,8]]) s1 # In[3]: # creating DataFrames dates = pd.date_range('20130101', periods=6) dates # In[4]: df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) df # In[5]: # Creating a DataFrame by passing a dict of objects that can be converted to series-like. df2 = pd.DataFrame({ 'A' : 1., 'B' : pd.Timestamp('20130102'), 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), 'D' : np.array([3] * 4,dtype='int32'), 'E' : pd.Categorical(["test","train","test","train"]), 'F' : 'foo' }) df2 # In[6]: # checking Data Types df2.dtypes # In[7]: # Use df2. for column name completion as well as attributes which can work on dataframe # #### Viewing Data # In[8]: # for first 5 records df.head() # In[9]: # for last 3 df.tail(3) # In[10]: # checking df index df.index # In[11]: # column names df.columns # In[12]: # df values df.values # #### Summary of Data # In[13]: # information abt df df.info() # In[14]: # describing stastistic summay df.describe() # #### Transposing and Sorting the data # In[15]: df.T # In[16]: # Sorting by index df.sort_index(axis=1, ascending=False) # In[17]: # sorting by values df.sort_values(by="B") # In[18]: df.sort_values(by=["B", "A"]) # #### Selection of Data # # In pandas, data can be accessed with these methods ```.at, .iat, .loc, .iloc and .ix``` # In[19]: # selecting a column A df['A'] # In[20]: # slicing rows df[0:3] # In[21]: df['2013-01-01':'2013-01-03'] # #### Selection by lable # In[22]: # cross selection using a lable df.loc[dates[0]] # In[23]: df.loc[:, ['A', 'B']] # [row, column] # In[24]: # Showing label slicing, both endpoints are included df.loc['20130102':'20130104',['A','B']] # In[25]: df.loc['20130102',['A','B']] # In[26]: # For getting a scalar value df.loc['20130102',['A']] # In[27]: df.loc['20130102','B'] # In[28]: # for faster access #df.at['20130102', 'A'] # #### Selection by Position # In[29]: # Select via the position of the passed integers df.iloc[3] # In[30]: # By integer slices, acting similar to numpy/python df.iloc[3:5, 2:4] # row - 3 n 4 , col = 2, 3 # In[31]: # By lists of integer position locations, similar to the numpy/python style df.iloc[[1,3,4], 2:4] # In[32]: # For slicing rows explicitly df.iloc[1:3,:] # In[33]: # For slicing columns explicitly df.iloc[:,1:3] # In[34]: # For getting a value explicitly df.iloc[1,1] # In[35]: df.loc['2013-01-02', 'B'] # In[36]: # For getting fast access to a scalar (equiv to the prior method) df.iat[1,1] # #### Boolean Indexing # In[37]: df.B > 0 # In[38]: df[df.B > 0] # In[39]: df[df>0] # In[40]: # Using the isin() method for filtering: df3 = df.copy() df3['E'] = ['one', 'one','two','three','four','three'] df3 # In[41]: df3[df3['E'].isin(['two','four'])] # #### Setting a new column # In[42]: s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) s1 # In[43]: df['F'] = s1 df # In[44]: # Setting values by label df.at[dates[0],'A'] = 0 df # In[45]: # Setting values by position df.iat[0,1] = 0 df # In[46]: # Setting by assigning with a numpy array df.loc[:,'D'] = np.array([5] * len(df)) df # In[47]: df2 = df.copy() df2 # In[48]: # to replace all the positive value from its negative df2[df2 > 0] = -df2 df2 # In[49]: df2[df2 < 0] = -df2 df2 # #### Missing Data # In[50]: # Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data. df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) df1 # In[51]: df1.loc[dates[1],'E'] = 1 df1 # In[52]: # to drop any rows that have missing data. df1.dropna(how='any') # if any columns have NULL or NaN # In[53]: df1.dropna() # In[54]: df1.dropna(how='all') # if ALL columns have NULL or NaN # #### Filling missing data # In[55]: df1.fillna(3) # In[56]: df1.fillna(value=4) # In[57]: df1.fillna({'F':3, 'E':2.9}) # Fill F column with 3 and E column wih 2.9 # In[58]: pd.isnull(df1) # #### Stats # In[59]: df.info() # In[60]: df.describe() # In[61]: df.count() # In[62]: df.mean() # column wise # In[63]: df.mean(1) # row wise # In[64]: df.std() # In[65]: df.std(1) # In[66]: s = pd.Series([1,3,5,np.nan,6,8], index=dates) s # In[67]: s = s.shift(2) # shifting the content by 2 index s # In[68]: df.sub(s, axis='index') # #### Apply # In[69]: df.apply(np.cumsum) # In[70]: df.apply(lambda x: x.max() - x.min()) # #### Histogramming # In[71]: s = pd.Series(np.random.randint(0, 7, size=10)) s # In[72]: s.value_counts() # checking unique value counts # #### String Methods # In[73]: s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) s # In[74]: s.str.lower() # #### Merge # 1. Concat # In[75]: df = pd.DataFrame(np.random.randn(10, 4)) df # In[76]: pieces = [df[:3], df[3:7], df[7:]] pieces # In[77]: pd.concat(pieces) # concat rowwise # In[78]: pd.concat(pieces, axis=1) # concat column wise # #### Join # # ``` # pd.merge(left, right, how='inner', on=None, left_on=None, right_on=None, # left_index=False, right_index=False, sort=True, # suffixes=('_x', '_y'), copy=True, indicator=False) # ``` # In[79]: left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) left # In[80]: right # In[81]: pd.merge(left, right, on='key') # In[82]: left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) left # In[83]: pd.merge(left, right, on='key') # #### Append # In[84]: df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) df # In[85]: s = df.iloc[3] s # In[86]: df.append(s, ignore_index=True) df # #### Grouping # # By “group by” we are referring to a process involving one or more of the following steps # # Splitting the data into groups based on some criteria # Applying a function to each group independently # Combining the results into a data structure # In[87]: df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) df # In[88]: df.groupby('A').sum() # In[89]: df.groupby(['A','B']).sum() # #### Reshaping # In[90]: tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])) index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) index # In[91]: df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) df # The stack() method “compresses” a level in the DataFrame’s columns # In[92]: df.stack() # With a “stacked” DataFrame or Series (having a MultiIndex as the index), the inverse operation of stack() is unstack(), which by default unstacks the last level: # In[93]: df.unstack() # this will unstack the inner index (last level) # In[94]: df.unstack('first') # unstacking by lable # In[95]: df.unstack('second') # In[96]: df.unstack(0) # unstacking by index # #### Pivot Tables # In[97]: df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)}) df # In[98]: pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) # #### Time Series # In[99]: rng = pd.date_range('1/1/2012', periods=100, freq='S') rng[:5] # In[100]: ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) ts[:8] # In[101]: ts.sum() # In[102]: ts.resample('5Min').sum() # In[103]: ts.resample('5Min') # Time zone representation # In[104]: rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') rng # In[105]: ts = pd.Series(np.random.randn(len(rng)), rng) ts # In[106]: ts_utc = ts.tz_localize('UTC') ts_utc # Convert to another time zone # In[107]: ts_utc.tz_convert('Asia/Calcutta') # Converting between time span representations # In[108]: rng = pd.date_range('1/1/2012', periods=5, freq='M') ts = pd.Series(np.random.randn(len(rng)), index=rng) ts # In[109]: ps = ts.to_period() ps # In[110]: ps.to_timestamp() # Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly frequency with year ending in November to 9am of the end of the month following the quarter end: # In[111]: prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') ts = pd.Series(np.random.randn(len(prng)), prng) ts.head() # In[112]: ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 ts.head() # #### Categoricals # In[113]: df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) df # In[114]: df.dtypes # In[115]: df["grade"] = df["raw_grade"].astype("category") df["grade"] # Rename the categories to more meaningful names (assigning to Series.cat.categories is inplace!) # In[116]: df["grade"].cat.categories = ["very good", "good", "very bad"] df # In[117]: # Reorder the categories and simultaneously add the # missing categories (methods under Series .cat return a new Series per default). df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) df # In[118]: # Sorting is per order in the categories, not lexical order. df.sort_values(by="grade") # In[119]: # Grouping by a categorical column shows also empty categories. df.groupby("grade").size() # #### Plotting # In[120]: ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() ts.plot() # In[121]: # On DataFrame, plot() is a convenience to plot all of the columns with labels: df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D']) df = df.cumsum() df.plot(grid=True) # #### Data Read and Write # In[122]: df[:4] # to_csv and read_csv # In[123]: # writing it to csv df.to_csv("dataset/df_as_csv.csv") # In[124]: # reading it df2 = pd.read_csv("dataset/df_as_csv.csv") df2.head() # for hdfs, to_hdf and read_hdf # In[125]: df.to_hdf('foo.h5','df') # In[126]: pd.read_hdf('foo.h5','df').head() # For Excel, to_excel and read_excel # In[127]: df.to_excel('foo.xlsx', sheet_name='Sheet1') # In[129]: pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head() # In[ ]: