#!/usr/bin/env python
# coding: utf-8

# In[1]:


# import
import pandas as pd
import numpy as np


# #### Possible data inputs to DataFrame constructor
# ```
# 2D ndarray                             A matrix of data, passing optional row and column labels
# dict of arrays, lists, or tuples       Each sequence becomes a column in the DataFrame. All sequences must be the same length.
# NumPy structured/record array          Treated as the “dict of arrays” case
# dict of Series                         Each value becomes a column. Indexes from each Series are unioned together to form the result’s row index if no explicit index is passed.
# dict of dicts                          Each inner dict becomes a column. Keys are unioned to form the row index as in the “dict of Series” case.
# list of dicts or Series                Each item becomes a row in the DataFrame. Union of dict keys or Series indexes become the DataFrame’s column labels
# List of lists or tuples                Treated as the “2D ndarray” case
# Another DataFrame                      The DataFrame’s indexes are used unless different ones are passed
# NumPy MaskedArray                      Like the “2D ndarray” case except masked values become NA/missing in the DataFrame result
# ```

# In[2]:


state = ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada']
year = [2000, 2001, 2002, 2001, 2002]
pop = [1.5, 1.7, 3.6, 2.4, 2.9]

print(type(state), type(year), type(pop))


# In[3]:


# creating dataframe
df = pd.DataFrame({'state':state, 'year':year, 'pop':pop})
print(df.info())


# In[4]:


print(df)


# In[5]:


sdata = {'state':state, 'year':year, 'pop':pop}
print(sdata,"\n",type(sdata))


# In[6]:


df = pd.DataFrame(sdata, columns=['pop1', 'state1', 'year1'])   # we can not rename columns like this, but create column names 
                                                                # if doesn't exists
print(df)


# In[7]:


df = pd.DataFrame(sdata, columns=['pop1', 'state', 'year'])  # this will pick those columns from sdata which matched
print(df)


# In[8]:


df = pd.DataFrame(sdata)
print(df.columns)


# In[9]:


# renaming columns and index
df.columns = ['pop1', 'state1', 'year1']
df.index = ['one', 'two', 'three', 'four', 'five']
print(df)


# In[10]:


# stats about dataframe
print(df.index, "\n", df.shape, "\n", df.columns)


# In[11]:


df['pop1'] = 1.5
print(df)


# In[12]:


df['pop1'] = range(5)
print(df)


# In[13]:


# can access the data as 
print(df['state1'])


# In[14]:


print(df.state1)


# In[15]:


# for deleting any columns
del df['pop1']
print(df)


# In[16]:


# transpose the dataframe
dft = df.T
print(dft)


# In[17]:


# using columns as an index
df.index = df['year1']
del df['year1']
print(df)


# In[18]:


df.columns.name, df.index.name


# In[19]:


df.columns


# In[20]:


# printing values
df.values


# #### Index methods and properties
# ```
# append               Concatenate with additional Index objects, producing a new Index
# diff                 Compute set difference as an Index
# intersection         Compute set intersection
# union                Compute set union
# isin                 Compute boolean array indicating whether each value is contained in the passed collection
# delete               Compute new Index with element at index i deleted
# drop                 Compute new index by deleting passed values
# insert               Compute new Index by inserting element at index i
# is_monotonic         Returns True if each element is greater than or equal to the previous element
# is_unique            Returns True if the Index has no duplicate values
# unique               Compute the array of unique values in the Index
# ```

# In[21]:


# Series and DataFrames index are mutable
df.index


# In[22]:


#df.index[2]=2009 # this will throw a error


# #### Reindex Series or DataFrme
# ```
# index            New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An Index will be used exactly as is without any copying
# method           Interpolation (fill) method, see Table 5-4 for options.
# fill_value       Substitute value to use when introducing missing data by reindexing
# limit            When forward- or backfilling, maximum size gap to fill
# level            Match simple Index on level of MultiIndex, otherwise select subset of
# copy             Do not copy underlying data if new index is equivalent to old index. True by default (i.e. always copy data).
# ```

# In[23]:


print(df)


# In[24]:


df.index


# In[25]:


# df2 = df.reindex([2000, 2001, 2002, 2001, 2002, 2009]) 
# this will throw an value error, as index should be unique


# In[26]:


frame = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],columns=['Ohio', 'Texas', 'California'])
print(frame)


# In[27]:


frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)


# In[28]:


# likewise let's revert the df
df['year'] = df.index
df.index = [0,1,2,3,4]
print(df)


# In[29]:


# now we can reindex this df
df2 = df.reindex([1,2,3,4,5,6,7])  # again, reindex will first look into the df and then create the new
print(df2)  # as here, it will keep 1,2,3,4 and drop 0 and create new 5,6,7 index


# In[30]:


# better and faster way to do that is - 
df3=df2.ix[[1,2,3,4,6]]
print(df3)


# In[31]:


# CAN ALter the columns as well
new_columns = ['state1', 'year', 'population']
df4 = df3.ix[[1,2,3,4,6], new_columns]
print(df4)


# In[32]:


df4.columns


# In[33]:


# renaming columns
df4.columns = ['state', 'year', 'pop']
print(df4)


# In[34]:


# dropping index or columns
df5=df4.drop([3])
print(df5)


# In[35]:


df5 = df5.drop(['pop'], axis=1)
print(df5)


# ### Indexing, selection, and filtering

# In[36]:


df4


# In[37]:


df4[df4['state']=='Ohio']


# In[38]:


df4[['state', 'year']]


# In[39]:


df4['year'][df4['state']=='Ohio']=2004


# In[40]:


df4


# In[41]:


# ix enables you to select a subset of the rows and columns from a DataFrame with NumPy like notation plus axis labels
df4.ix[[1,2],['state']]


# In[42]:


df4.ix[[3,6],[0,2]]


# In[43]:


df4.ix[df4['year']<2003,[0,2]]


# #### Indexing options with DataFrame
# ```
# obj[val]              Select single column or sequence of columns from the DataFrame. Special case con-veniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame (set values based on some criterion).
# obj.ix[val]           Selects single row of subset of rows from the DataFrame.
# obj.ix[:, val]        Selects single column of subset of columns.
# obj.ix[val1, val2]    Select both rows and columns. 
# reindex method        Conform one or more axes to new indexes. 
# xs method             Select single row or column as a Series by label.
# icol, irow methods    Select single column or row, respectively, as a Series by integer location.
# get_value, set_value methods             Select single value by row and column label.
# ```

# ### Arithmetic and data alignment

# In[44]:


s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])


# In[45]:


s1 + s2  #assigned NaN for those index which is not found in another series


# In[46]:


df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])


# In[47]:


df1 + df2


# #### Arithmetic methods with fill values

# In[48]:


df1.add(df2, fill_value=0)


# In[49]:


# when reindexing a Series or DataFrame, you can also specify a different fill value
df1.reindex(columns=df2.columns, fill_value=0)


# #### Flexible arithmetic methods
# ```
# add   Method for addition (+)
# sub   Method for subtraction (-)
# div   Method for division (/)
# mul   Method for multiplication (*)
# ```

# #### Operations between DataFrame and Series

# In[50]:


frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame


# In[51]:


series = frame.ix[0]  # pickng first row
series


# In[52]:


frame * series


# In[53]:


# By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns, 
# broadcasting down the rows:
frame - series


# In[54]:


series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame * series2


# #### Function application and mapping

# In[55]:


f = lambda x : x.max() - x.min()


# In[56]:


frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)


# In[57]:


frame.apply(f)


# In[58]:


frame.apply(f, axis=1)


# In[59]:


# defining a func
def f(x):
    return pd.Series([x.max(), x.min()], index=['max', 'min'])


# In[60]:


frame.apply(f)


# In[61]:


frame.apply(f, axis=1)


# In[62]:


format = lambda x: '%.2f' % x
frame.applymap(format)


# #### Sorting and ranking

# In[63]:


obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj


# In[64]:


# sorting on index
obj.sort_index()


# In[65]:


frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame


# In[66]:


frame.sort_index()


# In[67]:


frame.sort_index(axis=1)


# In[68]:


frame.sort_index(axis=1).sort_index()


# In[69]:


frame.sort_index(axis=1, ascending=False)


# In[70]:


# To sort a Series by its values, use its order method
sr = pd.Series(['2', np.nan, '-3', '5'])
sr


# In[71]:


# sorting by value
sr.sort_values()


# In[72]:


frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame


# In[73]:


frame.sort_values(by='b')


# In[74]:


frame.sort_values(by=['a', 'b'])


# In[75]:


# ranking   # Explore more
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj


# In[76]:


obj.rank()


# #### Axis indexes with duplicate values

# In[77]:


obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj


# In[78]:


obj.index.unique()  # get unique index


# In[79]:


obj.index.is_unique  # check if index are unique


# In[80]:


df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df


# In[81]:


df.index.is_unique


# In[82]:


df.ix['a']   # ix is used to select rows by index


# In[83]:


df.ix[0]


# In[ ]: