#!/usr/bin/env python
# coding: utf-8

# # 十分钟上手 Pandas

# `pandas` 是一个 `Python Data Analysis Library`。
# 
# 安装请参考官网的教程，如果安装了 `Anaconda`，则不需要安装 `pandas` 库。

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# ## 产生 Pandas 对象

# `pandas` 中有三种基本结构：
# 
# - `Series`
#     - 1D labeled homogeneously-typed array
# - `DataFrame`
#     - General 2D labeled, size-mutable tabular structure with potentially heterogeneously-typed columns
# - `Panel`
#     - General 3D labeled, also size-mutable array

# ### Series

# 一维 `Series` 可以用一维列表初始化：

# In[2]:


s = pd.Series([1,3,5,np.nan,6,8])

print s


# 默认情况下，`Series` 的下标都是数字（可以使用额外参数指定），类型是统一的。
# 
# ### DataFrame
# 
# `DataFrame` 则是个二维结构，这里首先构造一组时间序列，作为我们第一维的下标：

# In[3]:


dates = pd.date_range('20130101', periods=6)

print dates


# 然后创建一个 `DataFrame` 结构：

# In[4]:


df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

df


# 默认情况下，如果不指定 `index` 参数和 `columns`，那么他们的值将用从 `0` 开始的数字替代。
# 
# 除了向 `DataFrame` 中传入二维数组，我们也可以使用字典传入数据：

# In[5]:


df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo' })

df2


# 字典的每个 `key` 代表一列，其 `value` 可以是各种能够转化为 `Series` 的对象。
# 
# 与 `Series` 要求所有的类型都一致不同，`DataFrame` 值要求每一列数据的格式相同：

# In[6]:


df2.dtypes


# ## 查看数据

# ### 头尾数据

# `head` 和 `tail` 方法可以分别查看最前面几行和最后面几行的数据（默认为 5）：

# In[7]:


df.head()


# 最后 3 行：

# In[8]:


df.tail(3)


# ### 下标，列标，数据

# 下标使用 `index` 属性查看：

# In[9]:


df.index


# 列标使用 `columns` 属性查看：

# In[10]:


df.columns


# 数据值使用 `values` 查看：

# In[11]:


df.values


# ### 统计数据

# 查看简单的统计数据：

# In[12]:


df.describe()


# ### 转置

# In[13]:


df.T


# ## 排序

# `sort_index(axis=0, ascending=True)` 方法按照下标大小进行排序，`axis=0` 表示按第 0 维进行排序。

# In[14]:


df.sort_index(ascending=False)


# In[15]:


df.sort_index(axis=1, ascending=False)


# `sort_values(by, axis=0, ascending=True)` 方法按照 `by` 的值的大小进行排序，例如按照 `B` 列的大小：

# In[16]:


df.sort_values(by="B")


# ## 索引

# 虽然 `DataFrame` 支持 `Python/Numpy` 的索引语法，但是推荐使用 `.at, .iat, .loc, .iloc 和 .ix` 方法进行索引。

# ### 读取数据

# 选择单列数据：

# In[17]:


df["A"]


# 也可以用 `df.A`：

# In[18]:


df.A


# 使用切片读取多行：

# In[19]:


df[0:3]


# `index` 名字也可以进行切片：

# In[20]:


df["20130101":"20130103"]


# ### 使用 `label` 索引

# `loc` 可以方便的使用 `label` 进行索引：

# In[21]:


df.loc[dates[0]]


# 多列数据：

# In[22]:


df.loc[:,['A','B']]


# 选择多行多列：

# In[23]:


df.loc['20130102':'20130104',['A','B']]


# 数据降维：

# In[24]:


df.loc['20130102',['A','B']]


# 得到标量值：

# In[25]:


df.loc[dates[0],'B']


# 不过得到标量值可以用 `at`，速度更快：

# In[26]:


get_ipython().run_line_magic('timeit', "-n100 df.loc[dates[0],'B']")
get_ipython().run_line_magic('timeit', "-n100 df.at[dates[0],'B']")

print df.at[dates[0],'B']


# ### 使用位置索引

# `iloc` 使用位置进行索引：

# In[27]:


df.iloc[3]


# 连续切片：

# In[28]:


df.iloc[3:5,0:2]


# 索引不连续的部分：

# In[29]:


df.iloc[[1,2,4],[0,2]]


# 索引整行：

# In[30]:


df.iloc[1:3,:]


# 整列：

# In[31]:


df.iloc[:, 1:3]


# 标量值：

# In[32]:


df.iloc[1,1]


# 当然，使用 `iat` 索引标量值更快：

# In[33]:


get_ipython().run_line_magic('timeit', '-n100 df.iloc[1,1]')
get_ipython().run_line_magic('timeit', '-n100 df.iat[1,1]')

df.iat[1,1]


# ### 布尔型索引

# 所有 `A` 列大于 0 的行：

# In[34]:


df[df.A > 0]


# 只留下所有大于 0 的数值：

# In[35]:


df[df > 0]


# 使用 `isin` 方法做 `filter` 过滤：

# In[36]:


df2 = df.copy()
df2['E'] = ['one', 'one','two','three','four','three']

df2


# In[37]:


df2[df2['E'].isin(['two','four'])]


# ### 设定数据的值

# In[38]:


s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

s1


# 像字典一样，直接指定 `F` 列的值为 `s1`，此时以 `df` 已有的 `index` 为标准将二者进行合并，`s1` 中没有的 `index` 项设为 `NaN`，多余的项舍去：

# In[39]:


df['F'] = s1

df


# 或者使用 `at` 或 `iat` 修改单个值：

# In[40]:


df.at[dates[0],'A'] = 0

df


# In[41]:


df.iat[0, 1] = 0

df


# 设定一整列：

# In[42]:


df.loc[:,'D'] = np.array([5] * len(df))

df


# 设定满足条件的数值：

# In[43]:


df2 = df.copy()

df2[df2 > 0] = -df2

df2


# ## 缺失数据

# In[44]:


df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1.loc[dates[0]:dates[1],'E'] = 1

df1


# 丢弃所有缺失数据的行得到的新数据：

# In[45]:


df1.dropna(how='any')


# 填充缺失数据：

# In[46]:


df1.fillna(value=5)


# 检查缺失数据的位置：

# In[47]:


pd.isnull(df1)


# ## 计算操作

# ### 统计信息

# 每一列的均值：

# In[48]:


df.mean()


# 每一行的均值：

# In[49]:


df.mean(1)


# 多个对象之间的操作，如果维度不对，`pandas` 会自动调用 `broadcasting` 机制：

# In[50]:


s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

print s


# 相减 `df - s`：

# In[51]:


df.sub(s, axis='index')


# ### apply 操作

# 与 `R` 中的 `apply` 操作类似，接收一个函数，默认是对将函数作用到每一列上：

# In[52]:


df.apply(np.cumsum)


# 求每列最大最小值之差：

# In[53]:


df.apply(lambda x: x.max() - x.min())


# ### 直方图

# In[54]:


s = pd.Series(np.random.randint(0, 7, size=10))
print s


# 直方图信息：

# In[55]:


print s.value_counts()


# 绘制直方图信息：

# In[56]:


h = s.hist()


# ### 字符串方法

# 当 `Series` 或者 `DataFrame` 的某一列是字符串时，我们可以用 `.str` 对这个字符串数组进行字符串的基本操作： 

# In[57]:


s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

print s.str.lower()


# ## 合并

# ### 连接

# In[58]:


df = pd.DataFrame(np.random.randn(10, 4))

df


# 可以使用 `pd.concat` 函数将多个 `pandas` 对象进行连接：

# In[59]:


pieces = [df[:2], df[4:5], df[7:]]

pd.concat(pieces)


# ### 数据库中的 Join

# `merge` 可以实现数据库中的 `join` 操作：

# In[60]:


left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

print left
print right


# In[61]:


pd.merge(left, right, on='key')


# ### append

# 向 `DataFrame` 中添加行：

# In[62]:


df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

df


# 将第三行的值添加到最后：

# In[63]:


s = df.iloc[3]

df.append(s, ignore_index=True)


# ### Grouping

# In[64]:


df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

df


# 按照 `A` 的值进行分类：

# In[65]:


df.groupby('A').sum()


# 按照 `A, B` 的值进行分类：

# In[66]:


df.groupby(['A', 'B']).sum()


# ## 改变形状

# ### Stack

# 产生一个多 `index` 的 `DataFrame`：

# In[67]:


tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                     'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df


# `stack` 方法将 `columns` 变成一个新的 `index` 部分：

# In[68]:


df2 = df[:4]

stacked = df2.stack()

stacked


# 可以使用 `unstack()` 将最后一级 `index` 放回 `column`：

# In[69]:


stacked.unstack()


# 也可以指定其他的级别：

# In[70]:


stacked.unstack(1)


# ## 时间序列

# 金融分析中常用到时间序列数据：

# In[71]:


rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
ts = pd.Series(np.random.randn(len(rng)), rng)

ts


# 标准时间表示：

# In[72]:


ts_utc = ts.tz_localize('UTC')

ts_utc


# In[ ]:


# 改变时区表示：

# In[73]:


ts_utc.tz_convert('US/Eastern')


# ## Categoricals

# In[74]:


df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df


# 可以将 `grade` 变成类别：

# In[75]:


df["grade"] = df["raw_grade"].astype("category")

df["grade"]


# 将类别的表示转化为有意义的字符：

# In[76]:


df["grade"].cat.categories = ["very good", "good", "very bad"]

df["grade"]


# 添加缺失的类别：

# In[77]:


df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]


# 使用 `grade` 分组：

# In[78]:


df.groupby("grade").size()


# ## 绘图

# 使用 `ggplot` 风格：

# In[79]:


plt.style.use('ggplot')


# `Series` 绘图：

# In[80]:


ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

p = ts.cumsum().plot()


# `DataFrame` 按照 `columns` 绘图：

# In[81]:


df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])

df.cumsum().plot()
p = plt.legend(loc="best")


# ## 文件读写

# ### csv

# 写入文件：

# In[82]:


df.to_csv('foo.csv')


# 从文件中读取：

# In[83]:


pd.read_csv('foo.csv').head()


# ### hdf5

# 写入文件：

# In[84]:


df.to_hdf("foo.h5", "df")


# 读取文件：

# In[85]:


pd.read_hdf('foo.h5','df').head()


# ### excel

# 写入文件：

# In[86]:


df.to_excel('foo.xlsx', sheet_name='Sheet1')


# 读取文件：

# In[87]:


pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']).head()


# 清理生成的临时文件：

# In[88]:


import glob
import os

for f in glob.glob("foo*"):
    os.remove(f)