#!/usr/bin/env python
# coding: utf-8

# # 二维数据结构：DataFrame

# In[1]:


import numpy as np
import pandas as pd


# `DataFrame` 是 `pandas` 中的二维数据结构，可以看成一个 `Excel` 中的工作表，或者一个 `SQL` 表，或者一个存储 `Series` 对象的字典。
# 
# `DataFrame(data, index, columns)` 中的 `data` 可以接受很多数据类型：
# 
# - 一个存储一维数组，字典，列表或者 `Series` 的字典
# - 2-D 数组
# - 结构或者记录数组
# - 一个 `Series`
# - 另一个 `DataFrame`
# 
# `index` 用于指定行的 `label`，`columns` 用于指定列的 `label`，如果参数不传入，那么会按照传入的内容进行设定。

# ## 从 Series 字典中构造

# 可以使用值为 `Series` 的字典进行构造：

# In[2]:


d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}


# 如果没有传入 `columns` 的值，那么 `columns` 的值默认为字典 `key`，`index` 默认为所有 `value` 中 `index` 的并集。

# In[3]:


df = pd.DataFrame(d)

df


# 如果指定了 `index` 值，`index` 为指定的 `index` 值：

# In[4]:


pd.DataFrame(d, index=['d', 'b', 'a'])


# 如果指定了 `columns` 值，会去字典中寻找，找不到的值为 `NaN`：

# In[5]:


pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])


# 查看 `index` 和 `columns`：

# In[6]:


df.index


# In[7]:


df.columns


# ## 从 ndarray 或者 list 字典中构造

# 如果字典是 `ndarray` 或者 `list`，那么它们的长度要严格保持一致：

# In[8]:


d = {'one' : [1., 2., 3., 4.],
     'two' : [4., 3., 2., 1.]}


# `index` 默认为 `range(n)`，其中 `n` 为数组长度： 

# In[9]:


pd.DataFrame(d)


# 如果传入 `index` 参数，那么它必须与数组等长：

# In[10]:


pd.DataFrame(d, index=['a', 'b', 'c', 'd'])


# ## 从结构数组中构造

# `numpy` 支持结构数组的构造：

# In[11]:


data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])
data[:] = [(1,2.,'Hello'), (2,3.,"World")]

data


# 参数处理的方式与数组字典类似：

# In[12]:


pd.DataFrame(data)


# In[13]:


pd.DataFrame(data, index=['first', 'second'])


# In[14]:


pd.DataFrame(data, columns=['C', 'A', 'B'])


# ## 从字典列表中构造

# 字典中同一个键的值会被合并到同一列：

# In[15]:


data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

pd.DataFrame(data2)


# In[16]:


pd.DataFrame(data2, index=['first', 'second'])


# In[17]:


pd.DataFrame(data2, columns=['a', 'b'])


# ## 从 Series 中构造

# 相当于将 Series 二维化。

# ## 其他构造方法

# `DataFrame.from_dict` 从现有的一个字典中构造，`DataFrame.from_records` 从现有的一个记录数组中构造：

# In[18]:


pd.DataFrame.from_records(data, index='C')


# `DataFrame.from_items` 从字典的 `item` 对构造：

# In[19]:


pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])])


# ## 列操作

# `DataFrame` 可以类似于字典一样对列进行操作：

# In[20]:


df["one"]


# 添加新列：

# In[21]:


df['three'] = df['one'] * df['two']

df['flag'] = df['one'] > 2

df


# 可以像字典一样删除：

# In[22]:


del df["two"]

three = df.pop("three")

df


# 给一行赋单一值：

# In[23]:


df['foo'] = 'bar'

df


# 如果 `index` 不一致，那么会只保留公共的部分：

# In[24]:


df['one_trunc'] = df['one'][:2]

df


# 也可以直接插入一维数组，但是数组的长度必须与 `index` 一致。
# 
# 默认新列插入位置在最后，也可以指定位置插入：

# In[25]:


df.insert(1, 'bar', df['one'])

df


# 添加一个 `test` 新列：

# In[28]:


df.assign(test=df["one"] + df["bar"])


# ## 索引和选择

# 基本操作：
# 
# | Operation	| Syntax | Result |
# | ---- | ----- | ---- |
# | Select column	| df[col] | Series |
# | Select row by label | df.loc[label] | Series |
# | Select row by integer location | df.iloc[loc] | Series |
# | Slice rows | df[5:10]	| DataFrame |
# | Select rows by boolean vector	| df[bool_vec] | DataFrame |