#!/usr/bin/env python
# coding: utf-8

# ##内容索引
# 1. 计算股票收益率----
# diff做差分、std求标准差、where函数返回满足条件的数组索引
# 
# 2. 日期分析例子----
# 载入文件对日期的处理、where和take函数、argmax和argmin函数
# 
# 3. 周汇总例子----
# apply_along_axis函数会调用一个自定义函数，作用于每一个数组元素上
# 
# 4. 线性模型预测价格----
# linalg.lstsq函数
# 
# 5. 绘制趋势线例子----
# 
# 6. ndarray补充----
# clip函数，修剪；
# compress函数，压缩；
# prod函数，相乘；
# cumprod函数，累积乘积。

# 上一小节，介绍了numpy中读取数据，计算均值方差，最大最小值等基本操作。这一节中，我将继续学习numpy常用函数，用来做更加深入的股票分析。
# 这里股票分析只是一个例子，不必在意概念细节，只是针对现有的数据来熟悉numpy的操作。

# In[2]:


import numpy as np


# ##1. 股票收益率

# 简单收益率是指相邻两个价格之间的变化率，而对数收益率是指所有价格取对数后两两之间的差值。由于对数收益率是两两价格相除再去对数，所以其**可以用来衡量价格收益率。**
# 
# 投资者最感兴趣的是收益率的方差或标准差，这代表了投资风险的大小。

# ###1.1 计算简单收益率
# diff函数计算离散差分，计算收益率，需要用差值除以前一天的价格。

# In[3]:


#cp means closing price
cp = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
cp_diff = np.diff(cp)
rate_of_returns = cp_diff / cp[:-1]


# In[4]:


cp


# In[5]:


#去掉最后一个
#从开始到倒数第二个
cp[:-1]


# In[6]:


rate_of_returns


# In[7]:


print "standard deviation = ", np.std(rate_of_returns)


# ###1.2 对数收益率

# 取对数要注意事先检查数据中不包含0和负值，确保输入满足定义域条件

# In[8]:


log_rate_of_returns = np.diff(np.log(cp))


# In[9]:


log_rate_of_returns


# 得到收益率为正的情况，where函数可以根据制定条件返回数组的索引值

# In[10]:


pos_return_indices = np.where(log_rate_of_returns > 0)
print 'Indices with positive returns', pos_return_indices


# ###1.3 计算波动率
# 波动率volatility是对价格变动的一种度量。**年波动率等于对数收益率的标准差除以其均值，在除以交易日倒数的平方根，通常交易日去252天。**

# In[11]:


annual_volatility = np.std(log_rate_of_returns) / np.mean(log_rate_of_returns)
annual_volatility = annual_volatility / np.sqrt(1. / 252.)
print annual_volatility


# sqrt函数中的除法运算必须使用浮点数才能得到正确结果。

# In[12]:


print "Monthly volatility", annual_volatility * np.sqrt(1. / 12.)


# ##2. 日期分析
# 我们读入收盘价数据，根据星期几来切分数据，计算平均价格，最后找出一周的那一天的平均收盘价最高

# **Numpy是面向浮点数运算的，对日期处理需要专门的方法**

# loadtxt函数中有一个特定的参数converters，这是数据列到转换函数之间映射的字典。

# In[13]:


#convert funtion
import datetime
def datestr2num(s):
    return datetime.datetime.strptime(s, "%d-%m-%Y").date().weekday()

#字符串首先会按照指定形式"%d-%m-%Y"转换成一个datetime对象，随后datetime对象被转换成date对象，最后调用weekday方法返回一个数字
#0代表星期一，6代表星期天


# In[14]:


#cp means closing price
dates, cp = np.loadtxt('data.csv', delimiter=',', usecols=(1,6), converters={1:datestr2num}, unpack=True)
print "Dates = ", dates


# In[15]:


#保存各工作日的平均收盘价
averages = np.zeros(5)


# **where函数会根据指定的条件返回所有满足条件的数组元素的索引值；take函数根据这些索引值从数组中取出相应的元素。**

# In[16]:


for i in xrange(5):
    indices = np.where(dates == i)
    prices = np.take(cp, indices)
    avg = prices.mean()
    print "Day", i, "prices", prices, "Average", avg
    averages[i] = avg


# In[17]:


#找出哪个工作日的平均收盘价最高，哪个最低
top = averages.max()
bottom = averages.min()
print "Highest average", top
print "Top day of the week", np.argmax(averages)
print "Lowest average", bottom
print "Bottom day of the week", np.argmin(averages)


# **argmin函数返回的是averages数组中最小元素的索引值，argmax同理**

# ##3. 周汇总

# In[18]:


dates, op, hp, lp, cp = np.loadtxt('data.csv', delimiter=',', usecols=(1,3,4,5,6), converters={1: datestr2num}, unpack=True)
cp = cp[:16]
dates = dates[:16]
#找到第一个星期一
first_monday = np.ravel(np.where(dates == 0))[0]
print "The first Monday index is ",first_monday


# In[19]:


#找到最后一个周五
last_friday = np.ravel(np.where(dates == 4))[-1]
print "The last Friday index is ", last_friday


# In[20]:


#存储三周每一天的索引值
weeks_indices = np.arange(first_monday, last_friday+1)
print "Weeks indices initial ",weeks_indices


# In[21]:


#切分数组，每5个元素一个子数组
weeks_indices = np.split(weeks_indices, 3)
print "Weeks indices after split ", weeks_indices


# **apply_along_axis函数会调用一个自定义函数，作用于每一个数组元素上。**在调用apply_along_axis时提供我们自定义的函数summarize，并指定要作用的轴或维度的编号以及函数的参数

# In[22]:


#这里indices是三个array
def summarize(indices, open, high, low, close):
    monday_open = open[indices[0]]
    week_high = np.max(np.take(high, indices))
    week_low = np.min(np.take(low, indices))
    friday_close = close[indices[-1]]
    return ("APPL", monday_open, week_high, week_low, friday_close)


# In[23]:


#这里参数1，代表作用于每一行，一共三行，得到三行结果
#相当于axis为1的时候进行计算
week_summary = np.apply_along_axis(summarize, 1, weeks_indices, op, hp, lp, cp)
print "Week summary", week_summary


# ##4. 用线性模型预测价格
# NumPy的linalg包是专门用于线性代数计算的，我们可以根据N个之前的价格利用线性模型进行预测计算。

#  我们假设，一个股价可以用之前股价的线性组合表示出来，即，这个股价等于之前股价的加权和的结果。用线性代数的角度，这是解一个最小二乘法的问题。

# In[24]:


#1. 获得N个股价的向量b
cp = np.loadtxt('data.csv', delimiter=',', usecols=(6,), unpack=True)
N = 5
#从倒数第N个数据取到最后
b = cp[-N:]
print b
#翻转数组，越前面数据，日期越新
b = b[::-1]
print b


# In[25]:


#2. 初始N*N的二维数组A
A = np.zeros((N,N), float)
print "Zeros N by N\n", A


# In[26]:


#3. 用b向量的N个股价值填充数组A
for i in range(N):
    A[i,] = cp[-N-1-i : -1-i]
    
print "A:\n", A


# In[27]:


#4. 使用linalg中的lstsq来解决最小平方和的问题
(x, residuals, rank, s) = np.linalg.lstsq(A, b)
print 'x:\n', x
print 'residuals:\n', residuals
print 'rank:\n', rank
print 's:\n', s


# **返回的元组中包含稍后要用到的系数向量x，一个残差数组，A的秩以及A的奇异值**

# In[28]:


#5. 使用numpy中的dot函数计算系数向量与最近N个价格构成的向量的点积
print np.dot(b, x)


# ##5. 绘制趋势线
# 趋势线，是根据股价走势图上很多所谓的枢轴点绘成的曲线，即描绘的是价格变化的趋势。

# In[29]:


get_ipython().run_line_magic('matplotlib', 'inline')
from matplotlib.pyplot import plot
from matplotlib.pyplot import show


# In[30]:


#我们假设枢轴点的位置是最高价、最低价、收盘价的均值
hp, lp, cp = np.loadtxt('data.csv', delimiter=',', usecols=(4,5,6), unpack=True)
pivots = (hp+lp+cp) / 3
print "Pivots:\n", pivots


# 从枢轴点出发，我们可以推导出阻力位和支撑位。
# > 阻力位是指股价上升时遇到阻力，在转秩前的最高价格
# 
# > 支撑位是股价下跌时遇到支撑，在反弹前的最低价格
# 
# 他们都是估计量，根据这些估计量，我们可以绘制阻力位和支撑位的趋势线。

# In[31]:


#定义直线方程为y = Ax，其中A=[t 1],x=[a b]
#展开就是y = at + b
def fit_line(t, y):
    #vstack垂直组合
    A = np.vstack([t, np.ones_like(t)]).T
    return np.linalg.lstsq(A, y)[0]


# In[32]:


#假设支撑位在枢轴点下方的位置
#阻力点在枢轴点上方的一个位置
t = np.arange(len(cp))
sa, sb = fit_line(t, pivots-(hp-lp))
ra, rb = fit_line(t, pivots+(hp-lp))
support = sa*t + sb
resistance = ra*t + rb


# In[33]:


#设置一个判断数据点是否在趋势线之间的条件
condition = (cp > support) & (cp < resistance)
print "Conditions:\n", condition
between_bands = np.where(condition)
print between_bands
print np.ravel(between_bands)


# In[34]:


#复查取值
print support[between_bands]
print cp[between_bands]
print resistance[between_bands]


# In[35]:


plot(t, cp)
plot(t, support)
plot(t, resistance)


# In[36]:


n_betweens_band = len(np.ravel(between_bands))
print "Number points between bands: ", n_betweens_band
print "Ratio between bands: ", float(n_betweens_band)/len(cp)


# In[37]:


#提供另外一种计算支撑位和阻力位之间数据点个数的方法
# []操作符定义选择条件
# intersect1d计算两者相交的结果
a1 = cp[cp > support]
a2 = cp[cp < resistance]
print "Number of points between band2 2nd approach: ", len(np.intersect1d(a1, a2))


# ##补充： ndarray对象的几种方法
# > clip，修剪
# 
# > compress，压缩
# 
# > prod，相乘
# 
# > cumprod，累积乘积

# In[38]:


#clip方法
#将所有比给定最大值还大的元素全部设为给定最大值
#将所有比给定最小值还小的元素全部设为最小值
a = np.arange(5)
print "a = ",a
print "Clipped:", a.clip(1,3)


# In[40]:


#compress方法
#返回一个根据给定条件筛选后的数组
a = np.arange(5)
print a
print "Compressed: ", a.compress(a > 2)


# In[42]:


# prod方法计算数组中所有元素的乘积
# 这种方法可以计算阶乘 factorial
b = np.arange(1,6)
print  "b = ", b
print "factorial: ", b.prod()


# In[43]:


# cumprod方法，计算数组元素的累积乘积
print "factorials array: ", b.cumprod()