import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1,3,5,np.nan,6,8])
s
0 1 1 3 2 5 3 NaN 4 6 5 8 dtype: float64
# Cteate a DataFrame
dates = pd.date_range('20130101', periods=6)
# dates
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns=list('ABCD'))
df
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.709714 | -0.570070 | -0.550845 | -1.057212 |
2013-01-02 | -0.628737 | 0.391646 | -1.091479 | 0.537669 |
2013-01-03 | -0.294052 | -1.296279 | -0.912759 | 0.441580 |
2013-01-04 | 0.528582 | 0.532051 | -1.274615 | 0.146372 |
2013-01-05 | 0.501640 | 1.164900 | 2.032659 | 0.443303 |
2013-01-06 | 1.862346 | 2.191615 | -0.349397 | -0.309474 |
df.dtypes
A float64 B float64 C float64 D float64 dtype: object
df.head(2)
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.709714 | -0.570070 | -0.550845 | -1.057212 |
2013-01-02 | -0.628737 | 0.391646 | -1.091479 | 0.537669 |
df.tail(2)
A | B | C | D | |
---|---|---|---|---|
2013-01-05 | 0.501640 | 1.164900 | 2.032659 | 0.443303 |
2013-01-06 | 1.862346 | 2.191615 | -0.349397 | -0.309474 |
# Display the index, columns, and the underlying numpy data
df.index
df.columns
df.values
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', '2013-01-05', '2013-01-06'], dtype='datetime64[ns]', freq='D')
# 查看描述性统计
df.describe()
A | B | C | D | |
---|---|---|---|---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | 0.446582 | 0.402311 | -0.357739 | 0.033706 |
std | 0.870429 | 1.235381 | 1.219708 | 0.617847 |
min | -0.628737 | -1.296279 | -1.274615 | -1.057212 |
25% | -0.095129 | -0.329641 | -1.046799 | -0.195513 |
50% | 0.515111 | 0.461849 | -0.731802 | 0.293976 |
75% | 0.664431 | 1.006688 | -0.399759 | 0.442872 |
max | 1.862346 | 2.191615 | 2.032659 | 0.537669 |
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)
s
2013-01-01 NaN 2013-01-02 NaN 2013-01-03 1 2013-01-04 3 2013-01-05 5 2013-01-06 NaN Freq: D, dtype: float64
# Return cumulative sum over requested axis.
df.apply(np.cumsum)
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.709714 | -0.570070 | -0.550845 | -1.057212 |
2013-01-02 | 0.080977 | -0.178424 | -1.642323 | -0.519543 |
2013-01-03 | -0.213076 | -1.474703 | -2.555082 | -0.077962 |
2013-01-04 | 0.315507 | -0.942652 | -3.829697 | 0.068409 |
2013-01-05 | 0.817147 | 0.222248 | -1.797039 | 0.511712 |
2013-01-06 | 2.679493 | 2.413864 | -2.146435 | 0.202238 |
# Concat function
piece = [df[:2],df[2:3],df[3:1]]
pd.concat(piece)
A | B | C | D | |
---|---|---|---|---|
2013-01-01 | 0.709714 | -0.570070 | -0.550845 | -1.057212 |
2013-01-02 | -0.628737 | 0.391646 | -1.091479 | 0.537669 |
2013-01-03 | -0.294052 | -1.296279 | -0.912759 | 0.441580 |
# Merge function
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
right = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [4, 5]})
pd.merge(left,right,on='key')
key | lval_x | lval_y | |
---|---|---|---|
0 | foo | 1 | 4 |
1 | foo | 1 | 5 |
2 | foo | 2 | 4 |
3 | foo | 2 | 5 |
# Append function
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
s = df.iloc[3]
print(df)
print(s)
df.append(s, ignore_index=True)
A B C D 0 0.307377 0.587502 -1.498826 -1.413681 1 0.606972 -0.949604 0.330130 0.221957 2 -0.224912 -1.682801 0.376430 0.132976 3 0.177087 1.054522 1.107316 0.857462 4 -1.378083 0.585530 0.877582 -1.344324 5 -0.288912 0.378269 -0.178293 1.931936 6 -0.196315 -0.300350 2.258136 0.007789 7 -0.863666 0.695809 -2.187412 -0.436914 A 0.177087 B 1.054522 C 1.107316 D 0.857462 Name: 3, dtype: float64
A | B | C | D | |
---|---|---|---|---|
0 | 0.307377 | 0.587502 | -1.498826 | -1.413681 |
1 | 0.606972 | -0.949604 | 0.330130 | 0.221957 |
2 | -0.224912 | -1.682801 | 0.376430 | 0.132976 |
3 | 0.177087 | 1.054522 | 1.107316 | 0.857462 |
4 | -1.378083 | 0.585530 | 0.877582 | -1.344324 |
5 | -0.288912 | 0.378269 | -0.178293 | 1.931936 |
6 | -0.196315 | -0.300350 | 2.258136 | 0.007789 |
7 | -0.863666 | 0.695809 | -2.187412 | -0.436914 |
8 | 0.177087 | 1.054522 | 1.107316 | 0.857462 |
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
# Grouping and then applying a function sum to the resulting groups.
print(df)
df.groupby('A').sum()
A B C D 0 foo one -1.010950 -1.443440 1 bar one -1.023700 0.052766 2 foo two -0.642614 -2.438775 3 bar three -0.442711 0.525121 4 foo two -1.176690 -0.230537 5 bar two -0.771803 -0.347051 6 foo one -1.313567 -1.211388 7 foo three 0.779921 -1.279009
C | D | |
---|---|---|
A | ||
bar | -2.238214 | 0.230836 |
foo | -3.363901 | -6.603149 |
# Time Series
rng = pd.date_range('1/1/2012', periods=100, freq='S')
# print(rng)
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
# print(ts)
ts.resample('5Min').sum()
263.22
# ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts=pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2000',periods=1000))
ts = ts.cumsum()
ts.plot()
/Users/ulson_hu/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:830: MatplotlibDeprecationWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. mplDeprecation)
<matplotlib.axes._subplots.AxesSubplot at 0x11262d250>