import pandas as pd
from pandas import Timedelta as td
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import datetime as dt
from pykalman import KalmanFilter
start = '2002-1-1'
end= '2015, 12, 1'
#p= pattern length
p = 125
#o = outcome length
o = 25
def ret_index(prices):
'''
return index of 1 dollar invested in that instrument
'''
rets = prices.pct_change()
index = (rets+1).cumprod()
return (index-1)*100
def zscore(x):
'''
series x normalized by his standard deviation
'''
demeaned = x-x.mean()
return demeaned/x.std()
px = get_pricing('SPY', start, end, fields= 'price')
i = len(px.index)-1
#verify
current_close = px.iloc[i]
current_date = px.index[i]
print current_date, current_close
# current pattern
cp = px.iloc[i-p:i]
#historical data within to serach for similar patterns
h = px[:i-p]
def performance(x):
return (x[-1]-x[0])/abs(x[0])*100
def similar(x):
'''
condition: patterns have to be correlated according to the pearson correlation
return: mean of the difference of values, more weigth to max and min values, less weigth to in-between values
'''
performance_cp = (cp[-1]-cp[0])/abs(cp[0])
performance_x = (x[-1]-x[0])/abs(x[0])
sp_corr_value, sp_corr_pvalue = stats.pearsonr(cp.values,x)
if sp_corr_value < sp_corr_pvalue: # correlation value returned by pearnson has to be grather than p_value
return np.NAN
else:
return sp_corr_value
correlation = pd.rolling_apply(h,p,similar)# apply the similar function to each row of the historical dataframe
correlation.name = 'corr'
correlation= correlation.dropna()
# drop NaN values reurned by the pearson correlation check
correlation.plot(style='ro')
df = pd.DataFrame(correlation).dropna()
df['date']=df.index
df.columns = ['corr', 'date']
df['delta'] = (df['date']-df['date'].shift(1))/pd.Timedelta('1 days')
df['delta'].fillna(p+1, inplace=True)
df['eval'] = df.apply(lambda x: x['date'] if x['delta'] >7 else np.NAN, axis=1)
df.fillna(method = 'pad', inplace=True)
df[:5]
# lista di date a cui corrispondono i massimi valori di correlazione
pat = df['corr'].groupby(df['eval']).apply(lambda x: x.argmax()).values
pat
max_corr = df['corr'].loc[[_ for _ in pat]]
max_corr.order(ascending=False, inplace=True)
max_corr = max_corr[:10]
pat_names = max_corr.index
correlation.plot(style='ro', alpha= .2)
max_corr.plot(style= 'ko')
pat_list = [px.iloc[(i+1)-p:i+1] for i, date in enumerate(px.index) if date in pat_names]
out_list = [px.iloc[(i+1):(i+1)+o] for i, date in enumerate(px.index) if date in pat_names]
df_pat = pd.concat(pat_list, axis= 1)
df_out = pd.concat(out_list, axis=1)
df_pat.columns= pat_names
df_out.columns= pat_names
ri= ret_index(cp).fillna(0).values
for d in pat_names:
q_p = df_pat[d].dropna()
q_o = df_out[d].dropna()
q_p = q_p.reset_index(drop=True)
q_o = q_o.reset_index(drop=True)
r_p= ret_index(q_p).fillna(0)
r_o = ret_index(q_o).fillna(0)
r_p= r_p.values
r_o = r_o.values
plt.plot(range(p), r_p-r_p[-1])
plt.plot([_+(p-1) for _ in range(o)], r_o-r_o[0])
plt.plot(ri-ri[-1], 'k')
plt.show()
df_pat.plot()
plt.show()