The goal of this project is to backtest a system that for every bar:¶

1) looks back 1 month and call this pattern 'CP'¶

2) looks back in the entire history of that stock and find patterns correlated with CP¶

3) if these patterns has generated similar outcomes than those pattern are considered valid predictors¶

4) trade the current pattern expecting a similar outcome¶

import pandas as pd
from pandas import Timedelta as td
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import datetime as dt
from pykalman import KalmanFilter

start = '2002-1-1'
end= '2015, 12, 1'

#p= pattern length
p = 125
#o = outcome length
o = 25

def ret_index(prices):
    '''
    return index of 1 dollar invested in that instrument
    '''
    rets = prices.pct_change()
    index = (rets+1).cumprod()
    return (index-1)*100

def zscore(x):
    '''
    series x normalized by his standard deviation
    '''
    demeaned = x-x.mean()
    return demeaned/x.std()

1) identify the current pattern¶

px = get_pricing('SPY', start, end, fields= 'price')
i = len(px.index)-1

#verify
current_close = px.iloc[i]
current_date = px.index[i]
print current_date, current_close

2015-12-01 00:00:00+00:00 210.74

# current pattern
cp = px.iloc[i-p:i]

2) look back in history and find all patterns correlated to the current one¶

#historical data within to serach for similar patterns
h = px[:i-p]

def performance(x):
    return (x[-1]-x[0])/abs(x[0])*100

this is how i check if 2 patterns are similar¶

def similar(x):
    '''
    condition: patterns have to be correlated according to the pearson correlation
    return: mean of the difference of values, more weigth to max and min values, less weigth to in-between values 
    '''  
    performance_cp = (cp[-1]-cp[0])/abs(cp[0])
    performance_x = (x[-1]-x[0])/abs(x[0])
    sp_corr_value, sp_corr_pvalue = stats.pearsonr(cp.values,x)

    if sp_corr_value < sp_corr_pvalue: # correlation value returned by pearnson has to be grather than p_value
        return np.NAN   
    else:
        return sp_corr_value

correlation = pd.rolling_apply(h,p,similar)# apply the similar function to each row of the historical dataframe
correlation.name = 'corr'

correlation= correlation.dropna()
# drop NaN values reurned by the pearson correlation check
correlation.plot(style='ro')

<matplotlib.axes._subplots.AxesSubplot at 0x7fbd0d79d2d0>

from this graph i see that many similar patterns have overlapping index dates so i want to group them and extract from each group the date with the max correlation value so every pattern have unique dates in it¶

df = pd.DataFrame(correlation).dropna()
df['date']=df.index
df.columns = ['corr', 'date']
df['delta'] = (df['date']-df['date'].shift(1))/pd.Timedelta('1 days')
df['delta'].fillna(p+1, inplace=True)
df['eval'] = df.apply(lambda x: x['date'] if x['delta'] >7 else np.NAN, axis=1)
df.fillna(method = 'pad', inplace=True)
df[:5]

# lista di date a cui corrispondono i massimi valori di correlazione
pat = df['corr'].groupby(df['eval']).apply(lambda x: x.argmax()).values
pat

array([Timestamp('2002-12-10 00:00:00+0000', tz='UTC'),
       Timestamp('2003-05-08 00:00:00+0000', tz='UTC'),
       Timestamp('2004-07-21 00:00:00+0000', tz='UTC'),
       Timestamp('2004-10-13 00:00:00+0000', tz='UTC'),
       Timestamp('2005-06-27 00:00:00+0000', tz='UTC'),
       Timestamp('2005-12-29 00:00:00+0000', tz='UTC'),
       Timestamp('2006-09-18 00:00:00+0000', tz='UTC'),
       Timestamp('2007-06-05 00:00:00+0000', tz='UTC'),
       Timestamp('2007-11-01 00:00:00+0000', tz='UTC'),
       Timestamp('2008-05-14 00:00:00+0000', tz='UTC'),
       Timestamp('2008-09-26 00:00:00+0000', tz='UTC'),
       Timestamp('2009-01-16 00:00:00+0000', tz='UTC'),
       Timestamp('2009-05-27 00:00:00+0000', tz='UTC'),
       Timestamp('2010-05-04 00:00:00+0000', tz='UTC'),
       Timestamp('2010-08-30 00:00:00+0000', tz='UTC'),
       Timestamp('2011-11-14 00:00:00+0000', tz='UTC'),
       Timestamp('2012-08-23 00:00:00+0000', tz='UTC'),
       Timestamp('2013-02-06 00:00:00+0000', tz='UTC'),
       Timestamp('2014-12-19 00:00:00+0000', tz='UTC')], dtype=object)

max_corr = df['corr'].loc[[_ for _ in pat]]
max_corr.order(ascending=False, inplace=True)
max_corr = max_corr[:10]
pat_names = max_corr.index

correlation.plot(style='ro', alpha= .2)
max_corr.plot(style= 'ko')

<matplotlib.axes._subplots.AxesSubplot at 0x7fbd0fb48f50>

for each day identify his pattern ( p-days behind )¶

pat_list = [px.iloc[(i+1)-p:i+1] for i, date in enumerate(px.index) if date in pat_names]
out_list = [px.iloc[(i+1):(i+1)+o] for i, date in enumerate(px.index) if date in pat_names]

Dataframe of patterns and outcomes:¶

df_pat = pd.concat(pat_list, axis= 1)
df_out = pd.concat(out_list, axis=1)
df_pat.columns= pat_names
df_out.columns= pat_names

ri= ret_index(cp).fillna(0).values
for d in pat_names:
    q_p = df_pat[d].dropna()
    q_o = df_out[d].dropna()
    q_p = q_p.reset_index(drop=True)
    q_o = q_o.reset_index(drop=True)
    r_p= ret_index(q_p).fillna(0)
    r_o = ret_index(q_o).fillna(0)
    r_p= r_p.values
    r_o = r_o.values
    plt.plot(range(p), r_p-r_p[-1])
    plt.plot([_+(p-1) for _ in range(o)], r_o-r_o[0])
plt.plot(ri-ri[-1], 'k')
plt.show()
df_pat.plot()
plt.show()

	corr	date	delta	eval
2002-08-23 00:00:00+00:00	0.145151	2002-08-23	126	2002-08-23
2002-08-26 00:00:00+00:00	0.170073	2002-08-26	3	2002-08-23
2002-08-27 00:00:00+00:00	0.194336	2002-08-27	1	2002-08-23
2002-08-28 00:00:00+00:00	0.212647	2002-08-28	1	2002-08-23
2002-08-29 00:00:00+00:00	0.233133	2002-08-29	1	2002-08-23