Pravin Bezwada
Feb 2018
Classical PCA based statistical arbitrage is described here: https://www.math.nyu.edu/faculty/avellane/AvellanedaLeeStatArb071008.pdf
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.stats.api as smapi
from statsmodels.tsa.stattools import adfuller
from sklearn.decomposition import PCA, FastICA
from sklearn.linear_model import LinearRegression, MultiTaskLassoCV
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline import Pipeline
from quantopian.research import run_pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
study_date = "2018-02-01"
minprice = USEquityPricing.close.latest > 10
maxprice = USEquityPricing.close.latest < 200
pipe = Pipeline(columns= { 'market_cap': MarketCap(), 'sector': Sector() },
screen=QTradableStocksUS() & minprice & maxprice)
res = run_pipeline(pipe, study_date, study_date)
res.sort_values('market_cap', ascending=False, inplace=True)
stocks = res.index.droplevel(0) # drop the single date from the multi-index
print stocks.shape
# select top 250 stocks
pricing = get_pricing(symbols=stocks, fields='close_price',
start_date=pd.Timestamp(study_date) - pd.DateOffset(months=24),
end_date=pd.Timestamp(study_date))
returns = pricing.pct_change()
returns = returns.iloc[1:,:].dropna(axis=1)
returns = np.log1p(returns)
print returns.shape
oSample = 400
returnsIn = returns.values[:oSample, :]
returnsOut = returns.values[oSample:, :]
from sklearn.covariance import OAS
cov = OAS().fit(returnsIn).covariance_
e,v = np.linalg.eigh(cov)
comp = v[:, :25].real
factors= np.dot(returnsIn, comp)
print factors.shape
lrObject = LinearRegression(fit_intercept=False).fit(factors, returnsIn)
residuals = returnsIn - lrObject.predict(factors)
print residuals.shape
residual_sums = residuals.cumsum(axis=0)
pvals = {} # store standard deviations of residual sums
for i in range(0, residual_sums.shape[1]):
pval = adfuller(residual_sums[:, i], regression='nc')[1]
if pval < 0.0001:
pvals[i] = pval
print len(pvals)
selection = len(pvals)
import operator
sorted_indices = sorted(pvals.items(), key=operator.itemgetter(1))
plt.figure(1)
for i in range(0, 2):
print sorted_indices[i][1]
plt.subplot(2*100+10+i+1)
plt.plot(residual_sums[:, sorted_indices[i][0]]);
W = np.zeros((selection, returns.shape[1])) # weight of 3 portfolios
for i in range(0, selection):
K = sorted_indices[i][0]
betas = lrObject.coef_[K, :]
W[i, K] = 1.
C = comp.T
for j in range(0, C.shape[0]):
W[i, :] -= betas[j] * C[j, :]
W[i, :] /= np.sum(np.abs(W[i, :]))
plt.figure(1)
for i in range(0, selection):
r_oos = np.dot(returnsOut.cumsum(axis=0), W[i, :])
if i < 2:
plt.subplot(2*100+10+i+1)
plt.plot(r_oos);
plt.plot(np.sum(np.dot(returnsOut.cumsum(axis=0), W.T), axis=1), color='red');