Notebook

Classical statistical arbitrage and maximum mean reversion

Pravin Bezwada

Feb 2018

Classical PCA based statistical arbitrage is described here: https://www.math.nyu.edu/faculty/avellane/AvellanedaLeeStatArb071008.pdf

In [122]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import numpy as np
import pandas as pd
import scipy as sp
import statsmodels.stats.api as smapi
from statsmodels.tsa.stattools import adfuller
from sklearn.decomposition import PCA, FastICA
from sklearn.linear_model import LinearRegression, MultiTaskLassoCV

from quantopian.pipeline.data import morningstar
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline import Pipeline
from quantopian.research import run_pipeline
from quantopian.pipeline.data.builtin import USEquityPricing

Build universe of stock returns

In [116]:
study_date = "2018-02-01"
In [117]:
minprice = USEquityPricing.close.latest > 10
maxprice = USEquityPricing.close.latest < 200
pipe = Pipeline(columns= { 'market_cap': MarketCap(), 'sector': Sector() }, 
                screen=QTradableStocksUS() & minprice & maxprice)

res = run_pipeline(pipe, study_date, study_date)
res.sort_values('market_cap', ascending=False, inplace=True)
stocks = res.index.droplevel(0)  # drop the single date from the multi-index
print stocks.shape
(1900,)
In [168]:
# select top 250 stocks
pricing = get_pricing(symbols=stocks, fields='close_price',
                      start_date=pd.Timestamp(study_date) - pd.DateOffset(months=24),
                      end_date=pd.Timestamp(study_date))

returns = pricing.pct_change()
returns = returns.iloc[1:,:].dropna(axis=1)
returns = np.log1p(returns)
print returns.shape
(505, 1780)

decompose returns and find factors

In [176]:
oSample = 400

returnsIn = returns.values[:oSample, :]
returnsOut = returns.values[oSample:, :]

from sklearn.covariance import OAS

cov = OAS().fit(returnsIn).covariance_
e,v = np.linalg.eigh(cov)
comp = v[:, :25].real
factors=  np.dot(returnsIn, comp)
print factors.shape
(400, 25)

Regress returns on factors

In [177]:
lrObject = LinearRegression(fit_intercept=False).fit(factors, returnsIn)
residuals = returnsIn - lrObject.predict(factors)
print residuals.shape
(400, 1780)

Verify stationarity of cumulative residual sums

In [178]:
residual_sums = residuals.cumsum(axis=0)

pvals = {} # store standard deviations of residual sums

for i in range(0, residual_sums.shape[1]):
    pval = adfuller(residual_sums[:, i], regression='nc')[1] 
    if  pval < 0.0001:
        pvals[i] = pval
        
print len(pvals)
selection = len(pvals)
3

Plot the top few residual sums

In [179]:
import operator
sorted_indices = sorted(pvals.items(), key=operator.itemgetter(1))


plt.figure(1)
for i in range(0, 2):
    print sorted_indices[i][1]
    plt.subplot(2*100+10+i+1)
    plt.plot(residual_sums[:, sorted_indices[i][0]]);
4.43476631354e-06
9.56538161988e-06

Compute portfolio weights for each residual sum

In [180]:
W = np.zeros((selection, returns.shape[1])) # weight of 3 portfolios

for i in range(0, selection):
    K = sorted_indices[i][0]
    betas = lrObject.coef_[K, :]
    W[i, K] = 1.
    C = comp.T
    for j in range(0, C.shape[0]):
        W[i, :] -= betas[j] * C[j, :] 
    W[i, :] /= np.sum(np.abs(W[i, :]))

Plot out of sample reversion for top portfolios

In [181]:
plt.figure(1)
for i in range(0, selection):
    r_oos = np.dot(returnsOut.cumsum(axis=0), W[i, :])

    if i < 2:
        plt.subplot(2*100+10+i+1)
        plt.plot(r_oos);

Combine all portfolio and check OOS

In [182]:
plt.plot(np.sum(np.dot(returnsOut.cumsum(axis=0), W.T), axis=1), color='red');