Notebook

Smooth PCA

An experiment to see if smoothing of prices affects PCA. The test applies classic statistical arbitrage strategy on smoothed prices and raw prices. We notice that there is significant increase in IC after smoothing prices also from the graphs the residuals appear to be more mean reverting when prices are smoothed.

In [112]:
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline import Pipeline
from quantopian.research import run_pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

study_date = "2019-01-01"
minprice = USEquityPricing.close.latest > 10
maxprice = USEquityPricing.close.latest < 100
pipe = Pipeline(columns= { 'market_cap': MarketCap(), 'sector': Sector() }, 
                screen=QTradableStocksUS() & minprice & maxprice)

res = run_pipeline(pipe, study_date, study_date)
res.sort_values('market_cap', ascending=False, inplace=True)

stocks = res[res.sector==Sector.ENERGY].index.droplevel(0)  # drop the single date from the multi-index
stocks = stocks
pricing = get_pricing(symbols=stocks, fields='close_price',
                      start_date=pd.Timestamp(study_date) - pd.DateOffset(months=36),
                      end_date=pd.Timestamp(study_date)).dropna(axis=1)

prices = pricing.dropna(axis=1)
In [113]:
returns = prices.pct_change().dropna().values
factors = PCA(0.85).fit_transform(returns)
sm_returns = prices.ewm(span=15).mean().dropna().pct_change().dropna().values
sm_factors = PCA(0.85).fit(sm_returns).transform(sm_returns)
In [114]:
print factors.shape
print sm_factors.shape
(753, 17)
(753, 12)
In [115]:
import statsmodels.api as smapi

model = smapi.OLS(returns, smapi.add_constant(factors)).fit()
sm_model = smapi.OLS(sm_returns, smapi.add_constant(sm_factors)).fit()

import matplotlib.pyplot as plt
plt.subplot(211)
plt.plot(model.resid[:, 1].cumsum())
plt.subplot(212)
plt.plot(sm_model.resid[:, 1].cumsum())
Out[115]:
[<matplotlib.lines.Line2D at 0x7fe7f9b3f310>]
In [116]:
K = 90
W = np.zeros((prices.shape[0] - K, prices.shape[1]))
sm_W = np.zeros((prices.shape[0] - K, prices.shape[1]))

for ij in range(K, prices.shape[0]):
    P = prices.iloc[ij - K:ij+1, :]
    R = P.pct_change().dropna().values
    sm_R = P.ewm(span=15).mean().dropna().pct_change().dropna().values
    F = PCA(.75).fit_transform(R)
    sm_F = PCA(.75).fit_transform(sm_R)
    
    m = smapi.OLS(R, smapi.add_constant(F)).fit()
    sm_m = smapi.OLS(sm_R, smapi.add_constant(sm_F)).fit()
    
    res = m.resid.cumsum(axis=0)
    sm_res = sm_m.resid.cumsum(axis=0)
    
    scores = np.zeros(res.shape[1])
    sm_scores = np.zeros(sm_res.shape[1])
    
    for i in range(0, len(scores)):
        m = smapi.OLS(res[1:, i], smapi.add_constant(res[:-1, i])).fit()
        mu = m.params[0] / (1. + m.params[1])
        scores[i] = mu - res[-1, i]
        
        m = smapi.OLS(sm_res[1:, i], smapi.add_constant(sm_res[:-1, i])).fit()
        mu = m.params[0] / (1. + m.params[1])
        sm_scores[i] = mu - sm_res[-1, i]
    W[ij-K] = scores
    sm_W[ij-K] = sm_scores
In [117]:
import alphalens

factors = pd.DataFrame(W[:-1], columns=prices.columns, index=prices.index[K+1:]).stack()
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(
    factors,
    prices,
    quantiles=2,
    periods=(1,), 
    filter_zscore=None)
alphalens.tears.create_information_tear_sheet(factor_data)
Dropped 0.2% entries from factor data: 0.2% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Information Analysis
1D
IC Mean 0.008
IC Std. 0.140
Risk-Adjusted IC 0.060
t-stat(IC) 1.551
p-value(IC) 0.121
IC Skew -0.034
IC Kurtosis 0.316
<matplotlib.figure.Figure at 0x7fe7fa29a150>
In [118]:
factors = pd.DataFrame(sm_W[:-1], columns=prices.columns, index=prices.index[K+1:]).stack()
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(
    factors,
    prices,
    quantiles=2,
    periods=(1,), 
    filter_zscore=None)
alphalens.tears.create_information_tear_sheet(factor_data)
Dropped 0.2% entries from factor data: 0.2% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Information Analysis
1D
IC Mean 0.010
IC Std. 0.144
Risk-Adjusted IC 0.072
t-stat(IC) 1.849
p-value(IC) 0.065
IC Skew -0.041
IC Kurtosis 0.194
<matplotlib.figure.Figure at 0x7fe7faae0290>