An experiment to see if smoothing of prices affects PCA. The test applies classic statistical arbitrage strategy on smoothed prices and raw prices. We notice that there is significant increase in IC after smoothing prices also from the graphs the residuals appear to be more mean reverting when prices are smoothed.
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.pipeline import Pipeline
from quantopian.research import run_pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
study_date = "2019-01-01"
minprice = USEquityPricing.close.latest > 10
maxprice = USEquityPricing.close.latest < 100
pipe = Pipeline(columns= { 'market_cap': MarketCap(), 'sector': Sector() },
screen=QTradableStocksUS() & minprice & maxprice)
res = run_pipeline(pipe, study_date, study_date)
res.sort_values('market_cap', ascending=False, inplace=True)
stocks = res[res.sector==Sector.ENERGY].index.droplevel(0) # drop the single date from the multi-index
stocks = stocks
pricing = get_pricing(symbols=stocks, fields='close_price',
start_date=pd.Timestamp(study_date) - pd.DateOffset(months=36),
end_date=pd.Timestamp(study_date)).dropna(axis=1)
prices = pricing.dropna(axis=1)
returns = prices.pct_change().dropna().values
factors = PCA(0.85).fit_transform(returns)
sm_returns = prices.ewm(span=15).mean().dropna().pct_change().dropna().values
sm_factors = PCA(0.85).fit(sm_returns).transform(sm_returns)
print factors.shape
print sm_factors.shape
import statsmodels.api as smapi
model = smapi.OLS(returns, smapi.add_constant(factors)).fit()
sm_model = smapi.OLS(sm_returns, smapi.add_constant(sm_factors)).fit()
import matplotlib.pyplot as plt
plt.subplot(211)
plt.plot(model.resid[:, 1].cumsum())
plt.subplot(212)
plt.plot(sm_model.resid[:, 1].cumsum())
K = 90
W = np.zeros((prices.shape[0] - K, prices.shape[1]))
sm_W = np.zeros((prices.shape[0] - K, prices.shape[1]))
for ij in range(K, prices.shape[0]):
P = prices.iloc[ij - K:ij+1, :]
R = P.pct_change().dropna().values
sm_R = P.ewm(span=15).mean().dropna().pct_change().dropna().values
F = PCA(.75).fit_transform(R)
sm_F = PCA(.75).fit_transform(sm_R)
m = smapi.OLS(R, smapi.add_constant(F)).fit()
sm_m = smapi.OLS(sm_R, smapi.add_constant(sm_F)).fit()
res = m.resid.cumsum(axis=0)
sm_res = sm_m.resid.cumsum(axis=0)
scores = np.zeros(res.shape[1])
sm_scores = np.zeros(sm_res.shape[1])
for i in range(0, len(scores)):
m = smapi.OLS(res[1:, i], smapi.add_constant(res[:-1, i])).fit()
mu = m.params[0] / (1. + m.params[1])
scores[i] = mu - res[-1, i]
m = smapi.OLS(sm_res[1:, i], smapi.add_constant(sm_res[:-1, i])).fit()
mu = m.params[0] / (1. + m.params[1])
sm_scores[i] = mu - sm_res[-1, i]
W[ij-K] = scores
sm_W[ij-K] = sm_scores
import alphalens
factors = pd.DataFrame(W[:-1], columns=prices.columns, index=prices.index[K+1:]).stack()
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(
factors,
prices,
quantiles=2,
periods=(1,),
filter_zscore=None)
alphalens.tears.create_information_tear_sheet(factor_data)
factors = pd.DataFrame(sm_W[:-1], columns=prices.columns, index=prices.index[K+1:]).stack()
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(
factors,
prices,
quantiles=2,
periods=(1,),
filter_zscore=None)
alphalens.tears.create_information_tear_sheet(factor_data)