This analysis computes the Spearman Rank correlation between an alpha factor and 1, 5, and 10 day forward price movements. We disagregate by sector to isolate sector specific trends.
from __future__ import division
from quantopian.pipeline import Pipeline
from quantopian.pipeline import CustomFactor
from quantopian.research import run_pipeline
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import Latest
import numpy as np
import pandas as pd
import scipy as sp
import pyfolio as pf
import matplotlib.pyplot as plt
start_date = pd.to_datetime('2014-10-1')
end_date = pd.to_datetime('2015-2-1')
Here's the alpha we are looking to analyze:
alpha_name = "alpha101"
class Alpha101(CustomFactor):
inputs = [USEquityPricing.close, USEquityPricing.high, USEquityPricing.low]
window_length = 2
def compute(self, today, assets, out, close, high, low):
# Using close of t-1 as the t open. Workaround for known open bug.
out[:] = (close[-1] - close[-2]) / ((high[-1] - low[-1]) + .001)
We'll also use a five day liquidity factor to ensure we are only dealing with liquid names:
class Liquidity(CustomFactor):
inputs = [USEquityPricing.volume, USEquityPricing.close]
window_length = 5
def compute(self, today, assets, out, volume, close):
out[:] = (volume * close).mean(axis=0)
Pull in sector codes for sector analysis:
class Sector(CustomFactor):
inputs = [morningstar.asset_classification.morningstar_sector_code]
window_length = 1
def compute(self, today, assets, out, msc):
out[:] = msc[-1]
alpha101 = Alpha101()
sector = Sector()
liquidity = Liquidity()
liquidity_rank = liquidity.rank(ascending=False)
# the alpha101.eq(alpha101) bit is a handy trick to filter out rows containing NaNs.
ok_universe = (1000 > liquidity_rank) & alpha101.eq(alpha101) & sector.eq(sector)
alpha101_rank = alpha101.rank(mask=ok_universe, ascending=False)
pipe = Pipeline()
pipe.add(alpha101, alpha_name)
pipe.add(sector, 'sector_code')
pipe.set_screen(ok_universe)
pipe.show_graph(format='png')
daily_alpha = run_pipeline(pipe, start_date=start_date, end_date=end_date)
daily_alpha = daily_alpha.reset_index().rename(columns={'level_0': 'date', 'level_1':'sid'})
daily_alpha.head()
len(equities)
Now pull in pricing data and compute 1, 5, and 10 day forward price movements.
equities = daily_alpha.sid.unique()
equities = map(lambda x: x.sid, equities)
td = pd.Timedelta(days=25)
prices = get_pricing(equities, start_date=start_date, end_date=end_date+td, fields='open_price')
col_n = '%s_day_fwd_price_pct_change'
daily_alpha = daily_alpha.set_index(['date', 'sid'])
for i in [1, 5, 10]:
delta = prices.pct_change(i).shift(-i)
daily_alpha[col_n%i] = delta.stack()
daily_alpha = daily_alpha.reset_index()
daily_alpha.head()
Now compute the Spearman Rank correlation within each day and sector.
cn = "%s_day_IC"
src = pd.DataFrame()
src_fn = lambda x: sp.stats.spearmanr(daily_alpha.loc[x.index, alpha_name],
daily_alpha.loc[x.index, x.name])[0]
src = daily_alpha.groupby(['date', 'sector_code']).agg({col_n%1: src_fn,
col_n%5: src_fn,
col_n%10: src_fn})
src = src.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
src = src[[cn%1, cn%5, cn%10]]
src = src.reset_index()
sector_names = {
101: 'Basic Materials',
102: 'Consumer Cyclical',
103: 'Financial Services',
104: 'Real Estate',
205: 'Consumer Defensive',
206: 'Healthcare',
207: 'Utilities',
308: 'Communication Services',
309: 'Energy',
310: 'Industrials',
311: 'Technology' ,
}
for sc, cor in src.groupby('sector_code'):
t = alpha_name + ": " + sector_names[sc] + " Information Coeficient"
cor.drop('sector_code', axis=1).set_index('date').plot(title=t)
def src_std_error(rho, n):
return (1-rho**2)/np.sqrt(n-1)
z = lambda x: x.replace(day=1)
c = daily_alpha.reset_index()['date'].apply(z)
c.index = daily_alpha.index
daily_alpha['y_m'] = c
monthly_src = pd.DataFrame()
src_fn = lambda x: sp.stats.spearmanr(daily_alpha.loc[x.index, alpha_name],
daily_alpha.loc[x.index, x.name])[0]
monthly_src = daily_alpha.groupby(['y_m', 'sector_code']).agg({col_n%1: src_fn,
col_n%5: src_fn,
col_n%10: src_fn})
obs_count = daily_alpha.groupby(['y_m', 'sector_code']).count().drop(alpha_name, axis=1)
src_std_error_df = src_std_error(monthly_src, obs_count)
monthly_src = monthly_src.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
monthly_src = monthly_src[[cn%1, cn%5, cn%10]]
monthly_src = monthly_src.reset_index()
src_std_error_df = src_std_error_df.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
src_std_error_df = src_std_error_df[[cn%1, cn%5, cn%10]]
src_std_error_df = src_std_error_df.reset_index()
f, axes = plt.subplots(6,2, sharex=True, sharey=True, figsize=(20,45))
axes = axes.flatten()
i = 0
for sc, cor in monthly_src.groupby(['sector_code']):
err = src_std_error_df[src_std_error_df.sector_code == sc].set_index('y_m')
cor.drop('sector_code', axis=1).set_index('y_m').plot(kind='bar',
title=sector_names[sc],
ax=axes[i],
yerr=err)
i+=1
fig = plt.gcf()
fig.suptitle(alpha_name + " Monthly Information Coefficient by Sector", fontsize=24, x=.5, y=.93)
all_src = pd.DataFrame()
src_fn = lambda x: sp.stats.spearmanr(daily_alpha.loc[x.index, alpha_name],
daily_alpha.loc[x.index, x.name])[0]
all_src = daily_alpha.groupby(['sector_code']).agg({col_n%1: src_fn,
col_n%5: src_fn,
col_n%10: src_fn})
obs_count = daily_alpha.groupby(['sector_code']).count().drop(alpha_name, axis=1)
src_std_error_df = src_std_error(all_src, obs_count)
all_src = all_src.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
all_src = all_src[[cn%1, cn%5, cn%10]]
all_src = all_src.reset_index()
src_std_error_df = src_std_error_df.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
src_std_error_df = src_std_error_df[[cn%1, cn%5, cn%10]]
src_std_error_df = src_std_error_df.reset_index()
all_src.sector_code = all_src.sector_code.apply(lambda x: sector_names[x])
src_std_error_df.sector_code = src_std_error_df.sector_code.apply(lambda x: sector_names[x])
t = "{} Information Coefficient ({} - {})".format(alpha_name,
start_date.strftime('%m/%d/%Y'),
end_date.strftime('%m/%d/%Y'))
all_src.set_index('sector_code').plot(kind='bar',
title=t,
yerr=src_std_error_df.set_index('sector_code'))
#pd.qcut(x, 5, labels=False) couldn't get this working...
def quint_bucket(x):
if x > .8:
return 5
elif x > .6:
return 4
elif x > .4:
return 3
elif x > .2:
return 2
else:
return 1
daily_alpha = daily_alpha.sort(['date', 'sector_code']).set_index(['date', 'sector_code'])
daily_alpha['alpha_percentile'] = daily_alpha.alpha101.groupby(
level=['date', 'sector_code']).transform(lambda x: quint_bucket(np.percentile(x)))
daily_alpha = daily_alpha.reset_index()
daily_alpha['quint_bucket'] = daily_alpha.alpha_percentile.apply(quint_bucket)
mean_ret_by_q = daily_alpha.groupby(['sector_code', 'quint_bucket']).mean()[[col_n%1, col_n%5, col_n%10]]
f, axes = plt.subplots(6,2, sharex=False, sharey=True, figsize=(20,45))
axes = axes.flatten()
i = 0
for sc, cor in mean_ret_by_q.groupby(level='sector_code'):
cor = cor.reset_index().drop('sector_code', axis=1).set_index('quint_bucket')
cor.plot(kind='bar', title=sector_names[sc], ax=axes[i])
axes[i].set_xlabel('factor quintile')
axes[i].set_ylabel('mean price % change')
i+=1
fig = plt.gcf()
fig.suptitle(alpha_name + ": Mean Return By Factor Quintile", fontsize=24, x=.5, y=.93)