Notebook

101 Alphas Project: Pipeline Factor Information Coefficent

This analysis computes the Spearman Rank correlation between an alpha factor and 1, 5, and 10 day forward price movements. We disagregate by sector to isolate sector specific trends.

In [188]:
from __future__ import division
from quantopian.pipeline import Pipeline
from quantopian.pipeline import CustomFactor
from quantopian.research import run_pipeline
from quantopian.pipeline.data import morningstar
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import Latest
import numpy as np
import pandas as pd
import scipy as sp
import pyfolio as pf
import matplotlib.pyplot as plt
In [162]:
start_date = pd.to_datetime('2014-10-1')
end_date = pd.to_datetime('2015-2-1')

Here's the alpha we are looking to analyze:

In [163]:
alpha_name = "alpha101"
class Alpha101(CustomFactor):
    inputs = [USEquityPricing.close, USEquityPricing.high, USEquityPricing.low]
    window_length = 2
    
    def compute(self, today, assets, out, close, high, low):
        # Using close of t-1 as the t open. Workaround for known open bug.
        out[:] = (close[-1] - close[-2]) / ((high[-1] - low[-1]) + .001)

We'll also use a five day liquidity factor to ensure we are only dealing with liquid names:

In [164]:
class Liquidity(CustomFactor):   
    inputs = [USEquityPricing.volume, USEquityPricing.close] 
    window_length = 5

    def compute(self, today, assets, out, volume, close): 
        out[:] = (volume * close).mean(axis=0)

Pull in sector codes for sector analysis:

In [165]:
class Sector(CustomFactor):
    inputs = [morningstar.asset_classification.morningstar_sector_code]
    window_length = 1
    def compute(self, today, assets, out, msc):
        out[:] = msc[-1]
In [166]:
alpha101 = Alpha101()
sector = Sector()
liquidity = Liquidity()

liquidity_rank = liquidity.rank(ascending=False)
# the alpha101.eq(alpha101) bit is a handy trick to filter out rows containing NaNs.
ok_universe = (1000 > liquidity_rank)  & alpha101.eq(alpha101) & sector.eq(sector)

alpha101_rank = alpha101.rank(mask=ok_universe,  ascending=False)
In [167]:
pipe = Pipeline()
pipe.add(alpha101, alpha_name)
pipe.add(sector, 'sector_code')
pipe.set_screen(ok_universe)
In [168]:
pipe.show_graph(format='png')
Out[168]:
In [169]:
daily_alpha = run_pipeline(pipe, start_date=start_date, end_date=end_date)
daily_alpha = daily_alpha.reset_index().rename(columns={'level_0': 'date', 'level_1':'sid'})
daily_alpha.head()
Out[169]:
date sid alpha101 sector_code
0 2014-10-01 00:00:00+00:00 Equity(2 [AA]) 0.788382 101
1 2014-10-01 00:00:00+00:00 Equity(24 [AAPL]) 0.809102 311
2 2014-10-01 00:00:00+00:00 Equity(62 [ABT]) -0.705596 206
3 2014-10-01 00:00:00+00:00 Equity(64 [ABX]) -0.996979 101
4 2014-10-01 00:00:00+00:00 Equity(67 [ADSK]) 0.177936 311
In [170]:
len(equities)
Out[170]:
1685

Now pull in pricing data and compute 1, 5, and 10 day forward price movements.

In [210]:
equities = daily_alpha.sid.unique()
equities = map(lambda x: x.sid, equities)
td = pd.Timedelta(days=25)
prices = get_pricing(equities, start_date=start_date, end_date=end_date+td, fields='open_price')
In [211]:
col_n = '%s_day_fwd_price_pct_change'

daily_alpha = daily_alpha.set_index(['date', 'sid'])
for i in [1, 5, 10]:
    delta = prices.pct_change(i).shift(-i)
    daily_alpha[col_n%i] = delta.stack()
daily_alpha = daily_alpha.reset_index()
In [212]:
daily_alpha.head()
Out[212]:
date sid alpha101 sector_code 1_day_fwd_price_pct_change 5_day_fwd_price_pct_change 10_day_fwd_price_pct_change y_m
0 2014-10-01 00:00:00+00:00 Equity(2 [AA]) 0.788382 101 -0.025418 -0.009919 -0.132052 2014-10-01 00:00:00+00:00
1 2014-10-01 00:00:00+00:00 Equity(24 [AAPL]) 0.809102 311 -0.012327 -0.018193 -0.026046 2014-10-01 00:00:00+00:00
2 2014-10-01 00:00:00+00:00 Equity(62 [ABT]) -0.705596 206 -0.019738 -0.014507 -0.039715 2014-10-01 00:00:00+00:00
3 2014-10-01 00:00:00+00:00 Equity(64 [ABX]) -0.996979 101 0.005420 -0.058266 -0.077236 2014-10-01 00:00:00+00:00
4 2014-10-01 00:00:00+00:00 Equity(67 [ADSK]) 0.177936 311 0.062699 0.056148 -0.082725 2014-10-01 00:00:00+00:00

Now compute the Spearman Rank correlation within each day and sector.

In [213]:
cn = "%s_day_IC"
src = pd.DataFrame()
src_fn = lambda x: sp.stats.spearmanr(daily_alpha.loc[x.index, alpha_name],
                                   daily_alpha.loc[x.index, x.name])[0]

src = daily_alpha.groupby(['date', 'sector_code']).agg({col_n%1: src_fn,
                                                        col_n%5: src_fn,
                                                        col_n%10: src_fn})

src = src.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
src = src[[cn%1, cn%5, cn%10]]
src = src.reset_index()
In [214]:
sector_names = {
 101: 'Basic Materials',
 102: 'Consumer Cyclical',
 103: 'Financial Services',
 104: 'Real Estate',
 205: 'Consumer Defensive',
 206: 'Healthcare',
 207: 'Utilities',
 308: 'Communication Services',
 309: 'Energy',
 310: 'Industrials',
 311: 'Technology' ,
}
In [215]:
for sc, cor in src.groupby('sector_code'):
    t = alpha_name + ": " + sector_names[sc] + " Information Coeficient"
    cor.drop('sector_code', axis=1).set_index('date').plot(title=t)

For Calculating Monthly IC

In [216]:
def src_std_error(rho, n):
    return (1-rho**2)/np.sqrt(n-1)

z = lambda x: x.replace(day=1)
c = daily_alpha.reset_index()['date'].apply(z)
c.index = daily_alpha.index
daily_alpha['y_m'] = c
monthly_src = pd.DataFrame()
src_fn = lambda x: sp.stats.spearmanr(daily_alpha.loc[x.index, alpha_name],
                                   daily_alpha.loc[x.index, x.name])[0]

monthly_src = daily_alpha.groupby(['y_m', 'sector_code']).agg({col_n%1: src_fn,
                                                               col_n%5: src_fn,
                                                               col_n%10: src_fn})
obs_count = daily_alpha.groupby(['y_m', 'sector_code']).count().drop(alpha_name, axis=1)
src_std_error_df = src_std_error(monthly_src, obs_count)

monthly_src = monthly_src.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
monthly_src = monthly_src[[cn%1, cn%5, cn%10]]
monthly_src = monthly_src.reset_index()

src_std_error_df = src_std_error_df.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
src_std_error_df = src_std_error_df[[cn%1, cn%5, cn%10]]
src_std_error_df = src_std_error_df.reset_index()
In [229]:
f, axes = plt.subplots(6,2, sharex=True, sharey=True, figsize=(20,45))
axes = axes.flatten()
i = 0
for sc, cor in monthly_src.groupby(['sector_code']):
    err = src_std_error_df[src_std_error_df.sector_code == sc].set_index('y_m')
    cor.drop('sector_code', axis=1).set_index('y_m').plot(kind='bar', 
                                                          title=sector_names[sc], 
                                                          ax=axes[i],
                                                          yerr=err)
    i+=1
fig = plt.gcf()
fig.suptitle(alpha_name + " Monthly Information Coefficient by Sector", fontsize=24, x=.5, y=.93)
Out[229]:
<matplotlib.text.Text at 0x7f240530dbd0>
In [235]:
all_src = pd.DataFrame()
src_fn = lambda x: sp.stats.spearmanr(daily_alpha.loc[x.index, alpha_name],
                                      daily_alpha.loc[x.index, x.name])[0]

all_src = daily_alpha.groupby(['sector_code']).agg({col_n%1: src_fn,
                                                    col_n%5: src_fn,
                                                    col_n%10: src_fn})
obs_count = daily_alpha.groupby(['sector_code']).count().drop(alpha_name, axis=1)
src_std_error_df = src_std_error(all_src, obs_count)

all_src = all_src.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
all_src = all_src[[cn%1, cn%5, cn%10]]
all_src = all_src.reset_index()

src_std_error_df = src_std_error_df.rename(columns={col_n%1: cn%1, col_n%5: cn%5,col_n%10: cn%10})
src_std_error_df = src_std_error_df[[cn%1, cn%5, cn%10]]
src_std_error_df = src_std_error_df.reset_index()
all_src.sector_code = all_src.sector_code.apply(lambda x: sector_names[x])
src_std_error_df.sector_code = src_std_error_df.sector_code.apply(lambda x: sector_names[x])
In [245]:
t =  "{} Information Coefficient ({} - {})".format(alpha_name, 
                                                   start_date.strftime('%m/%d/%Y'),
                                                   end_date.strftime('%m/%d/%Y'))
all_src.set_index('sector_code').plot(kind='bar',
                                      title=t, 
                                      yerr=src_std_error_df.set_index('sector_code'))
Out[245]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f23ffb51790>
In [320]:
#pd.qcut(x, 5, labels=False) couldn't get this working... 
def quint_bucket(x):
    if x > .8:
        return 5
    elif x > .6:
        return 4
    elif x > .4:
        return 3
    elif x > .2:
        return 2
    else:
        return 1

daily_alpha = daily_alpha.sort(['date', 'sector_code']).set_index(['date', 'sector_code'])
daily_alpha['alpha_percentile'] = daily_alpha.alpha101.groupby(
            level=['date', 'sector_code']).transform(lambda x: quint_bucket(np.percentile(x)))
daily_alpha  = daily_alpha.reset_index()
daily_alpha['quint_bucket'] = daily_alpha.alpha_percentile.apply(quint_bucket)

mean_ret_by_q = daily_alpha.groupby(['sector_code', 'quint_bucket']).mean()[[col_n%1, col_n%5, col_n%10]]
In [330]:
f, axes = plt.subplots(6,2, sharex=False, sharey=True, figsize=(20,45))
axes = axes.flatten()
i = 0
for sc, cor in mean_ret_by_q.groupby(level='sector_code'):
    cor = cor.reset_index().drop('sector_code', axis=1).set_index('quint_bucket')
    cor.plot(kind='bar', title=sector_names[sc], ax=axes[i])
    axes[i].set_xlabel('factor quintile')
    axes[i].set_ylabel('mean price % change')
    i+=1
fig = plt.gcf()
fig.suptitle(alpha_name + ": Mean Return By Factor Quintile", fontsize=24, x=.5, y=.93)
Out[330]:
<matplotlib.text.Text at 0x7f23fa308b10>
In [ ]: