From the paper 101 Formulaic Alphas
$ (-1 * correlation(rank(delta(log(volume), 2)), rank(((close - open) / open)), 6)) $
This factor returns a negative value if the change in volume is highly correlated with intraday return. In other words, if volume increases (decreases) by a lot on days where the intraday return is high (low), this factor is negative.
I am postulating that the idea behind this factor is that large moves with heavy volume are liquidity demanding trades (ideally by uninformed traders). Traders providing liquidity in these instances would demand a premium/discount to take the other side to compensate for the risk that they may be trading with an informed trader or the risk of being stuck with an inventory too large. Note, this is quite the opposite of how technical analysis generally looks at the volume/price relationships (although I am oversimplifying a bit with this statement).
My in-sample data for this runs from 2003 to 2012. However, it should be noted that this paper was published in 2015. Therefore, any out-of-sample testing should be done on data after 2015, once the researcher gets to that stage. 2012 to 2015 could possibly be used as sort of a cross-validation set to tune hyper parameters if any kind of machine learning is used to tweak the factor.
In this notebook, I will perform a bit of parameter optimization, in part to see what the best parameters or for performance. However, I am more interested in seeing how sensitive the performance of the factor is to changes in the input parameters. If performance is super sensitive to small changes in the inputs, then I would give a higher likelihood that the researchers overfit this factor.
To keep things simple for the moment, I will only adjust the correlation lookback window in the optimization. In the future, I may work on tweaking other parameters if I can find an efficient workflow for doing so.
# Typical imports for use with Pipeline
from quantopian.pipeline import Pipeline, CustomFactor
from quantopian.research import run_pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.classifiers.fundamentals import Sector
from quantopian.pipeline.filters import QTradableStocksUS, Q500US
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import alphalens as al
class VolumeChange(CustomFactor):
"""Factor returning the change in log volume as compared
to (window_length - 1) days ago. Essentially, this is the
percent change in volume."""
inputs = [USEquityPricing.volume]
window_length = 3
window_safe=True
def compute(self, today, asset_ids, out, volume):
out[:] = np.log(volume[-1]) - np.log(volume[-3])
class IntradayReturn(CustomFactor):
"""Factor returning the return from today's open to
today's close"""
inputs = [USEquityPricing.open, USEquityPricing.close]
window_length = 1
window_safe=True
def compute(self, today, asset_ids, out, open_, close):
out[:] = close / open_ - 1
def make_alpha_2(mask, window_length=6):
"""Construct factor returning the negative of the rank correlation over the
past 'window_length' days between the intraday return and the VolumeChange.
Parameters
-----------
mask: Filter
Filter representing what assets get included in factor computation.
Returns
-------
Factor
Notes: This is a measure of whether returns are correlated with volume. It is
negative when volume is stronger on up moves and light on down moves. It is
positive when volume is stronger on down moves and lighter on up moves.
"""
class Alpha2(CustomFactor):
# inputs = [VolumeChange().rank(), IntradayReturn().rank()]
# window_length = 6
def compute(self, today, asset_ids, out, volume_change, intraday_return):
volume_change_df = pd.DataFrame(volume_change)
intraday_return_df = pd.DataFrame(intraday_return)
out[:]=-volume_change_df.corrwith(intraday_return_df)
return Alpha2(mask=mask,
inputs = [VolumeChange(mask=mask).rank(),
IntradayReturn(mask=mask).rank()],
window_length=window_length
)
def make_pipeline(corr_param_range):
base_universe = QTradableStocksUS()
# base_universe = Fundamentals.symbol.latest.element_of(['GS', 'AAPL', 'XOM'])
closed_end_funds = Fundamentals.share_class_description.latest.startswith('CE')
universe = base_universe & ~closed_end_funds
factor_dict = {}
for i in corr_param_range:
factor_dict['alpha_2_{}'.format(i)] = make_alpha_2(universe, i)
factor_dict['sector_code'] = Sector(mask=universe)
return Pipeline(columns=factor_dict, screen=universe)
start_date = '2003-01-01'
end_date = '2012-12-31'
# end_date = '2003-01-10'
corr_param_range = [4,6,8,10,12,14,16,18,20]
result = run_pipeline(make_pipeline(corr_param_range), start_date, end_date, chunksize=504)
col_order = []
# Reorder Columns
for i in corr_param_range:
col_order.append('alpha_2_{}'.format(i))
col_order.append('sector_code')
result = result[col_order]
result.head()
factor_data
¶def get_al_prices(result, periods=(1,5,21)):
assets = result.index.levels[1].unique()
start_date = result.index.get_level_values(0)[0]
end_date = result.index.get_level_values(0)[-1] + max(periods) * pd.tseries.offsets.BDay()
pricing = get_pricing(assets, start_date, end_date, fields="open_price")
return pricing
def get_factor_data(result,
factor_col,
prices,
forward_returns,
quantiles=5,
bins=None,
groupby=None,
binning_by_group=False,
groupby_labels=None,
max_loss=0.35):
# pricing = get_al_prices(result, periods)
# factor_data = al.utils.get_clean_factor_and_forward_returns(factor=result[factor_col],
# prices=pricing,
# groupby=groupby,
# binning_by_group=binning_by_group,
# groupby_labels=groupby_labels,
# quantiles=quantiles,
# bins=bins,
# periods=periods,
# max_loss=max_loss)
factor_data = al.utils.get_clean_factor(result[factor_col],
forward_returns,
groupby=groupby,
binning_by_group=binning_by_group,
groupby_labels=groupby_labels,
quantiles=quantiles,
bins=bins,
max_loss=max_loss)
return factor_data
periods=(1,3,5,7,10,12,15,20)
prices = get_al_prices(result, periods)
forward_returns = al.utils.compute_forward_returns(result[result.columns[0]], prices, periods)
forward_returns.head()
# factor_data={}
ic_dict={}
for factor_col in result.columns:
if factor_col != 'sector_code':
print "-"*30 + "\nGetting Factor Data for '{}'".format(factor_col)
factor_data = get_factor_data(result,
factor_col,
prices,
forward_returns)
print "-"*30 + "\nCalculating ICs for '{}'".format(factor_col)
ic_dict[factor_col] = al.performance.mean_information_coefficient(factor_data)
ic_df = pd.DataFrame.from_dict(ic_dict)[col_order[:-1]]
ic_df
ic_df.loc['5D'].idxmax()
ic_df.plot();
import seaborn as sns
sns.heatmap(ic_df, annot=True, cmap='RdBu', vmin=-.01, vmax=.01)
Correlation_window = 6 days
# prices, factor_data = get_factor_data(result, 'alpha_2')
factor_data = get_factor_data(result,
['alpha_2_6'],
prices,
forward_returns)
al.tears.create_full_tear_sheet(factor_data, long_short=True, group_neutral=False )
Correlation_window = 16 Days
result.columns
factor_data = get_factor_data(result,
['alpha_2_16'],
prices,
forward_returns)
factor_data.head()
al.tears.create_full_tear_sheet(factor_data.drop(['3D', '7D', '12D', '15D', '20D'], axis=1),
long_short=True, group_neutral=False )