by Jeremy Muhia
The objective of this study was to use data from the Alpha Vertex data set to enhance a short term mean reversion strategy. This data set contains data points that indicate 5 day future returns as predicted by machine learning algorithms. This data was used to screen for stocks that should or should not be included in the portfolio based on future returns quantiles. The rest of this notebook focuses on describing the Alpha Vertex data.
from quantopian.pipeline import Pipeline
from quantopian.pipeline.filters import Q500US
from quantopian.pipeline.factors import CustomFactor
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.classifiers.morningstar import Sector
from quantopian.research import run_pipeline
import alphalens
# These imports can be found in the store panel for each dataset
# (https://www.quantopian.com/data). Note that not all store datasets
# can be used in pipeline yet.
from quantopian.pipeline.data.alpha_vertex import (
# Top 100 Securities
precog_top_100 as dataset_100,
# Top 500 Securities
precog_top_500 as dataset_500
)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
class PredictionQuality(CustomFactor):
"""
create a customized factor to calculate the prediction quality
for each stock in the universe.
compares the percentage of predictions with the correct sign
over a rolling window (3 weeks) for each stock
"""
# data used to create custom factor
inputs = [dataset_500.predicted_five_day_log_return, USEquityPricing.close]
# change this to what you want
window_length = 15
def compute(self, today, assets, out, pred_ret, px_close):
# actual returns
px_close_df = pd.DataFrame(data=px_close)
pred_ret_df = pd.DataFrame(data=pred_ret)
log_ret5_df = np.log(px_close_df) - np.log(px_close_df.shift(5))
log_ret5_df = log_ret5_df.iloc[5:].reset_index(drop=True)
n = len(log_ret5_df)
# predicted returns
pred_ret_df = pred_ret_df.iloc[:n]
# number of predictions with incorrect sign
err_df = (np.sign(log_ret5_df) - np.sign(pred_ret_df)).abs() / 2.0
# custom quality measure
pred_quality = (1 - pd.ewma(err_df, min_periods=n, com=n)).iloc[-1].values
out[:] = pred_quality
class NormalizedReturn(CustomFactor):
"""
Custom Factor to calculate the normalized forward return
scales the forward return expecation by the historical volatility
of returns
"""
# data used to create custom factor
inputs = [dataset_500.predicted_five_day_log_return, USEquityPricing.close]
# change this to what you want
window_length = 10
def compute(self, today, assets, out, pred_ret, px_close):
# mean return
avg_ret = np.nanmean(pred_ret[-1], axis =0)
# standard deviation of returns
std_ret = np.nanstd(pred_ret[-1], axis=0)
# normalized returns
norm_ret = (pred_ret[-1] - avg_ret) / std_ret
out[:] = norm_ret
START = '2016-01-01'
END = '2017-01-01'
MORNINGSTAR_SECTOR_CODES = {
-1: 'Misc',
101: 'Basic Materials',
102: 'Consumer Cyclical',
103: 'Financial Services',
104: 'Real Estate',
205: 'Consumer Defensive',
206: 'Healthcare',
207: 'Utilities',
308: 'Communication Services',
309: 'Energy',
310: 'Industrials',
311: 'Technology' ,
}
We start by creating a universe of stocks that are included in the Q500 and also have a recent prediction in the Alpha Vertex Data.
Then, the PredictionQuality
class is used to later create a filter that can be used to calculate normalized returns for stocks whose prediction is above a given quality threshold.
Finally, this normalized return value is used to create a pipeline from the beginning of 2016 to the beginning of 2017.
# get stocks covered in the Q500 that have recent prediction data in AlphaVertex
covered_stocks = Q500US() & dataset_500.predicted_five_day_log_return.latest.notnull()
prediction_quality = PredictionQuality(mask=covered_stocks)
quality = prediction_quality > 0.65
normalized_return = NormalizedReturn(mask=quality)
# create a pipeline of only stocks that are covered above
pipe = Pipeline(
columns={
'predicted 5 day returns' : dataset_500.predicted_five_day_log_return.latest,
'normalized returns': normalized_return,
'sector' : Sector(mask=covered_stocks)
},
screen=covered_stocks
)
# run the pipeline
pipe_output = run_pipeline(pipe, start_date=START, end_date=END)
Below is a sample of the Pipeline. Also, note that there are 457 unique stocks in the entire pipeline.
pipe_output.head()
# this is the number of unique securities in the dataframe over the entire year that the pipeline is run
len(pipe_output.index.get_level_values(1).unique())
The figure below shows how many stocks have a normalized return prediction (blue line), a non-normalized return prediction (green line), and a valid sector code (red line).
Notice that there are far fewer stocks with a normalized return prediction. My assumption is that because normalized returns are only calculated for stocks whose prediction overcome a certain prediction quality threshold, this results in a fraction of the stocks having normalized returns predictions. Also, the upper and lower bounds for stocks with normalized returns predictions tighten over time. I'm not sure why this happens.
# the green line shows the number of stocks at each time period with valid predicted returns
# the blue line shows the number of stocks at each time period with valid normalized returns
# the significant difference here is due to the prediction quality filter imposed on the normalized returns
pipe_output.groupby(pipe_output.index.get_level_values(0)).count().plot()
Below, the figure shows a plot of the range for non-normalized predicted returns over the timeline of the pipeline.
# this shows the range of predicted returns at each time period
pipe_output['predicted 5 day returns'].plot()
Because some of the stocks have only non-normalized returns predictions because their predictions do not meet the quality threshold, we need to drop the NaN
values in the dataframe.
pipe_output.dropna().describe()
Finally, the tear sheet is below.
assets = pipe_output.index.levels[1].unique()
pricing = get_pricing(assets, START, '2017-02-28', fields='open_price')
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(
pipe_output['predicted 5 day returns'],
pricing,
quantiles=5,
groupby=pipe_output['sector'],
periods=(1, 5, 10)
)
alphalens.tears.create_full_tear_sheet(factor_data)