from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline import factors, filters, classifiers
from quantopian.pipeline.factors import CustomFactor, Returns, AverageDollarVolume, SimpleMovingAverage
from quantopian.pipeline.filters import StaticAssets, Q500US, Q1500US, Q3000US
from quantopian.pipeline.experimental import QTradableStocksUS
from quantopian.pipeline.filters.fundamentals import IsPrimaryShare
from quantopian.pipeline.classifiers.fundamentals import Sector
from quantopian.pipeline.data.builtin import USEquityPricing
import math
import datetime
import numpy as np
import pandas as pd
## Helper functions
def high_volume_universe(top_liquid, min_price = None, min_volume = None):
"""
Computes a security universe of liquid stocks and filtering out
hard to trade ones
Returns
-------
high_volume_tradable - zipline.pipeline.filter
"""
if top_liquid == 500:
universe = Q500US()
elif top_liquid == 1500:
universe = Q1500US()
elif top_liquid == 3000:
universe = Q3000US()
else:
universe = filters.make_us_equity_universe(
target_size=top_liquid,
rankby=factors.AverageDollarVolume(window_length=200),
mask=filters.default_us_equity_universe_mask(),
groupby=Sector(),
max_group_weight=0.3,
smoothing_func=lambda f: f.downsample('month_start'),
)
if min_price is not None:
price = SimpleMovingAverage(inputs=[USEquityPricing.close],
window_length=21, mask=universe)
universe &= (price >= min_price)
if min_volume is not None:
volume = SimpleMovingAverage(inputs=[USEquityPricing.volume],
window_length=21, mask=universe)
universe &= (volume >= min_volume)
return universe
def construct_factor_history(factor_cls, start_date='2015-10-1', end_date='2016-2-1',
factor_name='factor', top_liquid=500,
sector_column=None):
"""
Creates a DataFrame containing daily factor values and sector codes for a liquidity
constrained universe. The returned DataFrame is can be used in the factor tear sheet.
"""
ok_universe = high_volume_universe(top_liquid)
factor = factor_cls(mask=ok_universe)
sector = Sector(mask=ok_universe)
pipe = Pipeline()
pipe.add(factor, factor_name)
if sector_column is not None: # this is very slow too
pipe.add(sector, sector_column)
pipe.set_screen(ok_universe)
daily_factor = run_pipeline(pipe, start_date=start_date, end_date=end_date, chunksize=300)
return daily_factor.dropna()
def get_daily_price(sid_universe, start_date, end_date, extra_days_before=0, extra_days_after=0):
"""
Creates a DataFrame containing daily percentage returns and price
"""
extra_days = math.ceil(extra_days_before * 365.0/252.0) + 3 # just to be sure
start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") - datetime.timedelta(days=extra_days)
start_date = start_date.strftime("%Y-%m-%d")
extra_days = math.ceil(extra_days_after * 365.0/252.0) + 3 # just to be sure
end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=extra_days)
end_date = end_date.strftime("%Y-%m-%d")
pricing = get_pricing(sid_universe, start_date=start_date, end_date=end_date, fields='open_price')
return pricing
#
# 'run_tear_sheet' glues all the function together to make life easier to run the tear sheet on a pipeline factor
#
import alphalens
def run_tear_sheet(factor,
factor_name,
start_date,
end_date,
top_liquid,
show_sector_plots,
avgretplot,
periods,
quantiles,
bins,
filter_zscore,
long_short,
prices_cache = None):
sector_column = 'sector_code' if show_sector_plots else None
days_before, days_after = (0,0)
if avgretplot is not None:
days_before, days_after = avgretplot
days_after = max(days_after, max(periods) + 1)
#
## Run the Pipeline
#
print 'construct factor history'
factor = construct_factor_history(factor, start_date=start_date, end_date=end_date,
factor_name=factor_name, top_liquid=top_liquid,
sector_column=sector_column)
#
## Get prices
#
sid_universe = set( factor.index.levels[1].unique() )
if prices_cache is not None:
cached_sids = set(prices_cache.columns)
sid_universe -= cached_sids
print 'Get pricing for %d entries' % len(sid_universe)
if sid_universe:
prices = get_daily_price(sid_universe, start_date=start_date, end_date=end_date,
extra_days_before=days_before, extra_days_after=days_after)
if prices_cache is not None:
prices = pd.concat([prices, prices_cache], axis=1)
else:
prices = prices_cache
#
## Use Alphalens to create a factor tear sheet
#
print 'Alphalens'
if len(np.isinf(factor[factor_name])) > 0:
print 'Dropping inf or -inf values from factor'
factor[factor_name] = factor[factor_name].replace([np.inf, -np.inf], np.nan)
sectors_series = factor[sector_column] if show_sector_plots else None
factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor=factor[factor_name],
prices=prices,
groupby=sectors_series,
by_group=False,
quantiles=quantiles,
bins=bins,
periods=periods,
filter_zscore=filter_zscore,
groupby_labels=Sector.SECTOR_NAMES)
alphalens.plotting.plot_quantile_statistics_table(factor_data)
alphalens.tears.create_returns_tear_sheet(factor_data=factor_data,
long_short=long_short,
by_group=show_sector_plots)
if avgretplot:
alphalens.tears.create_event_returns_tear_sheet(factor_data=factor_data,
prices=prices,
avgretplot=avgretplot,
long_short=long_short,
by_group=show_sector_plots)
return prices, factor, factor_data
class HurstExp(CustomFactor):
"""
Hurst exponent helps test whether the time series is:
(1) A Random Walk (H ~ 0.5)
(2) Trending (H > 0.5)
(3) Mean reverting (H < 0.5)
"""
inputs = [USEquityPricing.open]
window_length = int(252*0.5)
def Hurst(self, ts): #Fast
# Create a range of lag values
lags=np.arange(2,20)
# Calculate variance array of the lagged differences
tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]
# Slow: Use a linear fit to estimate
#poly = np.polyfit(np.log(lags), np.log(tau), 1)[0]
#Fast
#From: Derek M. Tishler - dmtishler@gmail.com
# 1st degree polynomial approximation for speed
# source: http://stackoverflow.com/questions/28237428/fully-vectorise-numpy-polyfit
n = len(lags)
x = np.log(lags)
y = np.log(tau)
poly = (n*(x*y).sum() - x.sum()*y.sum()) / (n*(x*x).sum() - x.sum()*x.sum())
# Return the Hurst exponent
hurst_exp = poly*2.0
return hurst_exp
def compute(self, today, assets, out, OPEN):
SERIES = np.log(np.nan_to_num(OPEN)) #Adjustmet of @Villa "use log prices so that when you subtract lag differences you get log returns."
hurst_exp_per_asset = map(self.Hurst, [SERIES[:,col_id].flatten() for col_id in np.arange(SERIES.shape[1])])
#print 'Hurst Exp:\n','len=',len(hurst_exp_per_asset)
#print "HurstData=",hurst_exp_per_asset
out[:] = np.nan_to_num(hurst_exp_per_asset)
factor_name = 'HurstExp'
start_date = '2005-01-01'
end_date = '2017-05-01'
top_liquid = 1500
show_sector_plots = False
# alphalens specific
periods = (1, 3, 5, 10)
#
# The Hurst Exponent tells you whether a series is:
# - Geometric random walk (H=0.5)
# - Mean-reverting series (H<0.5)
# - Trending Series (H>0.5)
#
bins = None
quantiles = 5
avgretplot = None #(5, 20) # use None to avoid plotting or (days_before, days_after)
filter_zscore = None
long_short = True
prices_cache = None # this saves lots of time when running tear sheet multiple times
Hurst exponent helps test whether the time series is:
After running Alphalens on HurstExp with the following binning options:
bins = [0., 0.15, 0.25, 0.30, 0.35, 0.40, 0.45, 0.55, 0.65, 1.0] quantiles = None
We can see the bins are so distribuited:
quantile min max count count %,
1 0.00 0.15 41819 0.9
2 0.15 0.25 247678 5.3 <--- mean rev
3 0.25 0.30 333692 7.2 <--- mean rev
4 0.30 0.35 546991 11.8 <--- mean rev
5 0.35 0.40 759547 16.4 <--- mean rev
6 0.40 0.45 870952 18.8 <--- mean rev
7 0.45 0.55 1375851 29.8 <--- Random walk
8 0.55 0.65 412128 8.9 <--- Trending
9 0.65 0.78 27618 0.5
mean_reverting_threshold = 0.35
trending_threshold = 0.55
def factor(mask):
he = HurstExp(mask=mask)
mean_reverting_mask = he < mean_reverting_threshold
trending_mask = he > trending_threshold
returns = Returns(window_length=2, mask=mask)
meanrev = -SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=mean_reverting_mask)
trending = SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=trending_mask)
return meanrev
prices_cache, factor, factor_data = \
run_tear_sheet( factor = factor,
factor_name = factor_name,
start_date = start_date,
end_date = end_date,
top_liquid = top_liquid,
show_sector_plots = show_sector_plots,
avgretplot = avgretplot,
periods = periods,
quantiles = quantiles,
bins = bins,
filter_zscore = filter_zscore,
long_short = long_short,
prices_cache = prices_cache)
def factor(mask):
he = HurstExp(mask=mask)
mean_reverting_mask = he < mean_reverting_threshold
trending_mask = he > trending_threshold
returns = Returns(window_length=2, mask=mask)
meanrev = -SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=mean_reverting_mask)
trending = SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=trending_mask)
return trending
prices_cache, factor, factor_data = \
run_tear_sheet( factor = factor,
factor_name = factor_name,
start_date = start_date,
end_date = end_date,
top_liquid = top_liquid,
show_sector_plots = show_sector_plots,
avgretplot = avgretplot,
periods = periods,
quantiles = quantiles,
bins = bins,
filter_zscore = filter_zscore,
long_short = long_short,
prices_cache = prices_cache)