Notebook

Alphalens boilerplate

In [1]:
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline import factors, filters, classifiers
from quantopian.pipeline.factors import CustomFactor, Returns, AverageDollarVolume, SimpleMovingAverage
from quantopian.pipeline.filters import StaticAssets, Q500US, Q1500US, Q3000US
from quantopian.pipeline.experimental import QTradableStocksUS
from quantopian.pipeline.filters.fundamentals import IsPrimaryShare
from quantopian.pipeline.classifiers.fundamentals import Sector  
from quantopian.pipeline.data.builtin import USEquityPricing

import math
import datetime
import numpy as np
import pandas as pd

## Helper functions

def high_volume_universe(top_liquid, min_price = None, min_volume = None):  
    """
    Computes a security universe of liquid stocks and filtering out
    hard to trade ones
    Returns
    -------
    high_volume_tradable - zipline.pipeline.filter
    """
    
    if top_liquid == 500:
        universe = Q500US()
    elif top_liquid == 1500:
        universe = Q1500US()
    elif top_liquid == 3000:
        universe = Q3000US()        
    else:        
        universe = filters.make_us_equity_universe(
            target_size=top_liquid,
            rankby=factors.AverageDollarVolume(window_length=200),
            mask=filters.default_us_equity_universe_mask(),
            groupby=Sector(),
            max_group_weight=0.3,
            smoothing_func=lambda f: f.downsample('month_start'),
        )
    
    if min_price is not None:
        price = SimpleMovingAverage(inputs=[USEquityPricing.close],
                                    window_length=21, mask=universe)
        universe &= (price >= min_price)
        
    if min_volume is not None:
        volume = SimpleMovingAverage(inputs=[USEquityPricing.volume],
                                     window_length=21, mask=universe)
        universe &= (volume >= min_volume)
        
    return universe
      
def construct_factor_history(factor_cls, start_date='2015-10-1', end_date='2016-2-1', 
                             factor_name='factor', top_liquid=500,
                             sector_column=None):
    """
    Creates a DataFrame containing daily factor values and sector codes for a liquidity 
    constrained universe. The returned DataFrame is can be used in the factor tear sheet.
    """
    ok_universe = high_volume_universe(top_liquid)
       
    factor = factor_cls(mask=ok_universe)
    sector = Sector(mask=ok_universe)    
       
    pipe = Pipeline()
    pipe.add(factor, factor_name)
    if sector_column is not None: # this is very slow too
        pipe.add(sector, sector_column)  
    pipe.set_screen(ok_universe)

    daily_factor = run_pipeline(pipe, start_date=start_date, end_date=end_date, chunksize=300)
       
    return daily_factor.dropna()

def get_daily_price(sid_universe, start_date, end_date, extra_days_before=0, extra_days_after=0):
    """
    Creates a DataFrame containing daily percentage returns and price
    """   
    extra_days = math.ceil(extra_days_before * 365.0/252.0) + 3 # just to be sure
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d") - datetime.timedelta(days=extra_days)
    start_date = start_date.strftime("%Y-%m-%d")
    
    extra_days = math.ceil(extra_days_after * 365.0/252.0) + 3 # just to be sure
    end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d") + datetime.timedelta(days=extra_days)
    end_date = end_date.strftime("%Y-%m-%d")
    
    pricing = get_pricing(sid_universe, start_date=start_date, end_date=end_date, fields='open_price')
    
    return pricing

#
# 'run_tear_sheet' glues all the function together to make life easier to run the tear sheet on a pipeline factor
#

import alphalens

def run_tear_sheet(factor,
                   factor_name,
                   start_date,
                   end_date,
                   top_liquid,
                   show_sector_plots,
                   avgretplot,
                   periods,
                   quantiles,
                   bins,
                   filter_zscore,
                   long_short,
                   prices_cache = None):
     
    sector_column = 'sector_code' if show_sector_plots else None
    days_before, days_after = (0,0)

    if avgretplot is not None:   
        days_before, days_after = avgretplot
        days_after = max(days_after, max(periods) + 1)
    
    #
    ## Run the Pipeline
    #
    print 'construct factor history'
    factor = construct_factor_history(factor, start_date=start_date, end_date=end_date, 
                                      factor_name=factor_name, top_liquid=top_liquid,
                                      sector_column=sector_column)
    #
    ## Get prices
    #
    sid_universe = set( factor.index.levels[1].unique() )
    if prices_cache is not None:
        cached_sids = set(prices_cache.columns)
        sid_universe -= cached_sids
        
    print 'Get pricing for %d entries' % len(sid_universe)
    if sid_universe:
        prices = get_daily_price(sid_universe, start_date=start_date, end_date=end_date, 
                                 extra_days_before=days_before, extra_days_after=days_after)
        if prices_cache is not None:
            prices = pd.concat([prices, prices_cache], axis=1)
    else:
        prices = prices_cache

    #
    ## Use Alphalens to create a factor tear sheet
    #
    print 'Alphalens'
    
    if len(np.isinf(factor[factor_name])) > 0:
        print 'Dropping inf or -inf values from factor'
        factor[factor_name] = factor[factor_name].replace([np.inf, -np.inf], np.nan)
    
    sectors_series = factor[sector_column] if show_sector_plots else None
    factor_data = alphalens.utils.get_clean_factor_and_forward_returns(factor=factor[factor_name],
                                                                       prices=prices,
                                                                       groupby=sectors_series,
                                                                       by_group=False,
                                                                       quantiles=quantiles,
                                                                       bins=bins,
                                                                       periods=periods,
                                                                       filter_zscore=filter_zscore,
                                                                       groupby_labels=Sector.SECTOR_NAMES)

    alphalens.plotting.plot_quantile_statistics_table(factor_data)
    alphalens.tears.create_returns_tear_sheet(factor_data=factor_data,
                                              long_short=long_short,
                                              by_group=show_sector_plots)

    if avgretplot:
        alphalens.tears.create_event_returns_tear_sheet(factor_data=factor_data,
                                                        prices=prices,
                                                        avgretplot=avgretplot,
                                                        long_short=long_short,
                                                        by_group=show_sector_plots)
    
    return prices, factor, factor_data

Define our factor

In [2]:
class HurstExp(CustomFactor):  
    """
    Hurst exponent helps test whether the time series is:
    (1) A Random Walk (H ~ 0.5)
    (2) Trending (H > 0.5)
    (3) Mean reverting (H < 0.5)
    """
    inputs = [USEquityPricing.open]  
    window_length = int(252*0.5)
    
    def Hurst(self, ts):   #Fast
        # Create a range of lag values  
        lags=np.arange(2,20)  
        # Calculate variance array of the lagged differences  
        tau = [np.sqrt(np.std(np.subtract(ts[lag:], ts[:-lag]))) for lag in lags]

        # Slow: Use a linear fit to estimate  
        #poly = np.polyfit(np.log(lags), np.log(tau), 1)[0]
        
        #Fast
        #From: Derek M. Tishler - dmtishler@gmail.com
        # 1st degree polynomial approximation for speed  
        # source: http://stackoverflow.com/questions/28237428/fully-vectorise-numpy-polyfit  
        n = len(lags)  
        x = np.log(lags)  
        y = np.log(tau)  
        poly = (n*(x*y).sum() - x.sum()*y.sum()) / (n*(x*x).sum() - x.sum()*x.sum())

        # Return the Hurst exponent
        hurst_exp = poly*2.0
        return hurst_exp

    def compute(self, today, assets, out,  OPEN):
        SERIES = np.log(np.nan_to_num(OPEN)) #Adjustmet of @Villa "use log prices so that when you subtract lag differences you get log returns." 
        hurst_exp_per_asset = map(self.Hurst, [SERIES[:,col_id].flatten() for col_id in np.arange(SERIES.shape[1])])  
        #print 'Hurst Exp:\n','len=',len(hurst_exp_per_asset)
        #print "HurstData=",hurst_exp_per_asset  
        out[:] = np.nan_to_num(hurst_exp_per_asset)

Define settings

In [3]:
factor_name = 'HurstExp'

start_date  = '2005-01-01'
end_date    = '2017-05-01'
top_liquid  = 1500
show_sector_plots = False

# alphalens specific
periods = (1, 3, 5, 10)

#
# The Hurst Exponent tells you whether a series is:
# - Geometric random walk (H=0.5)
# - Mean-reverting series (H<0.5)
# - Trending Series (H>0.5)
#
bins      = None
quantiles = 5


avgretplot  = None #(5, 20)  # use None to avoid plotting or (days_before, days_after)
filter_zscore = None
long_short  = True

prices_cache = None # this saves lots of time when running tear sheet multiple times

Run the tear sheet

Choosing the thresholds

Hurst exponent helps test whether the time series is:

  • A Random Walk (H ~ 0.5)
  • Trending (H > 0.5)
  • Mean reverting (H < 0.5)

After running Alphalens on HurstExp with the following binning options:

bins = [0., 0.15, 0.25, 0.30, 0.35, 0.40, 0.45, 0.55, 0.65, 1.0] quantiles = None

We can see the bins are so distribuited:

quantile min max count count %, 1 0.00 0.15 41819 0.9 2 0.15 0.25 247678 5.3 <--- mean rev 3 0.25 0.30 333692 7.2 <--- mean rev 4 0.30 0.35 546991 11.8 <--- mean rev 5 0.35 0.40 759547 16.4 <--- mean rev 6 0.40 0.45 870952 18.8 <--- mean rev 7 0.45 0.55 1375851 29.8 <--- Random walk 8 0.55 0.65 412128 8.9 <--- Trending 9 0.65 0.78 27618 0.5

In [ ]:
mean_reverting_threshold = 0.35
trending_threshold = 0.55
In [4]:
def factor(mask):
    
    he = HurstExp(mask=mask)
    
    mean_reverting_mask = he < mean_reverting_threshold
    trending_mask       = he > trending_threshold
    
    returns = Returns(window_length=2, mask=mask)
    meanrev    = -SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=mean_reverting_mask)
    trending   =  SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=trending_mask)
    
    return meanrev

prices_cache, factor, factor_data = \
run_tear_sheet( factor       = factor,
                factor_name  = factor_name,
                start_date   = start_date,
                end_date     = end_date,
                top_liquid   = top_liquid,
                show_sector_plots = show_sector_plots,
                avgretplot   = avgretplot,               
                periods      = periods,
                quantiles    = quantiles,
                bins         = bins,
                filter_zscore = filter_zscore,
                long_short   = long_short,
                prices_cache = prices_cache)
construct factor history
Get pricing for 3357 entries
Alphalens
Dropping inf or -inf values from factor
Quantiles Statistics
min max mean std count count %
factor_quantile
1 -0.200141 0.001447 -0.003375 0.003408 243478 20.102877
2 -0.007467 0.003171 -0.001434 0.001009 241622 19.949635
3 -0.005074 0.004957 -0.000693 0.000999 241608 19.948479
4 -0.003772 0.007920 0.000063 0.001132 241622 19.949635
5 -0.002251 0.223492 0.001900 0.003144 242830 20.049374
Returns Analysis
1 3 5 10
Ann. alpha 0.027 0.028 0.027 0.018
beta 0.108 0.127 0.138 0.155
Mean Period Wise Return Top Quantile (bps) 2.068 6.241 10.415 18.465
Mean Period Wise Return Bottom Quantile (bps) -1.668 -5.392 -9.180 -17.359
Mean Period Wise Spread (bps) 2.385 2.708 2.829 2.541
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=3).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=3).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=5).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=5).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:727: FutureWarning: pd.rolling_apply is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(center=False,min_periods=1,window=10).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:767: FutureWarning: pd.rolling_apply is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,min_periods=1,window=10).apply(args=<tuple>,func=<function>,kwargs=<dict>)
  min_periods=1, args=(period,))
/usr/local/lib/python2.7/dist-packages/alphalens/plotting.py:519: FutureWarning: pd.rolling_mean is deprecated for Series and will be removed in a future version, replace with 
	Series.rolling(window=22,center=False).mean()
  pd.rolling_mean(mean_returns_spread_bps, 22).plot(color='orangered',
<matplotlib.figure.Figure at 0x7f6ec7f13050>
In [5]:
def factor(mask):
    
    he = HurstExp(mask=mask)
    
    mean_reverting_mask = he < mean_reverting_threshold
    trending_mask       = he > trending_threshold
    
    returns = Returns(window_length=2, mask=mask)
    meanrev    = -SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=mean_reverting_mask)
    trending   =  SimpleMovingAverage(inputs=[returns], window_length=int(252*0.5), mask=trending_mask)
    
    return trending

prices_cache, factor, factor_data = \
run_tear_sheet( factor       = factor,
                factor_name  = factor_name,
                start_date   = start_date,
                end_date     = end_date,
                top_liquid   = top_liquid,
                show_sector_plots = show_sector_plots,
                avgretplot   = avgretplot,               
                periods      = periods,
                quantiles    = quantiles,
                bins         = bins,
                filter_zscore = filter_zscore,
                long_short   = long_short,
                prices_cache = prices_cache)
construct factor history
Get pricing for 0 entries
Alphalens
Dropping inf or -inf values from factor
Quantiles Statistics
min max mean std count count %
factor_quantile
1 -0.094601 0.002337 -0.002673 0.003102 85965 20.288449
2 -0.009780 0.004596 -0.000468 0.001210 84141 19.857970
3 -0.006875 0.006043 0.000351 0.001055 84086 19.844990
4 -0.004926 0.009052 0.001158 0.001055 84141 19.857970
5 -0.003087 0.202222 0.003200 0.003206 85381 20.150620
Returns Analysis
1 3 5 10
Ann. alpha -0.039 -0.036 -0.026 -0.018
beta -0.169 -0.190 -0.194 -0.193
Mean Period Wise Return Top Quantile (bps) 0.054 0.305 2.061 11.353
Mean Period Wise Return Bottom Quantile (bps) 0.062 -0.432 -3.040 -12.730
Mean Period Wise Spread (bps) -1.971 -1.681 -1.102 0.113
<matplotlib.figure.Figure at 0x7f6eabdc4890>