Notebook

Japan challenge submission template

In [1]:
from quantopian.pipeline import Pipeline, CustomFactor
from quantopian.pipeline.data import EquityPricing, factset
from quantopian.pipeline.factors import Returns, SimpleMovingAverage
from quantopian.pipeline.domain import (
    AT_EQUITIES, # Austria
    AU_EQUITIES, # Australia
    BE_EQUITIES, # Belgium
    BR_EQUITIES, # Brazil
    CA_EQUITIES, # Canada
    CH_EQUITIES, # Switzerland
    CN_EQUITIES, # China
    DE_EQUITIES, # Germany
    DK_EQUITIES, # Denmark
    ES_EQUITIES, # Spain
    FI_EQUITIES, # Finland
    FR_EQUITIES, # France
    GB_EQUITIES, # Great Britain
    HK_EQUITIES, # Hong Kong
    IE_EQUITIES, # Ireland
    IN_EQUITIES, # India
    IT_EQUITIES, # Italy
    JP_EQUITIES, # Japan
    KR_EQUITIES, # South Korea
    NL_EQUITIES, # Netherlands
    NO_EQUITIES, # Norway
    NZ_EQUITIES, # New Zealand
    PT_EQUITIES, # Portugal
    SE_EQUITIES, # Sweden
    SG_EQUITIES, # Singapore
    US_EQUITIES, # United States
)
from quantopian.research import run_pipeline

import pandas as pd
import numpy as np

import time

import matplotlib.pyplot as plt
import seaborn as sns

import empyrical as ep
import alphalens as al
import pyfolio as pf

Helper functions

In [2]:
class FilterExtend(CustomFactor):
    def compute(self, today, asset_ids, out, values):
        out[:] = np.max(values, axis=0)

def evaluate_factor(factor, 
                    domain, 
                    start_date, 
                    end_date,
                    factor_screen=None,
                    quantiles=5,
                    returns_lengths=(1, 5, 10)):
    """Analyze a Pipeline Factor using Alphalens.
    
    Parameters
    ----------
    factor : quantopian.pipeline.factors.Factor
        Factor producing scores to be evaluated.
    domain : quantopian.pipeline.domain.Domain
        Domain on which the factor should be evaluated.
    start_date : str or pd.Timestamp
        Start date for evaluation period.
    end_date : str or pd.Timestamp
        End date for evaluation period.
    standardize : 
    factor_screen : quantopian.pipeline.filters.Filter, optional
        Filter defining which assets ``factor`` should be evaluated on.
        Default is ``factor.notnull()``.
    quantiles : int, optional
        Number of buckets to use for quantile groups. Default is 5
    returns_lengths : sequence[int]
        Forward-returns horizons to use when evaluating ``factor``. 
        Default is 1-day, 5-day, and 10-day returns.
        
    Returns
    -------
    factor_data : pd.DataFrame
        A (date, asset)-indexed DataFrame with the following columns:
            'factor': float64
                Values produced by ``factor``.
            'factor_quantiles': int64
                Daily quantile label for each
    """
    calendar = domain.calendar
    # Roll input dates to the next trading session.
    start_date = calendar.minute_to_session_label(pd.Timestamp(start_date, tz='UTC'))
    end_date = calendar.minute_to_session_label(pd.Timestamp(end_date, tz='UTC'))
    
    if factor_screen is None:
        factor_screen = factor.notnull()
        
    # Run pipeline to get factor values and quantiles.
    factor_pipe = Pipeline(
        {'factor': factor, 
         'factor_quantile': factor.quantiles(quantiles, mask=factor_screen)},
        screen=factor_screen,
        domain=domain,
    )
    factor_results = run_pipeline(factor_pipe, start_date, end_date, chunksize=250)
        
    returns_pipe = Pipeline(
        columns={'daily_returns': Returns(window_length=2)}, 
        domain=domain, 
        screen=FilterExtend(
            inputs=[factor_screen], 
            window_length=max(returns_lengths)+1
        ).eq(1)
    )
    
    # Compute returns for the period after the factor pipeline, then 
    # shift the results back to align with our factor values.
    returns_start_date = start_date
    returns_end_date = end_date + domain.calendar.day * max(returns_lengths)
    raw_returns = run_pipeline(returns_pipe, returns_start_date, returns_end_date, chunksize=500)
    
    shifted_returns = {}
    column_order = []
    daily_returns = raw_returns['daily_returns']
    for length in returns_lengths:
        # Shift 1-day returns back by a day, 5-day returns back by 5 days, etc.
        colname = '{}D'.format(length)
        column_order.append(colname)
        shifted_returns[colname] = backshift_returns_series(daily_returns, length)
        
    # Merge backshifted returns into a single frame indexed like our desired output.
    merged_returns = pd.DataFrame(
        data=shifted_returns, 
        index=factor_results.index, 
        columns=column_order,
    )
    
    # Concat factor results and forward returns column-wise.
    merged = pd.concat([factor_results, merged_returns], axis=1)
    merged.index.set_names(['date', 'asset'], inplace=True)
    
    # Drop NaNs
    merged = merged.dropna(how='any')
    
    # Add a Business Day Offset to the DateTimeIndex
    merged.index.levels[0].freq = pd.tseries.offsets.BDay()
    
    return merged

def backshift_returns_series(series, N):
    """Shift a multi-indexed series backwards by N observations in the first level.
    
    This can be used to convert backward-looking returns into a forward-returns series.
    """
    ix = series.index
    dates, sids = ix.levels
    date_labels, sid_labels = map(np.array, ix.labels)
    # Output date labels will contain the all but the last N dates.
    new_dates = dates[:-N]
    # Output data will remove the first M rows, where M is the index of the
    # last record with one of the first N dates.
    cutoff = date_labels.searchsorted(N)
    new_date_labels = date_labels[cutoff:] - N
    new_sid_labels = sid_labels[cutoff:]
    new_values = series.values[cutoff:]
    assert new_date_labels[0] == 0
    new_index = pd.MultiIndex(
        levels=[new_dates, sids],
        labels=[new_date_labels, new_sid_labels],
        sortorder=1,
        names=ix.names,
    )
    return pd.Series(data=new_values, index=new_index)
In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import empyrical as ep
import alphalens as al
import pyfolio as pf

def compute_turnover(df):
    return df.dropna().unstack().dropna(how='all').fillna(0).diff().abs().sum(1)

def get_max_median_position_concentration(expos):
    longs = expos.loc[expos > 0]
    shorts = expos.loc[expos < 0]

    return expos.groupby(level=0).quantile([.05, .25, .5, .75, .95]).unstack()

def compute_factor_stats(factor_data_total, periods=range(1, 15)):
    portfolio_returns_total = al.performance.factor_returns(factor_data_total)
    portfolio_returns_total.columns = portfolio_returns_total.columns.map(lambda x: int(x[:-1]))
    for i in portfolio_returns_total.columns:
        portfolio_returns_total[i] = portfolio_returns_total[i].shift(i)

    delay_sharpes_total = portfolio_returns_total.apply(ep.sharpe_ratio)
    
    factor = factor_data_total.factor
    turnover = compute_turnover(factor)
    n_holdings = factor.groupby(level=0).count()
    perc_holdings = get_max_median_position_concentration(factor)
    
    return {'factor_data_total': factor_data_total, 
            'portfolio_returns_total': portfolio_returns_total,
            'delay_sharpes_total': delay_sharpes_total,
            'turnover': turnover,
            'n_holdings': n_holdings,
            'perc_holdings': perc_holdings,
    }

def plot_overview_tear_sheet(factor_data, periods=range(1, 15)):
    # We assume portfolio weights, so make sure factor scores sum to 1
    factor_data['factor'] = factor_data.factor.div(factor_data.abs().groupby(level='date').sum()['factor'])
    
    fig = plt.figure(figsize=(16, 16))
    gs = plt.GridSpec(3, 4)
    ax1 = plt.subplot(gs[0:2, 0:2])
    
    factor_stats = compute_factor_stats(factor_data, periods=periods)
                         
    pd.DataFrame({'total': factor_stats['delay_sharpes_total']}).plot.bar(ax=ax1)
    ax1.set(xlabel='delay', ylabel='IR')

    ax2a = plt.subplot(gs[0:2, 2:4])
    delay_cum_rets_total = factor_stats['portfolio_returns_total'][list(range(1, 5))].apply(ep.cum_returns)
    delay_cum_rets_total.plot(ax=ax2a)
    ax2a.set(title='Total returns', ylabel='Cumulative returns')
    
    ax6 = plt.subplot(gs[-1, 0:2])
    factor_stats['n_holdings'].plot(color='b', ax=ax6)
    ax6.set_ylabel('# holdings', color='b')
    ax6.tick_params(axis='y', labelcolor='b')
    
    ax62 = ax6.twinx()
    factor_stats['turnover'].plot(color='r', ax=ax62)
    ax62.set_ylabel('turnover', color='r')
    ax62.tick_params(axis='y', labelcolor='r')
    
    ax7 = plt.subplot(gs[-1, 2:4])
    factor_stats['perc_holdings'].plot(ax=ax7)
    ax7.set(ylabel='Long/short perc holdings')
    
    gs.tight_layout(fig)
    
    return fig, factor_stats

Universe definition

In [4]:
# Custom factor that gets the minimum volume traded over the last two weeks.
class MinVolume(CustomFactor):
    inputs=[EquityPricing.volume]
    window_length=10
    def compute(self, today, asset_ids, out, values):
        # Calculates the column-wise standard deviation, ignoring NaNs
        out[:] = np.min(values, axis=0)

# Create a volume and price filter that filters for stocks in the top 30%.
# We multiply by price to rule out penny stocks that trade in huge volume.
volume_min = MinVolume()
price = EquityPricing.close.latest
univ_filter = ((price * volume_min).percentile_between(70, 100, mask=(volume_min > 0)))

Enter your alpha factor here. Make sure to delete the following cell before making your submission!

In [5]:
# Our alpha factor is a size-based factor.
alpha_factor = -factset.Fundamentals.mkt_val.latest.log1p()
alpha_winsorized = alpha_factor.winsorize(min_percentile=0.05,
                                          max_percentile=0.95,
                                          mask=univ_filter)
    
# Zscore to get long and short (positive and negative) alphas to use as weights
alpha_zscore = alpha_winsorized.zscore()
In [6]:
# Call evaluate_factor on our factor to get Alphalens-formatted data.
al_data = evaluate_factor(
    alpha_zscore, 
    JP_EQUITIES, 
    '2015-06-1', 
    '2018-10-1', 
    factor_screen=univ_filter,
    returns_lengths=range(1, 15),
)

Pipeline Execution Time: 9.49 Seconds

Pipeline Execution Time: 2.10 Seconds
In [7]:
plot_overview_tear_sheet(al_data);
In [ ]: