The Estimize Signal¶

The Estimize Signal is a cross sectional score which captures several predictive factors based on Estimize’s proprietary database of crowdsourced earnings estimates. These factors include pre-earnings measures such as the difference between Estimize and Wall Street earnings forecasts, as well as post-earnings factors such as recent earnings surprises as benchmarked against Estimize forecasts. In constructing the Estimize Signal, we leveraged the research from our white paper, "Generating Abnormal Returns Using Crowdsourced Earnings Forecasts fromEstimize," first written in 2014. The signal construction process included rigorous in-and out-of-sample testing, and represents a fairly parsimonious use of the Estimize data set.

Backtest Results¶

In the following part of the notebook, we have codified an example of how the Estimize Signal could be traded in a single factor model, and show yearly as well as multiyear performance. The data used are pricing data (to calculate daily returns) provided by Quantopian, and the Estimize Signal data, which are split into yearly CSV files. To run this notebook you will need to download the Estimize Signal CSVs and place them into your research data directory on Quantopian. These files can be downloaded from the following links:

There are seven parameters that can be changed to control how the algo operates. With the default parameter settings, we have a universe consisting of U.S. equities with market caps >= $100mm, average daily trading volumes of > $1mm, and prices (split unadjusted) of > $4. On each trade we create a long portfolio which is equally weighted among all stocks in the top 10% of stocks in this universe according to the Signal. Similarly the short portfolio is equally weighted among the stocks in the bottom 10%. Our dollar neutral return is the difference between the long portfolio’s return and the short portfolio’s return. In addition we are using a 2-to-1 leverage ratio.

# Import libraries

import pandas as pd
import numpy as np
import scipy.stats as ss
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math

from datetime import timedelta
from statsmodels import regression
from statsmodels.discrete.discrete_model import Logit
from scipy.stats import mstats

from quantopian.research import run_pipeline, symbols
from quantopian.pipeline import CustomFactor, Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import AverageDollarVolume, CustomFactor
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.filters import CustomFilter

TIMEZONE = 'US/Eastern'

# Asset universe parameters:
MIN_MARKET_CAP = 100e6
MIN_AVG_DOLLAR_VOL = 1e6
MIN_PRICE = 4

# Signal parameters:
POST_SIGNAL_ONLY = False # If True, only post-earnings signal is used

# Trading parameters:
SIGNAL_QUANTILE = 0.10 # Upper and lower signal quantile to select assets
MIN_ASSETS = 20 # Min number of total long and short assets, below which no trades occur
LEVERAGE_RATIO = 2

def get_assets(df):
    return df.reset_index()['symbol'].unique().tolist()

def universe(start_date, end_date):
    market_cap = MarketCap()
    adv = AverageDollarVolume(window_length=20)
    last_close = USEquityPricing.close.latest
    
    min_market_cap = market_cap >= MIN_MARKET_CAP
    min_adv = adv >= MIN_AVG_DOLLAR_VOL
    min_last_close = last_close >= MIN_PRICE
    screen = (min_market_cap & min_adv & min_last_close)
    
    pipeline = Pipeline(screen=screen)
    df = run_pipeline(pipeline, start_date, end_date)
    
    df.index.tz = TIMEZONE
    df.reset_index(inplace=True)
    df.rename(columns={'level_0': 'as_of_date', 'level_1': 'symbol'}, inplace=True)
    df['as_of_date'] = df['as_of_date'].dt.date
    df.set_index(['as_of_date', 'symbol'], inplace=True)
    
    return df

def day_and_night_returns(start_date, end_date, assets):
    pl = get_pricing(assets, fields=['open_price', 'close_price'], start_date=start_date, end_date=end_date)
    op = pl['open_price']
    cp = pl['close_price']
    
    dr = ((cp / op) - 1.0).stack()
    nr = ((op.shift(-1) / cp) - 1.0).stack()
    df = pd.DataFrame(dict(dayret=dr, nightret=nr))
    
    df.reset_index(inplace=True)
    df.rename(columns={'level_0': 'as_of_date', 'level_1': 'symbol'}, inplace=True)
    df['as_of_date'] = df['as_of_date'].dt.date
    df.set_index(['as_of_date', 'symbol'], inplace=True)

    return df

def signals(year):
    if year != 2018:
        file_name = 'df{}.csv'.format(year)
    else:
        file_name = 'df2018q1.csv'
    
    df = local_csv(file_name, symbol_column='ticker', date_column='as_of')
    df.index.tz = TIMEZONE
    
    if POST_SIGNAL_ONLY:
        df = df[df['type'] == 'post']
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)
    df.rename(columns={'ticker': 'symbol', 'as_of': 'as_of'}, inplace=True)
    
    df['as_of_date'] = df['as_of'].dt.date
    df['hour'] = df['as_of'].dt.hour
    df.drop(['cusip', 'fiscal_date', 'reports_at', 'as_of', 'type'], axis=1, inplace=True)
    df.set_index(['as_of_date', 'symbol'], inplace=True)
    
    return df

def longs_and_shorts(sdf):
    df = sdf.reset_index()
    gdf = df.groupby(['as_of_date', 'hour'])
    df.set_index(['as_of_date', 'hour'], inplace=True)

    botq = gdf.quantile(SIGNAL_QUANTILE)
    botq.rename(columns={'signal': 'bottom_quantile'}, inplace=True)

    topq = gdf.quantile(1 - SIGNAL_QUANTILE)
    topq.rename(columns={'signal': 'top_quantile'}, inplace=True)

    df = df.join([botq, topq], how='inner')
    df['short'] = df['signal'] <= df['bottom_quantile']
    df['long'] = df['signal'] >= df['top_quantile']
    df = df[df['short'] | df['long']]
    df.drop(['bottom_quantile', 'top_quantile', 'signal'], axis=1, inplace=True)
    
    df['count'] = df.reset_index().groupby(['as_of_date', 'hour'])['symbol'].count()
    df = df[df['count'] >= MIN_ASSETS]
    df.drop(['count'], axis=1, inplace=True)
    df.reset_index(inplace=True)
    df.set_index(['as_of_date', 'symbol'], inplace=True)
    
    return df

def signal_returns(year):
    start_date = '{}-01-01'.format(year)
    
    if year != 2018:
        end_date = '{}-01-01'.format(year + 1)
    else:
        end_date = '2018-04-01'
    
    udf = universe(start_date, end_date)
    sdf = signals(year)
    fsdf = sdf.join(udf, how='inner')
    
    lsdf = longs_and_shorts(fsdf)
    assets = get_assets(lsdf)
    
    if len(assets) == 0:
        return pd.DataFrame(columns=['as_of_date', 'return'])
    
    rdf = day_and_night_returns(start_date, end_date, assets)
    
    df = lsdf.join(rdf, how='inner')
    df.loc[df['short'] == True, ['dayret', 'nightret']] *= -1
    df.drop(['long', 'short'], axis=1, inplace=True)
    df.reset_index(inplace=True)
    
    mdf = df[df['hour'] == 7].copy()
    mdf.drop(['nightret'], axis=1, inplace=True)
    mdf.rename(columns={'dayret': 'return'}, inplace=True)
    
    ndf = df[df['hour'] == 14].copy()
    ndf.drop(['dayret'], axis=1, inplace=True)
    ndf.rename(columns={'nightret': 'return'}, inplace=True)
    
    # Calculate morning and overnight returns
    df = pd.concat([mdf, ndf])
    df = df.groupby(['as_of_date', 'hour']).mean()
    
    # Calculate daily returns
    df.reset_index(inplace=True)
    df.drop(['hour'], axis=1, inplace=True)
    df = df.groupby(['as_of_date']).sum()
    
    return df
    
def show_cumulative_returns(df):
    if not df.empty:
        cdf = ((df + 1).cumprod() - 1) * 100
        cdf.plot()
        plt.xlabel('')
        plt.ylabel('')
        plt.legend(
            ['Cumulative Return'],
            bbox_to_anchor=(0.50, 0.96),
            loc="upper center",
            borderaxespad=0,
            ncol=1
        )
        plt.show()
    
def performance_summary(df):
    if df.empty:
        return None
    
    df.dropna(inplace=True)
    numday = df['return'].count()
    avgret = df['return'].mean()
    stdret = np.std(df['return'])
    
    data = []
    
    if numday > 0 and stdret > 0:
        cumret = ((df['return'] + 1).prod() - 1)
        annret = avgret * 252
        volatl = stdret * math.sqrt(252)
        sharpe = math.sqrt(252) * (avgret / stdret)
        
        data = np.array([df.index[0].year, cumret, avgret, numday, annret, sharpe, volatl]).reshape(1, 7)
        
    columns = [
        'Year',
        'Cumulative Return',
        'Avg Daily Return',
        'Num Trading Days',
        'Return (Annualized)',
        'Sharpe Ratio',
        'Volatility'
    ]
    
    pdf = pd.DataFrame(data=data, columns=columns)
    pdf['Year'] = pdf['Year'].astype(int)
    
    return pdf

def show_yearly_performance(years):
    ardf = None
    aperf = None
    
    for year in years:
        srdf = signal_returns(year)
        srdf['return'] *= LEVERAGE_RATIO

        perf = performance_summary(srdf)
        display(perf)
        
        show_cumulative_returns(srdf)
        
        if ardf is not None:
            ardf = pd.concat([ardf, srdf], copy=False)
            aperf = pd.concat([aperf, perf], copy=False)
        else:
            ardf = srdf
            aperf = perf
            
    show_cumulative_returns(ardf)
    
    aperf.drop(['Year'], axis=1, inplace=True)
    aperf = aperf.mean().to_frame().transpose()
    display(aperf)

    return ardf

def show_yearly_performance_as_table(ardf):
    cdf = ((ardf + 1).cumprod() - 1) * 100

    print(len(cdf))

    group_size = 60
    num_groups = int(math.ceil(float(len(cdf)) / group_size))

    for i in range(0, num_groups):
        start = i * group_size
        end = start + 60

        if end > len(cdf):
            end = -1

        pdf = cdf.iloc[start:end]
        display(pdf)

df = show_yearly_performance(range(2012, 2018))

# show_yearly_performance_as_table(df)