The Estimize Signal is a cross sectional score which captures several predictive factors based on Estimize’s proprietary database of crowdsourced earnings estimates. These factors include pre-earnings measures such as the difference between Estimize and Wall Street earnings forecasts, as well as post-earnings factors such as recent earnings surprises as benchmarked against Estimize forecasts. In constructing the Estimize Signal, we leveraged the research from our white paper, "Generating Abnormal Returns Using Crowdsourced Earnings Forecasts fromEstimize," first written in 2014. The signal construction process included rigorous in-and out-of-sample testing, and represents a fairly parsimonious use of the Estimize data set.
In the following part of the notebook, we have codified an example of how the Estimize Signal could be traded in a single factor model, and show yearly as well as multiyear performance. The data used are pricing data (to calculate daily returns) provided by Quantopian, and the Estimize Signal data, which are split into yearly CSV files. To run this notebook you will need to download the Estimize Signal CSVs and place them into your research data directory on Quantopian. These files can be downloaded from the following links:
There are seven parameters that can be changed to control how the algo operates. With the default parameter settings, we have a universe consisting of U.S. equities with market caps >= \$100mm, average daily trading volumes of > \$1mm, and prices (split unadjusted) of > \$4. On each trade we create a long portfolio which is equally weighted among all stocks in the top 10% of stocks in this universe according to the Signal. Similarly the short portfolio is equally weighted among the stocks in the bottom 10%. Our dollar neutral return is the difference between the long portfolio’s return and the short portfolio’s return. In addition we are using a 2-to-1 leverage ratio.
# Import libraries
import pandas as pd
import numpy as np
import scipy.stats as ss
import statsmodels.api as sm
import matplotlib.pyplot as plt
import math
from datetime import timedelta
from statsmodels import regression
from statsmodels.discrete.discrete_model import Logit
from scipy.stats import mstats
from quantopian.research import run_pipeline, symbols
from quantopian.pipeline import CustomFactor, Pipeline
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import AverageDollarVolume, CustomFactor
from quantopian.pipeline.factors.morningstar import MarketCap
from quantopian.pipeline.filters import CustomFilter
TIMEZONE = 'US/Eastern'
# Asset universe parameters:
MIN_MARKET_CAP = 100e6
MIN_AVG_DOLLAR_VOL = 1e6
MIN_PRICE = 4
# Signal parameters:
POST_SIGNAL_ONLY = False # If True, only post-earnings signal is used
# Trading parameters:
SIGNAL_QUANTILE = 0.10 # Upper and lower signal quantile to select assets
MIN_ASSETS = 20 # Min number of total long and short assets, below which no trades occur
LEVERAGE_RATIO = 2
def get_assets(df):
return df.reset_index()['symbol'].unique().tolist()
def universe(start_date, end_date):
market_cap = MarketCap()
adv = AverageDollarVolume(window_length=20)
last_close = USEquityPricing.close.latest
min_market_cap = market_cap >= MIN_MARKET_CAP
min_adv = adv >= MIN_AVG_DOLLAR_VOL
min_last_close = last_close >= MIN_PRICE
screen = (min_market_cap & min_adv & min_last_close)
pipeline = Pipeline(screen=screen)
df = run_pipeline(pipeline, start_date, end_date)
df.index.tz = TIMEZONE
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'as_of_date', 'level_1': 'symbol'}, inplace=True)
df['as_of_date'] = df['as_of_date'].dt.date
df.set_index(['as_of_date', 'symbol'], inplace=True)
return df
def day_and_night_returns(start_date, end_date, assets):
pl = get_pricing(assets, fields=['open_price', 'close_price'], start_date=start_date, end_date=end_date)
op = pl['open_price']
cp = pl['close_price']
dr = ((cp / op) - 1.0).stack()
nr = ((op.shift(-1) / cp) - 1.0).stack()
df = pd.DataFrame(dict(dayret=dr, nightret=nr))
df.reset_index(inplace=True)
df.rename(columns={'level_0': 'as_of_date', 'level_1': 'symbol'}, inplace=True)
df['as_of_date'] = df['as_of_date'].dt.date
df.set_index(['as_of_date', 'symbol'], inplace=True)
return df
def signals(year):
if year != 2018:
file_name = 'df{}.csv'.format(year)
else:
file_name = 'df2018q1.csv'
df = local_csv(file_name, symbol_column='ticker', date_column='as_of')
df.index.tz = TIMEZONE
if POST_SIGNAL_ONLY:
df = df[df['type'] == 'post']
df.reset_index(inplace=True)
df.dropna(inplace=True)
df.rename(columns={'ticker': 'symbol', 'as_of': 'as_of'}, inplace=True)
df['as_of_date'] = df['as_of'].dt.date
df['hour'] = df['as_of'].dt.hour
df.drop(['cusip', 'fiscal_date', 'reports_at', 'as_of', 'type'], axis=1, inplace=True)
df.set_index(['as_of_date', 'symbol'], inplace=True)
return df
def longs_and_shorts(sdf):
df = sdf.reset_index()
gdf = df.groupby(['as_of_date', 'hour'])
df.set_index(['as_of_date', 'hour'], inplace=True)
botq = gdf.quantile(SIGNAL_QUANTILE)
botq.rename(columns={'signal': 'bottom_quantile'}, inplace=True)
topq = gdf.quantile(1 - SIGNAL_QUANTILE)
topq.rename(columns={'signal': 'top_quantile'}, inplace=True)
df = df.join([botq, topq], how='inner')
df['short'] = df['signal'] <= df['bottom_quantile']
df['long'] = df['signal'] >= df['top_quantile']
df = df[df['short'] | df['long']]
df.drop(['bottom_quantile', 'top_quantile', 'signal'], axis=1, inplace=True)
df['count'] = df.reset_index().groupby(['as_of_date', 'hour'])['symbol'].count()
df = df[df['count'] >= MIN_ASSETS]
df.drop(['count'], axis=1, inplace=True)
df.reset_index(inplace=True)
df.set_index(['as_of_date', 'symbol'], inplace=True)
return df
def signal_returns(year):
start_date = '{}-01-01'.format(year)
if year != 2018:
end_date = '{}-01-01'.format(year + 1)
else:
end_date = '2018-04-01'
udf = universe(start_date, end_date)
sdf = signals(year)
fsdf = sdf.join(udf, how='inner')
lsdf = longs_and_shorts(fsdf)
assets = get_assets(lsdf)
if len(assets) == 0:
return pd.DataFrame(columns=['as_of_date', 'return'])
rdf = day_and_night_returns(start_date, end_date, assets)
df = lsdf.join(rdf, how='inner')
df.loc[df['short'] == True, ['dayret', 'nightret']] *= -1
df.drop(['long', 'short'], axis=1, inplace=True)
df.reset_index(inplace=True)
mdf = df[df['hour'] == 7].copy()
mdf.drop(['nightret'], axis=1, inplace=True)
mdf.rename(columns={'dayret': 'return'}, inplace=True)
ndf = df[df['hour'] == 14].copy()
ndf.drop(['dayret'], axis=1, inplace=True)
ndf.rename(columns={'nightret': 'return'}, inplace=True)
# Calculate morning and overnight returns
df = pd.concat([mdf, ndf])
df = df.groupby(['as_of_date', 'hour']).mean()
# Calculate daily returns
df.reset_index(inplace=True)
df.drop(['hour'], axis=1, inplace=True)
df = df.groupby(['as_of_date']).sum()
return df
def show_cumulative_returns(df):
if not df.empty:
cdf = ((df + 1).cumprod() - 1) * 100
cdf.plot()
plt.xlabel('')
plt.ylabel('')
plt.legend(
['Cumulative Return'],
bbox_to_anchor=(0.50, 0.96),
loc="upper center",
borderaxespad=0,
ncol=1
)
plt.show()
def performance_summary(df):
if df.empty:
return None
df.dropna(inplace=True)
numday = df['return'].count()
avgret = df['return'].mean()
stdret = np.std(df['return'])
data = []
if numday > 0 and stdret > 0:
cumret = ((df['return'] + 1).prod() - 1)
annret = avgret * 252
volatl = stdret * math.sqrt(252)
sharpe = math.sqrt(252) * (avgret / stdret)
data = np.array([df.index[0].year, cumret, avgret, numday, annret, sharpe, volatl]).reshape(1, 7)
columns = [
'Year',
'Cumulative Return',
'Avg Daily Return',
'Num Trading Days',
'Return (Annualized)',
'Sharpe Ratio',
'Volatility'
]
pdf = pd.DataFrame(data=data, columns=columns)
pdf['Year'] = pdf['Year'].astype(int)
return pdf
def show_yearly_performance(years):
ardf = None
aperf = None
for year in years:
srdf = signal_returns(year)
srdf['return'] *= LEVERAGE_RATIO
perf = performance_summary(srdf)
display(perf)
show_cumulative_returns(srdf)
if ardf is not None:
ardf = pd.concat([ardf, srdf], copy=False)
aperf = pd.concat([aperf, perf], copy=False)
else:
ardf = srdf
aperf = perf
show_cumulative_returns(ardf)
aperf.drop(['Year'], axis=1, inplace=True)
aperf = aperf.mean().to_frame().transpose()
display(aperf)
return ardf
def show_yearly_performance_as_table(ardf):
cdf = ((ardf + 1).cumprod() - 1) * 100
print(len(cdf))
group_size = 60
num_groups = int(math.ceil(float(len(cdf)) / group_size))
for i in range(0, num_groups):
start = i * group_size
end = start + 60
if end > len(cdf):
end = -1
pdf = cdf.iloc[start:end]
display(pdf)
df = show_yearly_performance(range(2012, 2018))
# show_yearly_performance_as_table(df)