import numpy as np
from pandas.tseries.offsets import CustomBusinessDay
from scipy.stats import mode
def compute_forward_scores(factor,
scores,
periods=(1, 5, 10),
filter_zscore=None):
"""
Finds the N period forward returns (as percent change) for each asset
provided.
Parameters
----------
factor : pd.Series - MultiIndex
A MultiIndex Series indexed by timestamp (level 0) and asset
(level 1), containing the values for a single alpha factor.
- See full explanation in utils.get_clean_factor_and_forward_returns
scores : pd.DataFrame
Pricing data to use in forward price calculation.
Assets as columns, dates as index. Pricing data must
span the factor analysis time period plus an additional buffer window
that is greater than the maximum number of expected periods
in the forward returns calculations.
periods : sequence[int]
periods to compute forward returns on.
filter_zscore : int or float, optional
Sets forward returns greater than X standard deviations
from the the mean to nan. Set it to 'None' to avoid filtering.
Caution: this outlier filtering incorporates lookahead bias.
Returns
-------
forward_returns : pd.DataFrame - MultiIndex
A MultiIndex DataFrame indexed by timestamp (level 0) and asset
(level 1), containing the forward returns for assets.
Forward returns column names follow the format accepted by
pd.Timedelta (e.g. '1D', '30m', '3h15m', '1D1h', etc).
'date' index freq property (forward_returns.index.levels[0].freq)
will be set to a trading calendar (pandas DateOffset) inferred
from the input data (see infer_trading_calendar for more details).
"""
factor_dateindex = factor.index.levels[0]
if factor_dateindex.tz != scores.index.tz:
raise NonMatchingTimezoneError("The timezone of 'factor' is not the "
"same as the timezone of 'scores'. See "
"the pandas methods tz_localize and "
"tz_convert.")
freq = infer_trading_calendar(factor_dateindex, scores.index)
factor_dateindex = factor_dateindex.intersection(scores.index)
if len(factor_dateindex) == 0:
raise ValueError("Factor and scores indices don't match: make sure "
"they have the same convention in terms of datetimes "
"and symbol-names")
forward_returns = pd.DataFrame(index=pd.MultiIndex.from_product(
[factor_dateindex, scores.columns], names=['date', 'asset']))
forward_returns.index.levels[0].freq = freq
for period in sorted(periods):
#
# build forward returns
#
fwdret = (scores
#.shift(-period)
.reindex(factor_dateindex)
)
if filter_zscore is not None:
mask = abs(fwdret - fwdret.mean()) > (filter_zscore * fwdret.std())
fwdret[mask] = np.nan
# Find the period length, which will be the column name
# Becase the calendar inferred from factor and prices doesn't take
# into consideration holidays yet, there could be some non-trading days
# in between the trades so we'll test several entries to find out the
# correct period length
#
entries_to_test = min(10, len(fwdret.index), len(scores.index)-period)
days_diffs = []
for i in range(entries_to_test):
p_idx = scores.index.get_loc(fwdret.index[i])
start = scores.index[p_idx]
end = scores.index[p_idx+period]
period_len = diff_custom_calendar_timedeltas(start, end, freq)
days_diffs.append(period_len.components.days)
delta_days = period_len.components.days - mode(days_diffs).mode[0]
period_len -= pd.Timedelta(days=delta_days)
# Finally use period_len as column name
column_name = timedelta_to_string(period_len)
forward_returns[column_name] = fwdret.stack()
forward_returns.index = forward_returns.index.rename(['date', 'asset'])
return forward_returns
def get_clean_factor_and_forward_scores(factor,
scores,
groupby=None,
binning_by_group=False,
quantiles=5,
bins=None,
periods=(1, 5, 10),
filter_zscore=20,
groupby_labels=None,
max_loss=10.0):
forward_scores = compute_forward_scores(factor, scores, periods,
filter_zscore)
factor_data = get_clean_factor(factor, forward_scores, groupby=groupby,
groupby_labels=groupby_labels,
quantiles=quantiles, bins=bins,
binning_by_group=binning_by_group,
max_loss=max_loss)
return factor_data
def infer_trading_calendar(factor_idx, prices_idx):
"""
Infer the trading calendar from factor and price information.
Parameters
----------
factor_idx : pd.DatetimeIndex
The factor datetimes for which we are computing the forward returns
prices_idx : pd.DatetimeIndex
The prices datetimes associated withthe factor data
Returns
-------
calendar : pd.DateOffset
"""
full_idx = factor_idx.union(prices_idx)
# drop days of the week that are not used
days_to_keep = []
days_of_the_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
for day, day_str in enumerate(days_of_the_week):
if (full_idx.dayofweek == day).any():
days_to_keep.append(day_str)
days_to_keep = ' '.join(days_to_keep)
# we currently don't infer holidays, but CustomBusinessDay class supports
# custom holidays. So holidays could be inferred too eventually
return CustomBusinessDay(weekmask=days_to_keep)
def diff_custom_calendar_timedeltas(start, end, freq):
"""
Compute the difference between two pd.Timedelta taking into consideration
custom frequency, which is used to deal with custom calendars, such as a
trading calendar
Parameters
----------
start : pd.Timestamp
end : pd.Timestamp
freq : DateOffset, optional
Returns
-------
pd.Timedelta
end - start
"""
actual_days = pd.date_range(start, end, freq=freq).shape[0] - 1
timediff = end - start
delta_days = timediff.components.days - actual_days
return timediff - pd.Timedelta(days=delta_days)
def get_clean_factor(factor,
forward_returns,
groupby=None,
binning_by_group=False,
quantiles=5,
bins=None,
groupby_labels=None,
max_loss=0.35):
initial_amount = float(len(factor.index))
factor = factor.copy()
factor.index = factor.index.rename(['date', 'asset'])
merged_data = forward_returns.copy()
merged_data['factor'] = factor
if groupby is not None:
if isinstance(groupby, dict):
diff = set(factor.index.get_level_values(
'asset')) - set(groupby.keys())
if len(diff) > 0:
raise KeyError(
"Assets {} not in group mapping".format(
list(diff)))
ss = pd.Series(groupby)
groupby = pd.Series(index=factor.index,
data=ss[factor.index.get_level_values(
'asset')].values)
if groupby_labels is not None:
diff = set(groupby.values) - set(groupby_labels.keys())
if len(diff) > 0:
raise KeyError(
"groups {} not in passed group names".format(
list(diff)))
sn = pd.Series(groupby_labels)
groupby = pd.Series(index=groupby.index,
data=sn[groupby.values].values)
merged_data['group'] = groupby.astype('category')
merged_data = merged_data.dropna()
fwdret_amount = float(len(merged_data.index))
no_raise = False if max_loss == 0 else True
merged_data['factor_quantile'] = quantize_factor(merged_data,
quantiles,
bins,
binning_by_group,
no_raise)
merged_data = merged_data.dropna()
binning_amount = float(len(merged_data.index))
tot_loss = (initial_amount - binning_amount) / initial_amount
fwdret_loss = (initial_amount - fwdret_amount) / initial_amount
bin_loss = tot_loss - fwdret_loss
# print("Dropped %.1f%% entries from factor data: %.1f%% in forward "
# "returns computation and %.1f%% in binning phase "
# "(set max_loss=0 to see potentially suppressed Exceptions)." %
# (tot_loss * 100, fwdret_loss * 100, bin_loss * 100))
if tot_loss > max_loss:
message = ("max_loss (%.1f%%) exceeded %.1f%%, consider increasing it."
% (max_loss * 100, tot_loss * 100))
raise MaxLossExceededError(message)
else:
# print("max_loss is %.1f%%, not exceeded: OK!" % (max_loss * 100))
pass
return merged_data
def timedelta_to_string(timedelta):
c = timedelta.components
format = ''
if c.days != 0:
format += '%dD' % c.days
if c.hours > 0:
format += '%dh' % c.hours
if c.minutes > 0:
format += '%dm' % c.minutes
if c.seconds > 0:
format += '%ds' % c.seconds
if c.milliseconds > 0:
format += '%dms' % c.milliseconds
if c.microseconds > 0:
format += '%dus' % c.microseconds
if c.nanoseconds > 0:
format += '%dns' % c.nanoseconds
return format
def add_custom_calendar_timedelta(inputs, timedelta, freq):
days = timedelta.components.days
offset = timedelta - pd.Timedelta(days=days)
return inputs + freq * days + offset
def non_unique_bin_edges_error(func):
message = """
An error occurred while computing bins/quantiles on the input provided.
This usually happens when the input contains too many identical
values and they span more than one quantile. The quantiles are choosen
to have the same number of records each, but the same value cannot span
multiple quantiles. Possible workarounds are:
1 - Decrease the number of quantiles
2 - Specify a custom quantiles range, e.g. [0, .50, .75, 1.] to get unequal
number of records per quantile
3 - Use 'bins' option instead of 'quantiles', 'bins' chooses the
buckets to be evenly spaced according to the values themselves, while
'quantiles' forces the buckets to have the same number of records.
4 - for factors with discrete values use the 'bins' option with custom
ranges and create a range for each discrete value
Please see utils.get_clean_factor_and_forward_returns documentation for
full documentation of 'bins' and 'quantiles' options.
"""
def dec(*args, **kwargs):
try:
return func(*args, **kwargs)
except ValueError as e:
if 'Bin edges must be unique' in str(e):
rethrow(e, message)
raise
return dec
@non_unique_bin_edges_error
def quantize_factor(factor_data,
quantiles=5,
bins=None,
by_group=False,
no_raise=False):
if not ((quantiles is not None and bins is None) or
(quantiles is None and bins is not None)):
raise ValueError('Either quantiles or bins should be provided')
def quantile_calc(x, _quantiles, _bins, _no_raise):
try:
if _quantiles is not None and _bins is None:
return pd.qcut(x, _quantiles, labels=False) + 1
elif _bins is not None and _quantiles is None:
return pd.cut(x, _bins, labels=False) + 1
except Exception as e:
if _no_raise:
return pd.Series(index=x.index)
raise e
grouper = [factor_data.index.get_level_values('date')]
if by_group:
grouper.append('group')
factor_quantile = factor_data.groupby(grouper)['factor'] \
.apply(quantile_calc, quantiles, bins, no_raise)
factor_quantile.name = 'factor_quantile'
return factor_quantile.dropna()
def new_is_positive():
return False
import pandas as pd
df = local_csv('DKSalaries.csv')
df['game_datetime'] = pd.to_datetime(df['game_date'])
df = df.set_index(['game_date', 'player_id'])
df.head()
df.iloc[10]
Alphalens (Factor Analysis) Alphalens is a tool on Quantopian for analyzing the predictve ability of a factor. In this notebook, we will use Alphalens to analyze the predictive ability of a fantasy points per game factor.
The first step to analzing our factor is to define it. Here, we will define the fantasy points per game factor, dk_score to be the rolling mean fantasy points per game (using DK scoring rules)
# player fantasy score per game.
dk_score = df['dk_score']
dk_score.head()
Optimize (Portfolio Optimization) Now that we have identified some strong predictors of fantasy points, let's use the data to build an optimal portfolio each day. Specifically, we will pick a team of players (construct a portfolio) based on their mean points per game over the last 20 days.
On Quantopian, we can use the Optimize API to help us calculate an optimal 'portfolio' each day. In finance, this means picking the stocks that we should hold each day based on an objective function and a set of constraints. In daily fantasy sports, the problem is the same: we want to pick a lineup based on an objective function (maximize DK points) subject to a series of constraints (based on the rules of the game).
Generally speaking, constraints are the easy part. The rules are set before every game and we know that our lineup has to follow these rules. Specifically, NFL contests at DraftKings has the following rules:
The lineup must be composed of exactly 9 players. The sum of the fictional player salaries in a lineup must be less than 50,000. A lineup must include players from at least two NFL games. A lineup must include at least one player classified in each of the following positions:
QB RB RB WR WR WR TE DST (Defense / Special Teams) FLEX
The challenging part is creating an objective function. We know that we want to maximize the number of DK points that we get on a given day, but without time travel, it's difficult to know exactly what the function looks like. In practice, this means we need to define an 'expected value' objective that represents our expected performance of each player. Earlier, we saw that our trailing points per game factor was a good predictor of DK points, so let's use that as our objective function.
As an optimization problem, this means that we want to pick a team of players subject to the DK rules (salary cap, positions, etc.) that maximizes the total trailing points per game factor.
Let's define our problem using the Optimize API.
# Import the Optimize namespace.
import quantopian.optimize as opt
# Player costs.
player_costs = pd.DataFrame(df['dk_salary']).dropna().sort_index(level=['game_date', 'player_id'])
# Player positions.
player_positions = df['dk_position'].dropna()
# Define the game date that we want to test. Change this and re-run
# cells below to build a lineup for another date.
TEST_DATE = '2019-10-21'
# Format our expected value factor (dk_score) for use in Optimize.
expected_value = dk_score[player_costs.index].dropna()
expected_value = expected_value[expected_value.index.get_level_values(0) == TEST_DATE]
# Objective function to be fed into Optimize later on.
obj = opt.MaximizeAlpha(expected_value)
# Salary cap constraint. The total cost of our lineup has to be less than 50,000.
# We will define this as a FactorExposure constraint.
cost_limit = opt.FactorExposure(
player_costs,
min_exposures={'dk_salary': 0},
max_exposures={'dk_salary': 50000}
)
# Map from each player to its position.
labels = df['dk_position'].apply(lambda x: x[0])
# # Maps from each position to its min/max exposure.
min_exposures = {'WR/FLEX': 3, 'RB/FLEX': 2, 'QB': 1, 'TE/FLEX':1, 'DST':1}
max_exposures = {'WR/FLEX': 4, 'RB/FLEX': 3, 'QB': 1, 'TE/FLEX':2, 'DST':1}
player_position_constraint = opt.NetGroupExposure(labels, min_exposures, max_exposures)
# This constraints tells the Optimizer than we can hold at most 1 of each player
discretize_players = opt.PositionConcentration(
pd.Series([]),
pd.Series([]),
default_min_weight=0,
default_max_weight=1,
)
max_players = opt.MaxGrossExposure(9)
result = opt.calculate_optimal_portfolio(
objective=obj,
constraints=[
discretize_players,
player_position_constraint,
max_players,
cost_limit,
]
)