Notebook
In [ ]:
import numpy as np
from pandas.tseries.offsets import CustomBusinessDay

from scipy.stats import mode

def compute_forward_scores(factor,
                            scores,
                            periods=(1, 5, 10),
                            filter_zscore=None):
    """
    Finds the N period forward returns (as percent change) for each asset
    provided.
    Parameters
    ----------
    factor : pd.Series - MultiIndex
        A MultiIndex Series indexed by timestamp (level 0) and asset
        (level 1), containing the values for a single alpha factor.
        - See full explanation in utils.get_clean_factor_and_forward_returns
    scores : pd.DataFrame
        Pricing data to use in forward price calculation.
        Assets as columns, dates as index. Pricing data must
        span the factor analysis time period plus an additional buffer window
        that is greater than the maximum number of expected periods
        in the forward returns calculations.
    periods : sequence[int]
        periods to compute forward returns on.
    filter_zscore : int or float, optional
        Sets forward returns greater than X standard deviations
        from the the mean to nan. Set it to 'None' to avoid filtering.
        Caution: this outlier filtering incorporates lookahead bias.
    Returns
    -------
    forward_returns : pd.DataFrame - MultiIndex
        A MultiIndex DataFrame indexed by timestamp (level 0) and asset
        (level 1), containing the forward returns for assets.
        Forward returns column names follow the format accepted by
        pd.Timedelta (e.g. '1D', '30m', '3h15m', '1D1h', etc).
        'date' index freq property (forward_returns.index.levels[0].freq)
        will be set to a trading calendar (pandas DateOffset) inferred
        from the input data (see infer_trading_calendar for more details).
    """

    factor_dateindex = factor.index.levels[0]
    if factor_dateindex.tz != scores.index.tz:
        raise NonMatchingTimezoneError("The timezone of 'factor' is not the "
                                       "same as the timezone of 'scores'. See "
                                       "the pandas methods tz_localize and "
                                       "tz_convert.")

    freq = infer_trading_calendar(factor_dateindex, scores.index)

    factor_dateindex = factor_dateindex.intersection(scores.index)

    if len(factor_dateindex) == 0:
        raise ValueError("Factor and scores indices don't match: make sure "
                         "they have the same convention in terms of datetimes "
                         "and symbol-names")

    forward_returns = pd.DataFrame(index=pd.MultiIndex.from_product(
        [factor_dateindex, scores.columns], names=['date', 'asset']))

    forward_returns.index.levels[0].freq = freq

    for period in sorted(periods):
        #
        # build forward returns
        #
        fwdret = (scores
                  #.shift(-period)
                  .reindex(factor_dateindex)
                  )

        if filter_zscore is not None:
            mask = abs(fwdret - fwdret.mean()) > (filter_zscore * fwdret.std())
            fwdret[mask] = np.nan

        # Find the period length, which will be the column name
        # Becase the calendar inferred from factor and prices doesn't take
        # into consideration holidays yet, there could be some non-trading days
        # in between the trades so we'll test several entries to find out the
        # correct period length
        #
        entries_to_test = min(10, len(fwdret.index), len(scores.index)-period)
        days_diffs = []
        for i in range(entries_to_test):
            p_idx = scores.index.get_loc(fwdret.index[i])
            start = scores.index[p_idx]
            end = scores.index[p_idx+period]
            period_len = diff_custom_calendar_timedeltas(start, end, freq)
            days_diffs.append(period_len.components.days)

        delta_days = period_len.components.days - mode(days_diffs).mode[0]
        period_len -= pd.Timedelta(days=delta_days)

        # Finally use period_len as column name
        column_name = timedelta_to_string(period_len)
        forward_returns[column_name] = fwdret.stack()

    forward_returns.index = forward_returns.index.rename(['date', 'asset'])

    return forward_returns

def get_clean_factor_and_forward_scores(factor,
                                         scores,
                                         groupby=None,
                                         binning_by_group=False,
                                         quantiles=5,
                                         bins=None,
                                         periods=(1, 5, 10),
                                         filter_zscore=20,
                                         groupby_labels=None,
                                         max_loss=10.0):
    
    forward_scores = compute_forward_scores(factor, scores, periods,
                                              filter_zscore)

    factor_data = get_clean_factor(factor, forward_scores, groupby=groupby,
                                   groupby_labels=groupby_labels,
                                   quantiles=quantiles, bins=bins,
                                   binning_by_group=binning_by_group,
                                   max_loss=max_loss)

    return factor_data


def infer_trading_calendar(factor_idx, prices_idx):
    """
    Infer the trading calendar from factor and price information.
    Parameters
    ----------
    factor_idx : pd.DatetimeIndex
        The factor datetimes for which we are computing the forward returns
    prices_idx : pd.DatetimeIndex
        The prices datetimes associated withthe factor data
    Returns
    -------
    calendar : pd.DateOffset
    """
    full_idx = factor_idx.union(prices_idx)

    # drop days of the week that are not used
    days_to_keep = []
    days_of_the_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    for day, day_str in enumerate(days_of_the_week):
        if (full_idx.dayofweek == day).any():
            days_to_keep.append(day_str)

    days_to_keep = ' '.join(days_to_keep)

    # we currently don't infer holidays, but CustomBusinessDay class supports
    # custom holidays. So holidays could be inferred too eventually
    return CustomBusinessDay(weekmask=days_to_keep)

def diff_custom_calendar_timedeltas(start, end, freq):
    """
    Compute the difference between two pd.Timedelta taking into consideration
    custom frequency, which is used to deal with custom calendars, such as a
    trading calendar
    Parameters
    ----------
    start : pd.Timestamp
    end : pd.Timestamp
    freq : DateOffset, optional
    Returns
    -------
    pd.Timedelta
        end - start
    """
    actual_days = pd.date_range(start, end, freq=freq).shape[0] - 1
    timediff = end - start
    delta_days = timediff.components.days - actual_days
    return timediff - pd.Timedelta(days=delta_days)

def get_clean_factor(factor,
                     forward_returns,
                     groupby=None,
                     binning_by_group=False,
                     quantiles=5,
                     bins=None,
                     groupby_labels=None,
                     max_loss=0.35):

    initial_amount = float(len(factor.index))

    factor = factor.copy()
    factor.index = factor.index.rename(['date', 'asset'])

    merged_data = forward_returns.copy()
    merged_data['factor'] = factor

    if groupby is not None:
        if isinstance(groupby, dict):
            diff = set(factor.index.get_level_values(
                'asset')) - set(groupby.keys())
            if len(diff) > 0:
                raise KeyError(
                    "Assets {} not in group mapping".format(
                        list(diff)))

            ss = pd.Series(groupby)
            groupby = pd.Series(index=factor.index,
                                data=ss[factor.index.get_level_values(
                                    'asset')].values)

        if groupby_labels is not None:
            diff = set(groupby.values) - set(groupby_labels.keys())
            if len(diff) > 0:
                raise KeyError(
                    "groups {} not in passed group names".format(
                        list(diff)))

            sn = pd.Series(groupby_labels)
            groupby = pd.Series(index=groupby.index,
                                data=sn[groupby.values].values)

        merged_data['group'] = groupby.astype('category')

    merged_data = merged_data.dropna()

    fwdret_amount = float(len(merged_data.index))

    no_raise = False if max_loss == 0 else True
    merged_data['factor_quantile'] = quantize_factor(merged_data,
                                                     quantiles,
                                                     bins,
                                                     binning_by_group,
                                                     no_raise)

    merged_data = merged_data.dropna()

    binning_amount = float(len(merged_data.index))

    tot_loss = (initial_amount - binning_amount) / initial_amount
    fwdret_loss = (initial_amount - fwdret_amount) / initial_amount
    bin_loss = tot_loss - fwdret_loss

#     print("Dropped %.1f%% entries from factor data: %.1f%% in forward "
#           "returns computation and %.1f%% in binning phase "
#           "(set max_loss=0 to see potentially suppressed Exceptions)." %
#           (tot_loss * 100, fwdret_loss * 100,  bin_loss * 100))

    if tot_loss > max_loss:
        message = ("max_loss (%.1f%%) exceeded %.1f%%, consider increasing it."
                   % (max_loss * 100, tot_loss * 100))
        raise MaxLossExceededError(message)
    else:
#         print("max_loss is %.1f%%, not exceeded: OK!" % (max_loss * 100))
        pass

    return merged_data

def timedelta_to_string(timedelta):

    c = timedelta.components
    format = ''
    if c.days != 0:
        format += '%dD' % c.days
    if c.hours > 0:
        format += '%dh' % c.hours
    if c.minutes > 0:
        format += '%dm' % c.minutes
    if c.seconds > 0:
        format += '%ds' % c.seconds
    if c.milliseconds > 0:
        format += '%dms' % c.milliseconds
    if c.microseconds > 0:
        format += '%dus' % c.microseconds
    if c.nanoseconds > 0:
        format += '%dns' % c.nanoseconds
    return format


def add_custom_calendar_timedelta(inputs, timedelta, freq):

    days = timedelta.components.days
    offset = timedelta - pd.Timedelta(days=days)
    return inputs + freq * days + offset

def non_unique_bin_edges_error(func):
    
    message = """
    An error occurred while computing bins/quantiles on the input provided.
    This usually happens when the input contains too many identical
    values and they span more than one quantile. The quantiles are choosen
    to have the same number of records each, but the same value cannot span
    multiple quantiles. Possible workarounds are:
    1 - Decrease the number of quantiles
    2 - Specify a custom quantiles range, e.g. [0, .50, .75, 1.] to get unequal
        number of records per quantile
    3 - Use 'bins' option instead of 'quantiles', 'bins' chooses the
        buckets to be evenly spaced according to the values themselves, while
        'quantiles' forces the buckets to have the same number of records.
    4 - for factors with discrete values use the 'bins' option with custom
        ranges and create a range for each discrete value
    Please see utils.get_clean_factor_and_forward_returns documentation for
    full documentation of 'bins' and 'quantiles' options.
"""

    def dec(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except ValueError as e:
            if 'Bin edges must be unique' in str(e):
                rethrow(e, message)
            raise
    return dec

@non_unique_bin_edges_error
def quantize_factor(factor_data,
                    quantiles=5,
                    bins=None,
                    by_group=False,
                    no_raise=False):
    
    if not ((quantiles is not None and bins is None) or
            (quantiles is None and bins is not None)):
        raise ValueError('Either quantiles or bins should be provided')

    def quantile_calc(x, _quantiles, _bins, _no_raise):
        try:
            if _quantiles is not None and _bins is None:
                return pd.qcut(x, _quantiles, labels=False) + 1
            elif _bins is not None and _quantiles is None:
                return pd.cut(x, _bins, labels=False) + 1
        except Exception as e:
            if _no_raise:
                return pd.Series(index=x.index)
            raise e

    grouper = [factor_data.index.get_level_values('date')]
    if by_group:
        grouper.append('group')

    factor_quantile = factor_data.groupby(grouper)['factor'] \
        .apply(quantile_calc, quantiles, bins, no_raise)
    factor_quantile.name = 'factor_quantile'

    return factor_quantile.dropna()
In [4]:
def new_is_positive():
    return False
In [19]:
import pandas as pd
In [20]:
df = local_csv('DKSalaries.csv')
In [21]:
df['game_datetime'] = pd.to_datetime(df['game_date'])
In [22]:
df = df.set_index(['game_date', 'player_id'])
In [23]:
df.head()
Out[23]:
player dk_position dk_salary team_id dk_score game_datetime
game_date player_id
10-20-19 13581406 Saquon Barkley RB/FLEX 8900 NYG 18.03 2019-10-20
13581130 Ezekiel Elliott RB/FLEX 8100 DAL 19.85 2019-10-20
13581251 Dalvin Cook RB/FLEX 8000 MIN 24.93 2019-10-20
13580796 Julio Jones WR/FLEX 8000 ATL 18.97 2019-10-20
13581039 Michael Thomas WR/FLEX 7900 NO 23.37 2019-10-20
In [24]:
df.iloc[10]
Out[24]:
player                  Amari Cooper
dk_position                  WR/FLEX
dk_salary                       7200
team_id                          DAL
dk_score                       20.08
game_datetime    2019-10-20 00:00:00
Name: (10-20-19, 13581026), dtype: object

Alphalens (Factor Analysis) Alphalens is a tool on Quantopian for analyzing the predictve ability of a factor. In this notebook, we will use Alphalens to analyze the predictive ability of a fantasy points per game factor.

The first step to analzing our factor is to define it. Here, we will define the fantasy points per game factor, dk_score to be the rolling mean fantasy points per game (using DK scoring rules)

In [25]:
# player fantasy score per game.
dk_score = df['dk_score']
In [26]:
dk_score.head()
Out[26]:
game_date  player_id
10-20-19   13581406     18.03
           13581130     19.85
           13581251     24.93
           13580796     18.97
           13581039     23.37
Name: dk_score, dtype: float64

Optimize (Portfolio Optimization) Now that we have identified some strong predictors of fantasy points, let's use the data to build an optimal portfolio each day. Specifically, we will pick a team of players (construct a portfolio) based on their mean points per game over the last 20 days.

On Quantopian, we can use the Optimize API to help us calculate an optimal 'portfolio' each day. In finance, this means picking the stocks that we should hold each day based on an objective function and a set of constraints. In daily fantasy sports, the problem is the same: we want to pick a lineup based on an objective function (maximize DK points) subject to a series of constraints (based on the rules of the game).

Generally speaking, constraints are the easy part. The rules are set before every game and we know that our lineup has to follow these rules. Specifically, NFL contests at DraftKings has the following rules:

The lineup must be composed of exactly 9 players. The sum of the fictional player salaries in a lineup must be less than 50,000. A lineup must include players from at least two NFL games. A lineup must include at least one player classified in each of the following positions:

QB RB RB WR WR WR TE DST (Defense / Special Teams) FLEX

The challenging part is creating an objective function. We know that we want to maximize the number of DK points that we get on a given day, but without time travel, it's difficult to know exactly what the function looks like. In practice, this means we need to define an 'expected value' objective that represents our expected performance of each player. Earlier, we saw that our trailing points per game factor was a good predictor of DK points, so let's use that as our objective function.

As an optimization problem, this means that we want to pick a team of players subject to the DK rules (salary cap, positions, etc.) that maximizes the total trailing points per game factor.

Let's define our problem using the Optimize API.

In [27]:
# Import the Optimize namespace.
import quantopian.optimize as opt
In [28]:
# Player costs.
player_costs = pd.DataFrame(df['dk_salary']).dropna().sort_index(level=['game_date', 'player_id'])

# Player positions.
player_positions = df['dk_position'].dropna()
In [29]:
# Define the game date that we want to test. Change this and re-run 
# cells below to build a lineup for another date.
TEST_DATE = '2019-10-21'
In [30]:
# Format our expected value factor (dk_score) for use in Optimize.
expected_value = dk_score[player_costs.index].dropna()
expected_value = expected_value[expected_value.index.get_level_values(0) == TEST_DATE]
In [31]:
# Objective function to be fed into Optimize later on.
obj = opt.MaximizeAlpha(expected_value)
In [65]:
# Salary cap constraint. The total cost of our lineup has to be less than 50,000.
# We will define this as a FactorExposure constraint.
cost_limit = opt.FactorExposure(
    player_costs, 
    min_exposures={'dk_salary': 0}, 
    max_exposures={'dk_salary': 50000}
)
In [66]:
# Map from each player to its position.
labels = df['dk_position'].apply(lambda x: x[0])

# # Maps from each position to its min/max exposure.
min_exposures = {'WR/FLEX': 3, 'RB/FLEX': 2, 'QB': 1, 'TE/FLEX':1, 'DST':1}
max_exposures = {'WR/FLEX': 4, 'RB/FLEX': 3, 'QB': 1, 'TE/FLEX':2, 'DST':1}

player_position_constraint = opt.NetGroupExposure(labels, min_exposures, max_exposures)
In [67]:
# This constraints tells the Optimizer than we can hold at most 1 of each player
discretize_players = opt.PositionConcentration(
    pd.Series([]),
    pd.Series([]),
    default_min_weight=0,
    default_max_weight=1,
)
In [68]:
max_players = opt.MaxGrossExposure(9)
In [74]:
result = opt.calculate_optimal_portfolio(
    objective=obj,
    constraints=[
        discretize_players,
        player_position_constraint,
        max_players,
        cost_limit,
    ]
)

ValueErrorTraceback (most recent call last)
<ipython-input-74-76a02eaf953d> in <module>()
      5         player_position_constraint,
      6         max_players,
----> 7         cost_limit,
      8     ]
      9 )

/build/src/qexec_repo/qexec/optimize/api.py in calculate_optimal_portfolio(objective, constraints, current_portfolio)
    164                                     constraints,
    165                                     current_portfolio=None):
--> 166         result = run_optimization(objective, constraints, current_portfolio)
    167         result.raise_for_status()
    168         return result.new_weights

/build/src/qexec_repo/qexec/optimize/api.py in run_optimization(objective, constraints, current_portfolio)
    156             constraints,
    157             current_portfolio,
--> 158             optimize_context,
    159         )
    160 

/build/src/qexec_repo/qexec/optimize/optimizer.pyc in optimize_unchecked(self, objective, constraints, current_weights, optimize_context)
    204             constraints,
    205             current_weights,
--> 206             optimize_context,
    207         )
    208         cvx_constraints = list(concat(constraint_map.values()))

/build/src/qexec_repo/qexec/optimize/optimizer.pyc in _get_problem_inputs(self, objective, constraints, current_weights, optimize_context)
    291         all_constraints = list(constraints) + external_constraints
    292 
--> 293         cvx_objective = objective.to_cvxpy(current_weights, new_weights)
    294 
    295         constraint_map = {

/build/src/qexec_repo/qexec/optimize/objectives.pyc in to_cvxpy(self, old_weights, new_weights)
    248     def to_cvxpy(self, old_weights, new_weights):
    249         alphas = self.effective_alphas(old_weights).values
--> 250         total_alpha_expr = new_weights.T * alphas
    251 
    252         # Penalize holding assets for which we don't have an alpha value.

/usr/local/lib/python2.7/dist-packages/cvxpy/expressions/expression.pyc in cast_op(self, other)
     38         """A wrapped binary operator that can handle non-Expression arguments.
     39         """
---> 40         other = self.cast_to_const(other)
     41         return binary_op(self, other)
     42     return cast_op

/usr/local/lib/python2.7/dist-packages/cvxpy/expressions/expression.pyc in cast_to_const(expr)
    240         """Converts a non-Expression to a Constant.
    241         """
--> 242         return expr if isinstance(expr, Expression) else cvxtypes.constant()(expr)
    243 
    244     @_cast_other

/usr/local/lib/python2.7/dist-packages/cvxpy/expressions/constants/constant.pyc in __init__(self, value)
     41         # Set DCP attributes.
     42         self._size = intf.size(self.value)
---> 43         self._is_pos, self._is_neg = intf.sign(self.value)
     44         super(Constant, self).__init__()
     45 

/usr/local/lib/python2.7/dist-packages/cvxpy/interface/matrix_utilities.pyc in sign(constant)
    208     else:  # Convert to Numpy array.
    209         mat = INTERFACES[np.ndarray].const_to_matrix(constant)
--> 210         max_val = mat.max()
    211         min_val = mat.min()
    212     return (min_val >= 0, max_val <= 0)

/usr/local/lib/python2.7/dist-packages/numpy/core/_methods.pyc in _amax(a, axis, out, keepdims)
     24 # small reductions
     25 def _amax(a, axis=None, out=None, keepdims=False):
---> 26     return umr_maximum(a, axis, None, out, keepdims)
     27 
     28 def _amin(a, axis=None, out=None, keepdims=False):

ValueError: zero-size array to reduction operation maximum which has no identity
In [ ]: