Notebook
In [1]:
from __future__ import print_function
import numpy as np
In [2]:
from quantopian.pipeline import Pipeline
from quantopian.pipeline.experimental import QTradableStocksUS
from quantopian.research import returns, run_pipeline
In [3]:
def tradeable_for(day):
    pipe = Pipeline(screen=QTradableStocksUS())
    return run_pipeline(pipe, day, day).index.get_level_values(1)
In [4]:
universe = tradeable_for('2011-01')
In [5]:
rets = returns(universe, '2009', '2011').dropna(how='any', axis=1)
rets.head()
Out[5]:
Equity(2 [ARNC]) Equity(24 [AAPL]) Equity(41 [ARCB]) Equity(52 [ABM]) Equity(62 [ABT]) Equity(64 [ABX]) Equity(67 [ADSK]) Equity(76 [TAP]) Equity(88 [ACI]) Equity(107 [ACV]) ... Equity(36371 [OCSL]) Equity(36372 [SNI]) Equity(36448 [LPS]) Equity(36628 [GTAT]) Equity(36714 [RAX]) Equity(36733 [HSNI]) Equity(36763 [WPRT]) Equity(36930 [DISC_A]) Equity(37686 [LOPE]) Equity(37775 [CLW])
2009-01-02 00:00:00+00:00 0.075573 0.062573 0.017290 -0.013132 0.001320 -0.021220 0.049771 0.010627 0.139399 0.032727 ... -0.001280 0.072630 -0.011282 0.253472 0.104869 0.035813 0.058824 0.039007 0.010684 0.136527
2009-01-05 00:00:00+00:00 -0.018124 0.043229 -0.029752 0.003200 -0.015700 -0.045570 0.010643 -0.019419 0.039508 -0.032459 ... 0.002564 0.016928 0.017967 0.102493 0.020339 -0.057181 0.111111 0.010239 -0.020613 0.160169
2009-01-06 00:00:00+00:00 0.020208 -0.016913 0.001665 0.009067 -0.033626 -0.020963 -0.001915 -0.056773 0.051035 0.000419 ... -0.023973 0.012072 -0.046197 0.062814 0.069767 0.019746 0.055000 0.012838 -0.028602 0.112625
2009-01-07 00:00:00+00:00 -0.099897 -0.021505 -0.039722 -0.020633 -0.004914 -0.071970 -0.038849 -0.013798 -0.111928 -0.009838 ... 0.000000 -0.023018 -0.053398 -0.052009 -0.049689 -0.093361 -0.069510 -0.029353 -0.053333 -0.151020
2009-01-08 00:00:00+00:00 0.042203 0.019780 -0.024884 0.031319 0.010868 0.066342 -0.041916 -0.021963 0.059774 0.005793 ... -0.001310 -0.013879 0.025178 0.022444 -0.042484 -0.089245 0.022071 0.038488 -0.002347 -0.026923

5 rows × 1732 columns

In [6]:
all_returns = rets.values
spy_returns = returns('SPY', '2009', '2011').values
In [7]:
def original_vectorized_beta(spy, assets):
    asset_residuals = assets - assets.mean(axis=0)  
    spy_residuals = spy - spy.mean()  
    covariances = (asset_residuals * spy_residuals).sum(axis=0)  
    spy_variance = (spy_residuals ** 2).sum()  
    return covariances / spy_variance
In [8]:
original = original_vectorized_beta(spy_returns[:, None], all_returns)
original
Out[8]:
array([ 1.83116649,  0.95349333,  1.40241192, ...,  0.93017304,
        0.4938906 ,  1.48053737])

Sanity Check that Values Still Match

In [9]:
from scipy.stats import linregress

aapl_returns = returns('AAPL', '2009', '2011').values
scipy_aapl_beta = linregress(spy_returns, aapl_returns).slope
vectorized_aapl_beta = original[rets.columns.get_loc(symbols('AAPL'))]

print("Scipy:     ", scipy_aapl_beta)
print("Vectorized:", vectorized_aapl_beta)
Scipy:      0.953493328272
Vectorized: 0.953493328272
In [10]:
def fastest_vectorized_beta_I_can_muster_v2(spy, assets):
    buf = np.empty(assets.shape[1])
    # We only need to de-mean one of these arrays, and SPY is a lot less work...
    spy -= spy.mean()
    spy.dot(assets, out=buf)
    np.divide(buf, spy.dot(spy), out=buf)  
    return buf
In [11]:
new = fastest_vectorized_beta_I_can_muster_v2(spy_returns, all_returns)
new
Out[11]:
array([ 1.83116649,  0.95349333,  1.40241192, ...,  0.93017304,
        0.4938906 ,  1.48053737])

New Vectorized Beta Matches Old to Within ~$10^{-15}$

In [12]:
np.abs((new - original)).max()
Out[12]:
2.6645352591003757e-15
In [13]:
import time
def timeit(f):
    """Take a function f and return a new function that calls 5000 times,
    returning the last result and the average runtime, ignoring the ten 
    slowest/fastest calls. This gives a more reliable estimate for fast functions.
    """
    _time=time.time
    def timed_f(*args, **kwargs):
        times = []
        for _ in range(5000):
            start = _time()
            result = f(*args, **kwargs)
            end = _time()
            times.append(end - start)
        # Take the average of the middle three to smooth out variance from cache effects.
        average_time = sum(sorted(times)[10:-10]) / (len(times) - 20)
        return result, average_time
    return timed_f
In [14]:
_, duration_old = timeit(original_vectorized_beta)(spy_returns[:, None], all_returns)
print("It took {} seconds to calculate betas the old way.".format(duration_old))
It took 0.00288552261261 seconds to calculate betas the old way.
In [15]:
_, duration_new = timeit(fastest_vectorized_beta_I_can_muster_v2)(spy_returns, all_returns)
print("It took {} seconds to calculate betas the new way.".format(duration_new))
print("The new version is {}x faster than the original notebook version.".format(duration_old / duration_new))
It took 0.000153615867278 seconds to calculate betas the new way.
The new version is 18.7840140719x faster than the original notebook version.