Notebook

Volume Based Activity Bars Construction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
In [2]:
start_date = pd.datetime(2018,8,1)
end_date = pd.datetime(2018,8,23)
spy_prices = get_pricing('SPY', start_date, end_date, frequency='minute').tz_convert('US/Eastern')
In [3]:
spy_prices.head(5)
Out[3]:
open_price high low close_price volume price
2018-08-01 09:31:00-04:00 281.56 281.64 281.480 281.580 623016.0 281.580
2018-08-01 09:32:00-04:00 281.58 281.61 281.410 281.433 218768.0 281.433
2018-08-01 09:33:00-04:00 281.43 281.46 281.330 281.400 135222.0 281.400
2018-08-01 09:34:00-04:00 281.41 281.71 281.400 281.660 276663.0 281.660
2018-08-01 09:35:00-04:00 281.65 281.73 281.565 281.730 170005.0 281.730

In determining the aggregation level, one way to think about it is to calculate on average how many bars per day you would want to be formed. You could do this by dividing the average daily volume by the targeted number of bars per day. <br> <br> NOTE: This example suffers from look-ahead bias in that you don't know the average daily volume of the days on which the bars are formed until after they are formed, but since this notebook is focused simply on building volume bars, I just calculated the mean from the same data just to get an idea for a reasonable value. This can be corrected by calculating the average daily volume prior to forming the bars.

In [4]:
TARGETED_BARS_PER_DAY = 6
spy_prices.groupby(spy_prices.index.date)['volume'].sum().mean() / TARGETED_BARS_PER_DAY
Out[4]:
6705967.333333333
In [5]:
def volume_bars(ohlcv, volume_threshold):
    """Create 'volume-base' activity ohlc bars.
    
    Parameters
    ----------
    ohlcv: pd.DataFrame
        columns = open_price, high, low, close, volume,  index = datetime
    volume_threshold: int
        Number of shares traded per bar
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing OHLCV data. Indexed by datetime at end of bar.
    """
    vol_sum = 0
    bar_high = np.nan
    bar_low = np.nan
    bar_open = np.nan
    volume_ohlcv = {}
    for row_datetime, row_data in ohlcv.iterrows():
        if np.isnan(bar_open):
            bar_open = row_data['open_price']
        vol_sum += row_data['volume']
        bar_high = np.nanmax((bar_high, row_data['high']))
        bar_low = np.nanmin((bar_low, row_data['low']))
        if vol_sum > volume_threshold:
            # Compile OHLCV Data and Close Bar
            volume_ohlcv[row_datetime] = {'open_price': bar_open, 
                                          'high': bar_high,
                                          'low': bar_low,
                                          'close_price': row_data['close_price'],
                                          'volume': vol_sum}
            bar_open = np.nan
            bar_high = np.nan
            bar_low = np.nan
            vol_sum=0
            
    return pd.DataFrame.from_dict(volume_ohlcv, orient='index')
    
ohlcv_loop = volume_bars(spy_prices, 6e6)
In [6]:
def volume_bars_vectorized(ohlcv, volume_threshold):
    """Create 'volume-base' activity ohlc bars using pandas and numpy to 
    make computations more efficient.
    
    Parameters
    ----------
    ohlcv: pd.DataFrame
        columns = open_price, high, low, close, volume,  index = datetime
    volume_threshold: int
        Number of shares traded per bar
        
    Returns
    -------
    pd.DataFrame
        DataFrame containing OHLCV data. Indexed by datetime at end of bar.
    """
    cum_vol = ohlcv['volume'].cumsum()
    grouper = cum_vol // volume_threshold
    
    # This makes sure last minute bar is included in aggregation
    mask = grouper != grouper.shift(1)
    mask[0] = False
    grouper = (grouper - mask.astype(int) ).values
  
    volume_ohlcv = (ohlcv.reset_index().groupby(grouper)
                    .agg({'open_price': 'first', 'high': 'max', 
                          'low': 'min', 'close_price': 'last', 
                          'volume': 'sum', 'index': 'last'})).set_index('index')
    volume_ohlcv = volume_ohlcv[['open_price', 'high', 'low', 'close_price', 'volume']]
    volume_ohlcv.index.name=None
    return volume_ohlcv
    

ohlcv_vect = volume_bars_vectorized(spy_prices, 6e6)    
In [7]:
ohlcv_loop.volume.plot(label='Looped Version')
ohlcv_vect.volume.plot(label='Vectorized Version')
plt.axhline(6e6, color='r', label='Targeted 6 million share threshold')
plt.title('Comparison by Method of Number of Shares Traded per Bar')
plt.ylim([4e6,8e6])
plt.legend(loc='best');

Note: There will be deviations from the targeted 6 million shares per bar due to the use of a 1-min resolution as opposed to a 1 tick resolution. The looped implementation only has positive deviations from the targeted share count (by design). Since, the vectorized implementation does not restart counting from 0 when starting a new bar, it can have deviations below the targeted value.

Performance Computation Test

In [8]:
import time
def time_it(func, *args):
    start_time = time.time()
    result = func(*args)
    return time.time() - start_time
In [9]:
number_of_runs = 100
loop_implement = []
vectorized_implement = []
for i in range(number_of_runs):
    loop_implement.append(time_it(volume_bars, spy_prices, 1e6))
    vectorized_implement.append(time_it(volume_bars_vectorized, spy_prices, 1e6))
In [10]:
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=False)

ax1.hist(loop_implement, bins=30)
ax2.hist(vectorized_implement, bins=30)

ax1.set_title('Compute Time Comparison: \n Loop Implementation')
ax2.set_title('Compute Time Comparison: \n Vectorized Implementation')
ax2.set(xlabel='Compute Time per run(seconds)')

fig.tight_layout()

print "Mean Compute Time"
print "------------------"
print "Loop Implementation {:0.3f} seconds per run".format(np.mean(loop_implement))
print "Vectorized Implementation: {:0.3f} seconds per run".format(np.mean(vectorized_implement))
Mean Compute Time
------------------
Loop Implementation 0.934 seconds per run
Vectorized Implementation: 0.007 seconds per run

Note: the prior plot is a little bit misleading due to the scaling on the x-axis. However, if you share the scaling on the xaxis, it scrunches the data up too much although it highlights the differenc in speed.

In [11]:
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)

ax1.hist(loop_implement, bins=30)
ax2.hist(vectorized_implement, bins=30)

ax1.set_title('Compute Time Comparison: \n Loop Implementation')
ax2.set_title('Compute Time Comparison: \n Vectorized Implementation')
ax2.set(xlabel='Compute Time per run(seconds)')

fig.tight_layout()

print "Mean Compute Time"
print "------------------"
print "Loop Implementation {:0.3f} seconds per run".format(np.mean(loop_implement))
print "Vectorized Implementation: {:0.3f} seconds per run".format(np.mean(vectorized_implement))
Mean Compute Time
------------------
Loop Implementation 0.934 seconds per run
Vectorized Implementation: 0.007 seconds per run

Distribution of Bar Returns

In [12]:
start_date = pd.datetime(2017,1,1)
end_date = pd.datetime(2017,8,23)
spy_prices = get_pricing('SPY', start_date, end_date, frequency='minute').tz_convert('US/Eastern').dropna()
spy_prices_30 = spy_prices.close_price.resample('30min').last().dropna()
ohlcv_vect = volume_bars_vectorized(spy_prices, 6e6)    
In [13]:
volume_returns = ohlcv_vect.close_price.pct_change()[1:]
time_returns_1 = spy_prices.close_price.pct_change()[1:]
time_returns_30 = spy_prices_30.pct_change()[1:]
In [14]:
def plot_return_distributions(returns, title):
    fig, (ax1, ax2) = plt.subplots(ncols=2)
    sns.distplot(volume_returns, ax=ax1)
    stats.probplot(volume_returns, plot=ax2)
    fig.suptitle(title)
In [15]:
plot_return_distributions(volume_returns, 'Volume Bar Returns')
print stats.describe(volume_returns)
DescribeResult(nobs=1336, minmax=(-0.0084634717418855532, 0.0099851912722666025), mean=7.0744367390269654e-05, variance=2.3981354111028867e-06, skewness=0.18858421111699528, kurtosis=3.9100645072986406)
In [18]:
plot_return_distributions(time_returns_30, "30-min Time Bar Returns")
print stats.describe(time_returns_30)
DescribeResult(nobs=2261, minmax=(-0.0098311256199384855, 0.010297223090794505), mean=4.1694203746075618e-05, variance=1.4203657336911521e-06, skewness=0.29648440843486107, kurtosis=11.838651950039093)
In [19]:
plot_return_distributions(time_returns_1, "1-min Time Bar Returns")
print stats.describe(time_returns_1)
DescribeResult(nobs=62998, minmax=(-0.0093613730586642463, 0.010914028467281289), mean=1.498323252734087e-06, variance=5.4846965635669746e-08, skewness=1.5426148465168095, kurtosis=220.99930444504568)

Based on this small sample, the volume bars do seem to have less skewness and kurtosis.