import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
start_date = pd.datetime(2018,8,1)
end_date = pd.datetime(2018,8,23)
spy_prices = get_pricing('SPY', start_date, end_date, frequency='minute').tz_convert('US/Eastern')
spy_prices.head(5)
In determining the aggregation level, one way to think about it is to calculate on average how many bars per day you would want to be formed. You could do this by dividing the average daily volume by the targeted number of bars per day. <br> <br> NOTE: This example suffers from look-ahead bias in that you don't know the average daily volume of the days on which the bars are formed until after they are formed, but since this notebook is focused simply on building volume bars, I just calculated the mean from the same data just to get an idea for a reasonable value. This can be corrected by calculating the average daily volume prior to forming the bars.
TARGETED_BARS_PER_DAY = 6
spy_prices.groupby(spy_prices.index.date)['volume'].sum().mean() / TARGETED_BARS_PER_DAY
def volume_bars(ohlcv, volume_threshold):
"""Create 'volume-base' activity ohlc bars.
Parameters
----------
ohlcv: pd.DataFrame
columns = open_price, high, low, close, volume, index = datetime
volume_threshold: int
Number of shares traded per bar
Returns
-------
pd.DataFrame
DataFrame containing OHLCV data. Indexed by datetime at end of bar.
"""
vol_sum = 0
bar_high = np.nan
bar_low = np.nan
bar_open = np.nan
volume_ohlcv = {}
for row_datetime, row_data in ohlcv.iterrows():
if np.isnan(bar_open):
bar_open = row_data['open_price']
vol_sum += row_data['volume']
bar_high = np.nanmax((bar_high, row_data['high']))
bar_low = np.nanmin((bar_low, row_data['low']))
if vol_sum > volume_threshold:
# Compile OHLCV Data and Close Bar
volume_ohlcv[row_datetime] = {'open_price': bar_open,
'high': bar_high,
'low': bar_low,
'close_price': row_data['close_price'],
'volume': vol_sum}
bar_open = np.nan
bar_high = np.nan
bar_low = np.nan
vol_sum=0
return pd.DataFrame.from_dict(volume_ohlcv, orient='index')
ohlcv_loop = volume_bars(spy_prices, 6e6)
def volume_bars_vectorized(ohlcv, volume_threshold):
"""Create 'volume-base' activity ohlc bars using pandas and numpy to
make computations more efficient.
Parameters
----------
ohlcv: pd.DataFrame
columns = open_price, high, low, close, volume, index = datetime
volume_threshold: int
Number of shares traded per bar
Returns
-------
pd.DataFrame
DataFrame containing OHLCV data. Indexed by datetime at end of bar.
"""
cum_vol = ohlcv['volume'].cumsum()
grouper = cum_vol // volume_threshold
# This makes sure last minute bar is included in aggregation
mask = grouper != grouper.shift(1)
mask[0] = False
grouper = (grouper - mask.astype(int) ).values
volume_ohlcv = (ohlcv.reset_index().groupby(grouper)
.agg({'open_price': 'first', 'high': 'max',
'low': 'min', 'close_price': 'last',
'volume': 'sum', 'index': 'last'})).set_index('index')
volume_ohlcv = volume_ohlcv[['open_price', 'high', 'low', 'close_price', 'volume']]
volume_ohlcv.index.name=None
return volume_ohlcv
ohlcv_vect = volume_bars_vectorized(spy_prices, 6e6)
ohlcv_loop.volume.plot(label='Looped Version')
ohlcv_vect.volume.plot(label='Vectorized Version')
plt.axhline(6e6, color='r', label='Targeted 6 million share threshold')
plt.title('Comparison by Method of Number of Shares Traded per Bar')
plt.ylim([4e6,8e6])
plt.legend(loc='best');
Note: There will be deviations from the targeted 6 million shares per bar due to the use of a 1-min resolution as opposed to a 1 tick resolution. The looped implementation only has positive deviations from the targeted share count (by design). Since, the vectorized implementation does not restart counting from 0 when starting a new bar, it can have deviations below the targeted value.
import time
def time_it(func, *args):
start_time = time.time()
result = func(*args)
return time.time() - start_time
number_of_runs = 100
loop_implement = []
vectorized_implement = []
for i in range(number_of_runs):
loop_implement.append(time_it(volume_bars, spy_prices, 1e6))
vectorized_implement.append(time_it(volume_bars_vectorized, spy_prices, 1e6))
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=False)
ax1.hist(loop_implement, bins=30)
ax2.hist(vectorized_implement, bins=30)
ax1.set_title('Compute Time Comparison: \n Loop Implementation')
ax2.set_title('Compute Time Comparison: \n Vectorized Implementation')
ax2.set(xlabel='Compute Time per run(seconds)')
fig.tight_layout()
print "Mean Compute Time"
print "------------------"
print "Loop Implementation {:0.3f} seconds per run".format(np.mean(loop_implement))
print "Vectorized Implementation: {:0.3f} seconds per run".format(np.mean(vectorized_implement))
Note: the prior plot is a little bit misleading due to the scaling on the x-axis. However, if you share the scaling on the xaxis, it scrunches the data up too much although it highlights the differenc in speed.
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
ax1.hist(loop_implement, bins=30)
ax2.hist(vectorized_implement, bins=30)
ax1.set_title('Compute Time Comparison: \n Loop Implementation')
ax2.set_title('Compute Time Comparison: \n Vectorized Implementation')
ax2.set(xlabel='Compute Time per run(seconds)')
fig.tight_layout()
print "Mean Compute Time"
print "------------------"
print "Loop Implementation {:0.3f} seconds per run".format(np.mean(loop_implement))
print "Vectorized Implementation: {:0.3f} seconds per run".format(np.mean(vectorized_implement))
start_date = pd.datetime(2017,1,1)
end_date = pd.datetime(2017,8,23)
spy_prices = get_pricing('SPY', start_date, end_date, frequency='minute').tz_convert('US/Eastern').dropna()
spy_prices_30 = spy_prices.close_price.resample('30min').last().dropna()
ohlcv_vect = volume_bars_vectorized(spy_prices, 6e6)
volume_returns = ohlcv_vect.close_price.pct_change()[1:]
time_returns_1 = spy_prices.close_price.pct_change()[1:]
time_returns_30 = spy_prices_30.pct_change()[1:]
def plot_return_distributions(returns, title):
fig, (ax1, ax2) = plt.subplots(ncols=2)
sns.distplot(volume_returns, ax=ax1)
stats.probplot(volume_returns, plot=ax2)
fig.suptitle(title)
plot_return_distributions(volume_returns, 'Volume Bar Returns')
print stats.describe(volume_returns)
plot_return_distributions(time_returns_30, "30-min Time Bar Returns")
print stats.describe(time_returns_30)
plot_return_distributions(time_returns_1, "1-min Time Bar Returns")
print stats.describe(time_returns_1)
Based on this small sample, the volume bars do seem to have less skewness and kurtosis.