Notebook
In [186]:
import pandas as pd
from scipy import stats
from pytz import timezone
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
In [206]:
data_cached = get_pricing(['SPY','SH'],start_date='2010-01-1',end_date='2014-12-31',fields='volume',frequency='minute').dropna()
data = pd.DataFrame()
data['SPY'] = data_cached[symbols('SPY')]
data['SH'] = data_cached[symbols('SH')]

data['volume_ratio'] = data_cached[symbols('SH')] / data_cached[symbols('SPY')]
In [207]:
data['time'] = data.index.tz_convert(timezone('US/Eastern')).time
data['date'] = data.index.date
In [208]:
grouped_volumes = data.groupby('time')['volume_ratio']
data['grouped_means'] = grouped_volumes.apply(pd.rolling_mean, 40, min_periods=40)
data['grouped_std'] = grouped_volumes.apply(pd.rolling_std, 40, min_periods=40)
# calculate the z-score of the volume ratio of each minute, longitudinally, but only in 
# comparison with the  same minute on previous days.
data['z'] = ((data['volume_ratio'] - data['grouped_means']) / data['grouped_std']).dropna()
In [209]:
ht_map = pd.pivot_table(data,'z',index=data['date'], columns=data['time'],fill_value=0)
plt.pcolor(ht_map, cmap='coolwarm')
plt.colorbar()
plt.clim(ht_map.min().min(),ht_map.max().max())
# plot the zscores of each minute through time.  Not particularly illuminating, except 
# those broad white stripes back in november 2011
In [213]:
data.groupby('time')['SPY'].mean().dropna().plot()

# plot the simple volume profile of SPY and SH
Out[213]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f57a8ee4d50>
In [214]:
data.groupby('time')['SH'].mean().dropna().plot()
Out[214]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f57a8e0a610>
In [234]:
data.groupby('time')['volume_ratio'].mean().dropna().plot()
# plot the profile of the ratio of volume over the day.  doesn't seem very helpful
Out[234]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f578986a110>
In [256]:
# plot the time-series history of the mean of the z-scores of the time-bucketed volume ratios (whew)
mean_zes = data.groupby('date')['z'].mean().dropna()
mean_zes.plot()
Out[256]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f57808ccf90>
In [245]:
# what is the profile of SH on abnormal ratio days (vs normal days)
abnormal_dates = mean_zes.index[mean_zes >  0.5]
abnormally_quiet_dates = mean_zes.index[mean_zes < -0.25]

data[data['date'].isin(abnormal_dates)].groupby('time')['SH'].mean().plot()
data.groupby('time')['SH'].mean().plot()
# seems like abnormality is mostly concentrated in the mornings and afternoons?  what about SPY?
Out[245]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5780f6ed10>
In [247]:
# what is the profile of SPY on abnormal ratio days (vs normal days)
data[data['date'].isin(abnormal_dates)].groupby('time')['SPY'].mean().plot()
data.groupby('time')['SPY'].mean().plot()
# seems like abnormality is mostly concentrated in the mornings and afternoons?  what about SPY?
Out[247]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5780ddc950>
In [254]:
# lets check the quiet ratio days too
data[data['date'].isin(abnormally_quiet_dates)].groupby('time')['SH'].mean().plot()
data.groupby('time')['SH'].mean().plot()
Out[254]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5780a3b510>
In [253]:
# lets check the quiet ratio days too
data[data['date'].isin(abnormally_quiet_dates)].groupby('time')['SPY'].mean().plot()
data.groupby('time')['SPY'].mean().plot()
Out[253]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5780ae3590>
In [252]:
# much closer.  could this mean that changes in the ratio of volume of SH/SPY are 
# being mostly driven by changes in the volume of SH?  which would make sense, especially if
# "activity" is correlated with negative shocks, people get more interested in hedging

# further work: plot the activity in the afternoons conditional on the activity in the mornings
In [244]:
 
In [ ]: