import pandas as pd
from scipy import stats
from pytz import timezone
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
data_cached = get_pricing(['SPY','SH'],start_date='2010-01-1',end_date='2014-12-31',fields='volume',frequency='minute').dropna()
data = pd.DataFrame()
data['SPY'] = data_cached[symbols('SPY')]
data['SH'] = data_cached[symbols('SH')]
data['volume_ratio'] = data_cached[symbols('SH')] / data_cached[symbols('SPY')]
data['time'] = data.index.tz_convert(timezone('US/Eastern')).time
data['date'] = data.index.date
grouped_volumes = data.groupby('time')['volume_ratio']
data['grouped_means'] = grouped_volumes.apply(pd.rolling_mean, 40, min_periods=40)
data['grouped_std'] = grouped_volumes.apply(pd.rolling_std, 40, min_periods=40)
# calculate the z-score of the volume ratio of each minute, longitudinally, but only in
# comparison with the same minute on previous days.
data['z'] = ((data['volume_ratio'] - data['grouped_means']) / data['grouped_std']).dropna()
ht_map = pd.pivot_table(data,'z',index=data['date'], columns=data['time'],fill_value=0)
plt.pcolor(ht_map, cmap='coolwarm')
plt.colorbar()
plt.clim(ht_map.min().min(),ht_map.max().max())
# plot the zscores of each minute through time. Not particularly illuminating, except
# those broad white stripes back in november 2011
data.groupby('time')['SPY'].mean().dropna().plot()
# plot the simple volume profile of SPY and SH
data.groupby('time')['SH'].mean().dropna().plot()
data.groupby('time')['volume_ratio'].mean().dropna().plot()
# plot the profile of the ratio of volume over the day. doesn't seem very helpful
# plot the time-series history of the mean of the z-scores of the time-bucketed volume ratios (whew)
mean_zes = data.groupby('date')['z'].mean().dropna()
mean_zes.plot()
# what is the profile of SH on abnormal ratio days (vs normal days)
abnormal_dates = mean_zes.index[mean_zes > 0.5]
abnormally_quiet_dates = mean_zes.index[mean_zes < -0.25]
data[data['date'].isin(abnormal_dates)].groupby('time')['SH'].mean().plot()
data.groupby('time')['SH'].mean().plot()
# seems like abnormality is mostly concentrated in the mornings and afternoons? what about SPY?
# what is the profile of SPY on abnormal ratio days (vs normal days)
data[data['date'].isin(abnormal_dates)].groupby('time')['SPY'].mean().plot()
data.groupby('time')['SPY'].mean().plot()
# seems like abnormality is mostly concentrated in the mornings and afternoons? what about SPY?
# lets check the quiet ratio days too
data[data['date'].isin(abnormally_quiet_dates)].groupby('time')['SH'].mean().plot()
data.groupby('time')['SH'].mean().plot()
# lets check the quiet ratio days too
data[data['date'].isin(abnormally_quiet_dates)].groupby('time')['SPY'].mean().plot()
data.groupby('time')['SPY'].mean().plot()
# much closer. could this mean that changes in the ratio of volume of SH/SPY are
# being mostly driven by changes in the volume of SH? which would make sense, especially if
# "activity" is correlated with negative shocks, people get more interested in hedging
# further work: plot the activity in the afternoons conditional on the activity in the mornings