Hello Thomas,
Here's a rough cut. I get a build error "59 Error Runtime exception: ValueError: array must not contain infs or NaNs" triggered by the line:
edge_model.fit(X)
Any idea what's going on? My understanding is that the code is actually run in some fashion as part of the build, right? However, it's challenging to debug, since the log output is suppressed.
Grant
from sklearn import cluster, covariance
import numpy as np
from collections import defaultdict
# based on the example at:
# http://scikit-learn.org/stable/auto_examples/applications/plot_stock_market.html
# use in quick backtester
update_period = 5*390 # update clusters at this period in minutes
def initialize(context):
c = context
# Nasdaq 100 from https://www.quantopian.com/posts/list-of-nasdaq-100-sids-to-use-in-your-algo
# c.sids = [sid(24), sid(114), sid(122), sid(630) , sid(67),
# sid(20680), sid(328), sid(14328), sid(368), sid(16841),
# sid(9883), sid(337), sid(38650), sid(739), sid(27533),
# sid(3806), sid(18529), sid(1209), sid(40207), sid(1419),
# sid(15101), sid(17632), sid(39095), sid(1637), sid(1900),
# sid(32301), sid(18870), sid(14014), sid(25317), sid(36930),
# sid(12652), sid(26111), sid(24819), sid(24482), sid(2618),
# sid(2663), sid(27543), sid(1787) , sid(2696), sid(42950),
# sid(20208), sid(2853), sid(8816), sid(12213), sid(3212),
# sid(9736), sid(23906), sid(26578), sid(22316), sid(13862),
# sid(3951), sid(8655), sid(25339), sid(4246), sid(43405),
# sid(27357), sid(32046), sid(4485), sid(43919), sid(4668),
# sid(8677), sid(22802), sid(3450), sid(5061), sid(5121),
# sid(5149), sid(5166), sid(23709), sid(13905), sid(19926),
# sid(19725), sid(8857), sid(5767), sid(5787), sid(19917),
# sid(6295), sid(6413), sid(6546), sid(20281), sid(6683),
# sid(26169), sid(6872), sid(11901), sid(13940), sid(7061),
# sid(15581), sid(24518), sid(7272), sid(39840), sid(7671),
# sid(27872), sid(8017), sid(38817), sid(8045), sid(8132),
# sid(8158), sid(24124), sid(8344), sid(8352), sid(14848)]
c.sids = []
# some sids to look at
c.sids.append(sid(24))
c.sids.append(sid(18522))
c.sids.append(sid(5061))
c.sids.append(sid(20486))
c.sids.append(sid(5885))
c.sids.append(sid(4707))
c.sids.append(sid(3149))
context.elapsed_minutes = 0
# @batch_transform(refresh_period=5, window_length=12)
def batch_cluster(attribute,context):
c = context
# tell it we're looking for a graph structure
edge_model = covariance.GraphLassoCV()
X = attribute.values.copy()
X /= X.std(axis=0)
edge_model.fit(X)
# now process into clusters based on co-fluctuation
_, labels = cluster.affinity_propagation(edge_model.covariance_)
log.debug("Found {0} groups from {1} complete histories".format(max(labels)+1,len(attribute)))
# filter the sids into groups, in order they appear in c.sids
groups = defaultdict(lambda: [])
for i, grp_idx in enumerate(labels):
# groups[grp_idx].append( int(c.sids[i]) )
groups[grp_idx].append( c.sids[i].symbol )
return groups
def handle_data(context, data):
if context.elapsed_minutes % update_period != 0.0:
return
context.elapsed_minutes += 1
prices_open = history(13, '1d', 'open_price',ffill=True)[0:-1]
prices_close = history(13, '1d', 'close_price',ffill=True)[0:-1]
prices_delta = prices_close - prices_open
# print prices_delta
#
# return
c = context
# for s in c.sids:
# if s in data:
# # add the day's price range to the list for this sid
# data[s]['variation'] = (data[s].close_price - data[s].open_price)
# note that the model wont work if there are different number of
# entries in data.variation.
groups = batch_cluster(prices_delta,c)
# if groups is not None:
# # display stock sids that co-fluctuate:
# for i, g in groups.iteritems():
# print 'Cluster %i: %s' % ((i + 1), ", ".join([str(s) for s in g]))
# # ...ADD ORDER CODE HERE...
result = '------------------\n'
if groups is not None:
# display stock sids that co-fluctuate:
for i, g in groups.iteritems():
result = result + 'Cluster %i: %s\n' % ((i + 1), ", ".join([str(s) for s in g]))
print result