It's good to know what values you're working with. Try this to keep an eye on them.
Provides an overview of pipeline output (or any series or dataframe) one time at the beginning of a backtest (or anywhere).
Or set to run every day or conditionally.
It is simpler than it looks, just set its main argument to match your pipeline output (or any series or dataframe), instead of context.out
currently.
To limit logging to only selected columns, set their names in the 'fields' list arguement like the line currently commented out.
If you have long column names, for vertical alignment, you might find it useful to simply shorten them.
Also this shows starting capital and start/stop dates like this:
2014-10-29 06:31 log_data:117 INFO $10,000,000 2014-10-29 to 2018-03-28
Code
Updated 2019-10-24
def log_data(context, z, num, fields=None):
''' Log info about pipeline output or, z can be any DataFrame or Series
https://quantopian.com/posts/overview-of-pipeline-content-easy-to-add-to-your-backtest
'''
if not len(z):
log.info('Empty pipe')
return
try: context.log_data_done
except:
context.log_data_done = 1
# Options
log_nan_only = 0 # Only log if nans are present.
show_sectors = 0 # If sectors, see them or not.
show_sorted_details = 1 # [num] high & low securities sorted, each column.
padmax = 6 # num characters for each field, starting point.
def out(lines): # log data lines of output efficiently
buffer_len = 1024 # each group
chunk = ':'
for line in lines:
if line is None or not len(line):
continue # skip if empty string for example
if len(chunk) + len(line) < buffer_len:
# Add to chunk if will still be under buffer_len
chunk += '\n{}'.format(line)
else: # Or log chunk and start over with new line.
log.info(chunk)
chunk = ':\n{}'.format(line)
if len(chunk) > 2: # if anything remaining
log.info(chunk)
if 'dict' in str(type(z)):
log.info('Not set up to handle a dictionary, only dataframe & series, bailing out of log_data()')
return
elif 'MultiIndex' in str(type(z.index)):
log.info('Found MultiIndex, not set up to handle it, bailing out of log_data()')
return
# Change index to just symbols for readability, meanwhile, right-aligned
z = z.rename(index=dict(zip(z.index.tolist(), [i.symbol.rjust(6) for i in z.index.tolist()])))
# Series ......
if 'Series' in str(type(z)): # is Series, not DataFrame
nan_count = len(z[z != z])
nan_count = 'NaNs {}/{}'.format(nan_count, len(z)) if nan_count else ''
if (log_nan_only and nan_count) or not log_nan_only:
pad = max( padmax, len('%.5f' % z.max()) )
log.info('{}{}{} Series len {}'.format('min'.rjust(pad+5),
'mean'.rjust(pad+5), 'max'.rjust(pad+5), len(z)))
log.info('{}{}{} {}'.format(
('%.5f' % z.round(6). min()).rjust(pad+5),
('%.5f' % z.round(6).mean()).rjust(pad+5),
('%.5f' % z.round(6). max()).rjust(pad+5),
nan_count
))
log.info('High\n{}'.format(z.sort_values(ascending=False).head(num)))
log.info('Low\n{}' .format(z.sort_values(ascending=False).tail(num)))
return
# DataFrame ......
content_min_max = [ ['','min','mean','max',''] ] ; content = []
for col in z.columns:
try: z[col].max()
except: continue # skip non-numeric
if col == 'sector' and not show_sectors: continue
nan_count = len(z[col][z[col] != z[col]])
nan_count = 'NaNs {}/{}'.format(nan_count, len(z)) if nan_count else ''
padmax = max( padmax, len(str(z[col].max())) ) ; mean_ = ''
if len(str(z[col].max())) > 8 and 'float' in str(z[col].dtype):
z[col] = z[col].round(6) # Reduce number of decimal places for floating point values
if 'float' in str(z[col].dtype): mean_ = str(round(z[col].mean(), 6))
elif 'int' in str(z[col].dtype): mean_ = str(round(z[col].mean(), 1))
content_min_max.append([col, str(z[col] .min()), mean_, str(z[col] .max()), nan_count])
if log_nan_only and nan_count or not log_nan_only:
log.info('Rows: {} Columns: {}'.format(z.shape[0], z.shape[1]))
if len(z.columns) == 1: content.append('Rows: {}'.format(z.shape[0]))
paddings = [6 for i in range(4)]
for lst in content_min_max: # set max lengths
i = 0
for val in lst[:4]: # value in each sub-list
paddings[i] = max(paddings[i], len(str(val)))
i += 1
headr = content_min_max[0]
content.append(('{}{}{}{}{}'.format(
headr[0] .rjust(paddings[0]),
(headr[1]).rjust(paddings[1]+5),
(headr[2]).rjust(paddings[2]+5),
(headr[3]).rjust(paddings[3]+5),
''
)))
for lst in content_min_max[1:]: # populate content using max lengths
content.append(('{}{}{}{} {}'.format(
lst[0].rjust(paddings[0]),
lst[1].rjust(paddings[1]+5),
lst[2].rjust(paddings[2]+5),
lst[3].rjust(paddings[3]+5),
lst[4],
)))
out(content)
if not show_sorted_details: return
if len(z.columns) == 1: return # skip detail if only 1 column
if fields == None: details = z.columns
content = []
for detail in details:
if detail == 'sector' and not show_sectors: continue
hi = z[details].sort_values(by=detail, ascending=False).head(num)
lo = z[details].sort_values(by=detail, ascending=False).tail(num)
content.append(('_ _ _ {} _ _ _' .format(detail)))
content.append(('{} highs ...\n{}'.format(detail, str(hi))))
content.append(('{} lows ...\n{}'.format(detail, str(lo))))
if log_nan_only and not len(lo[lo[detail] != lo[detail]]):
continue # skip if no nans
out(content)
Output
(partial output just for illustration, here, everything was being ranked)
2016-03-24 05:45 log_pipe:306 INFO Rows: 400 Columns: 7
min mean max
alpha 358.5 1364.53625 2376.0
bbr 2.0 419.02 838.0
dir 1.0 210.855 419.0
fcf 2.0 209.8625 419.0
rev 1.0 209.5075 419.0
rsi 0.5 105.06375 209.5
yld 1.0 210.2275 419.0
2016-03-24 05:45 log_pipe:321 INFO _ _ _ alpha _ _ _
... alpha highs
alpha bbr dir fcf rev rsi yld
AMBC 2376.0 804.0 266.0 371.0 360.0 156.0 419.0
SALE 2268.5 738.0 332.0 373.0 365.0 181.5 279.0
BPT 2265.5 724.0 415.0 350.0 166.0 194.5 416.0
RTRX 2261.0 802.0 377.0 149.0 340.0 181.0 412.0
... alpha lows
alpha bbr dir fcf rev rsi yld
ARNA 472.5 102.0 14.0 32.0 243.0 39.5 42.0
HES 395.0 128.0 137.0 48.0 3.0 29.0 50.0
AMD 359.0 16.0 36.0 49.0 162.0 60.0 36.0
DVN 358.5 90.0 145.0 78.0 14.0 29.5 2.0