Check out the speed and results from two separate custom factors each calculating TTM values. The first is a numpy implementation by Doug Baldwin (https://www.quantopian.com/posts/trailing-twelve-months-ttm-with-as-of-date). The second is a pandas implementation.
# First, we need to import the basic pipeline methods
from quantopian.pipeline import Pipeline, CustomFactor
from quantopian.research import run_pipeline
# Also get the built-in filters and/or factors to use
from quantopian.pipeline.filters import QTradableStocksUS, Q500US, StaticAssets
# Finally get any data we want to use
from quantopian.pipeline.data.morningstar import Fundamentals as ms
from quantopian.pipeline.domain import US_EQUITIES
# Import numpy and pandas because they rock
import numpy as np
import pandas as pd
# First the numpy version
class TrailingTwelveMonths(CustomFactor):
window_length=315
window_safe = True
mask=symbols
def compute(self, today, assets, out, values, dates):
out[:] = [
(v[d + np.timedelta64(52, 'W') > d[-1]])[
np.unique(
d[d + np.timedelta64(52, 'W') > d[-1]],
return_index=True
)[1]
].sum()
for v, d in zip(values.T, dates.T)
]
# Next a version which uses pandas methods
# A little easier to understand but probably less efficient
class TrailingTwelveMonths_Pandas(CustomFactor):
"""
Gets the sum of the last 4 reported values using pandas
Uses a window length of ~5 quarters (252/4 x 5) to ensure we have at least 4 quarters
Requires 2 inputs: the datset values and the associated asof_dates
"""
window_length = 315
outputs = ['value', 'unique_values']
def compute(self, today, assets, out, values, asof_date):
# Make a single multi-index dataframe with asof_date and values as columns
# The days are the index level=0 and securities are level=1
date_df = pd.DataFrame(asof_date).stack().rename('date')
value_df = pd.DataFrame(values).stack().rename('value')
df = pd.concat([date_df, value_df], axis=1)
# Add a column with the asset to make drop_duplicates easier
df['asset'] = df.index.get_level_values(level=1)
# Drop duplicate dates, keep only the latest value for each date
df_dropped = df.drop_duplicates(['date', 'asset'], keep='last')
# Take only the latest 4 dates. Then, filter out any groups with less than 4 qtrs of data
df_top_4 = df_dropped.groupby(level=1).tail(4)
df_exactly_4 = df_top_4.groupby(level=1).filter(lambda group: group.date.size == 4)
# Sum the values where there are exactly 4 unique asof_dates to get TTM. Get counts.
value_sum = df_exactly_4.groupby(level=1).value.sum()
counts = df_top_4.groupby(level=1).date.count()
# Make a series of nans to hold our output values (will defualt to nan if no TTM value)
output_sum = pd.Series(np.nan, index=range(len(assets)))
output_counts = pd.Series(np.nan, index=range(len(assets)))
# Put our TTM values into the series
output_sum.loc[value_sum.index] = value_sum
output_counts.loc[counts.index] = counts
out.value[:] = output_sum
out.unique_values[:] = output_counts
Let's first run a pipeline using the numpy version and check the time it takes to complete. Use the dividend field to get the TTM dividends.
etrn_filter = StaticAssets(symbols(['ETRN']))
dividends_ttm_np = TrailingTwelveMonths(inputs=[
ms.dividend_per_share_earnings_reports,
ms.dividend_per_share_earnings_reports_asof_date
],
mask=QTradableStocksUS()
)
pipeline_numpy = Pipeline(
columns={'dividends_ttm_np': dividends_ttm_np},
screen = QTradableStocksUS()
)
start = '2019-10-28'
end = '2019-10-28'
results = run_pipeline(pipeline_numpy, start, end)
results.head()
Second, run a pipeline using the pandas version and check the time it takes to complete. Use the same dividend field to get the TTM dividends.
dividends_ttm_pd = TrailingTwelveMonths_Pandas(inputs=[
ms.dividend_per_share_earnings_reports,
ms.dividend_per_share_earnings_reports_asof_date
],
mask=QTradableStocksUS()
)
pipeline_numpy = Pipeline(
columns={'dividends_ttm_pd': dividends_ttm_pd.value,
'count': dividends_ttm_pd.unique_values},
screen = QTradableStocksUS()
)
start = '2019-10-28'
end = '2019-10-28'
results = run_pipeline(pipeline_numpy, start, end)
results.head()
Rather what was expected. The pandas version takes over 50% more time. However, it's still fast. Let's run them in the same dataframe and compare the results.
dividends_ttm_np = TrailingTwelveMonths(inputs=[
ms.dividend_per_share_earnings_reports,
ms.dividend_per_share_earnings_reports_asof_date
],
mask=QTradableStocksUS()
)
dividends_ttm_pd = TrailingTwelveMonths_Pandas(inputs=[
ms.dividend_per_share_earnings_reports,
ms.dividend_per_share_earnings_reports_asof_date
],
mask=QTradableStocksUS()
)
pipeline_numpy = Pipeline(
columns={'dividends_ttm_np': dividends_ttm_np,
'dividends_ttm_pd': dividends_ttm_pd.value,
'count': dividends_ttm_pd.unique_values,
},
screen = QTradableStocksUS()
)
start = '2019-10-28'
end = '2019-10-28'
results = run_pipeline(pipeline_numpy, start, end)
results.head()
# Let's see if there are any differences
diffs = results.query('(dividends_ttm_np != dividends_ttm_pd)')
diffs
# Looks like there are places where there are no dividends reported and those with less than 4 qtrs
# Use trick that nan != nan
diffs = results.query('(count != 4) and (count == count)')
diffs
# Check if there are any places where there is a difference and not nan and not less than 4 qtrs
diffs = results.query('(count == 4) and (count != count) and (dividends_ttm_np != dividends_ttm_pd)')
diffs
It appears the two versions give the same results if there are 4 or more qtrs of data. The numpy version outputs 0 for securities without any quarters of data while the pandas version outputs nan.
However, there is an issue with the numpy version. It returns a value even if there are less than 4 qtrs of data. It will just sum what it has. This probably could be fixed.
So, the numpy version is concise and fast (and has a little bug) while the pandas version is a little more verbose and slower. The pandas version may be optimized a bit but will probably still be slower than numpy.