Notebook

Zscore producing too many Nans? Am I doing something wrong?

I would like to study the book to price ratio on tech stocks. For some reason the z-score is producing a lot of nans and alphales is droping nearly 30% of the data.

Am I doing something wrong? Thanks.

In [1]:
# Import Libraries

import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from statsmodels import regression
import matplotlib.pyplot as plt
import pandas as pd


from quantopian.pipeline import Pipeline
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import CustomFactor, Returns
from quantopian.pipeline.classifiers.fundamentals import Sector  
from quantopian.research import run_pipeline
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.filters import Q500US

import alphalens as al
In [2]:
# Book to Price Ratio Factor
class BookToPrice(CustomFactor):
    # pb = price to book, we'll need to take the reciprocal later
    inputs = [Fundamentals.pb_ratio.latest]
    window_length = 1
    
    def compute(self, today, assets, out, pb):
        out[:] = 1/pb
In [4]:
# Sector information
MORNINGSTAR_SECTOR_CODES = {
     -1: 'Misc',
    101: 'Basic Materials',
    102: 'Consumer Cyclical',
    103: 'Financial Services',
    104: 'Real Estate',
    205: 'Consumer Defensive',
    206: 'Healthcare',
    207: 'Utilities',
    308: 'Communication Services',
    309: 'Energy',
    310: 'Industrials',
    311: 'Technology' ,    
}
tech = Sector().eq(Sector.TECHNOLOGY)


begin_period = '2003-01-01'
end_period = '2018-01-01'
In [5]:
# Define Universe
universe = Q500US()

# Make Pipeline
def make_pipeline():

    # Create Pipeline
    pipe = Pipeline()

    # My factor
    myfactor = BookToPrice()
    
    # Returns
    returns = Returns(inputs=[USEquityPricing.close], window_length=2)
    
    # Factor Rank
    myfactor_rank = myfactor.zscore()

    # Pipeline to return    
    return Pipeline(
      columns={
          'myfactor': myfactor,
          'myfactor_rank': myfactor_rank,
          'sector' : Sector()
      },
      screen  = (universe & tech)
    )

results = run_pipeline(make_pipeline(), begin_period, end_period)
results.fillna(value=0);

Pipeline Execution Time: 1 Minute, 26.68 Seconds
In [7]:
print ("The zscore is producing a lot of NaNs.")
results.head()
The zscore is producing a lot of NaNs.
Out[7]:
myfactor myfactor_rank sector
2003-01-02 00:00:00+00:00 Equity(24 [AAPL]) 0.799808 NaN 311
Equity(67 [ADSK]) 0.361102 NaN 311
Equity(114 [ADBE]) 0.113600 NaN 311
Equity(122 [ADI]) 0.329402 NaN 311
Equity(328 [ALTR]) 0.239601 NaN 311
In [8]:
# Get sector, asset list, and pricing
sectors = results['sector']
asset_list = results.index.levels[1]
prices = get_pricing(asset_list, start_date=begin_period, end_date=end_period, fields='open_price')
prices.head(5)
Out[8]:
Equity(24 [AAPL]) Equity(67 [ADSK]) Equity(114 [ADBE]) Equity(115 [ADCT]) Equity(122 [ADI]) Equity(328 [ALTR]) Equity(337 [AMAT]) Equity(351 [AMD]) Equity(417 [ANDW]) Equity(465 [APH]) ... Equity(49139 [FIT]) Equity(49288 [LITE]) Equity(49506 [HPE]) Equity(49610 [SQ]) Equity(50049 [FTV]) Equity(50077 [TWLO]) Equity(50242 [DVMT]) Equity(50338 [NTNX]) Equity(50683 [SNAP]) Equity(50716 [DXC])
2003-01-02 00:00:00+00:00 0.920 7.116 12.462 14.84 17.315 11.378 10.699 6.56 10.30 4.453 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2003-01-03 00:00:00+00:00 0.948 7.195 12.711 16.24 18.207 11.884 11.105 7.01 10.66 4.690 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2003-01-06 00:00:00+00:00 0.962 6.992 13.134 19.18 18.921 12.174 11.876 7.05 10.65 4.670 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2003-01-07 00:00:00+00:00 0.947 7.190 13.664 19.74 19.942 13.177 12.290 7.25 11.07 4.740 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2003-01-08 00:00:00+00:00 0.934 7.507 14.063 18.55 19.806 12.725 11.972 7.06 10.74 4.846 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 252 columns

In [9]:
# Get clean factor and forward retuens for factor
periods = (1, 5, 10)
factor_data = al.utils.get_clean_factor_and_forward_returns(factor=results['myfactor_rank'],
                                                            prices=prices,
                                                            groupby=sectors,
                                                            groupby_labels=MORNINGSTAR_SECTOR_CODES,
                                                            periods=periods,
                                                            binning_by_group=True)
factor_data.head()
Dropped 28.9% entries from factor data: 28.9% in forward returns computation and 0.0% in binning phase (set max_loss=0 to see potentially suppressed Exceptions).
max_loss is 35.0%, not exceeded: OK!
Out[9]:
1D 5D 10D factor group factor_quantile
date asset
2005-09-20 00:00:00+00:00 Equity(24 [AAPL]) -0.000442 0.018860 0.035656 -0.273535 Technology 2
Equity(67 [ADSK]) 0.006731 0.028846 0.117308 -0.346485 Technology 1
Equity(114 [ADBE]) -0.036250 -0.016731 0.042872 -0.286838 Technology 1
Equity(122 [ADI]) -0.020921 -0.032457 0.004303 -0.197683 Technology 3
Equity(328 [ALTR]) 0.015387 0.006871 0.034475 -0.277710 Technology 1

Though 29% is under the 35% max, it seems a lot. Did I do something wrong with the zscore? Thanks.

In [ ]: