Notebook
In [1]:
from quantopian.pipeline.data import Fundamentals
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import Latest
from quantopian.pipeline.factors import CustomFactor
from quantopian.pipeline.experimental import QTradableStocksUS
from collections import OrderedDict


import pandas as pd
import numpy as np
from sklearn import preprocessing
In [2]:
class Nullfilling_minmaxscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.nan_to_num(values)
        scaling=preprocessing.MinMaxScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]
            
class Nullindicator_minmaxscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.where(np.isnan(values),1.0,0.0)
        scaling=preprocessing.MinMaxScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]

class Nullfilling_standardscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.nan_to_num(values)
        scaling=preprocessing.StandardScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]
            
class Nullindicator_standardscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.where(np.isnan(values),1.0,0.0)
        scaling=preprocessing.StandardScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]
        
class Nullindicator(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.where(np.isnan(values),1.0,0.0)
        out[:] = values[-1]
In [3]:
def make_factors(mask):
    
    def enterprise_value_original():
        return Fundamentals.enterprise_value.latest
    
    def enterprise_value_minmaxscaled():
        return Nullfilling_minmaxscaled([Fundamentals.enterprise_value], mask=mask)
    
    def enterprise_value_standardscaled():
        return Nullfilling_standardscaled([Fundamentals.enterprise_value], mask=mask)
    
    def enterprise_value_nulls():
        return Nullindicator([Fundamentals.enterprise_value], mask=mask)
    
    def enterprise_value_nulls_minmaxscaled():
        return Nullindicator_minmaxscaled([Fundamentals.enterprise_value], mask=mask)

    def enterprise_value_nulls_standardscaled():
        return Nullindicator_standardscaled([Fundamentals.enterprise_value], mask=mask)

    all_factors={
            'enterprise_value_original':enterprise_value_original,
            'enterprise_value_minmaxscaled':enterprise_value_minmaxscaled,
            'enterprise_value_standardscaled':enterprise_value_standardscaled,
            'enterprise_value_nulls':enterprise_value_nulls,
            'enterprise_value_nulls_minmaxscaled':enterprise_value_nulls_minmaxscaled,
            'enterprise_value_nulls_standardscaled':enterprise_value_nulls_standardscaled
        }
        
    return all_factors
In [4]:
def make_ml_pipeline(factors, universe):
    factors_pipe = OrderedDict()
    for name, f in factors.iteritems():
        factors_pipe[name] = f()
    pipe = Pipeline(screen=universe, columns=factors_pipe)
    
    return pipe
In [5]:
# Add the same mask as the screen to ensure the same 'universe' for each
ml_pipe=make_ml_pipeline(make_factors(QTradableStocksUS()), QTradableStocksUS())
In [6]:
start = pd.Timestamp("2015-01-01") # Can't choose a much longer time-period or we run out of RAM
end = pd.Timestamp("2016-03-01")

results = run_pipeline(ml_pipe, start_date=start, end_date=end)

Pipeline Execution Time: 14.63 Seconds

Results for timestamp 1 - Min Max Scaled

Minimum value:

In [7]:
results[["enterprise_value_minmaxscaled","enterprise_value_nulls_minmaxscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].min()
Out[7]:
enterprise_value_minmaxscaled          0.0
enterprise_value_nulls_minmaxscaled    0.0
dtype: float64

When min max scaling the minimum value should always be 0 - <font color=red> Why is enterprise value minmaxscaled not 0? </font>

Maximum value:

In [8]:
results[["enterprise_value_minmaxscaled","enterprise_value_nulls_minmaxscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].max()
Out[8]:
enterprise_value_minmaxscaled          1.0
enterprise_value_nulls_minmaxscaled    0.0
dtype: float64

As expected 1 for enterprise_value_minmaxscaled. 0 for enterprise_value_nulls_minmaxscaled, as no imputation of nulls was performed, therefore all values are 0. fine

Results for timestamp 1 - Standard Scaled

Mean value:

In [9]:
results[["enterprise_value_standardscaled","enterprise_value_nulls_standardscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].mean()
Out[9]:
enterprise_value_standardscaled         -1.455749e-17
enterprise_value_nulls_standardscaled    0.000000e+00
dtype: float64

When standard scaling mean value should be 0 for enterprise_value_standardscaled. <font color=red> Why is enterprise value standardscaled not 0?</font> As for enterprise_value_nulls_standardscaled, the calculation should not be possible due to the fact that no value was imputed and therefore the column should only have 0 values inside. Therefore, standard deviation should be 0. Division by 0 is impossible. <font color=red> Why is the mean -0.757141?</font>

Standard deviation value:

In [10]:
results[["enterprise_value_standardscaled","enterprise_value_nulls_standardscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].std()
Out[10]:
enterprise_value_standardscaled          1.000238
enterprise_value_nulls_standardscaled    0.000000
dtype: float64

When standard scaling standard deviation value should be 1 for enterprise_value_standardscaled. <font color=red> Why is enterprise value standardscaled not 1?</font> As for enterprise_value_nulls_standardscaled, the calculation should not be possible due to the fact that no value was imputed and therefore the column should only have 0 values inside. Therefore, standard deviation should be 0. Division by 0 is impossible. <font color=red> Why is the standard deviation 2.665169e-15?</font>

In [ ]: