from quantopian.pipeline.data import Fundamentals
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import Latest
from quantopian.pipeline.factors import CustomFactor
from quantopian.pipeline.experimental import QTradableStocksUS
from collections import OrderedDict


import pandas as pd
import numpy as np
from sklearn import preprocessing

class Nullfilling_minmaxscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.nan_to_num(values)
        scaling=preprocessing.MinMaxScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]
            
class Nullindicator_minmaxscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.where(np.isnan(values),1.0,0.0)
        scaling=preprocessing.MinMaxScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]

class Nullfilling_standardscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.nan_to_num(values)
        scaling=preprocessing.StandardScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]
            
class Nullindicator_standardscaled(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.where(np.isnan(values),1.0,0.0)
        scaling=preprocessing.StandardScaler()
        values[-1]=scaling.fit_transform(values[-1])
        out[:] = values[-1]
        
class Nullindicator(CustomFactor):
    window_length=1
    def compute(self, today, assets, out, values):
        values=np.where(np.isnan(values),1.0,0.0)
        out[:] = values[-1]

def make_factors(mask):
    
    def enterprise_value_original():
        return Fundamentals.enterprise_value.latest
    
    def enterprise_value_minmaxscaled():
        return Nullfilling_minmaxscaled([Fundamentals.enterprise_value], mask=mask)
    
    def enterprise_value_standardscaled():
        return Nullfilling_standardscaled([Fundamentals.enterprise_value], mask=mask)
    
    def enterprise_value_nulls():
        return Nullindicator([Fundamentals.enterprise_value], mask=mask)
    
    def enterprise_value_nulls_minmaxscaled():
        return Nullindicator_minmaxscaled([Fundamentals.enterprise_value], mask=mask)

    def enterprise_value_nulls_standardscaled():
        return Nullindicator_standardscaled([Fundamentals.enterprise_value], mask=mask)

    all_factors={
            'enterprise_value_original':enterprise_value_original,
            'enterprise_value_minmaxscaled':enterprise_value_minmaxscaled,
            'enterprise_value_standardscaled':enterprise_value_standardscaled,
            'enterprise_value_nulls':enterprise_value_nulls,
            'enterprise_value_nulls_minmaxscaled':enterprise_value_nulls_minmaxscaled,
            'enterprise_value_nulls_standardscaled':enterprise_value_nulls_standardscaled
        }
        
    return all_factors

def make_ml_pipeline(factors, universe):
    factors_pipe = OrderedDict()
    for name, f in factors.iteritems():
        factors_pipe[name] = f()
    pipe = Pipeline(screen=universe, columns=factors_pipe)
    
    return pipe

# Add the same mask as the screen to ensure the same 'universe' for each
ml_pipe=make_ml_pipeline(make_factors(QTradableStocksUS()), QTradableStocksUS())

start = pd.Timestamp("2015-01-01") # Can't choose a much longer time-period or we run out of RAM
end = pd.Timestamp("2016-03-01")

results = run_pipeline(ml_pipe, start_date=start, end_date=end)

Results for timestamp 1 - Min Max Scaled¶

Minimum value:¶

results[["enterprise_value_minmaxscaled","enterprise_value_nulls_minmaxscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].min()

enterprise_value_minmaxscaled          0.0
enterprise_value_nulls_minmaxscaled    0.0
dtype: float64

When min max scaling the minimum value should always be 0 - Why is enterprise value minmaxscaled not 0? 

Maximum value:¶

results[["enterprise_value_minmaxscaled","enterprise_value_nulls_minmaxscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].max()

enterprise_value_minmaxscaled          1.0
enterprise_value_nulls_minmaxscaled    0.0
dtype: float64

As expected 1 for enterprise_value_minmaxscaled. 0 for enterprise_value_nulls_minmaxscaled, as no imputation of nulls was performed, therefore all values are 0. fine

Results for timestamp 1 - Standard Scaled¶

Mean value:¶

results[["enterprise_value_standardscaled","enterprise_value_nulls_standardscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].mean()

enterprise_value_standardscaled         -1.455749e-17
enterprise_value_nulls_standardscaled    0.000000e+00
dtype: float64

When standard scaling mean value should be 0 for enterprise_value_standardscaled. Why is enterprise value standardscaled not 0? As for enterprise_value_nulls_standardscaled, the calculation should not be possible due to the fact that no value was imputed and therefore the column should only have 0 values inside. Therefore, standard deviation should be 0. Division by 0 is impossible. Why is the mean -0.757141?

Standard deviation value:¶

results[["enterprise_value_standardscaled","enterprise_value_nulls_standardscaled"]][results.index.get_level_values(0)==results.index.get_level_values(0).unique()[1]].std()

enterprise_value_standardscaled          1.000238
enterprise_value_nulls_standardscaled    0.000000
dtype: float64

When standard scaling standard deviation value should be 1 for enterprise_value_standardscaled. Why is enterprise value standardscaled not 1? As for enterprise_value_nulls_standardscaled, the calculation should not be possible due to the fact that no value was imputed and therefore the column should only have 0 values inside. Therefore, standard deviation should be 0. Division by 0 is impossible. Why is the standard deviation 2.665169e-15?