Notebook
In [1]:
# Thanh Duong 2018.02.07
# www.quantopian.com/posts/k-means-clustering-help

# Nick Lupica 2018.02.07
# www.quantopian.com/posts/k-means-clustering-help#5a7aa3c8e9f07d0dc9adec1d
In [2]:
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import SimpleMovingAverage
from quantopian.pipeline.classifiers.fundamentals import Sector 
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.filters import Q1500US, Q500US

import pandas as pd
import numpy as np
import random as random
from itertools import combinations
from sklearn.cluster import KMeans

def make_pipeline():
    
    sector_filter = Sector()
    financial_sector_filter = sector_filter.eq(103)
    
    market_cap = Fundamentals.market_cap.latest
    
    enterprise_value = Fundamentals.enterprise_value.latest
    
    dps_growth = Fundamentals.dps_growth.latest
    
    sustain_growth = Fundamentals.sustainable_growth_rate.latest
    
    working_capital_per_share = Fundamentals.working_capital_per_share.latest
    
    ROA = Fundamentals.roa.latest
    
    ROE = Fundamentals.roe.latest
    
    ROIC = Fundamentals.roic.latest
    
    EV_EBITDA = Fundamentals.ev_to_ebitda.latest
    
    return Pipeline(
      columns={
          #'EV/EBITDA': EV_EBITDA,
          'enterprise value': enterprise_value,
          'market_cap': market_cap,
          'sustain growth': sustain_growth,
          'ROA' : ROA,
          'ROE' : ROE,
          'ROIC' : ROIC
    }, screen = financial_sector_filter
    )

result = run_pipeline(make_pipeline(), '2015-05-05', '2015-05-05')

result = result.dropna(axis=0)
result.head(5)
Out[2]:
ROA ROE ROIC enterprise value market_cap sustain growth
2015-05-05 00:00:00+00:00 Equity(21 [AAME]) 0.003682 0.011316 0.010876 9.892274e+07 8.150474e+07 0.0346
Equity(66 [AB]) 0.028198 0.028258 0.028258 3.247275e+09 3.247275e+09 0.0038
Equity(157 [AEG]) -0.000153 -0.002354 0.000615 2.605216e+10 2.124709e+10 0.0005
Equity(185 [AFL]) 0.005498 0.035853 0.029553 2.930033e+10 2.756033e+10 0.1282
Equity(192 [ATAX]) 0.001303 0.003155 0.007914 5.693063e+08 3.344038e+08 -0.0613
In [3]:
result_array = result.values #change from DataFrame to array to use k-means library 
In [4]:
result_array
Out[4]:
array([[  3.68200000e-03,   1.13160000e-02,   1.08760000e-02,
          9.89227430e+07,   8.15047430e+07,   3.46000000e-02],
       [  2.81980000e-02,   2.82580000e-02,   2.82580000e-02,
          3.24727510e+09,   3.24727510e+09,   3.80000000e-03],
       [ -1.53000000e-04,  -2.35400000e-03,   6.15000000e-04,
          2.60521645e+10,   2.12470910e+10,   5.00000000e-04],
       ..., 
       [  1.49100000e-03,   1.01940000e-02,   9.11800000e-03,
          2.16061994e+10,   1.43571994e+10,   4.21000000e-02],
       [  3.31100000e-03,   4.13979000e-01,   1.10787000e-01,
          6.58138244e+08,   4.31409244e+08,   1.50300000e-01],
       [  8.12000000e-04,   1.68980000e-02,   1.49610000e-02,
         -2.92384211e+10,   7.21025955e+10,   4.96000000e-02]])
In [5]:
kmeans = KMeans(n_clusters=50).fit(result_array) #fit into 50 clusters
cluster_label = kmeans.labels_
print cluster_label
[28 32  9 21 28 31 17  0 21 28 32 28 31  1 30 23 46  4 42 28 32 28 34  0 32
  0 32 32 28 28  0  9 32  0  1 32  0 28 28 42 42 17 28  0 28  0 28 28 28  0
 32 32 28 28 28 28 28 32 28  0  0  0 28  0  9 28  0 28  0 28  0  0 28 28 17
  0 32 28 28 28  0 28  0 28 28 13 28 48 42 48  9 43 28 49 28 46 21 32 28 32
 28  9  0 28  0 28 28 42 28  0 32  0 32 28 48 28  8 28 28 28 42  0 48  3 42
 32  0  0 28 42 32 32 21 28 28 20 16 28 21 28 32  0 28 28 42  0  9 28 42  0
 28 28  0 17 32 28 28 24 32  6  0  0 28 42  0 28  3 28 28 28 28 28 28 28 42
 28 28 28 32 28 28  0 28 32  0  0 28 28 49 28 28  0 49 28 28  0 28 28 28 28
 28 28 28  4 28 28 36  0 32 28 28 28 28 32 48 32 28 32 42 32 42 44 28 32 28
  9 28 28 28 28 28 28 28  0 28 28  0 28 28 28 15 28 28 28 28  0 28 28 28  9
  9  0  0 13 28 28  7 19 32 32 28 28 28 22 28 28 28 28 28 28 28 28  0 28 28
 28 32  0 28 32 28 32 32 28  0 28 28 28 28 28 28 28 32 28 28 28 28 28  0 32
 28 28 32 32 28 15 28 28 32 32 32 28 28  5 17 28 45 28  0  0 28  0 28 28 28
 28 28 28 28 33  9 45 28  0 28  0 28 32 28 28 45 28  0 28 28 28 28 48 28 28
 28 28 28 28 28 28 25 28 28 28 28 28 28  0 17 28  3 28 28 28 38 28 36 48 12
  0 28 28 28 28 28 20 33 28 48 19  0 28 28  0 28 28 28  0 28 28 28  0 28 28
 22 28 49 32  0 28  0 28  0 21 28 14 41 28 32 28 28 28 32 13 28 28 28  0  0
 26 40 28 28 32  0 28 28 28  8  0 32 32 28  0 32 28 28 28 28 28 32  0 28 42
 28 28 28 28 28 28 32 28 32 28 28 28 28 28 28 42 28 28  0 28 28 28 42 28 28
  0 28 28 49 28 28 28 21 28 28 28 32 21 28 28  0 28 28 28 28 28 28 28 28 28
 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28  0 28
  2 28  0 28 32 28 28  0 28 17 28 28 28  0 28 37  0 32 28 28 28  0 28 32 28
 28 28 28 32 42 28 21  3 28 28 32  0 28 27 28 48 28 28 28 28 28 28 28 11 28
  0 28 28  0 28 29 28 28 28 28 33 28 48 28 32 28  0 28  0  0 28 32 32 21 28
 28 28 28 10  0 32 28 17 28 28 28 28 32 28 42 28  0 28 28 28 28 28 28 28 28
 28  8 28 35 32 32 28 28 48 28 28 28 28 28 28 28 28 39 48 28  0  0  0 28 28
 28 28 28 28 28  0 28 29 28  0  0 32 18 28 28 28 28 28 48 32 28 48  9 28 47]
In [6]:
cluster = np.array(cluster_label)
cluster = cluster.reshape((-1, 1))
#result_array = np.append(cluster,result_array,axis=1) #append cluster ID to array 
#pd.DataFrame(result_array) #change array back to DataFrame
In [7]:
result['Cluster'] = cluster
result.head()
Out[7]:
ROA ROE ROIC enterprise value market_cap sustain growth Cluster
2015-05-05 00:00:00+00:00 Equity(21 [AAME]) 0.003682 0.011316 0.010876 9.892274e+07 8.150474e+07 0.0346 28
Equity(66 [AB]) 0.028198 0.028258 0.028258 3.247275e+09 3.247275e+09 0.0038 32
Equity(157 [AEG]) -0.000153 -0.002354 0.000615 2.605216e+10 2.124709e+10 0.0005 9
Equity(185 [AFL]) 0.005498 0.035853 0.029553 2.930033e+10 2.756033e+10 0.1282 21
Equity(192 [ATAX]) 0.001303 0.003155 0.007914 5.693063e+08 3.344038e+08 -0.0613 28
In [8]:
result['Cluster'] = np.array(KMeans(n_clusters=50).fit(result.values).labels_).reshape((-1, 1))
result.head()
Out[8]:
ROA ROE ROIC enterprise value market_cap sustain growth Cluster
2015-05-05 00:00:00+00:00 Equity(21 [AAME]) 0.003682 0.011316 0.010876 9.892274e+07 8.150474e+07 0.0346 0
Equity(66 [AB]) 0.028198 0.028258 0.028258 3.247275e+09 3.247275e+09 0.0038 23
Equity(157 [AEG]) -0.000153 -0.002354 0.000615 2.605216e+10 2.124709e+10 0.0005 9
Equity(185 [AFL]) 0.005498 0.035853 0.029553 2.930033e+10 2.756033e+10 0.1282 29
Equity(192 [ATAX]) 0.001303 0.003155 0.007914 5.693063e+08 3.344038e+08 -0.0613 0