# Thanh Duong 2018.02.07
# www.quantopian.com/posts/k-means-clustering-help
# Nick Lupica 2018.02.07
# www.quantopian.com/posts/k-means-clustering-help#5a7aa3c8e9f07d0dc9adec1d
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.factors import SimpleMovingAverage
from quantopian.pipeline.classifiers.fundamentals import Sector
from quantopian.research import run_pipeline
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data import Fundamentals
from quantopian.pipeline.filters import Q1500US, Q500US
import pandas as pd
import numpy as np
import random as random
from itertools import combinations
from sklearn.cluster import KMeans
def make_pipeline():
sector_filter = Sector()
financial_sector_filter = sector_filter.eq(103)
market_cap = Fundamentals.market_cap.latest
enterprise_value = Fundamentals.enterprise_value.latest
dps_growth = Fundamentals.dps_growth.latest
sustain_growth = Fundamentals.sustainable_growth_rate.latest
working_capital_per_share = Fundamentals.working_capital_per_share.latest
ROA = Fundamentals.roa.latest
ROE = Fundamentals.roe.latest
ROIC = Fundamentals.roic.latest
EV_EBITDA = Fundamentals.ev_to_ebitda.latest
return Pipeline(
columns={
#'EV/EBITDA': EV_EBITDA,
'enterprise value': enterprise_value,
'market_cap': market_cap,
'sustain growth': sustain_growth,
'ROA' : ROA,
'ROE' : ROE,
'ROIC' : ROIC
}, screen = financial_sector_filter
)
result = run_pipeline(make_pipeline(), '2015-05-05', '2015-05-05')
result = result.dropna(axis=0)
result.head(5)
result_array = result.values #change from DataFrame to array to use k-means library
result_array
kmeans = KMeans(n_clusters=50).fit(result_array) #fit into 50 clusters
cluster_label = kmeans.labels_
print cluster_label
cluster = np.array(cluster_label)
cluster = cluster.reshape((-1, 1))
#result_array = np.append(cluster,result_array,axis=1) #append cluster ID to array
#pd.DataFrame(result_array) #change array back to DataFrame
result['Cluster'] = cluster
result.head()
result['Cluster'] = np.array(KMeans(n_clusters=50).fit(result.values).labels_).reshape((-1, 1))
result.head()