Notebook

Hierarchical Clustering

Here is a step by step guide on how to build the Hierarchical Clustering and Dendrogram out of our time series using SciPy. Please note that also scikit-learn (a powerful data analysis library built on top of SciPY) has many other clustering algorithms implemented.

A more thorough tutorial on hierarchical clustering in python: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

Example with synthetic data

First we build some synthetic time series to work with. We'll build 6 groups of correlated time series and we expect the hierarchical clustering to detect those six groups.

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
from scipy import stats
import scipy.cluster.hierarchy as hac
from scipy.cluster.hierarchy import fcluster

import matplotlib.pyplot as plt

#
# build 6 time series groups for testing, called: a, b, c, d, e, f
#

num_samples = 61
group_size = 10

#
# create the main time series for each group
#

x = np.linspace(0, 5, num_samples)
scale = 4

a = scale * np.sin(x)
b = scale * (np.cos(1+x*3) + np.linspace(0, 1, num_samples))
c = scale * (np.sin(2+x*6) + np.linspace(0, -1, num_samples))
d = scale * (np.cos(3+x*9) + np.linspace(0, 4, num_samples))
e = scale * (np.sin(4+x*12) + np.linspace(0, -4, num_samples))
f = scale * np.cos(x)

#
# from each main series build 'group_size' series
#

timeSeries = pd.DataFrame()
ax = None
for arr in [a,b,c,d,e,f]:
    arr = arr + np.random.rand(group_size, num_samples) + np.random.randn(group_size, 1)
    df = pd.DataFrame(arr)
    timeSeries = timeSeries.append(df)

    # We use seaborn to plot what we have
    #ax = sns.tsplot(ax=ax, data=df.values, ci=[68, 95])
    ax = sns.tsplot(ax=ax, data=df.values, err_style="unit_traces")

plt.show()

print(timeSeries.shape)
(60, 61)

Now we do the clustering and plot it:

In [2]:
Z = hac.linkage(timeSeries, 'single', 'correlation')
In [3]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
hac.dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

To retrieve the Clusters we can use the fcluster function. It can be run in multiple ways (check the documentation) but in this example we'll give it as target the number of clusters we want:

In [31]:
def print_clusters(timeSeries, Z, k, plot=False, plot_threshold=5):
    # k Number of clusters I'd like to extract
    results = fcluster(Z, k, criterion='maxclust')

    # check the results
    s = pd.Series(results)
    clusters = s.unique()

    for c in clusters:
        cluster_indeces = s[s==c].index
        cluster_size = len(cluster_indeces)
        print("Cluster %d number of entries %d" % (c, cluster_size))
        if plot and cluster_size > plot_threshold:
            cluster_series = timeSeries.T.iloc[:,cluster_indeces]
            #cluster_series.plot(legend=False)
            sns.tsplot(data=cluster_series.T.values, err_style="unit_traces")
            plt.show()
In [5]:
print_clusters(timeSeries, Z, 6, plot=True)
Cluster 4 number of entries 10
Cluster 6 number of entries 10
Cluster 1 number of entries 10
Cluster 5 number of entries 10
Cluster 3 number of entries 10
Cluster 2 number of entries 10

If we already have the correlation matrix, or if we want to decide what kind of correlation to apply, then we can do the following:

In [6]:
correlation_matrix = timeSeries.T.corr(method='spearman')
correlation_matrix.shape
Out[6]:
(60, 60)
In [7]:
sns.heatmap(correlation_matrix)
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1e5c6cb150>
In [8]:
distance = (1 - correlation_matrix) # Compute the distance from the correlation
Z = hac.linkage(distance, method='average', metric='euclidean')  
In [9]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
hac.dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()
In [10]:
print_clusters(timeSeries, Z, 6, plot=False)
Cluster 4 number of entries 10
Cluster 6 number of entries 10
Cluster 1 number of entries 10
Cluster 5 number of entries 10
Cluster 3 number of entries 10
Cluster 2 number of entries 10

Finally, if we need to creare our own distance function, we can do the following:

In [11]:
# Here we use spearman correlation
def my_metric(x, y):
    r = stats.pearsonr(x, y)[0]
    return 1 - r # correlation to distance: range 0 to 2

# Do the clustering    
Z = hac.linkage(timeSeries,  method='single', metric=my_metric)
In [12]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
hac.dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()
In [13]:
print_clusters(timeSeries, Z, 6, plot=False)
Cluster 4 number of entries 10
Cluster 6 number of entries 10
Cluster 1 number of entries 10
Cluster 5 number of entries 10
Cluster 3 number of entries 10
Cluster 2 number of entries 10

Example with market data

In [68]:
from quantopian.pipeline import Pipeline
from quantopian.pipeline import CustomFactor
from quantopian.research import run_pipeline
from quantopian.pipeline import factors, filters, classifiers
from quantopian.pipeline.factors import Returns, CustomFactor
from quantopian.pipeline.filters import  StaticAssets
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.filters.morningstar import Q1500US, Q500US


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from scipy import stats
import scipy.cluster.hierarchy as hac

import math
import datetime
In [69]:
#
# Select period of interest and universe of stocks
#
start_date = "2014-01-1"
end_date   = "2016-01-1"

universe   = Q500US()
In [70]:
#
# fetch the securities at start_date and use them for the full period
#
pipe = Pipeline(screen=universe)
pipe_out = run_pipeline(pipe, start_date, start_date)
assets = pipe_out.index.get_level_values(1)
print(len(assets))
500
In [71]:
universe = StaticAssets(assets)
pipe = Pipeline(screen=universe)

#
# we can use price or returns for the clustering
#
#pipe.add(Returns(mask=universe, window_length=2),'returns')  # Returns
pipe.add(USEquityPricing.close.latest,'price')              # Price

pipe_out = run_pipeline(pipe, start_date, end_date)
In [72]:
pipe_out = pipe_out.unstack()
pipe_out.columns = pipe_out.columns.droplevel()
pipe_out = pipe_out.fillna(method='ffill', axis=0).fillna(method='bfill', axis=0)
print 'Basket of stocks %d, output shape %s' % (len(pipe_out.columns), pipe_out.shape)
Basket of stocks 500, output shape (505, 500)
In [73]:
#
# build the cluster
#
def my_metric(x, y):
    r = stats.pearsonr(x, y)[0]
    similarity = r - 1. # range 0 to -2
    distance = -similarity / 2.0
    return distance

Z = hac.linkage(pipe_out.T,  method='average', metric=my_metric)
In [74]:
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
hac.dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()
In [75]:
cluster_series = pipe_out

#
# build the cumulative returns series from RETURNS
#
#cluster_series = cluster_series.add(1).cumprod()

#
# build the cumulative returns series from PRICES
#
cluster_series  = (cluster_series / cluster_series.iloc[0, :]) - 1

number_of_clusters_to_use = [50, ]
for k in number_of_clusters_to_use:
    print_clusters(cluster_series.T, Z, k, plot=True)
Cluster 35 number of entries 50
Cluster 31 number of entries 8
Cluster 15 number of entries 195
Cluster 7 number of entries 3
Cluster 45 number of entries 39
Cluster 38 number of entries 1
Cluster 36 number of entries 91
Cluster 16 number of entries 1
Cluster 34 number of entries 7
Cluster 28 number of entries 5
Cluster 26 number of entries 3
Cluster 42 number of entries 3
Cluster 39 number of entries 6
Cluster 41 number of entries 4
Cluster 40 number of entries 3
Cluster 17 number of entries 3
Cluster 13 number of entries 3
Cluster 11 number of entries 4
Cluster 20 number of entries 6
Cluster 27 number of entries 2
Cluster 24 number of entries 3
Cluster 44 number of entries 3
Cluster 12 number of entries 2
Cluster 46 number of entries 1
Cluster 37 number of entries 3
Cluster 21 number of entries 1
Cluster 10 number of entries 3
Cluster 4 number of entries 8
Cluster 48 number of entries 3
Cluster 3 number of entries 4
Cluster 6 number of entries 1
Cluster 18 number of entries 1
Cluster 33 number of entries 2
Cluster 2 number of entries 1
Cluster 5 number of entries 2
Cluster 32 number of entries 1
Cluster 14 number of entries 2
Cluster 22 number of entries 4
Cluster 25 number of entries 2
Cluster 30 number of entries 3
Cluster 1 number of entries 2
Cluster 23 number of entries 2
Cluster 49 number of entries 2
Cluster 43 number of entries 1
Cluster 8 number of entries 1
Cluster 47 number of entries 1
Cluster 19 number of entries 1
Cluster 50 number of entries 1
Cluster 9 number of entries 1
Cluster 29 number of entries 1