Notebook

COVID-19 growth analysis

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

Load data

Download the csv file and upload it to your data directory.

In [2]:
df = local_csv('covid19_March_13_2020.csv', index_col=0)
df.index = pd.to_datetime(df.index)
df.head()
Out[2]:
country state type cases
date
2020-01-22 Thailand NaN confirmed 2
2020-01-23 Thailand NaN confirmed 3
2020-01-24 Thailand NaN confirmed 5
2020-01-25 Thailand NaN confirmed 7
2020-01-26 Thailand NaN confirmed 8

Preprocess data

In [3]:
df_confirmed = df.loc[lambda x: x.type == 'confirmed']

# Estimated critical cases
p_crit = .05
df_confirmed = df_confirmed.assign(cases_crit=df_confirmed.cases*p_crit)

# Compute days relative to when 100 confirmed cases was crossed
df_confirmed.loc[:, 'days_since_100'] = np.nan
for country in df_confirmed.country.unique():
    df_confirmed.loc[(df_confirmed.country == country), 'days_since_100'] = \
        np.arange(-len(df_confirmed.loc[(df_confirmed.country == country) & (df_confirmed.cases < 100)]), 
                  len(df_confirmed.loc[(df_confirmed.country == country) & (df_confirmed.cases >= 100)]))
    
annotate_kwargs = dict(
    s='Based on COVID Data Repository by Johns Hopkins CSSE ({})\nBy Thomas Wiecki'.format(df_confirmed.index.max().strftime('%B %d, %Y')), 
    xy=(0.05, 0.01), xycoords='figure fraction', fontsize=10)

Select countries of interest

In [4]:
european_countries = ['Italy', 'Germany', 'France (total)', 'Spain', 'United Kingdom (total)', 
                      'Iran']
large_engl_countries = ['US (total)', 'Canada (total)', 'Australia (total)']
asian_countries = ['Singapore', 'Japan', 'Korea, South', 'Hong Kong']
south_american_countries = ['Argentina', 'Brazil', 'Colombia', 'Chile']

country_groups = [european_countries, large_engl_countries, asian_countries, south_american_countries]
line_styles = ['-', ':', '--', '-.']
In [5]:
def plot_countries(df, countries, min_cases=100, ls='-', col='cases'):
    for country in countries:
        df_country = df.loc[(df.country == country) & (df.cases >= min_cases)]
        if len(df_country) == 0:
            continue
        df_country.reset_index()[col].plot(label=country, ls=ls)
        
sns.set_palette(sns.hls_palette(10, l=.45, s=.8)) # 8 countries max
fig, ax = plt.subplots(figsize=(12, 8))

for countries, ls in zip(country_groups, line_styles):
    plot_countries(df_confirmed, countries, ls=ls)

x = np.linspace(0, plt.xlim()[1] - 1)
ax.plot(x, 100 * (1.33) ** x, ls='--', color='k', label='33% daily growth')

ax.set(yscale='log',
       title='Exponential growth of COVID-19 across countries',
       xlabel='Days from first 100 confirmed cases',
       ylabel='Confirmed cases (log scale)')
#ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.legend(loc=0)
ax.annotate(**annotate_kwargs)
sns.despine();
In [6]:
fig, ax = plt.subplots(figsize=(12, 8))

for countries, ls in zip(country_groups, line_styles):
    plot_countries(df_confirmed, countries, ls=ls)

x = np.linspace(0, plt.xlim()[1] - 1)
ax.plot(x, 100 * (1.33) ** x, ls='--', color='k', label='33% daily growth')

ax.set(title='Exponential growth of COVID-19 across countries',
       xlabel='Days from first 100 confirmed cases',
       ylabel='Confirmed cases', ylim=(0, 30000))
ax.legend(loc=0)
ax.annotate(**annotate_kwargs)
sns.despine();
In [7]:
covid19_df = df.groupby(level=0).agg({'cases': 'sum'})
covid19_df['growth_1d'] = (covid19_df - covid19_df.shift(1)) / covid19_df.shift(1)

countries_df = df.groupby([df.index.get_level_values(0), 'country']).agg({'cases': 'sum'})
us_df = countries_df[countries_df.index.get_level_values(1) == 'US']
us_df.index = us_df.index.droplevel(1)

covid19_df['us_proportion'] = us_df['cases'] / covid19_df['cases']
covid19_df = covid19_df.tz_localize('UTC').asfreq('C')
In [8]:
covid19_df.tail(5)
Out[8]:
cases growth_1d us_proportion
date
2020-03-09 00:00:00+00:00 325888 0.023582 0.001878
2020-03-10 00:00:00+00:00 335739 0.030228 0.002964
2020-03-11 00:00:00+00:00 348912 0.039236 0.003798
2020-03-12 00:00:00+00:00 354662 0.016480 0.004836
2020-03-13 00:00:00+00:00 378039 0.065913 0.005920
In [9]:
covid19_df.index
Out[9]:
DatetimeIndex(['2020-01-22', '2020-01-23', '2020-01-24', '2020-01-27',
               '2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31',
               '2020-02-03', '2020-02-04', '2020-02-05', '2020-02-06',
               '2020-02-07', '2020-02-10', '2020-02-11', '2020-02-12',
               '2020-02-13', '2020-02-14', '2020-02-17', '2020-02-18',
               '2020-02-19', '2020-02-20', '2020-02-21', '2020-02-24',
               '2020-02-25', '2020-02-26', '2020-02-27', '2020-02-28',
               '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
               '2020-03-06', '2020-03-09', '2020-03-10', '2020-03-11',
               '2020-03-12', '2020-03-13'],
              dtype='datetime64[ns, UTC]', name='date', freq='C')
In [10]:
# Module Imports
# --------------------
import quantopian.optimize as opt
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import CustomFactor, Returns
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data.morningstar import Fundamentals

from quantopian.pipeline.filters import QTradableStocksUS

import numpy as np
import pandas as pd
from datetime import datetime
from scipy.stats import gmean

MORNINGSTAR_SECTOR_CODES = {
     -1: 'Misc',
    101: 'Basic Materials',
    102: 'Consumer Cyclical',
    103: 'Financial Services',
    104: 'Real Estate',
    205: 'Consumer Defensive',
    206: 'Healthcare',
    207: 'Utilities',
    308: 'Communication Services',
    309: 'Energy',
    310: 'Industrials',
    311: 'Technology' ,    
}


# Environment Settings
# --------------------
## Production 
universe = QTradableStocksUS()
mask = {'mask': universe}

## Development
# universe = QTradableStocksUS()
# mask = {'mask': universe}


# Global Configuration
# --------------------

# None, 'industry', 'sector'
SCALE_BY = 'sector'
PIPE_NORMALIZE = True
CLIP_OUTLIERS = False
CLIP_THRESHOLD =  0.025

# When true, for each day, get some % of shorts and some % of longs
USE_EXTREMES = False
EXTREMES_BOTTOM = 0.03
EXTREMES_TOP = 0.07
In [11]:
from quantopian.research import run_pipeline

start_date = datetime.strptime('01/22/2020', '%m/%d/%Y')
end_date = datetime.strptime('03/13/2020', '%m/%d/%Y')

def normalize(data):
    """ Normalize long/short positions
    """
    result = pd.Series(data)
    result = result - nanmean(data)
        
    denom = result.abs().sum()
    if denom == 0:
        denom = 1
    
    return result / denom


def make_alpha_factors():
    factors = {}
    # Create factors here
    

    # Not an alpha factor, but useful for later in the grouped tear sheet analysis
    factors['sector'] = Fundamentals.morningstar_sector_code.latest
    
    return factors
                                        

def make_pipeline():
    alpha_factors = make_alpha_factors()
    columns = {a: alpha_factors[a] for a in alpha_factors}
    
    columns['return_d2'] = Returns(window_length=2, **mask)
    columns['return_d3'] = Returns(window_length=3, **mask)
    columns['return_d4'] = Returns(window_length=4, **mask)
    columns['return_d5'] = Returns(window_length=5, **mask)
    
    pipe = Pipeline(columns=columns, screen=universe)
    return pipe


pipe = make_pipeline()
mdf = run_pipeline(pipe, start_date, end_date).dropna(how='all')
mdf.head(5)

Pipeline Execution Time: 4.13 Seconds
Out[11]:
return_d2 return_d3 return_d4 return_d5 sector
2020-01-22 00:00:00+00:00 Equity(2 [ARNC]) 0.000171 -0.022337 0.004797 -0.008956 310
Equity(24 [AAPL]) -0.006370 0.004664 0.016501 0.012632 311
Equity(41 [ARCB]) -0.025454 -0.048157 -0.035752 -0.028911 310
Equity(52 [ABM]) -0.007541 -0.022772 0.004325 0.004580 310
Equity(53 [ABMD]) 0.047330 0.046897 0.067169 0.099172 206

1. Grouped by Sectors

In [13]:
grouped_mdf = mdf.groupby([mdf.index.get_level_values(0), 'sector']) \
              .agg({'return_d2': 'mean',
                    'return_d3': 'mean',
                    'return_d4': 'mean',
                    'return_d5': 'mean',})
gjoint_mdf = pd.merge(grouped_mdf.reset_index(), covid19_df.reset_index(),
                     left_on='level_0', right_on='date', how='inner') \
              .set_index(['level_0'])
gjoint_mdf.index.names = [None]
gjoint_mdf.head(5)
Out[13]:
sector return_d5 return_d4 return_d3 return_d2 date cases growth_1d us_proportion
2020-01-22 00:00:00+00:00 101 -0.006931 -0.014206 -0.014190 -0.009719 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
2020-01-22 00:00:00+00:00 102 0.006954 0.005280 -0.007810 -0.007968 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
2020-01-22 00:00:00+00:00 103 0.004891 0.005975 -0.005693 -0.008490 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
2020-01-22 00:00:00+00:00 104 0.023148 0.014086 0.007540 0.006535 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
2020-01-22 00:00:00+00:00 205 0.002305 0.000335 -0.007134 -0.003571 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
In [14]:
pal = sns.color_palette('Paired', len(MORNINGSTAR_SECTOR_CODES.items()))

def draw_plot(x, y, hue, data, ax, pal, legend=False):
    sector_keys = sorted(MORNINGSTAR_SECTOR_CODES.keys())
    legend_elements = []
    for i in range(len(sector_keys)):
        df = data[data[hue] == sector_keys[i]]
        color = pal[i]
        if df.shape[0] > 0:
            facet = sns.regplot(x=x, y=y, data=df, color=color, ax=ax)
            facet.set(axis_bgcolor='grey')
            if legend:
                legend_elements.append(
                    plt.Line2D([0], [0], marker='o', color='w',
                               label=MORNINGSTAR_SECTOR_CODES[sector_keys[i]],
                               markerfacecolor=color, markersize=10)
                )
    if legend:
        ax.legend(handles=legend_elements, bbox_to_anchor=(-0.15, 1.3),
                  ncol=3, loc='upper center')

fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
draw_plot(x='growth_1d', y='return_d2', hue='sector', data=gjoint_mdf, ax=ax1, pal=pal)
ax2 = fig.add_subplot(2,2,2)
draw_plot(x='growth_1d', y='return_d3', hue='sector', data=gjoint_mdf, ax=ax2, pal=pal, legend=True)
ax3 = fig.add_subplot(2,2,3)
draw_plot(x='growth_1d', y='return_d4', hue='sector', data=gjoint_mdf, ax=ax3, pal=pal)
ax4 = fig.add_subplot(2,2,4)
draw_plot(x='growth_1d', y='return_d5', hue='sector', data=gjoint_mdf, ax=ax4, pal=pal)
In [15]:
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
draw_plot(x='us_proportion', y='return_d2', hue='sector', data=gjoint_mdf, ax=ax1, pal=pal)
ax2 = fig.add_subplot(2,2,2)
draw_plot(x='us_proportion', y='return_d3', hue='sector', data=gjoint_mdf, ax=ax2, pal=pal, legend=True)
ax3 = fig.add_subplot(2,2,3)
draw_plot(x='us_proportion', y='return_d4', hue='sector', data=gjoint_mdf, ax=ax3, pal=pal)
ax4 = fig.add_subplot(2,2,4)
draw_plot(x='us_proportion', y='return_d5', hue='sector', data=gjoint_mdf, ax=ax4, pal=pal)

Conclusion:

Interestingly, there is a negative correlation between COVID cases in US in proportion to the world and future returns. This correlation was not seen with the growth_1d factor as larger growth was seen on later dates.

2. Ungrouped

In [16]:
joint_mdf = pd.merge(mdf.reset_index(), covid19_df.reset_index(),
                     left_on='level_0', right_on='date', how='inner') \
              .set_index(['level_0', 'level_1'])
joint_mdf.index.names = [None, None]
joint_mdf.head(5)
Out[16]:
return_d2 return_d3 return_d4 return_d5 sector date cases growth_1d us_proportion
2020-01-22 00:00:00+00:00 Equity(2 [ARNC]) 0.000171 -0.022337 0.004797 -0.008956 310 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
Equity(24 [AAPL]) -0.006370 0.004664 0.016501 0.012632 311 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
Equity(41 [ARCB]) -0.025454 -0.048157 -0.035752 -0.028911 310 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
Equity(52 [ABM]) -0.007541 -0.022772 0.004325 0.004580 310 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
Equity(53 [ABMD]) 0.047330 0.046897 0.067169 0.099172 206 2020-01-22 00:00:00+00:00 1194 NaN 0.000838
In [17]:
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
sns.regplot(x='growth_1d', y='return_d2', data=joint_mdf, ax=ax1)
ax2 = fig.add_subplot(2,2,2)
sns.regplot(x='growth_1d', y='return_d3', data=joint_mdf, ax=ax2)
ax3 = fig.add_subplot(2,2,3)
sns.regplot(x='growth_1d', y='return_d4', data=joint_mdf, ax=ax3)
ax4 = fig.add_subplot(2,2,4)
sns.regplot(x='growth_1d', y='return_d5', data=joint_mdf, ax=ax4);
In [18]:
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
sns.regplot(x='us_proportion', y='return_d2', data=joint_mdf, ax=ax1)
ax2 = fig.add_subplot(2,2,2)
sns.regplot(x='us_proportion', y='return_d3', data=joint_mdf, ax=ax2)
ax3 = fig.add_subplot(2,2,3)
sns.regplot(x='us_proportion', y='return_d4', data=joint_mdf, ax=ax3)
ax4 = fig.add_subplot(2,2,4)
sns.regplot(x='us_proportion', y='return_d5', data=joint_mdf, ax=ax4);

Conclusion

Again, the same negative correlation with returns can be seen with us_proportion factor. This factor could be a potential alpha if we know a way to combine it with others.

Weird Stuff

There is an outlier that had 200+% returns, I went and check and found it was from TT (Trane Technologies PLC?), but the result is different from the actual returns at that date.

In [19]:
joint_mdf.sort_values(by=['return_d2'], ascending=False).head(5)
Out[19]:
return_d2 return_d3 return_d4 return_d5 sector date cases growth_1d us_proportion
2020-03-04 00:00:00+00:00 Equity(4010 [TT]) 2.234737 -0.176036 -0.200588 -0.236417 310 2020-03-04 00:00:00+00:00 284172 0.029709 0.000588
2020-03-03 00:00:00+00:00 Equity(45799 [KPTI]) 0.707512 0.799481 0.930386 0.911755 206 2020-03-03 00:00:00+00:00 275973 0.030042 0.000478
2020-02-21 00:00:00+00:00 Equity(26286 [STMP]) 0.656463 0.716381 0.781599 0.842682 311 2020-02-21 00:00:00+00:00 195114 0.013079 0.000103
2020-03-11 00:00:00+00:00 Equity(41416 [KOS]) 0.630137 -0.385013 -0.510288 -0.541426 309 2020-03-11 00:00:00+00:00 348912 0.039236 0.003798
2020-03-04 00:00:00+00:00 Equity(38827 [OMER]) 0.552569 0.644891 0.654591 0.542812 206 2020-03-04 00:00:00+00:00 284172 0.029709 0.000588
In [20]:
alphas_view = joint_mdf.copy().loc['2020-01-23':]
alphas_view = alphas_view.drop(['return_d2', 'return_d3',
                                'return_d4', 'return_d5',
                                'date'], axis=1)
alphas_view.head(5)
Out[20]:
sector cases growth_1d us_proportion
2020-01-23 00:00:00+00:00 Equity(2 [ARNC]) 310 1391 0.164992 0.000719
Equity(24 [AAPL]) 311 1391 0.164992 0.000719
Equity(41 [ARCB]) 310 1391 0.164992 0.000719
Equity(52 [ABM]) 310 1391 0.164992 0.000719
Equity(53 [ABMD]) 206 1391 0.164992 0.000719
In [21]:
alphas_view.hist();
In [ ]: