COVID-19 growth analysis¶

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

Load data¶

Download the csv file and upload it to your data directory.

df = local_csv('covid19_March_13_2020.csv', index_col=0)
df.index = pd.to_datetime(df.index)
df.head()

Preprocess data¶

df_confirmed = df.loc[lambda x: x.type == 'confirmed']

# Estimated critical cases
p_crit = .05
df_confirmed = df_confirmed.assign(cases_crit=df_confirmed.cases*p_crit)

# Compute days relative to when 100 confirmed cases was crossed
df_confirmed.loc[:, 'days_since_100'] = np.nan
for country in df_confirmed.country.unique():
    df_confirmed.loc[(df_confirmed.country == country), 'days_since_100'] = \
        np.arange(-len(df_confirmed.loc[(df_confirmed.country == country) & (df_confirmed.cases < 100)]), 
                  len(df_confirmed.loc[(df_confirmed.country == country) & (df_confirmed.cases >= 100)]))
    
annotate_kwargs = dict(
    s='Based on COVID Data Repository by Johns Hopkins CSSE ({})\nBy Thomas Wiecki'.format(df_confirmed.index.max().strftime('%B %d, %Y')), 
    xy=(0.05, 0.01), xycoords='figure fraction', fontsize=10)

Select countries of interest¶

european_countries = ['Italy', 'Germany', 'France (total)', 'Spain', 'United Kingdom (total)', 
                      'Iran']
large_engl_countries = ['US (total)', 'Canada (total)', 'Australia (total)']
asian_countries = ['Singapore', 'Japan', 'Korea, South', 'Hong Kong']
south_american_countries = ['Argentina', 'Brazil', 'Colombia', 'Chile']

country_groups = [european_countries, large_engl_countries, asian_countries, south_american_countries]
line_styles = ['-', ':', '--', '-.']

def plot_countries(df, countries, min_cases=100, ls='-', col='cases'):
    for country in countries:
        df_country = df.loc[(df.country == country) & (df.cases >= min_cases)]
        if len(df_country) == 0:
            continue
        df_country.reset_index()[col].plot(label=country, ls=ls)
        
sns.set_palette(sns.hls_palette(10, l=.45, s=.8)) # 8 countries max
fig, ax = plt.subplots(figsize=(12, 8))

for countries, ls in zip(country_groups, line_styles):
    plot_countries(df_confirmed, countries, ls=ls)

x = np.linspace(0, plt.xlim()[1] - 1)
ax.plot(x, 100 * (1.33) ** x, ls='--', color='k', label='33% daily growth')

ax.set(yscale='log',
       title='Exponential growth of COVID-19 across countries',
       xlabel='Days from first 100 confirmed cases',
       ylabel='Confirmed cases (log scale)')
#ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.legend(loc=0)
ax.annotate(**annotate_kwargs)
sns.despine();

fig, ax = plt.subplots(figsize=(12, 8))

for countries, ls in zip(country_groups, line_styles):
    plot_countries(df_confirmed, countries, ls=ls)

x = np.linspace(0, plt.xlim()[1] - 1)
ax.plot(x, 100 * (1.33) ** x, ls='--', color='k', label='33% daily growth')

ax.set(title='Exponential growth of COVID-19 across countries',
       xlabel='Days from first 100 confirmed cases',
       ylabel='Confirmed cases', ylim=(0, 30000))
ax.legend(loc=0)
ax.annotate(**annotate_kwargs)
sns.despine();

covid19_df = df.groupby(level=0).agg({'cases': 'sum'})
covid19_df['growth_1d'] = (covid19_df - covid19_df.shift(1)) / covid19_df.shift(1)

countries_df = df.groupby([df.index.get_level_values(0), 'country']).agg({'cases': 'sum'})
us_df = countries_df[countries_df.index.get_level_values(1) == 'US']
us_df.index = us_df.index.droplevel(1)

covid19_df['us_proportion'] = us_df['cases'] / covid19_df['cases']
covid19_df = covid19_df.tz_localize('UTC').asfreq('C')

covid19_df.tail(5)

covid19_df.index

DatetimeIndex(['2020-01-22', '2020-01-23', '2020-01-24', '2020-01-27',
               '2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31',
               '2020-02-03', '2020-02-04', '2020-02-05', '2020-02-06',
               '2020-02-07', '2020-02-10', '2020-02-11', '2020-02-12',
               '2020-02-13', '2020-02-14', '2020-02-17', '2020-02-18',
               '2020-02-19', '2020-02-20', '2020-02-21', '2020-02-24',
               '2020-02-25', '2020-02-26', '2020-02-27', '2020-02-28',
               '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05',
               '2020-03-06', '2020-03-09', '2020-03-10', '2020-03-11',
               '2020-03-12', '2020-03-13'],
              dtype='datetime64[ns, UTC]', name='date', freq='C')

# Module Imports
# --------------------
import quantopian.optimize as opt
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import CustomFactor, Returns
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data.morningstar import Fundamentals

from quantopian.pipeline.filters import QTradableStocksUS

import numpy as np
import pandas as pd
from datetime import datetime
from scipy.stats import gmean

MORNINGSTAR_SECTOR_CODES = {
     -1: 'Misc',
    101: 'Basic Materials',
    102: 'Consumer Cyclical',
    103: 'Financial Services',
    104: 'Real Estate',
    205: 'Consumer Defensive',
    206: 'Healthcare',
    207: 'Utilities',
    308: 'Communication Services',
    309: 'Energy',
    310: 'Industrials',
    311: 'Technology' ,    
}


# Environment Settings
# --------------------
## Production 
universe = QTradableStocksUS()
mask = {'mask': universe}

## Development
# universe = QTradableStocksUS()
# mask = {'mask': universe}


# Global Configuration
# --------------------

# None, 'industry', 'sector'
SCALE_BY = 'sector'
PIPE_NORMALIZE = True
CLIP_OUTLIERS = False
CLIP_THRESHOLD =  0.025

# When true, for each day, get some % of shorts and some % of longs
USE_EXTREMES = False
EXTREMES_BOTTOM = 0.03
EXTREMES_TOP = 0.07

from quantopian.research import run_pipeline

start_date = datetime.strptime('01/22/2020', '%m/%d/%Y')
end_date = datetime.strptime('03/13/2020', '%m/%d/%Y')

def normalize(data):
    """ Normalize long/short positions
    """
    result = pd.Series(data)
    result = result - nanmean(data)
        
    denom = result.abs().sum()
    if denom == 0:
        denom = 1
    
    return result / denom


def make_alpha_factors():
    factors = {}
    # Create factors here
    

    # Not an alpha factor, but useful for later in the grouped tear sheet analysis
    factors['sector'] = Fundamentals.morningstar_sector_code.latest
    
    return factors
                                        

def make_pipeline():
    alpha_factors = make_alpha_factors()
    columns = {a: alpha_factors[a] for a in alpha_factors}
    
    columns['return_d2'] = Returns(window_length=2, **mask)
    columns['return_d3'] = Returns(window_length=3, **mask)
    columns['return_d4'] = Returns(window_length=4, **mask)
    columns['return_d5'] = Returns(window_length=5, **mask)
    
    pipe = Pipeline(columns=columns, screen=universe)
    return pipe


pipe = make_pipeline()
mdf = run_pipeline(pipe, start_date, end_date).dropna(how='all')
mdf.head(5)

1. Grouped by Sectors¶

grouped_mdf = mdf.groupby([mdf.index.get_level_values(0), 'sector']) \
              .agg({'return_d2': 'mean',
                    'return_d3': 'mean',
                    'return_d4': 'mean',
                    'return_d5': 'mean',})
gjoint_mdf = pd.merge(grouped_mdf.reset_index(), covid19_df.reset_index(),
                     left_on='level_0', right_on='date', how='inner') \
              .set_index(['level_0'])
gjoint_mdf.index.names = [None]
gjoint_mdf.head(5)

pal = sns.color_palette('Paired', len(MORNINGSTAR_SECTOR_CODES.items()))

def draw_plot(x, y, hue, data, ax, pal, legend=False):
    sector_keys = sorted(MORNINGSTAR_SECTOR_CODES.keys())
    legend_elements = []
    for i in range(len(sector_keys)):
        df = data[data[hue] == sector_keys[i]]
        color = pal[i]
        if df.shape[0] > 0:
            facet = sns.regplot(x=x, y=y, data=df, color=color, ax=ax)
            facet.set(axis_bgcolor='grey')
            if legend:
                legend_elements.append(
                    plt.Line2D([0], [0], marker='o', color='w',
                               label=MORNINGSTAR_SECTOR_CODES[sector_keys[i]],
                               markerfacecolor=color, markersize=10)
                )
    if legend:
        ax.legend(handles=legend_elements, bbox_to_anchor=(-0.15, 1.3),
                  ncol=3, loc='upper center')

fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
draw_plot(x='growth_1d', y='return_d2', hue='sector', data=gjoint_mdf, ax=ax1, pal=pal)
ax2 = fig.add_subplot(2,2,2)
draw_plot(x='growth_1d', y='return_d3', hue='sector', data=gjoint_mdf, ax=ax2, pal=pal, legend=True)
ax3 = fig.add_subplot(2,2,3)
draw_plot(x='growth_1d', y='return_d4', hue='sector', data=gjoint_mdf, ax=ax3, pal=pal)
ax4 = fig.add_subplot(2,2,4)
draw_plot(x='growth_1d', y='return_d5', hue='sector', data=gjoint_mdf, ax=ax4, pal=pal)

fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
draw_plot(x='us_proportion', y='return_d2', hue='sector', data=gjoint_mdf, ax=ax1, pal=pal)
ax2 = fig.add_subplot(2,2,2)
draw_plot(x='us_proportion', y='return_d3', hue='sector', data=gjoint_mdf, ax=ax2, pal=pal, legend=True)
ax3 = fig.add_subplot(2,2,3)
draw_plot(x='us_proportion', y='return_d4', hue='sector', data=gjoint_mdf, ax=ax3, pal=pal)
ax4 = fig.add_subplot(2,2,4)
draw_plot(x='us_proportion', y='return_d5', hue='sector', data=gjoint_mdf, ax=ax4, pal=pal)

Conclusion:¶

Interestingly, there is a negative correlation between COVID cases in US in proportion to the world and future returns. This correlation was not seen with the growth_1d factor as larger growth was seen on later dates.

2. Ungrouped¶

joint_mdf = pd.merge(mdf.reset_index(), covid19_df.reset_index(),
                     left_on='level_0', right_on='date', how='inner') \
              .set_index(['level_0', 'level_1'])
joint_mdf.index.names = [None, None]
joint_mdf.head(5)

fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
sns.regplot(x='growth_1d', y='return_d2', data=joint_mdf, ax=ax1)
ax2 = fig.add_subplot(2,2,2)
sns.regplot(x='growth_1d', y='return_d3', data=joint_mdf, ax=ax2)
ax3 = fig.add_subplot(2,2,3)
sns.regplot(x='growth_1d', y='return_d4', data=joint_mdf, ax=ax3)
ax4 = fig.add_subplot(2,2,4)
sns.regplot(x='growth_1d', y='return_d5', data=joint_mdf, ax=ax4);

fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
sns.regplot(x='us_proportion', y='return_d2', data=joint_mdf, ax=ax1)
ax2 = fig.add_subplot(2,2,2)
sns.regplot(x='us_proportion', y='return_d3', data=joint_mdf, ax=ax2)
ax3 = fig.add_subplot(2,2,3)
sns.regplot(x='us_proportion', y='return_d4', data=joint_mdf, ax=ax3)
ax4 = fig.add_subplot(2,2,4)
sns.regplot(x='us_proportion', y='return_d5', data=joint_mdf, ax=ax4);

Conclusion¶

Again, the same negative correlation with returns can be seen with us_proportion factor. This factor could be a potential alpha if we know a way to combine it with others.

Weird Stuff¶

There is an outlier that had 200+% returns, I went and check and found it was from TT (Trane Technologies PLC?), but the result is different from the actual returns at that date.

joint_mdf.sort_values(by=['return_d2'], ascending=False).head(5)

alphas_view = joint_mdf.copy().loc['2020-01-23':]
alphas_view = alphas_view.drop(['return_d2', 'return_d3',
                                'return_d4', 'return_d5',
                                'date'], axis=1)
alphas_view.head(5)

alphas_view.hist();

	cases	growth_1d	us_proportion
date
2020-03-09 00:00:00+00:00	325888	0.023582	0.001878
2020-03-10 00:00:00+00:00	335739	0.030228	0.002964
2020-03-11 00:00:00+00:00	348912	0.039236	0.003798
2020-03-12 00:00:00+00:00	354662	0.016480	0.004836
2020-03-13 00:00:00+00:00	378039	0.065913	0.005920

		return_d2	return_d3	return_d4	return_d5	sector
2020-01-22 00:00:00+00:00	Equity(2 [ARNC])	0.000171	-0.022337	0.004797	-0.008956	310
	Equity(24 [AAPL])	-0.006370	0.004664	0.016501	0.012632	311
	Equity(41 [ARCB])	-0.025454	-0.048157	-0.035752	-0.028911	310
	Equity(52 [ABM])	-0.007541	-0.022772	0.004325	0.004580	310
	Equity(53 [ABMD])	0.047330	0.046897	0.067169	0.099172	206

	sector	return_d5	return_d4	return_d3	return_d2	date	cases	growth_1d	us_proportion
2020-01-22 00:00:00+00:00	101	-0.006931	-0.014206	-0.014190	-0.009719	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
2020-01-22 00:00:00+00:00	102	0.006954	0.005280	-0.007810	-0.007968	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
2020-01-22 00:00:00+00:00	103	0.004891	0.005975	-0.005693	-0.008490	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
2020-01-22 00:00:00+00:00	104	0.023148	0.014086	0.007540	0.006535	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
2020-01-22 00:00:00+00:00	205	0.002305	0.000335	-0.007134	-0.003571	2020-01-22 00:00:00+00:00	1194	NaN	0.000838

		return_d2	return_d3	return_d4	return_d5	sector	date	cases	growth_1d	us_proportion
2020-01-22 00:00:00+00:00	Equity(2 [ARNC])	0.000171	-0.022337	0.004797	-0.008956	310	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
	Equity(24 [AAPL])	-0.006370	0.004664	0.016501	0.012632	311	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
	Equity(41 [ARCB])	-0.025454	-0.048157	-0.035752	-0.028911	310	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
	Equity(52 [ABM])	-0.007541	-0.022772	0.004325	0.004580	310	2020-01-22 00:00:00+00:00	1194	NaN	0.000838
	Equity(53 [ABMD])	0.047330	0.046897	0.067169	0.099172	206	2020-01-22 00:00:00+00:00	1194	NaN	0.000838

		return_d2	return_d3	return_d4	return_d5	sector	date	cases	growth_1d	us_proportion
2020-03-04 00:00:00+00:00	Equity(4010 [TT])	2.234737	-0.176036	-0.200588	-0.236417	310	2020-03-04 00:00:00+00:00	284172	0.029709	0.000588
2020-03-03 00:00:00+00:00	Equity(45799 [KPTI])	0.707512	0.799481	0.930386	0.911755	206	2020-03-03 00:00:00+00:00	275973	0.030042	0.000478
2020-02-21 00:00:00+00:00	Equity(26286 [STMP])	0.656463	0.716381	0.781599	0.842682	311	2020-02-21 00:00:00+00:00	195114	0.013079	0.000103
2020-03-11 00:00:00+00:00	Equity(41416 [KOS])	0.630137	-0.385013	-0.510288	-0.541426	309	2020-03-11 00:00:00+00:00	348912	0.039236	0.003798
2020-03-04 00:00:00+00:00	Equity(38827 [OMER])	0.552569	0.644891	0.654591	0.542812	206	2020-03-04 00:00:00+00:00	284172	0.029709	0.000588

	country	state	type	cases
date
2020-01-22	Thailand	NaN	confirmed	2
2020-01-23	Thailand	NaN	confirmed	3
2020-01-24	Thailand	NaN	confirmed	5
2020-01-25	Thailand	NaN	confirmed	7
2020-01-26	Thailand	NaN	confirmed	8

		sector	cases	growth_1d	us_proportion
2020-01-23 00:00:00+00:00	Equity(2 [ARNC])	310	1391	0.164992	0.000719
	Equity(24 [AAPL])	311	1391	0.164992	0.000719
	Equity(41 [ARCB])	310	1391	0.164992	0.000719
	Equity(52 [ABM])	310	1391	0.164992	0.000719
	Equity(53 [ABMD])	206	1391	0.164992	0.000719