import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
df = local_csv('covid19_March_13_2020.csv', index_col=0)
df.index = pd.to_datetime(df.index)
df.head()
df_confirmed = df.loc[lambda x: x.type == 'confirmed']
# Estimated critical cases
p_crit = .05
df_confirmed = df_confirmed.assign(cases_crit=df_confirmed.cases*p_crit)
# Compute days relative to when 100 confirmed cases was crossed
df_confirmed.loc[:, 'days_since_100'] = np.nan
for country in df_confirmed.country.unique():
df_confirmed.loc[(df_confirmed.country == country), 'days_since_100'] = \
np.arange(-len(df_confirmed.loc[(df_confirmed.country == country) & (df_confirmed.cases < 100)]),
len(df_confirmed.loc[(df_confirmed.country == country) & (df_confirmed.cases >= 100)]))
annotate_kwargs = dict(
s='Based on COVID Data Repository by Johns Hopkins CSSE ({})\nBy Thomas Wiecki'.format(df_confirmed.index.max().strftime('%B %d, %Y')),
xy=(0.05, 0.01), xycoords='figure fraction', fontsize=10)
european_countries = ['Italy', 'Germany', 'France (total)', 'Spain', 'United Kingdom (total)',
'Iran']
large_engl_countries = ['US (total)', 'Canada (total)', 'Australia (total)']
asian_countries = ['Singapore', 'Japan', 'Korea, South', 'Hong Kong']
south_american_countries = ['Argentina', 'Brazil', 'Colombia', 'Chile']
country_groups = [european_countries, large_engl_countries, asian_countries, south_american_countries]
line_styles = ['-', ':', '--', '-.']
def plot_countries(df, countries, min_cases=100, ls='-', col='cases'):
for country in countries:
df_country = df.loc[(df.country == country) & (df.cases >= min_cases)]
if len(df_country) == 0:
continue
df_country.reset_index()[col].plot(label=country, ls=ls)
sns.set_palette(sns.hls_palette(10, l=.45, s=.8)) # 8 countries max
fig, ax = plt.subplots(figsize=(12, 8))
for countries, ls in zip(country_groups, line_styles):
plot_countries(df_confirmed, countries, ls=ls)
x = np.linspace(0, plt.xlim()[1] - 1)
ax.plot(x, 100 * (1.33) ** x, ls='--', color='k', label='33% daily growth')
ax.set(yscale='log',
title='Exponential growth of COVID-19 across countries',
xlabel='Days from first 100 confirmed cases',
ylabel='Confirmed cases (log scale)')
#ax.get_yaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
ax.legend(loc=0)
ax.annotate(**annotate_kwargs)
sns.despine();
fig, ax = plt.subplots(figsize=(12, 8))
for countries, ls in zip(country_groups, line_styles):
plot_countries(df_confirmed, countries, ls=ls)
x = np.linspace(0, plt.xlim()[1] - 1)
ax.plot(x, 100 * (1.33) ** x, ls='--', color='k', label='33% daily growth')
ax.set(title='Exponential growth of COVID-19 across countries',
xlabel='Days from first 100 confirmed cases',
ylabel='Confirmed cases', ylim=(0, 30000))
ax.legend(loc=0)
ax.annotate(**annotate_kwargs)
sns.despine();
covid19_df = df.groupby(level=0).agg({'cases': 'sum'})
covid19_df['growth_1d'] = (covid19_df - covid19_df.shift(1)) / covid19_df.shift(1)
countries_df = df.groupby([df.index.get_level_values(0), 'country']).agg({'cases': 'sum'})
us_df = countries_df[countries_df.index.get_level_values(1) == 'US']
us_df.index = us_df.index.droplevel(1)
covid19_df['us_proportion'] = us_df['cases'] / covid19_df['cases']
covid19_df = covid19_df.tz_localize('UTC').asfreq('C')
covid19_df.tail(5)
covid19_df.index
# Module Imports
# --------------------
import quantopian.optimize as opt
from quantopian.pipeline import Pipeline
from quantopian.pipeline.factors import CustomFactor, Returns
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline.data.morningstar import Fundamentals
from quantopian.pipeline.filters import QTradableStocksUS
import numpy as np
import pandas as pd
from datetime import datetime
from scipy.stats import gmean
MORNINGSTAR_SECTOR_CODES = {
-1: 'Misc',
101: 'Basic Materials',
102: 'Consumer Cyclical',
103: 'Financial Services',
104: 'Real Estate',
205: 'Consumer Defensive',
206: 'Healthcare',
207: 'Utilities',
308: 'Communication Services',
309: 'Energy',
310: 'Industrials',
311: 'Technology' ,
}
# Environment Settings
# --------------------
## Production
universe = QTradableStocksUS()
mask = {'mask': universe}
## Development
# universe = QTradableStocksUS()
# mask = {'mask': universe}
# Global Configuration
# --------------------
# None, 'industry', 'sector'
SCALE_BY = 'sector'
PIPE_NORMALIZE = True
CLIP_OUTLIERS = False
CLIP_THRESHOLD = 0.025
# When true, for each day, get some % of shorts and some % of longs
USE_EXTREMES = False
EXTREMES_BOTTOM = 0.03
EXTREMES_TOP = 0.07
from quantopian.research import run_pipeline
start_date = datetime.strptime('01/22/2020', '%m/%d/%Y')
end_date = datetime.strptime('03/13/2020', '%m/%d/%Y')
def normalize(data):
""" Normalize long/short positions
"""
result = pd.Series(data)
result = result - nanmean(data)
denom = result.abs().sum()
if denom == 0:
denom = 1
return result / denom
def make_alpha_factors():
factors = {}
# Create factors here
# Not an alpha factor, but useful for later in the grouped tear sheet analysis
factors['sector'] = Fundamentals.morningstar_sector_code.latest
return factors
def make_pipeline():
alpha_factors = make_alpha_factors()
columns = {a: alpha_factors[a] for a in alpha_factors}
columns['return_d2'] = Returns(window_length=2, **mask)
columns['return_d3'] = Returns(window_length=3, **mask)
columns['return_d4'] = Returns(window_length=4, **mask)
columns['return_d5'] = Returns(window_length=5, **mask)
pipe = Pipeline(columns=columns, screen=universe)
return pipe
pipe = make_pipeline()
mdf = run_pipeline(pipe, start_date, end_date).dropna(how='all')
mdf.head(5)
grouped_mdf = mdf.groupby([mdf.index.get_level_values(0), 'sector']) \
.agg({'return_d2': 'mean',
'return_d3': 'mean',
'return_d4': 'mean',
'return_d5': 'mean',})
gjoint_mdf = pd.merge(grouped_mdf.reset_index(), covid19_df.reset_index(),
left_on='level_0', right_on='date', how='inner') \
.set_index(['level_0'])
gjoint_mdf.index.names = [None]
gjoint_mdf.head(5)
pal = sns.color_palette('Paired', len(MORNINGSTAR_SECTOR_CODES.items()))
def draw_plot(x, y, hue, data, ax, pal, legend=False):
sector_keys = sorted(MORNINGSTAR_SECTOR_CODES.keys())
legend_elements = []
for i in range(len(sector_keys)):
df = data[data[hue] == sector_keys[i]]
color = pal[i]
if df.shape[0] > 0:
facet = sns.regplot(x=x, y=y, data=df, color=color, ax=ax)
facet.set(axis_bgcolor='grey')
if legend:
legend_elements.append(
plt.Line2D([0], [0], marker='o', color='w',
label=MORNINGSTAR_SECTOR_CODES[sector_keys[i]],
markerfacecolor=color, markersize=10)
)
if legend:
ax.legend(handles=legend_elements, bbox_to_anchor=(-0.15, 1.3),
ncol=3, loc='upper center')
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
draw_plot(x='growth_1d', y='return_d2', hue='sector', data=gjoint_mdf, ax=ax1, pal=pal)
ax2 = fig.add_subplot(2,2,2)
draw_plot(x='growth_1d', y='return_d3', hue='sector', data=gjoint_mdf, ax=ax2, pal=pal, legend=True)
ax3 = fig.add_subplot(2,2,3)
draw_plot(x='growth_1d', y='return_d4', hue='sector', data=gjoint_mdf, ax=ax3, pal=pal)
ax4 = fig.add_subplot(2,2,4)
draw_plot(x='growth_1d', y='return_d5', hue='sector', data=gjoint_mdf, ax=ax4, pal=pal)
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
draw_plot(x='us_proportion', y='return_d2', hue='sector', data=gjoint_mdf, ax=ax1, pal=pal)
ax2 = fig.add_subplot(2,2,2)
draw_plot(x='us_proportion', y='return_d3', hue='sector', data=gjoint_mdf, ax=ax2, pal=pal, legend=True)
ax3 = fig.add_subplot(2,2,3)
draw_plot(x='us_proportion', y='return_d4', hue='sector', data=gjoint_mdf, ax=ax3, pal=pal)
ax4 = fig.add_subplot(2,2,4)
draw_plot(x='us_proportion', y='return_d5', hue='sector', data=gjoint_mdf, ax=ax4, pal=pal)
Interestingly, there is a negative correlation between COVID cases in US in proportion to the world and future returns. This correlation was not seen with the growth_1d
factor as larger growth was seen on later dates.
joint_mdf = pd.merge(mdf.reset_index(), covid19_df.reset_index(),
left_on='level_0', right_on='date', how='inner') \
.set_index(['level_0', 'level_1'])
joint_mdf.index.names = [None, None]
joint_mdf.head(5)
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
sns.regplot(x='growth_1d', y='return_d2', data=joint_mdf, ax=ax1)
ax2 = fig.add_subplot(2,2,2)
sns.regplot(x='growth_1d', y='return_d3', data=joint_mdf, ax=ax2)
ax3 = fig.add_subplot(2,2,3)
sns.regplot(x='growth_1d', y='return_d4', data=joint_mdf, ax=ax3)
ax4 = fig.add_subplot(2,2,4)
sns.regplot(x='growth_1d', y='return_d5', data=joint_mdf, ax=ax4);
fig = plt.figure(figsize=(10, 10))
ax1 = fig.add_subplot(2,2,1)
sns.regplot(x='us_proportion', y='return_d2', data=joint_mdf, ax=ax1)
ax2 = fig.add_subplot(2,2,2)
sns.regplot(x='us_proportion', y='return_d3', data=joint_mdf, ax=ax2)
ax3 = fig.add_subplot(2,2,3)
sns.regplot(x='us_proportion', y='return_d4', data=joint_mdf, ax=ax3)
ax4 = fig.add_subplot(2,2,4)
sns.regplot(x='us_proportion', y='return_d5', data=joint_mdf, ax=ax4);
Again, the same negative correlation with returns can be seen with us_proportion
factor. This factor could be a potential alpha if we know a way to combine it with others.
There is an outlier that had 200+% returns, I went and check and found it was from TT (Trane Technologies PLC?), but the result is different from the actual returns at that date.
joint_mdf.sort_values(by=['return_d2'], ascending=False).head(5)
alphas_view = joint_mdf.copy().loc['2020-01-23':]
alphas_view = alphas_view.drop(['return_d2', 'return_d3',
'return_d4', 'return_d5',
'date'], axis=1)
alphas_view.head(5)
alphas_view.hist();