"Infographics Challenge, Economic Implications of COVID-19"
First submission notebook
Lucas BL
import pandas as pd
import numpy as np
import empyrical as ep
import alphalens as al
from quantopian.pipeline import Pipeline
from quantopian.pipeline.data.factset import GeoRev
from quantopian.pipeline.domain import US_EQUITIES
from quantopian.research import run_pipeline
from quantopian.pipeline.filters import QTradableStocksUS
from quantopian.pipeline.data.factset import RBICSFocus
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set(style="whitegrid")
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans
import statsmodels
import statsmodels.api as sm
# Load COVID-19 data. We periodically update the file in your research
# folder. You can also download an updated file from
# https://ourworldindata.org/coronavirus-source-data and put
# it into your data directory in research
covid = local_csv('covid19_cases.csv', date_column='date')
covid.head()
Definition of more specific economic areas
# Map selected countries to their region
region_country = {
'United States': 'North America',
'Canada': 'North America',
'Austria': 'East-Europe',
'Bulgaria': 'East-Europe',
'Croatia': 'East-Europe',
'Denmark': 'West-Europe',
'Estonia': 'East-Europe',
'Finland': 'Scandinavian_Countries',
'France': 'West-Europe',
'Germany': 'West-Europe',
'Greece': 'East-Europe',
'Iceland': 'Scandinavian_Countries',
'Hungary': 'East-Europe',
'Italy': 'West-Europe',
'Netherlands': 'West-Europe',
'Poland': 'East-Europe',
'Portugal': 'West-Europe',
'Romania': 'East-Europe',
'Spain': 'West-Europe',
'Serbia': 'East-Europe',
'Switzerland': 'West-Europe',
'Sweden': 'Scandinavian_Countries',
'United Kingdom': 'West-Europe',
'Australia': 'Asia-Pacific',
'Bangladesh': 'Asia-Pacific',
'Bhutan': 'Asia-Pacific',
'China': 'Asia-Pacific',
'Indonesia': 'Asia-Pacific',
'Japan': 'Asia-Pacific',
'Malaysia': 'Asia-Pacific',
'Myanmar': 'Asia-Pacific',
'New Zealand': 'Asia-Pacific',
'Singapore': 'Asia-Pacific',
'South Korea': 'Asia-Pacific',
'Taiwan': 'Asia-Pacific',
'Vietnam': 'Asia-Pacific',
}
covid['region'] = covid['location'].map(region_country)
covid_region = covid.reset_index().groupby(['date', 'region']).sum().reset_index(level='region')
covid_region.head()
df = covid_region.reset_index()
region_totcases = pd.DataFrame()
for name, group in df.groupby(df.iloc[:,1]):
if region_totcases.empty:
region_totcases = group.set_index("date")[["total_cases"]].rename(columns={"total_cases":name})
else:
region_totcases = region_totcases.join(group.set_index("date")[["total_cases"]]).rename(columns={"total_cases":name})
print(region_totcases.head())
region_totcases.pct_change().rolling(1).mean().cumsum().plot(legend=True)
plt.ylabel('# Total cases')
plt.title('COVID-19 growth in different regions')
plt.legend(loc=0)
plt.figure()
sns.clustermap(region_totcases.corr(), vmin=-1, vmax=1, center=0, linewidths=.75, cmap='coolwarm')
In the graph of the total cases above we can see that the number of total cases are increasing in a differnet timespan, reflecting the interconnectiveness of specific areas. Moreover, specific countries disease prevention may influence the delay of observation as well as consumer behavior
def return_dataframe(data, iloc_column, variable):
dataframe = data.reset_index()
new_dataframe = pd.DataFrame()
for name, group in dataframe.groupby(dataframe.iloc[:,iloc_column]):
if new_dataframe.empty:
new_dataframe = group.set_index("date")[[variable]].rename(columns={variable:name})
else:
new_dataframe = new_dataframe.join(group.set_index("date")[[variable]]).rename(columns={variable:name})
return new_dataframe
covid_df = return_dataframe(covid.iloc[:,:], 1, "total_cases")
print('Rows in the dataframe= ', len(covid_df))
# covid_df.tail()
covid_df = covid_df.dropna(thresh=len(covid_df)*0.8, axis=1, how='all').iloc[30:-1,:]
#Drop January since there is a lot of missing data and reporting errors
Second derivative of delta to map the velocity (speed) of the epidemic withinh the area
sns.clustermap(covid_df.pct_change().corr(method='pearson'), vmin=-1, vmax=1, center=0, cmap='coolwarm')
thresh = 10000
covid_region.groupby('region')['total_cases'].plot(legend=True);
plt.axhline(thresh, ls='--', color='0.5', label='Chosen threshold');
plt.ylabel('# confirmed cases'); plt.title('COVID-19 growth in different regions'); plt.legend(loc=0);
# Compute date where threshold was crossed in each region
covid_region_date = covid_region.loc[lambda x: x.total_cases > thresh].reset_index().groupby('region').first()['date'].sort_values()
covid_region_date
# Compute date where threshold was crossed in each countries
covid_location_date = covid.loc[lambda x: x.total_cases > thresh].reset_index().groupby('location').first()['date'].sort_values()
covid_location_date
# Revenue exposure to North America, Asia / Pacific and Europe.
GeoRevNA = GeoRev.slice('NORTH AMERICA')
GeoRevAP = GeoRev.slice('ASIA-PACIFIC')
GeoRevEU = GeoRev.slice('EUROPE')
# Most recent revenue exposure.
rev_exposure_NA = GeoRevNA.est_pct.latest
rev_exposure_AP = GeoRevAP.est_pct.latest
rev_exposure_EU = GeoRevEU.est_pct.latest
# We are not using RBICS sectors for this analysis
# but putting it here in case you want to use it.
sector = RBICSFocus.l1_name.latest
# Add all factors to a pipeline and run it.
pipe = Pipeline(
columns={
'rev_exposure_NA': rev_exposure_NA.rank().zscore(),
'rev_exposure_AP': rev_exposure_AP.rank().zscore(),
'rev_exposure_EU': rev_exposure_EU.rank().zscore(),
'sector': sector,
},
domain=US_EQUITIES,
screen=(QTradableStocksUS() & rev_exposure_NA.notnull() & rev_exposure_AP.notnull() & rev_exposure_EU.notnull()),
)
# Run the pipeline for the most recent date available, georev is only updated yearly
df = run_pipeline(pipe, '2018-01-01', '2018-01-01')
df = df.reset_index(level=0, drop=True) # drop date index as it's just a single day
print(df.head())
df = df.drop('sector', axis='columns')
df.head()
# Get stock returns
prices = get_pricing(
symbols=df.index,
start_date=covid.index[0],
end_date=covid.index[-1],
fields='close_price',
)
# Add datetime index with forward filling to match prices
# This effectively forward-fills the 2018 data
factor = pd.concat({dt: df for dt in prices.index})
factor.head()
# Use alphalens to compute factor returns for each column
factor_returns = {}
for col in factor.columns:
factor_data = al.utils.get_clean_factor_and_forward_returns(
factor[col], prices, periods=[1])
factor_returns[col] = al.performance.factor_returns(factor_data)['1D']
factor_returns = pd.DataFrame(factor_returns)
factor_returns.head()
factor_returns.columns = ['Asia/Pacific factor returns', 'Europe factor returns', 'North America factor returns']
Elbow curve using K-Mean Clsutering
map "relevant" cluster number regarding possible noises in the data (Highest convexity is generally the appropriate number)
#Calculate average annual percentage return and volatilities over a theoretical one year period
delta_df = covid_df.pct_change().rolling(1).mean().mean(axis=1)
delta_df = pd.DataFrame(delta_df).dropna()
delta_df.columns = ['delta_df']
delta_df['Volatility'] = delta_df.rolling(2).std().mean(axis=1)
delta_df = delta_df.pct_change().dropna()
# #format the data as a numpy array to feed into the K-Means algorithm
X = np.asarray([np.asarray(delta_df['delta_df']),np.asarray(delta_df['Volatility'])]).T
distorsions = []
for k in range(2, 40):
k_means = KMeans(n_clusters=k)
k_means.fit(X)
distorsions.append(k_means.inertia_)
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 40), distorsions, c='b')
plt.grid(True)
plt.title('Elbow curve')
def hrc_dendrogram(data, thresh=None, ax=None, figsize=None, scale=None):
#transpose the dataframe
x = data.corr(method='spearman')
# Normalize the movements: normalized_movements
normalized_movements = normalize(x)
# Calculate the linkage: mergings
mergings = linkage(normalized_movements, method='ward')
# print(mergings)
global labels
global nbofclusters
labels = fcluster(mergings, thresh, criterion='distance')
labels = pd.DataFrame({'Cluster':labels, 'Tickers':x.index}).set_index('Tickers')
nbofclusters = labels.max()
# Plot the dendrogram
labelList = [i for i in x.index]
# plt.figure(figsize=figsize)
# plt.yscale(scale)
dendrogram(mergings,
truncate_mode='level',
color_threshold=thresh,
leaf_rotation=90.,
leaf_font_size=15.,
labels=labelList,
show_contracted=True, ax=ax)
# plt.suptitle('Hiearchical Clustering Dendrogram', fontsize=15); plt.ylabel('height', fontsize=15);
# plt.show()
hrc_dendrogram(covid_df.pct_change(), thresh=1.6, scale='linear', figsize=(20,3))
It appear that some relationship exist (at least at soem extent) in the velocity of the spread of the COVID19 in some contries, it mostly map the behavior of the epidemy in each country that are influenced by soem factors such as, social-distancing, sanitary control, prevention, etc. There is error rates since the data are only based on total cases reported
# Define the clusters according to the same metrics as above i.e. "distance" for euclidian
# Nb of clusters > arbitrary decision
# Plot Distance (euclidean) Classification classification
model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
model.fit(covid_df.corr())
labels = model.labels_
cluster = pd.DataFrame()
# covid_df.mean(axis=1).cumsum().plot(color='b', alpha=0.5, title="Industries classification Hiearchical Clustering (12)")
for i in range(labels.max()+1):
cluster['cluster_{}'.format(i)] = covid_df.T[labels==i].T.mean(axis=1)
cluster['cluster_{}'.format(i)].diff(1).rolling(1).mean().cumsum().plot(alpha=1, legend=True)
plt.show()
Here we can clearly spot China as the Cluster_3 other reflect eurozone and North america zones, further improvements could map those clusters with already defined economic regions
# General Parameters for Tearsheet
fig, axs = plt.subplots(nrows=5, sharex=False, figsize=(20,35));
#Regional plots
region_totcases.pct_change().rolling(1).mean().cumsum().plot(legend=True, ax=axs[0])
plt.legend(loc=0);
axs[0].set(title='Cumulative changes in total cases',
ylabel='Cumulative changes', xlabel='');
# Plot of Factor returns
ep.cum_returns(factor_returns).plot(ax=axs[1])
colors = ['b', 'g', 'r', 'brown', 'gray', 'orange', 'black', 'cyan']
axs[1].set(title='GeoRev-weighted factor returns in response to COVID-19 cases',
ylabel='Cumulative returns',xlabel='');
# Plot of Factor returns volatility
factor_returns.rolling(5).std(axis=1).plot(ax=axs[2])
axs[2].set(title='GeoRev-weighted Weekly Standard deviation of mean factor returns in response to COVID-19',
ylabel='Standard Deviation',xlabel='');
# Plot Factor spread
(factor_returns.rolling(5).mean().max(axis=1) - factor_returns.rolling(5).mean().min(axis=1)).plot(ax=axs[3], label='Spread changes')
for c, (region, date) in zip(colors, covid_location_date.iteritems()):
axs[3].axvline(date, label=region, ls='--', color=c)
axs[3].legend(loc=2);
axs[3].set(title='Weekly spread of factor returns | bars countries crossing {} cases'.format(thresh),
ylabel='Spread (High-Low)',xlabel='');
# fig, axs = plt.subplots(nrows=2, sharex=False)
hrc_dendrogram(covid_df**2, thresh=0.07, ax=axs[4])
axs[4].set(title='Hiearchical Clustering Dendrogram',
ylabel='Height', yscale='linear');
Credit: Thomas Wiecki notebook sample
(First submission) Visualization for mapping related areas. I added a the cumulative changes of reported cases to visualize the stagnation of the epidemy (China).
The dendrograms reflect the degree of correlation between reported cases. The standard deviation of factor returns don't seems to be affected by the reported cases until cases in Europe and US were rising.
However the factor returns seems to hold well even in stress period as they slowly came back to normal.
The spread of factor returns seems to indicate higher spread than normal level, further steps will requires stationarity test of integratino order in larger dataset to explore whether the relationship hold in the long run.
Lucas. BL