Notebook

Searching for a signal in CEO change and news sentiment data

In [1]:
from quantopian.pipeline.data.accern import alphaone_free as alphaone
from quantopian.pipeline.data.eventvestor import CEOChangeAnnouncements as cc
from quantopian.pipeline.data.builtin import USEquityPricing
from quantopian.pipeline import Pipeline
from quantopian.research import run_pipeline
import pandas as pd

# Number of days after CEO change that articles are considered.
SINCE_CHANGE = 5

# Number of trading days since article that we will keep track of price data.
SINCE_ARTICLE = 40

Let's set up a pipeline to get some data to work with. This takes a minute to run.

In [2]:
announcement_date = cc.asof_date.latest
pipe = Pipeline(
    columns={
        'sentiment': alphaone.article_sentiment.latest,
        'impact_score': alphaone.impact_score.latest,
        'announcement_date': announcement_date,
        'price': USEquityPricing.close.latest
    },
    screen=announcement_date.notnull()
)
factor = run_pipeline(pipe, start_date='2012-08-26', end_date='2014-12-31')
factor.head(5)
Out[2]:
announcement_date impact_score price sentiment
2012-08-27 00:00:00+00:00 Equity(2 [AA]) 2008-05-08 NaN 8.63 NaN
Equity(24 [AAPL]) 2011-08-24 70.086 663.18 -0.126
Equity(64 [ABX]) 2008-12-23 NaN 37.77 NaN
Equity(66 [AB]) 2008-12-19 NaN 13.59 NaN
Equity(69 [ACAT]) 2010-10-28 NaN 42.93 NaN

Now we'll filter out the data points that aren't close enough to a CEO change.

In [3]:
data = factor

# Constructs a new column with values from the first index, stripped of the time information.
data['asof_date'] = map(lambda x: pd.Timestamp(x.date()), factor.index.get_level_values(0))

# Filters out data points too long after the last CEO change.
data = data[data['asof_date'] - data['announcement_date'] <=
            pd.Timedelta((SINCE_CHANGE + SINCE_ARTICLE) * 1.6, unit='d')]
data.head(5)
Out[3]:
announcement_date impact_score price sentiment asof_date
2012-08-27 00:00:00+00:00 Equity(754 [BBY]) 2012-08-20 79.143 17.31 -0.714 2012-08-27
Equity(780 [BCS]) 2012-07-03 87.000 11.85 -0.667 2012-08-27
Equity(1331 [CCC]) 2012-06-27 NaN 13.39 NaN 2012-08-27
Equity(1979 [CUB]) 2012-06-25 NaN 48.73 NaN 2012-08-27
Equity(2351 [DUK]) 2012-07-03 80.000 65.48 0.000 2012-08-27

Here we make a new DataFrame with a row for each article fewer than 5 days after a CEO change.

In [4]:
# Filter for a non-null sentiment value and a date within 5 days of the last CEO change.
sentiment = data[pd.notnull(data['sentiment']) &
                 (data['asof_date'] - data['announcement_date'] < pd.Timedelta(SINCE_CHANGE, unit='d'))]
sentiment.head(10)
Out[4]:
announcement_date impact_score price sentiment asof_date
2012-08-28 00:00:00+00:00 Equity(5199 [NAV]) 2012-08-27 87.071 23.331 -0.051 2012-08-28
Equity(9430 [STRL]) 2012-08-27 99.545 9.750 0.000 2012-08-28
2012-08-29 00:00:00+00:00 Equity(5199 [NAV]) 2012-08-27 88.075 22.750 -0.164 2012-08-29
Equity(6769 [SEE]) 2012-08-28 90.250 12.990 0.583 2012-08-29
Equity(9430 [STRL]) 2012-08-27 99.545 9.800 0.000 2012-08-29
Equity(42037 [ZLTQ]) 2012-08-28 100.000 4.910 0.500 2012-08-29
2012-08-30 00:00:00+00:00 Equity(5199 [NAV]) 2012-08-27 85.571 22.500 0.000 2012-08-30
Equity(6769 [SEE]) 2012-08-28 95.789 14.580 0.281 2012-08-30
Equity(9268 [SHLO]) 2012-08-29 100.000 10.420 0.143 2012-08-30
Equity(9430 [STRL]) 2012-08-27 99.545 9.910 0.000 2012-08-30

Now we append columns containing the relative price difference from the day of article publication.

In [5]:
# This function is applied to our sentiment DataFrame right below its definition.
# Given an event (equity and article's date of publication),
#     it returns a Series containing pricing data for the equity following the event.
def get_prices(s):
    
    # Drops all data before the article's publication date, using a slice.
    data_after_article = data.loc[s['asof_date']:]
    
    # Isolates the data for the specific equity.
    # The equity is stored in the series name automatically by the .apply function below.
    equity_data = data_after_article.loc(axis=0)[:, s.name[1]]
    
    # Gets the equity's prices for a specified number of days after the event,
    #     by dropping pricing data that is too old.
    equity_pricing = equity_data['price'].head(SINCE_ARTICLE + 1)
    
    # Returns the equity's relative price differences from the day of the event,
    #     with the indices of the series replaced by numbers to indicate days after the event.
    return ((equity_pricing - s['price']) / s['price']).reset_index(drop=True)

# Concatenates the price data for all stocks in the DataFrame as new columns.
matches = pd.concat([sentiment, sentiment.apply(get_prices, axis=1)], axis=1)
matches.head(5)
Out[5]:
announcement_date impact_score price sentiment asof_date 0 1 2 3 4 ... 31 32 33 34 35 36 37 38 39 40
2012-08-28 00:00:00+00:00 Equity(5199 [NAV]) 2012-08-27 87.071 23.331 -0.051 2012-08-28 0 -0.024902 -0.035618 -0.080194 -0.057906 ... -0.057049 -0.030046 -0.044190 -0.072050 -0.117912 -0.093909 -0.119626 -0.183104 -0.148986 -0.165017
Equity(9430 [STRL]) 2012-08-27 99.545 9.750 0.000 2012-08-28 0 0.005128 0.016410 0.005128 -0.006154 ... 0.008205 0.001026 -0.015385 -0.009231 -0.011282 -0.008205 -0.025641 -0.033846 -0.056410 -0.066667
2012-08-29 00:00:00+00:00 Equity(5199 [NAV]) 2012-08-27 88.075 22.750 -0.164 2012-08-29 0 -0.010989 -0.056703 -0.033846 -0.121758 ... -0.005275 -0.019780 -0.048352 -0.095385 -0.070769 -0.097143 -0.162242 -0.127253 -0.143692 -0.160879
Equity(6769 [SEE]) 2012-08-28 90.250 12.990 0.583 2012-08-29 0 0.122402 0.101848 0.098537 0.087375 ... 0.190916 0.180139 0.192456 0.227868 0.253272 0.270978 0.227868 0.234796 0.210931 0.210931
Equity(9430 [STRL]) 2012-08-27 99.545 9.800 0.000 2012-08-29 0 0.011224 0.000000 -0.011224 0.006122 ... -0.004082 -0.020408 -0.014286 -0.016327 -0.013265 -0.030612 -0.038776 -0.061224 -0.071429 -0.081633

5 rows × 46 columns

In [6]:
print(len(matches))
2984
In [7]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np

Here's a plot for rank correlation between the sentiment rating and the relative price change down the road. In the first 12 days, the stock price tends to move in the opposite direction. A good strategy might be to go long or short in a stock for 12 days after an article, going in the opposite direction of the sentiment.

In [8]:
scores = np.zeros(SINCE_ARTICLE + 1)
for i in range(1, SINCE_ARTICLE + 1):
    score, pvalue = stats.spearmanr(matches['sentiment'],
                                    matches[i])
    scores[i] = score
    
plt.bar(range(1, SINCE_ARTICLE + 1), scores[1:])
plt.xlabel('Trading days afterward')
plt.xlim((1, SINCE_ARTICLE + 1))
plt.ylabel('Rank correlation between article sentiment and returns');

Here's the spread of 12-day returns, with equities sorted into baskets by sentiment score. This shows how the signal might be usable in a long-short strategy. It might be wise to exclude sentiment scores under -0.2 for a smooth spread.

In [9]:
num_baskets = 10
delta = 2. / num_baskets
avgs = np.zeros(num_baskets + 1)
for i in range(num_baskets):
    basket = matches[(matches['sentiment'] >= -1 + i * delta) &
                     (matches['sentiment'] < -1 + (i + 1) * delta)]
    
    avgs[i] = basket[12].mean()
avgs[num_baskets] = matches[matches['sentiment'] == 1][12].mean()

plt.bar(range(num_baskets + 1), avgs)
plt.xticks(range(num_baskets + 1), np.around(np.arange(-1, 1.2, 0.2), decimals=1))
plt.xlabel('Article sentiment')
plt.ylabel('Mean 12-day returns');

We can try to incorporate Accern's impact score but it doesn't make much difference.

In [10]:
multi_signal = matches['sentiment'] * matches['impact_score']
scores = np.zeros(SINCE_ARTICLE + 1)
pvalues = np.zeros(SINCE_ARTICLE + 1)
for i in range(1, SINCE_ARTICLE + 1):
    score, pvalue = stats.spearmanr(multi_signal,
                                    matches[i])
    pvalues[i] = pvalue
    scores[i] = score
    
plt.bar(range(1,SINCE_ARTICLE + 1), scores[1:])
plt.xlabel('Trading days afterward')
plt.xlim((1, SINCE_ARTICLE + 1))
plt.ylabel('Rank correlation between article sentiment times impact score and returns');
In [11]:
num_baskets = 10
delta = 200. / num_baskets
avgs = np.zeros(num_baskets + 1)
for i in range(num_baskets):
    basket = matches[(multi_signal >= -100 + i * delta) &
                     (multi_signal < -100 + (i + 1) * delta)]
    
    avgs[i] = basket[12].mean()
avgs[num_baskets] = matches[multi_signal == 100][12].mean()

plt.bar(range(num_baskets + 1), avgs)
plt.xticks(range(num_baskets + 1), np.around(np.arange(-1, 1.2, 0.2), decimals=1));
plt.xlabel('Article sentiment times impact score')
plt.ylabel('Mean 12-day returns');