As suggested by Dr. Wiecki, I wanted to make availible an analagous version of the Machine Learning multi output comarison, but for classifer models. Once again, this is based on a great scikit-learn example: http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
Aside from expoloring the simple estimators themselves, time was taken for data pre processing and exploring things like: kalman filters, interpolation, outlier removal, feature scaling, feature extraction, and more.
The idea is: We have 390 minutes of close_price data in a day, what if we used the first x minutes of that day to train and the following y minutes to form a binary classifier that informs us if the price went up or down? Let's use ~13 years so we have 3,274 samples(days) to use in a train/test dataset.
-Derek M. Tishler
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from matplotlib.colors import ListedColormap
import matplotlib.cm as cms
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from pykalman import KalmanFilter
import talib
# Each day, we will use x minutes as our training sets, and the target values are the following y minutes
mins_to_use_for_training = 195
mins_to_use_for_targets = 195
# Using principal component analysis, perform dimension/feature reduction to....
pca_desired_domain = mins_to_use_for_training/8
# Timeframe of analysis and stock to use
training_data_start_date = datetime(2002,1,2) #Y-M-D
training_data_start_date_end_date = datetime(2015,1,2)
symbol = "SPY"
# How many samples do we want to plot at the end to see how each model did(5-20 is a good figure)
n_faces = 20
hist = get_pricing(symbol, start_date=training_data_start_date, end_date=training_data_start_date_end_date, frequency='minute',fields="close_price")
#hist = hist.fillna(hist.mean())
print "Number of NaNs in history",hist.interpolate().count() - hist.count()
#hist = hist.interpolate()
print "Len of training dataset: ",len(hist)
print str(training_data_start_date) + " / " + str(training_data_start_date_end_date)
print "Do we have nans?: ", np.any(np.isnan(hist.values))
# Plot the history, visually inspect for outliers or anything crazy(tough to do when 'zoomed out')
plt.plot(hist)
plt.tight_layout()
plt.show()
# Get positions of outliers and mask them out, removed from pipeline but left for experimentation
#p = 5
#moving_average = talib.MA(hist.values, p)
#moving_std = talib.STDDEV(hist.values, p)
#moving_average[:p] = moving_average[p]
#moving_std[:p] = moving_std[p]
#
#print "# of outliers removed: ", len(hist[(np.abs(hist-moving_average)>(3.0*moving_std))])
#hist = hist[~(np.abs(hist-moving_average)>(3.0*moving_std))]
#plt.plot(hist)
#plt.tight_layout()
#plt.show()
# Kalman filter to smooth price delta disto, removed from pipeline but left for experimentation
# Construct a Kalman filter
#kf = KalmanFilter(transition_matrices = [1],
# observation_matrices = [1],
# initial_state_mean = hist.values[0],
# initial_state_covariance = 1,
# observation_covariance=1,
# transition_covariance=.1)
#state_means, _ = kf.filter(hist.replace(0.0, np.NaN).fillna(method='bfill').fillna(method='ffill').values)
#state_means = state_means.flatten()
#
#plt.title("Kalman Filter vs Close Price")
#ax1 = plt.subplot(2,1,1)
#plt.plot(hist.values[-390:])
#plt.plot(state_means[-390:])
#
## Now save our filtered data back into the hist
#hist.loc[:] = state_means
#
#ax1 = plt.subplot(2,1,2)
#plt.plot(hist.values[-390:])
#plt.show()"""
# This mess is where I group the history data into daily bins to more easily define out training/target data.
# Also use this time to ensure no NaNs or Missing data(zeros) are left in(need smooth, continuous data)
# parse the histprical data into daily, homogeneous series
DFList = [group[1] for group in hist.interpolate(method='polynomial', order=1).groupby([hist.index.year,hist.index.month,hist.index.day])]
daily_data = np.zeros((len(DFList), 390), dtype=np.float32)
print len(DFList), ' days'
perfect_index_id = 0
# we need a perfect index example for filtering(and correctly placing/filling) history data with gaps
for i in np.arange(len(DFList)):
if len(DFList[i]) == 390:
perfect_index_id = i
print len(DFList[perfect_index_id]), 'minutes in a day'
# Go through each day and check index homogeneity
for i in np.arange(len(DFList)):
# check if the day in ? is today, save that as the test sample for after model analysis
time_temp = DFList[i].index[0]
# Case of day with full data
if np.in1d(DFList[perfect_index_id].index.time, DFList[i].index.time).all() and len(DFList[i]) == 390:
daily_data[i, :] = pd.Series(DFList[i]).replace(0.0, np.NaN).values#)#.fillna(method='bfill')#.replace(0.0, np.nan).fillna(method='bfill')#.fillna(method='bfill').replace(0.0, method='bfill').values
#daily_data[i, :] = pd.Series(daily_data[i]).interpolate(method='polynomial', order=1).values
daily_data[i, :] = pd.Series(daily_data[i]).fillna(method='bfill').values
daily_data[i, :] = pd.Series(daily_data[i]).fillna(method='ffill').values
if np.any(np.isnan(daily_data[i])):
print "NaN1"
else:
# We have missing data, we need to place existing data in correct index, then fill the empty
# Mask existing prices into their correct slot
daily_data[i, np.in1d(DFList[perfect_index_id].index.time, DFList[i].index.time)] = \
DFList[i][np.where(np.in1d(DFList[i].index.time, DFList[perfect_index_id].index.time) == True)[0]]
# back fill the gaps
daily_data[i, :] = pd.Series(daily_data[i]).replace(0.0, np.NaN).values#)#.fillna(method='bfill')#.replace(0.0, np.nan).fillna(method='bfill')#.fillna(method='bfill').replace(0.0, method='bfill').values
#daily_data[i, :] = pd.Series(daily_data[i]).interpolate(method='polynomial', order=1).values
daily_data[i, :] = pd.Series(daily_data[i]).fillna(method='bfill').values
daily_data[i, :] = pd.Series(daily_data[i]).fillna(method='ffill').values
if np.any(np.isnan(daily_data[i])):
print "NaN2"
# This seems silly, but it's really helpful to print some random days and ensure we dont have
# terrible samples when training our model. The amount & quality of the data scrubbing above needs
# to be heavily considered.
size_root = 5
# Plot 25 random days to check out the data
rand_index = np.arange(len(daily_data))
np.random.shuffle(rand_index)
rand_index = rand_index[:size_root*size_root]
fig = plt.figure(figsize=(3. * size_root, 2. * size_root))
for i in np.arange(size_root):
for j in np.arange(size_root):
sub = plt.subplot(size_root, size_root, i*size_root + j + 1, title="Day %d"%rand_index[i*size_root + j])
plt.plot(daily_data[rand_index[i*size_root + j]][:mins_to_use_for_training])
plt.plot(np.arange(mins_to_use_for_training, mins_to_use_for_training+mins_to_use_for_targets), daily_data[rand_index[i*size_root + j]][mins_to_use_for_training:mins_to_use_for_training+mins_to_use_for_targets], 'r-')
plt.tight_layout()
plt.show()
# Check for nans, zeros. One last check as they(NaNs) will break down the prepreccessing transforms.
print np.where(np.isnan(daily_data))[0].shape
print np.where(daily_data==0.0)[0].shape
# Split each day into the desired train/forecast partitions.
X_train = daily_data[:, :mins_to_use_for_training]
# Create binary classifier for days that made or lost money
#y_train = np.array(np.sign(np.sum(np.diff(daily_data[:, mins_to_use_for_training:mins_to_use_for_training+mins_to_use_for_targets], axis=1), axis=1)), dtype=np.int8)
y_train = np.divide(np.sum(np.diff(daily_data[:, mins_to_use_for_training:mins_to_use_for_training+mins_to_use_for_targets], axis=1), axis=1),
np.std(np.diff(daily_data[:, mins_to_use_for_training:mins_to_use_for_training+mins_to_use_for_targets], axis=1)))
temp = np.copy(y_train)
y_train[np.where(temp >= 0.0)] = 1
#y_train[np.where(temp == 0.0)] = 0
y_train[np.where(temp < 0.0)] = -1
y_train = np.array(y_train, dtype=np.int8)
print X_train.shape
print y_train.shape
# Use train file to build model, or use actual test file above(with no solutions though so cant compare)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
L1,L2 = X_train.shape[0],y_train.shape[0]
orig_X,orig_y = np.copy(X_train),np.copy(y_train)
# Consider the case of using sequential out of sample data as price scaling over time might affect models ability to forecast.
#X_train = orig_X[:L1*0.6, :]
#X_test = orig_X[L1*0.6:, :]
#y_train = orig_y[:L2*0.6, :]
#y_test = orig_y[L2*0.6:, :]
X_train_orig = np.copy(X_train)
X_test_orig = np.copy(X_test)
y_train_orig = np.copy(y_train)
y_test_orig = np.copy(y_test)
# the histogram of the data
fig = plt.figure(figsize=(15,5))
n, bins, patches = plt.hist(y_train, 3, normed=1, facecolor='green', alpha=0.75, rwidth=0.75, align='mid')
#plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of Training Classifier')
#plt.xticks([-0.67,0.,0.67],['Short','Hold','Long'])
plt.xticks([-0.67,0.67],['Short(-1)','Long(1)'])
plt.grid(True)
plt.show()
# Create minmax scalers using the training data, apply
zmuv_scaler_X_train = StandardScaler().fit(X_train)
X_train = zmuv_scaler_X_train.transform(X_train)
X_test = zmuv_scaler_X_train.transform(X_test)
# Create minmax scalers using the training data, apply
mm_scaler_X_train = MinMaxScaler().fit(X_train)
X_train = mm_scaler_X_train.transform(X_train)
X_test = mm_scaler_X_train.transform(X_test)
# Create PCA scalers using training data, apply
pca = PCA(whiten=True, n_components = pca_desired_domain).fit(X_train)
X_train = pca.transform(X_train)
print X_train.shape
X_test = pca.transform(X_test)
print X_test.shape
# What does our new, transformed data look like?
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
fig = plt.figure(figsize=(15,10))
ax1 = plt.subplot(3,1,1)
ax1.set_title("Daily Close Price - Minutes")
plt.plot(daily_data[-1][:mins_to_use_for_training])
plt.plot(np.arange(mins_to_use_for_training,mins_to_use_for_training+mins_to_use_for_targets), daily_data[-1][mins_to_use_for_training:mins_to_use_for_training+mins_to_use_for_targets], 'r-')
ax2 = plt.subplot(3,2,3)
ax2.set_title("(Train)Training Data Example")
plt.plot(X_train[-1][1:])
ax3 = plt.subplot(3,2,4)
ax3.set_title("(Train)Target Data Example")
plt.plot(y_train[-1], 'rs', markersize=20)
ax4 = plt.subplot(3,2,5)
ax4.set_title("(Test)Training Data Example")
plt.plot(X_test[-1][1:])
ax5 = plt.subplot(3,2,6)
ax5.set_title("(Test)Target Data Example")
plt.plot(y_test[-1], 'rs', markersize=20)
plt.tight_layout()
plt.show()
h = .02 # step size in the mesh
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes"]
classifiers = [
KNeighborsClassifier(),
SVC(kernel="linear", C=0.025),
SVC(gamma=0.1, C=1000),
DecisionTreeClassifier(max_depth=10),
RandomForestClassifier(max_depth=10, n_estimators=10, max_features=X_train.shape[1]),
AdaBoostClassifier(),
GaussianNB(),
]
# iterate over classifiers
for name, clf in zip(names, classifiers):
clf.fit(X_train, y_train)
print ' ########## '+name+' ##########'
print(classification_report(y_test, clf.predict(X_test), target_names=['Short','Long'])) # ['Short', 'Stay','Long']
print '-----------------------------------------------------\n\n'
#print "%s\t\t%5.3f"%(name,score)
# The probability of each (sample x class) for the predictions
try:
Z = clf.decision_function(X_test)
except AttributeError:
Z = clf.predict_proba(X_test)
#print Z