Given a Gaussian copula C(u),u∈[0,1]d with correlation matrix S. Denote I⊆{1,…,d} a subset of coordinate indices, and J={1,…,d}∖I its complement. Let U be the random vector posessing joint CDF C, and UI,UJ denote its subvectors with corresponding components.
Denote X=(X1,…,Xd) the random vector obtained from U using inverse to the standard normal CDF Φ−1 as follows:
X=(Φ−1(U1),…,Φ−1(Ud)).
It is well known that X possesses joint normal distribution with zero mean and covariance matrix S.
Now let XI,XJ be subvectors of X corresponding to subsets of indices I,J. What is the distribution of XJ given XI=a, where a∈R|I|?
Denote S∗IJ the submatrix of S with rows from I and columns from J, and similarly S∗II, S∗JJ and S∗JI. If all the numbers in I precede the numbers in J (in general it is not so), then we would have
S=(S∗IIS∗IJS∗JIS∗JJ).
Note that since S is symmetric, we have S∗IJ=S∗JI′.
Now the conditional distribution of XJ given XI=a is joint normal with mean and covariance matrix
μ(a)=E(XJ|X∗I=a)=S∗JIS−1IIa,Z=E((XJ−μ(a))2|X∗I=a)=S∗JJ−S∗JIS∗II−1SIJ.
Denote CDF of the distribution by F∗μ(a),Z, and the corresponding PDF by f∗μ(a),Z.
Given a copula C, a joint CDF for a random vector U with correlation matrix S. Let UI=b, what is the conditional distribution of Uj given UI=b?
Denote X=Φ−1(U), XI=Φ−1(UI), XJ=Φ−1(UJ), where the function Φ−1 is applied component-wise. Let a=Φ−1(b), then we have for the conditional CDF of UJ given UI=b:
P(UJ≤uJ|UI=b)=P(Φ−1(UJ)≤Φ−1(uJ)|Φ−1(UI)=Φ−1(b))=P(XJ≤Φ−1(uJ)|XI=Φ−1(b)).
The latter is joint normal cumulative distribution function with mean S∗JIS∗II−1Φ−1(b) and covariance matrix S∗JJ−S∗JIS∗II−1S∗IJ, possessing PDF f∗μ(Φ−1(b)),Z and CDF F∗μ(Φ−1(b)),Z.
"""
Created on Sun Oct 22 14:16:36 2017
@author: Arcady Novosyolov
"""
import numpy as np
import pandas as pd
import scipy.stats as ss
from sklearn.svm import SVR
# avioding boundary trouble
loc_eps = 0.0000001
# ================================================================
# copula classes
# ================================================================
class GaussianCopulaConditional:
""" """
def __init__(self, corr_matrix=np.eye(2), conditionals=None):
"""
Gaussian covariance matrix and variables variables list;
target variables will be kept in 'targets' list
"""
self.cr = pd.DataFrame(corr_matrix)
cls = self.cr.columns
# by default conditionals contains everything except the last component
if conditionals is None:
conditionals = cls[:-1]
# the rest of components are targets
targets = [s for s in cls if s not in conditionals]
# keep the variables lists
self.targets = targets
self.conditionals = conditionals
# covariance submatrices
sII = np.array(self.cr.loc[conditionals, conditionals])
sIJ = np.array(self.cr.loc[conditionals, targets])
sJI = np.array(self.cr.loc[targets, conditionals])
sJJ = np.array(self.cr.loc[targets, targets])
# conditional mean and covariance
pr = np.linalg.solve(sII, sIJ)
self.cond_cov = sJJ - sJI.dot(pr)
self.mupr = np.linalg.solve(sII, sIJ).T
def make_input(self, u, u_cond):
""" """
if u is None:
u = 0.5 * np.ones((1, len(self.targets)))
if u_cond is None:
u_cond = 0.5 * np.ones((1, len(self.conditionals)))
u[u == 0] = loc_eps
u[u == 1] = 1 - loc_eps
return u, u_cond
def pdf(self, u=None, u_cond=None):
"""
u is 2-dim array n_obs x n_targets
u_cond is 2-dim array 1 x n_conditionals
"""
u, u_cond = self.make_input(u, u_cond)
x_cond = ss.norm.ppf(u_cond)
x = ss.norm.ppf(u)
mn = self.mupr.dot(x_cond.T).T[0]
if len(self.targets) <= 1:
res = ss.norm.pdf(x, mn[0], self.cond_cov[0, 0])
fx = ss.norm.pdf(x)
res = pd.Series((res / fx)[:, 0], index=u[:, 0])
else:
res = ss.multivariate_normal.pdf(x, mean=mn, cov=self.cond_cov)
fx = ss.norm.pdf(x).prod(axis=1)
res = pd.Series(res / fx, index=range(x.shape[0]))
res.name = 'Cond PDF of ' + ', '.join(self.targets)
return res
def cdf(self, u=None, u_cond=None):
"""
u is 2-dim array n_obs x n_targets
u_cond is 2-dim array 1 x n_conditionals
"""
u, u_cond = self.make_input(u, u_cond)
x_cond = ss.norm.ppf(u_cond)
x = ss.norm.ppf(u)
mn = self.mupr.dot(x_cond.T).T[0]
if len(self.targets) <= 1:
# univariate normal
res = ss.norm.cdf(x, mn[0], self.cond_cov[0, 0])
res = pd.Series(res[:, 0], index=u[:, 0])
else:
# mvnormcdf does not accept multiple input points
res = np.zeros(x.shape[0])
for i in range(x.shape[0]):
res[i] = mvnormcdf(x[i, :], mn, self.cond_cov)
res = pd.Series(res, index=range(u.shape[0]))
res.name = 'Cond CDF of ' + ', '.join(self.targets)
return res
class GaussianCopula:
""" """
def __init__(self, corr_matr=pd.DataFrame(np.eye(2))):
"""
initialize copula (as two dimensional indipendent one by default)
"""
self.set_corr(corr_matr)
def set_corr(self, cr):
""" set correlation matrix parameter and compute its inverse """
self.cr = pd.DataFrame(cr)
self.dim = self.cr.shape[0]
cls = self.cr.columns
self.cri = pd.DataFrame(np.linalg.inv(self.cr), columns=cls, index=cls)
def fit(self, data):
"""
fit the copula from data
the data should be 'm times n' Numpy array or Pandas DataFrame,
m = number of samples (observations), n = number of components
"""
cr = pd.DataFrame(data).corr(method='spearman')
self.set_corr(pearson_from_spearman(cr))
def rvs(self, size=100000):
"""
simulate sample for Monte Carlo method
size = sample volume, 100,000 by default
"""
sam = ss.multivariate_normal.rvs(cov=self.cr, size=size)
return pd.DataFrame(ss.norm.cdf(sam), columns=self.cr.columns)
def pdf(self, u):
""" probability density function of the copula """
x = ss.norm.ppf(u)
mat = self.cri - np.eye(self.dim)
pok = -0.5 * x.dot(mat).dot(x)
mult = 1 / np.sqrt(np.linalg.det(self.cri))
return mult * np.exp(pok)
def probability(self, event_func, n_obs=100000):
"""
calculates probability of an event A, which is defined via its
indicator function "event_func"
"""
u = self.rvs(size=n_obs)
return event_func(u).mean()
def conditional_probability(self, event_func, cond_func, n_obs=100000):
"""
calculates conditional probability of an event A, given event B,
which are defined via there indicator functions "event_func" and
"cond_func", respectively
"""
u = self.rvs(size=n_obs)
prob_of_cond = cond_func(u).mean()
prob_of_event_and_cond = (event_func(u) & cond_func(u)).mean()
if prob_of_cond > 0:
res = prob_of_event_and_cond / prob_of_cond
else:
res = 0
return res
def conditional_mean_variance_event(self, y_func, cond_func, n_obs=100000):
"""
calculates conditional mean and variance of a random variable Y = f(U),
given condition A, with function f defined by "y_func", and
condition A defined by its indicator function "cond_func"
"""
u = self.rvs(size=n_obs)
cond_ser = cond_func(u).astype('float')
prob_of_cond = cond_ser.mean()
event_and_cond = (y_func(u) * cond_ser).mean()
event2_and_cond = (y_func(u) ** 2 * cond_ser).mean()
if prob_of_cond > 0:
me = event_and_cond / prob_of_cond
va = event2_and_cond / prob_of_cond - me ** 2
else:
me, va = 0, 0
return me, va
def conditional_mean_variance_vector(self, y_func, x_func, n_obs=5000):
"""
creates an SVR model for computing expected mean of Y given X,
where X, Y are functions of U;
returns the model 'mod';
to predict value of Y in a new point X, call mod.predict(X), as usual
"""
u = self.rvs(size=n_obs)
# expected value portion
y = y_func(u)
x = x_func(u)
mod_ev = SVR()
mod_ev.fit(x, y)
ym = mod_ev.predict(x)
# variance portion
yv = (y - ym) ** 2
mod_va = SVR()
mod_va.fit(x, yv)
return mod_ev, mod_va
# =================================================================
# conversion from Pearson correlations to Spearman correlations
# and vice versa
# =================================================================
def spearman_from_pearson(r):
""" convertion for normal multivariate """
s = 6 / np.pi * np.arcsin(0.5 * r)
return s
def pearson_from_spearman(s):
""" conversion for normal multivariate """
r = 2 * np.sin(s * np.pi / 6)
return r
from scipy.stats import mvn
# =====================================================================
# Gaussian multivariate normal cdf
# =====================================================================
informcode = {0: 'normal completion with ERROR < EPS',
1: '''completion with ERROR > EPS and MAXPTS function values used;
increase MAXPTS to decrease ERROR;''',
2: 'N > 500 or N < 1'}
def mvstdnormcdf(lower, upper, corrcoef, **kwds):
n = len(lower)
lower = np.array(lower)
upper = np.array(upper)
corrcoef = np.array(corrcoef)
correl = np.zeros(int(n*(n-1)/2.0)) #dtype necessary?
if (lower.ndim != 1) or (upper.ndim != 1):
raise ValueError('can handle only 1D bounds')
if len(upper) != n:
raise ValueError('bounds have different lengths')
if n==2 and corrcoef.size==1:
correl = corrcoef
elif corrcoef.ndim == 1 and len(corrcoef) == n*(n-1)/2.0:
correl = corrcoef
elif corrcoef.shape == (n,n):
correl = corrcoef[np.tril_indices(n, -1)]
else:
raise ValueError('corrcoef has incorrect dimension')
if not 'maxpts' in kwds:
if n >2:
kwds['maxpts'] = 10000*n
lowinf = np.isneginf(lower)
uppinf = np.isposinf(upper)
infin = 2.0*np.ones(n)
np.putmask(infin,lowinf,0)# infin.putmask(0,lowinf)
np.putmask(infin,uppinf,1) #infin.putmask(1,uppinf)
#this has to be last
np.putmask(infin,lowinf*uppinf,-1)
error, cdfvalue, inform = sp.stats.mvn.mvndst(lower,upper,infin,correl,**kwds)
if inform:
print('something wrong', informcode[inform], error)
return cdfvalue
def mvnormcdf(upper, mu, cov, lower=None, **kwds):
upper = np.array(upper)
if lower is None:
lower = -np.ones(upper.shape) * np.inf
else:
lower = np.array(lower)
cov = np.array(cov)
stdev = np.sqrt(np.diag(cov)) # standard deviation vector
#do I need to make sure stdev is float and not int?
#is this correct to normalize to corr?
lower = (lower - mu)/stdev
upper = (upper - mu)/stdev
divrow = np.atleast_2d(stdev)
corr = cov/divrow/divrow.T
#v/np.sqrt(np.atleast_2d(np.diag(covv)))/np.sqrt(np.atleast_2d(np.diag(covv))).T
return mvstdnormcdf(lower, upper, corr, **kwds)
data = np.asarray([['factor', 'first', 'second', 'third', 'fourth'],
['first', 1.00, 0.50, -0.50, 0.50],
['second', 0.50, 1.0, 0.25, 0.00],
['third', -0.50, 0.25, 1.0, -0.25],
['fourth', 0.50, 0.00, -0.25, 1.00]])
s = pd.DataFrame(data=data[1:,1:], index=data[1:,0], columns=data[0,1:])
s = s.astype('float')
print(s)
One target variable 'second'.
conditionals = s.columns.delete(1)
print('conditional variables {0}'.format(conditionals))
# create conditionals class
cop = GaussianCopulaConditional(s, conditionals)
# define condition
u_cond = 0.5 * np.ones((len(cop.targets), len(cop.conditionals)))
print(u_cond)
# define single point u
u = np.array([[0.2]])
print(u)
# compute conditional CDF
ccdf = cop.cdf(u, u_cond)
print('--- conditional CDF ---')
print(ccdf)
# compute conditional PDF
cpdf = cop.pdf(u, u_cond)
print('--- conditional PDF ---')
print(cpdf)
# define points u
u = np.array([[0.2], [0.5], [0.8]])
print(u)
# compute conditional CDF
ccdf = cop.cdf(u, u_cond)
print('--- conditional CDF ---')
print(ccdf)
# compute conditional PDF
cpdf = cop.pdf(u, u_cond)
print('--- conditional PDF ---')
print(cpdf)