"""
Model clustered, correlated data.
"""
import sys
import toolshed as ts
from itertools import izip
import copy
import time
import re
import pandas as pd
import numpy as np
import scipy.stats as ss
import patsy
from scipy.stats import norm
from numpy.linalg import cholesky as chol, lstsq
from statsmodels.api import GEE, GLM, MixedLM, RLM, GLS, OLS, GLSAR
from statsmodels.genmod.cov_struct import Exchangeable, Independence
from statsmodels.genmod.families import (Gaussian, Poisson,
NegativeBinomial as NB)
from statsmodels.discrete.discrete_model import NegativeBinomial
from statsmodels.tools.sm_exceptions import PerfectSeparationError
def long_covs(covs, methylation, **kwargs):
covs['id'] = ['id_%i' % i for i in range(len(covs))]
cov_rep = pd.concat((covs for i in range(len(methylation))))
#nr, nc = methylation.shape
cov_rep['CpG'] = np.repeat(['CpG_%i' % i for i in
range(methylation.shape[0])], methylation.shape[1])
#cov_rep['CpG'] = np.repeat(['CpG_%i' % i for i in range(nr)], nc)
cov_rep['methylation'] = np.concatenate(methylation)
for k, arr in kwargs.items():
assert arr.shape == methylation.shape
cov_rep[k] = np.concatenate(arr)
cov_rep.index = np.arange(len(cov_rep), dtype=int)
return cov_rep
def corr(methylations):
if len(methylations) == 0: return np.nan
if len(methylations) == 1: return [[1]]
c = np.abs(ss.spearmanr(methylations.T))
if len(c.shape) == 1:
assert len(methylations) == 2
return np.array([[1, c[0]], [c[0], 1]])
return c[0]
def get_ptc(fit, coef):
if isinstance(fit, list):
result, res = [get_ptc(f, coef) for f in fit], {}
for c in ('p', 't', 'coef'):
res[c] = np.array([r[c] for r in result])
res['covar'] = result[0]['covar']
return res
idx = [i for i, par in enumerate(fit.model.exog_names)
if par.startswith(coef)]
assert len(idx) == 1, ("too many params like", coef)
try:
return {'p': fit.pvalues[idx[0]],
't': fit.tvalues[idx[0]],
'coef': fit.params[idx[0]],
'covar': fit.model.exog_names[idx[0]]}
except (ValueError, np.linalg.linalg.LinAlgError) as e:
return dict(p=np.nan, t=np.nan, coef=np.nan,
covar=fit.model.exog_names[idx[0]])
[docs]def one_cluster(formula, feature, covs, coef, method=OLS,
_pat=re.compile("\+\s*CpG")):
"""used when we have a "cluster" with 1 probe."""
c = covs.copy()
# remove the CpG in the formula
formula = _pat.sub("", formula)
if isinstance(feature, CountFeature):
c['methylation'] = feature.methylated
c['counts'] = feature.counts
c = c[c['counts'] > 0]
try:
return get_ptc(GLM.from_formula(formula, data=c,
exposure=c['counts'],
family=Poisson()).fit(), coef)
except PerfectSeparationError:
return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef)
else:
c['methylation'] = feature.values
res = method.from_formula(formula, data=c).fit()
return get_ptc(res, coef)
[docs]def nb_cluster(formula, cluster, covs, coef):
"""Model a cluster of correlated features with the negative binomial"""
methylated = np.array([f.methylated for f in cluster])
counts = np.array([f.counts for f in cluster])
try:
res = [NegativeBinomial.from_formula(formula, covs, offset=np.log(count))\
.fit(disp=0) for methylation, count in izip(methylated, counts)]
except:
return dict(t=np.nan, coef=np.nan, covar="NA", p=np.nan, corr=np.nan)
methylations = methylated / counts # for correlation below.
r = get_ptc(res, coef)
nan = np.isnan(r['p'])
r['p'] = zscore_combine(r['p'][~nan], corr(methylations[~nan]))
r['t'], r['coef'] = np.mean(r['t']), np.mean(r['coef'])
return r
[docs]def gee_cluster(formula, cluster, covs, coef, cov_struct=Exchangeable(),
family=Gaussian()):
"""An example of a `model_fn`; any function with a similar signature
can be used.
Parameters
----------
formula : str
R (patsy) style formula. Must contain 'methylation': e.g.:
methylation ~ age + gender + race
cluster : list of Features
cluster of features from clustering or a region.
most functions will create a methylation matrix with:
>> meth = np.array([f.values for f in features])
covs : pandas.DataFrame
Contains covariates from `formula`
coef: str
coefficient of interest, e.g. 'age'
cov_struct: object
one of the covariance structures provided by statsmodels.
Likely either Exchangeable() or Independence()
family: object
one of the familyies provided by statsmodels. If Guassian(),
then methylation is assumed to be count-based (clusters of
CountFeatures.
Returns
-------
result : dict
dict with values (keys) of at least p-value ('p'), coefficient
estimate ('coef') and any other information desired.
"""
if isinstance(family, Gaussian):
cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
res = GEE.from_formula(formula, groups=cov_rep['id'], data=cov_rep,
cov_struct=cov_struct, family=family).fit()
elif isinstance(family, (NB, Poisson)):
cov_rep = long_covs(covs, np.array([f.methylated for f in cluster]),
counts = np.array([f.counts for f in cluster]))
res = GEE.from_formula(formula, groups=cov_rep['id'], data=cov_rep,
cov_struct=cov_struct, family=family,
offset=np.log(cov_rep['counts'])).fit()
else:
raise Exception("Only gaussian and poisson are supported")
return get_ptc(res, coef)
# see: https://gist.github.com/josef-pkt/89585d0b084739a4ed1c
[docs]def ols_cluster_robust(formula, cluster, covs, coef):
"""Model clusters with cluster-robust OLS, same signature as
:func:`~gee_cluster`"""
cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
res = OLS.from_formula(formula, data=cov_rep).fit(cov_type='cluster',
cov_kwds=dict(groups=cov_rep['id']))
return get_ptc(res, coef)
def glsar_cluster(formula, cluster, covs, coef, rho=6):
cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
# group by id, then sort by CpG so that AR is to adjacent CpGs
cov_rep.sort(['id', 'CpG'], inplace=True)
res = GLSAR.from_formula(formula, data=cov_rep, rho=rho).iterative_fit(maxiter=5)
return get_ptc(res, coef)
[docs]def mixed_model_cluster(formula, cluster, covs, coef):
"""Model clusters with a mixed-model, same signature as
:func:`~gee_cluster`"""
cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
# TODO: remove this once newer version of statsmodels is out.
# speeds convergence by using fixed estimates from OLS
params = OLS.from_formula(formula, data=cov_rep).fit().params
res = MixedLM.from_formula(formula, groups='id',
data=cov_rep).fit(start_params=dict(fe=params), reml=False,
method='bfgs')
return get_ptc(res, coef)
def zscore_combine(pvals, sigma, mid=np.mean):
if np.all(np.isnan(sigma)): return np.nan
pvals[pvals == 1] = 1.0 - 9e-16
z, L = mid(norm.isf(pvals)), len(pvals)
sz = 1.0 / L * np.sqrt(L + 2 * np.tril(sigma, k=-1).sum())
return norm.sf(z / sz)
def _combine_cluster(formula, methylations, covs, coef, method,
_corr=True, method_kwargs=None):
"""function called by z-score to get pvalues"""
if method_kwargs is None: method_kwargs = {}
assert not 'methylation' in covs
res = [method.from_formula(formula, covs, **method_kwargs).fit()
for methylation in methylations]
idx = [i for i, par in enumerate(res[0].model.exog_names)
if par.startswith(coef)][0]
pvals = np.array([r.pvalues[idx] for r in res], dtype=np.float64)
pvals[pvals == 1] = 1.0 - 9e-16
res = dict(t=np.array([r.tvalues[idx] for r in res]),
coef=np.array([r.params[idx] for r in res]),
covar=res[0].model.exog_names[idx],
p=pvals[~np.isnan(pvals)])
# save some time for bumphunting where we don't need
# the correlation.
if _corr:
res['corr'] = corr(methylations[~np.isnan(pvals)])
return res
def beta_count_cluster(formula, cluster, covs, coef, mid=np.mean):
from .betareg import Beta
methylations = np.array([f.values for f in cluster])
counts = np.array([f.counts for f in cluster])
res = []
for count, methylation in zip(counts, methylations):
Z = np.column_stack((np.ones_like(count), count))
Z = Z[Z[:, 1] > 0]
try:
res.append(get_ptc(Beta.from_formula(formula, covs, Z=Z).fit(),
coef))
except PerfectSeparationError:
res.append(dict(t=np.nan, coef=np.nan, p=np.nan, covar=coef))
pvals = np.array([r['p'] for r in res])
cor = corr(methylations[~np.isnan(pvals)])
r = {}
r['p'] = zscore_combine(pvals, cor, mid=mid)
r['t'], r['coef'] = (np.mean([rs['t'] for rs in res]),
np.mean([rs['coef'] for rs in res]))
return r
[docs]def zscore_cluster(formula, cluster, covs, coef, method=OLS, method_kwargs=None, mid=np.mean):
"""Model clusters by fitting model at each site and then
combining using the z-score method. Same signature as
:func:`~gee_cluster`"""
methylations = np.array([f.values for f in cluster])
r = _combine_cluster(formula, methylations, covs, coef, method=method,
method_kwargs=method_kwargs)
r['p'] = zscore_combine(r['p'], r.pop('corr'), mid=mid)
r['t'], r['coef'] = r['t'].mean(), r['coef'].mean()
return r
# function for comparing with bump_cluster
# takes the return value of _combine_cluster and returns a single numeric value
def coef_sum(c, cutoff=0.01):
coefs = c['coef']
return sum(min(0, c + cutoff) if c < 0 else max(0, c - cutoff) for c in coefs)
# function for comparing with bump_cluster
def t_sum(c, cutoff=2):
coefs = c['t']
return sum(min(0, c + cutoff) if c < 0 else max(0, c - cutoff) for c in coefs)
# function for comparing with bump_cluster
def coef_t_prod(coefs):
return np.median([coefs['t'][i] * coefs['coef'][i]
for i in range(len(coefs['coef']))])
def _cluster_coefs(formula, y, covs, coef):
"""Fast coefficient calculation for bumping."""
X = patsy.dmatrix(formula.split("~")[-1], covs, return_type="dataframe")
idx = [i for i, c in enumerate(X.columns) if c.startswith(coef)]
assert len(idx) == 1
x, resids, rank, s = lstsq(X, y.T)
coefs = x[idx[0]]
return coefs
[docs]def bump_cluster(formula, cluster, covs, coef, nsims=20000,
value_fn=coef_sum, method=OLS):
"""Model clusters by fitting model at each site and then comparing some
metric to the same metric from models fit to simulated data.
Uses sequential Monte-carlo to stop once we know the simulated p-value is
high (since we are always interested in low p-values).
Same signature as :func:`~gee_cluster`
"""
methylations = np.array([f.values for f in cluster])
orig = _combine_cluster(formula, methylations, covs, coef, method=method)
obs_coef = value_fn(orig)
reduced_residuals, reduced_fitted = [], []
# get the reduced residuals and models so we can shuffle
for i, methylation in enumerate(methylations):
y, X = patsy.dmatrices(formula, covs, return_type='dataframe')
idxs = [par for par in X.columns if par.startswith(coef)]
assert len(idxs) == 1, ('too many coefficents like', coef)
X.pop(idxs[0])
fitr = method(y, X).fit()
reduced_residuals.append(np.array(fitr.resid))
reduced_fitted.append(np.array(fitr.fittedvalues))
ngt, idxs = 0, np.arange(len(methylations[0]))
for isim in range(nsims):
np.random.shuffle(idxs)
fakem = np.array([rf + rr[idxs] for rf, rr in izip(reduced_fitted,
reduced_residuals)])
assert fakem.shape == methylations.shape
coefs = _cluster_coefs(formula, fakem, covs, coef)
ccut = value_fn(dict(coef=coefs))
#sim = _combine_cluster(formula, fakem, covs, coef, robust=robust,
# _corr=False)
#ccut = value_fn(sim)
ngt += abs(ccut) > abs(obs_coef)
# sequential monte-carlo.
if ngt > 6: break
orig.pop('corr')
# extra 1 in denom for 0-index
orig['n_sim'] = isim + 1
# extra 1 so we can't get 0 p-value
orig['p'] = (1.0 + ngt) / (1.0 + orig['n_sim'])
orig['coef'], orig['t'] = orig['coef'].mean(), orig['t'].mean()
return orig
[docs]def wrapper(model_fn, formula, cluster, clin_df, coef, kwargs=None):
"""wrap the user-defined functions to return everything we expect and
to call just OLS when there is a single probe."""
if kwargs is None: kwargs = {}
t = time.time()
if len(cluster) > 1:
r = model_fn(formula,
cluster,
clin_df,
coef, **kwargs)
else:
r = one_cluster(formula, cluster[0], clin_df, coef,
method=kwargs.get('method', OLS))
r['time'] = time.time() - t
r['chrom'] = cluster[0].chrom
r['start'] = cluster[0].position - 1
r['end'] = cluster[-1].position
r['n_sites'] = len(cluster)
r['sites'] = [c.spos for c in cluster]
r['var'] = coef
r['cluster'] = cluster
return r
[docs]def model_clusters(clust_iter, clin_df, formula, coef, model_fn=gee_cluster,
pool=None,
transform=None,
n_cpu=None,
**kwargs):
"""For each cluster in an iterable, evaluate the chosen model and
yield a dictionary of information
Parameters
----------
clust_iter : iterable
iterable of clusters
clin_df : pandas.DataFrame
Contains covariates from `formula`
formula : str
R (patsy) style formula. Must contain 'methylation': e.g.:
methylation ~ age + gender + race
coef : str
The coefficient of interest in the model, e.g. 'age'
model_fn : fn
A function with signature
fn(formula, methylation, covs, coef, kwargs)
that returns a dictionary with at least p-value and coef
transform: fn
A function that modifies the data before modeling.
n_cpu : int
kwargs: dict
arguments sent to `model_fn`
"""
try:
clin_df.pop('methylation')
except KeyError:
pass
if transform:
tf = lambda cluster: cluster_transform(cluster, transform)
for r in ts.pmap(wrapper, ((model_fn, formula,
tf(cluster) if transform else cluster,
clin_df, coef,
kwargs) for cluster in clust_iter), n_cpu,
p=pool):
yield r
def cluster_transform(cluster, method):
cluster = copy.deepcopy(cluster)
for f in cluster:
f.values = method(f.values)
return cluster
[docs]class Feature(object):
"""
A feature object that can and likely should be used by all programs that
call `crystal`. Takes a chromosome, a position and a list of float
values that are the methylation measurements (should be logit transformed).
Attributes
----------
chrom: str
position: int
values: list
spos: str
string position (chr1:12354)
rho_min : float
minimum spearman's R to be considered correlated
ovalues : list
other values potentially used by modeling functions
"""
__slots__ = "chrom position values spos rho_min ovalues".split()
def __init__(self, chrom, pos, values, ovalues=None, rho_min=0.5):
self.spos = "%s:%i" % (chrom, pos)
self.chrom, self.position, self.values = chrom, pos, np.asarray(values)
self.rho_min = rho_min
self.ovalues = ovalues
[docs] def distance(self, other):
"""Distance between this feature and another."""
if self.chrom != other.chrom: return sys.maxint
return self.position - other.position
def __repr__(self):
return "{cls}({spos})".format(spos=self.spos,
cls=self.__class__.__name__)
def __cmp__(self, other):
return cmp(self.chrom, other.chrom) or cmp(self.position,
other.position)
[docs]class CountFeature(Feature):
"""Feature Class that supports count data."""
def __init__(self, chrom, pos, methylated, counts, rho_min=0.5):
Feature.__init__(self, chrom, pos, methylated, counts, rho_min=rho_min)
self.counts = counts
# use ratios for correlation.
self.values = methylated / counts.astype(float)
self.methylated = methylated