Source code for crystal.crystal

"""
Model clustered, correlated data.
"""

import sys
import toolshed as ts
from itertools import izip
import copy
import time
import re

import pandas as pd
import numpy as np
import scipy.stats as ss
import patsy

from scipy.stats import norm
from numpy.linalg import cholesky as chol, lstsq

from statsmodels.api import GEE, GLM, MixedLM, RLM, GLS, OLS, GLSAR
from statsmodels.genmod.cov_struct import Exchangeable, Independence
from statsmodels.genmod.families import (Gaussian, Poisson,
        NegativeBinomial as NB)
from statsmodels.discrete.discrete_model import NegativeBinomial
from statsmodels.tools.sm_exceptions import PerfectSeparationError

def long_covs(covs, methylation, **kwargs):
    covs['id'] = ['id_%i' % i for i in range(len(covs))]
    cov_rep = pd.concat((covs for i in range(len(methylation))))
    #nr, nc = methylation.shape
    cov_rep['CpG'] = np.repeat(['CpG_%i' % i for i in
        range(methylation.shape[0])], methylation.shape[1])

    #cov_rep['CpG'] = np.repeat(['CpG_%i' % i for i in range(nr)], nc)
    cov_rep['methylation'] = np.concatenate(methylation)
    for k, arr in kwargs.items():
        assert arr.shape == methylation.shape
        cov_rep[k] = np.concatenate(arr)
    cov_rep.index = np.arange(len(cov_rep), dtype=int)
    return cov_rep

def corr(methylations):
    if len(methylations) == 0: return np.nan
    if len(methylations) == 1: return [[1]]
    c = np.abs(ss.spearmanr(methylations.T))
    if len(c.shape) == 1:
        assert len(methylations) == 2
        return np.array([[1, c[0]], [c[0], 1]])
    return c[0]

def get_ptc(fit, coef):
    if isinstance(fit, list):
        result, res = [get_ptc(f, coef) for f in fit], {}
        for c in ('p', 't', 'coef'):
            res[c] = np.array([r[c] for r in result])
        res['covar'] = result[0]['covar']
        return res

    idx = [i for i, par in enumerate(fit.model.exog_names)
                        if par.startswith(coef)]
    assert len(idx) == 1, ("too many params like", coef)
    try:
        return {'p': fit.pvalues[idx[0]],
            't': fit.tvalues[idx[0]],
            'coef': fit.params[idx[0]],
            'covar': fit.model.exog_names[idx[0]]}
    except (ValueError, np.linalg.linalg.LinAlgError) as e:
        return dict(p=np.nan, t=np.nan, coef=np.nan,
                covar=fit.model.exog_names[idx[0]])


[docs]def one_cluster(formula, feature, covs, coef, method=OLS,
                _pat=re.compile("\+\s*CpG")):
    """used when we have a "cluster" with 1 probe."""
    c = covs.copy()
    # remove the CpG in the formula
    formula = _pat.sub("", formula)
    if isinstance(feature, CountFeature):
        c['methylation'] = feature.methylated
        c['counts'] = feature.counts
        c = c[c['counts'] > 0]
        try:
            return get_ptc(GLM.from_formula(formula, data=c,
                                        exposure=c['counts'],
                                        family=Poisson()).fit(), coef)
        except PerfectSeparationError:
            return dict(p=np.nan, t=np.nan, coef=np.nan, covar=coef)
    else:
        c['methylation'] = feature.values
        res = method.from_formula(formula, data=c).fit()
        return get_ptc(res, coef)


[docs]def nb_cluster(formula, cluster, covs, coef):
    """Model a cluster of correlated features with the negative binomial"""
    methylated = np.array([f.methylated for f in cluster])
    counts = np.array([f.counts for f in cluster])
    try:
        res = [NegativeBinomial.from_formula(formula, covs, offset=np.log(count))\
               .fit(disp=0) for methylation, count in izip(methylated, counts)]
    except:
        return dict(t=np.nan, coef=np.nan, covar="NA", p=np.nan, corr=np.nan)

    methylations = methylated / counts # for correlation below.

    r = get_ptc(res, coef)
    nan = np.isnan(r['p'])
    r['p'] = zscore_combine(r['p'][~nan], corr(methylations[~nan]))
    r['t'], r['coef'] = np.mean(r['t']), np.mean(r['coef'])
    return r

[docs]def gee_cluster(formula, cluster, covs, coef, cov_struct=Exchangeable(),
        family=Gaussian()):
    """An example of a `model_fn`; any function with a similar signature
    can be used.

    Parameters
    ----------

    formula : str
        R (patsy) style formula. Must contain 'methylation': e.g.:
        methylation ~ age + gender + race

    cluster : list of Features
        cluster of features from clustering or a region.
        most functions will create a methylation matrix with:
        >> meth = np.array([f.values for f in features])

    covs : pandas.DataFrame
        Contains covariates from `formula`

    coef: str
        coefficient of interest, e.g. 'age'

    cov_struct: object
        one of the covariance structures provided by statsmodels.
        Likely either Exchangeable() or Independence()

    family: object
        one of the familyies provided by statsmodels. If Guassian(),
        then methylation is assumed to be count-based (clusters of
        CountFeatures.

    Returns
    -------

    result : dict
        dict with values (keys) of at least p-value ('p'), coefficient
        estimate ('coef') and any other information desired.
    """
    if isinstance(family, Gaussian):
        cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
        res = GEE.from_formula(formula, groups=cov_rep['id'], data=cov_rep,
            cov_struct=cov_struct, family=family).fit()
    elif isinstance(family, (NB, Poisson)):
        cov_rep = long_covs(covs, np.array([f.methylated for f in cluster]),
                counts = np.array([f.counts for f in cluster]))
        res = GEE.from_formula(formula, groups=cov_rep['id'], data=cov_rep,
                               cov_struct=cov_struct, family=family,
                               offset=np.log(cov_rep['counts'])).fit()
    else:
        raise Exception("Only gaussian and poisson are supported")

    return get_ptc(res, coef)

# see: https://gist.github.com/josef-pkt/89585d0b084739a4ed1c
[docs]def ols_cluster_robust(formula, cluster, covs, coef):
    """Model clusters with cluster-robust OLS, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    res = OLS.from_formula(formula, data=cov_rep).fit(cov_type='cluster',
            cov_kwds=dict(groups=cov_rep['id']))
    return get_ptc(res, coef)

def glsar_cluster(formula, cluster, covs, coef, rho=6):
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    # group by id, then sort by CpG so that AR is to adjacent CpGs
    cov_rep.sort(['id', 'CpG'], inplace=True)
    res = GLSAR.from_formula(formula, data=cov_rep, rho=rho).iterative_fit(maxiter=5)
    return get_ptc(res, coef)

[docs]def mixed_model_cluster(formula, cluster, covs, coef):
    """Model clusters with a mixed-model, same signature as
    :func:`~gee_cluster`"""
    cov_rep = long_covs(covs, np.array([f.values for f in cluster]))
    # TODO: remove this once newer version of statsmodels is out.
    # speeds convergence by using fixed estimates from OLS
    params = OLS.from_formula(formula, data=cov_rep).fit().params

    res = MixedLM.from_formula(formula, groups='id',
            data=cov_rep).fit(start_params=dict(fe=params), reml=False,
                    method='bfgs')

    return get_ptc(res, coef)

def zscore_combine(pvals, sigma, mid=np.mean):
    if np.all(np.isnan(sigma)): return np.nan
    pvals[pvals == 1] = 1.0 - 9e-16
    z, L = mid(norm.isf(pvals)), len(pvals)
    sz = 1.0 / L * np.sqrt(L + 2 * np.tril(sigma, k=-1).sum())
    return norm.sf(z / sz)

def _combine_cluster(formula, methylations, covs, coef, method,
        _corr=True, method_kwargs=None):
    """function called by z-score to get pvalues"""
    if method_kwargs is None: method_kwargs = {}
    assert not 'methylation' in covs
    res = [method.from_formula(formula, covs, **method_kwargs).fit()
            for methylation in methylations]
    idx = [i for i, par in enumerate(res[0].model.exog_names)
                   if par.startswith(coef)][0]
    pvals = np.array([r.pvalues[idx] for r in res], dtype=np.float64)
    pvals[pvals == 1] = 1.0 - 9e-16
    res = dict(t=np.array([r.tvalues[idx] for r in res]),
                coef=np.array([r.params[idx] for r in res]),
                covar=res[0].model.exog_names[idx],
                p=pvals[~np.isnan(pvals)])
    # save some time for bumphunting where we don't need
    # the correlation.
    if _corr:
        res['corr'] = corr(methylations[~np.isnan(pvals)])
    return res

def beta_count_cluster(formula, cluster, covs, coef, mid=np.mean):
    from .betareg import Beta
    methylations = np.array([f.values for f in cluster])
    counts = np.array([f.counts for f in cluster])
    res = []
    for count, methylation in zip(counts, methylations):
        Z = np.column_stack((np.ones_like(count), count))
        Z = Z[Z[:, 1] > 0]
        try:
            res.append(get_ptc(Beta.from_formula(formula, covs, Z=Z).fit(),
                coef))
        except PerfectSeparationError:
            res.append(dict(t=np.nan, coef=np.nan, p=np.nan, covar=coef))
    pvals = np.array([r['p'] for r in res])
    cor = corr(methylations[~np.isnan(pvals)])
    r = {}
    r['p'] = zscore_combine(pvals, cor, mid=mid)
    r['t'], r['coef'] = (np.mean([rs['t'] for rs in res]),
                         np.mean([rs['coef'] for rs in res]))
    return r


[docs]def zscore_cluster(formula, cluster, covs, coef, method=OLS, method_kwargs=None, mid=np.mean):
    """Model clusters by fitting model at each site and then
    combining using the z-score method. Same signature as
    :func:`~gee_cluster`"""
    methylations = np.array([f.values for f in cluster])
    r = _combine_cluster(formula, methylations, covs, coef, method=method,
            method_kwargs=method_kwargs)
    r['p'] = zscore_combine(r['p'], r.pop('corr'), mid=mid)
    r['t'], r['coef'] = r['t'].mean(), r['coef'].mean()
    return r

# function for comparing with bump_cluster
# takes the return value of _combine_cluster and returns a single numeric value
def coef_sum(c, cutoff=0.01):
    coefs = c['coef']
    return sum(min(0, c + cutoff) if c < 0 else max(0, c - cutoff) for c in coefs)

# function for comparing with bump_cluster
def t_sum(c, cutoff=2):
    coefs = c['t']
    return sum(min(0, c + cutoff) if c < 0 else max(0, c - cutoff) for c in coefs)

# function for comparing with bump_cluster
def coef_t_prod(coefs):
    return np.median([coefs['t'][i] * coefs['coef'][i]
                        for i in range(len(coefs['coef']))])


def _cluster_coefs(formula, y, covs, coef):
    """Fast coefficient calculation for bumping."""
    X = patsy.dmatrix(formula.split("~")[-1], covs, return_type="dataframe")
    idx = [i for i, c in enumerate(X.columns) if c.startswith(coef)]
    assert len(idx) == 1
    x, resids, rank, s = lstsq(X, y.T)
    coefs = x[idx[0]]
    return coefs

[docs]def bump_cluster(formula, cluster, covs, coef, nsims=20000,
        value_fn=coef_sum, method=OLS):
    """Model clusters by fitting model at each site and then comparing some
    metric to the same metric from models fit to simulated data.
    Uses sequential Monte-carlo to stop once we know the simulated p-value is
    high (since we are always interested in low p-values).

    Same signature as :func:`~gee_cluster`
    """
    methylations = np.array([f.values for f in cluster])
    orig = _combine_cluster(formula, methylations, covs, coef, method=method)
    obs_coef = value_fn(orig)

    reduced_residuals, reduced_fitted = [], []

    # get the reduced residuals and models so we can shuffle
    for i, methylation in enumerate(methylations):
        y, X = patsy.dmatrices(formula, covs, return_type='dataframe')
        idxs = [par for par in X.columns if par.startswith(coef)]
        assert len(idxs) == 1, ('too many coefficents like', coef)
        X.pop(idxs[0])
        fitr = method(y, X).fit()

        reduced_residuals.append(np.array(fitr.resid))
        reduced_fitted.append(np.array(fitr.fittedvalues))

    ngt, idxs = 0, np.arange(len(methylations[0]))

    for isim in range(nsims):
        np.random.shuffle(idxs)

        fakem = np.array([rf + rr[idxs] for rf, rr in izip(reduced_fitted,
            reduced_residuals)])
        assert fakem.shape == methylations.shape

        coefs = _cluster_coefs(formula, fakem, covs, coef)
        ccut = value_fn(dict(coef=coefs))

        #sim = _combine_cluster(formula, fakem, covs, coef, robust=robust,
        #        _corr=False)
        #ccut = value_fn(sim)

        ngt += abs(ccut) > abs(obs_coef)
        # sequential monte-carlo.
        if ngt > 6: break

    orig.pop('corr')
    # extra 1 in denom for 0-index
    orig['n_sim'] = isim + 1
    # extra 1 so we can't get 0 p-value
    orig['p'] = (1.0 + ngt) / (1.0 + orig['n_sim'])
    orig['coef'], orig['t'] = orig['coef'].mean(), orig['t'].mean()
    return orig

[docs]def wrapper(model_fn, formula, cluster, clin_df, coef, kwargs=None):
    """wrap the user-defined functions to return everything we expect and
    to call just OLS when there is a single probe."""
    if kwargs is None: kwargs = {}
    t = time.time()
    if len(cluster) > 1:
        r = model_fn(formula,
                cluster,
                clin_df,
                coef, **kwargs)
    else:
        r = one_cluster(formula, cluster[0], clin_df, coef,
                method=kwargs.get('method', OLS))
    r['time'] = time.time() - t
    r['chrom'] = cluster[0].chrom
    r['start'] = cluster[0].position - 1
    r['end'] = cluster[-1].position
    r['n_sites'] = len(cluster)
    r['sites'] = [c.spos for c in cluster]
    r['var'] = coef
    r['cluster'] = cluster
    return r

[docs]def model_clusters(clust_iter, clin_df, formula, coef, model_fn=gee_cluster,
        pool=None,
        transform=None,
        n_cpu=None,
        **kwargs):
    """For each cluster in an iterable, evaluate the chosen model and
    yield a dictionary of information

    Parameters
    ----------

    clust_iter : iterable
        iterable of clusters

    clin_df : pandas.DataFrame
        Contains covariates from `formula`

    formula : str
        R (patsy) style formula. Must contain 'methylation': e.g.:
        methylation ~ age + gender + race

    coef : str
        The coefficient of interest in the model, e.g. 'age'

    model_fn : fn
        A function with signature
        fn(formula, methylation, covs, coef, kwargs)
        that returns a dictionary with at least p-value and coef

    transform: fn
        A function that modifies the data before modeling.

    n_cpu : int

    kwargs: dict
        arguments sent to `model_fn`
    """
    try:
        clin_df.pop('methylation')
    except KeyError:
        pass
    if transform:
        tf = lambda cluster: cluster_transform(cluster, transform)

    for r in ts.pmap(wrapper, ((model_fn, formula,
                                tf(cluster) if transform else cluster,
                                clin_df, coef,
                                kwargs) for cluster in clust_iter), n_cpu,
                                p=pool):
        yield r

def cluster_transform(cluster, method):
    cluster = copy.deepcopy(cluster)
    for f in cluster:
        f.values = method(f.values)
    return cluster

[docs]class Feature(object):

    """
    A feature object that can and likely should be used by all programs that
    call `crystal`. Takes a chromosome, a position and a list of float
    values that are the methylation measurements (should be logit transformed).

    Attributes
    ----------

    chrom: str

    position: int

    values: list

    spos: str
        string position (chr1:12354)

    rho_min : float
        minimum spearman's R to be considered correlated

    ovalues : list
        other values potentially used by modeling functions
    """

    __slots__ = "chrom position values spos rho_min ovalues".split()

    def __init__(self, chrom, pos, values, ovalues=None, rho_min=0.5):
        self.spos = "%s:%i" % (chrom, pos)
        self.chrom, self.position, self.values = chrom, pos, np.asarray(values)
        self.rho_min = rho_min
        self.ovalues = ovalues

[docs]    def distance(self, other):
        """Distance between this feature and another."""
        if self.chrom != other.chrom: return sys.maxint
        return self.position - other.position

[docs]    def is_correlated(self, other):
        """Return boolean indicating  correlation with other."""
        rho, p = ss.spearmanr(self.values, other.values)
        return rho > self.rho_min

    def __repr__(self):
        return "{cls}({spos})".format(spos=self.spos,
                                        cls=self.__class__.__name__)

    def __cmp__(self, other):
        return cmp(self.chrom, other.chrom) or cmp(self.position,
                                                   other.position)

[docs]class CountFeature(Feature):

    """Feature Class that supports count data."""

    def __init__(self, chrom, pos, methylated, counts, rho_min=0.5):
        Feature.__init__(self, chrom, pos, methylated, counts, rho_min=rho_min)
        self.counts = counts
        # use ratios for correlation.
        self.values = methylated / counts.astype(float)
        self.methylated = methylated