Source code for cdr.util

from __future__ import print_function
import sys
import os
import re
import math
import pickle
import numpy as np
import pandas as pd
from scipy import linalg, special

file_re = re.compile('output_([^_]+)_f(\d+)_([^_]+).csv')


def stderr(s):
    sys.stderr.write(s)
    sys.stderr.flush()


def extract_cdr_prediction_data(dirpath, metric='mse'):
    twostep_ix = 0
    filetype_ix = 1
    response_ix = 3
    filenum_ix = 5
    partition_ix = 6

    parser = re.compile(
        '(LM_2STEP_)?'
        '(CDRpreds|squared_error|losses_mse|mse_losses|loglik|preds|preds_table|obs|output)'
        '(_(.+?))?(_f([0-9]+))?_([^_]*).(csv|txt)'
    )
    out = {}
    # Might be an ensemble, so check
    basename = os.path.basename(dirpath)
    parentdir = os.path.normpath(os.path.dirname(dirpath))
    modeldirs = []
    use_crossval = False
    folds = set()
    ensembleids = set()
    modeldir2fold = {}
    modeldir2ensembleid = {}
    for x in os.listdir(parentdir):
        if os.path.isdir(os.path.join(parentdir, x)):
            ensemble = re.match('\.m\d+', x[len(basename):])
            cv = re.match('\.CV[^.~]+~([^.~]+)(\.m(\d+))?', x[len(basename):])
            if x == basename or \
                    (x.startswith(basename) and (ensemble or cv)):
                x_full = os.path.join(parentdir, x)

                if cv:
                    fold = cv.group(1)
                    modeldir2fold[x] = fold
                    folds.add(fold)

                    ensembleid = cv.group(3)
                    modeldir2ensembleid[x] = ensembleid
                    ensembleids.add(ensembleid)

                    use_crossval = True

                modeldirs.append(x_full)
    folds = sorted(list(folds))
    ensembleids = sorted(list(ensembleids))
    if len(modeldirs) > 1: # Ensemble
        try:
            modeldirs.remove(os.path.normpath(dirpath))
        except ValueError:
            pass
    for modeldir in modeldirs:
        model_basename = os.path.basename(modeldir)
        for path in os.listdir(modeldir):
            parsed = parser.match(path)
            if parsed:
                parsed = parsed.groups()
                if parsed[twostep_ix]:
                    predtype = '2step'
                else:
                    predtype = 'direct'
                filetype = parsed[filetype_ix]
                if filetype in ['CDRpreds', 'preds_table', 'output']:
                    filetype = 'table'
                elif filetype in ['squared_error', 'losses_mse', 'mse_losses', 'obs', 'preds']:
                    filetype = 'mse'
                response = parsed[response_ix]
                if response is None:
                    response = 'y'
                filenum = parsed[filenum_ix]
                if filenum is None:
                    filenum = 0
                else:
                    filenum = int(filenum)
                partition = parsed[partition_ix]

                data_path = os.path.join(modeldir, path)

                if filetype == 'table':
                    a = pd.read_csv(
                        data_path,
                        sep=' ',
                        skipinitialspace=True
                    )
                    if metric == 'mse':
                        a = (a['CDRobs'] - a['CDRpreds']) ** 2
                    elif metric == 'corr':
                        a = a[['CDRobs', 'CDRpreds']]
                    elif metric == 'loglik':
                        a = a['CDRloglik']
                    else:
                        raise ValueError('Unrecognized metric: %s' % metric)
                else:
                    a = pd.read_csv(
                        data_path,
                        sep=' ',
                        header=None,
                        skipinitialspace=True
                    )

                if response not in out:
                    out[response] = {}
                if filenum not in out[response]:
                    out[response][filenum] = {}
                if partition not in out[response][filenum]:
                    out[response][filenum][partition] = {}
                if predtype not in out[response][filenum][partition]:
                    out[response][filenum][partition][predtype] = {}
                if model_basename in out[response][filenum][partition][predtype]:
                    raise ValueError('Found redundant results file: %s.' % data_path)
                else:
                    out[response][filenum][partition][predtype][model_basename] = a

    # Concatenate across crossval folds if needed
    if use_crossval:
        _out = {}
        for response in out:
            if response not in _out:
                _out[response] = {}
            for filenum in out[response]:
                if filenum not in _out[response]:
                    _out[response][filenum] = {}
                for partition in out[response][filenum]:
                    if partition not in _out[response][filenum]:
                        _out[response][filenum][partition] = {}
                    for predtype in out[response][filenum][partition]:
                        if predtype not in _out[response][filenum][partition]:
                            _out[response][filenum][partition][predtype] = {}
                        ensembles = {}
                        for model_basename in out[response][filenum][partition][predtype]:
                            ensembleid = modeldir2ensembleid[model_basename]
                            if ensembleid not in ensembles:
                                ensembles[ensembleid] = {}
                            fold = modeldir2fold[model_basename]
                            if fold not in ensembles[ensembleid]:
                                ensembles[ensembleid][fold] = out[response][filenum][partition][predtype][model_basename]
                            else:
                                raise ValueError('Found redundant results file: %s.' % model_basename)
                        for ensembleid in ensembleids:
                            outname = basename
                            if ensembleid is not None:
                                outname += '.m%s' % ensembleid
                            ensemble = []
                            for fold in folds:
                                ensemble.append(ensembles[ensembleid][fold])
                            ensemble = pd.concat(ensemble, axis=0).reset_index(drop=True)
                            _out[response][filenum][partition][predtype][outname] = ensemble

        out = _out

    return out


[docs] def names2ix(names, l, dtype=np.int32): """ Generate 1D numpy array of indices in **l** corresponding to names in **names** :param names: ``list`` of ``str``; names to look up in **l** :param l: ``list`` of ``str``; list of names from which to extract indices :param dtype: ``numpy`` dtype object; return dtype :return: ``numpy`` array; indices of **names** in **l** """ if type(names) is not list: names = [names] ix = [] for n in names: ix.append(l.index(n)) return np.array(ix, dtype=dtype)
[docs] def mse(true, preds): """ Compute mean squared error (MSE). :param true: True values :param preds: Predicted values :return: ``float``; MSE """ return ((true-preds)**2).mean()
[docs] def mae(true, preds): """ Compute mean absolute error (MAE). :param true: True values :param preds: Predicted values :return: ``float``; MAE """ return (true-preds).abs().mean()
[docs] def percent_variance_explained(true, preds): """ Compute percent variance explained. :param true: True values :param preds: Predicted values :return: ``float``; percent variance explained """ num = mse(true, preds) denom = np.std(true) ** 2 return max(0., (1 - num / denom) * 100)
[docs] def get_random_permutation(n): """ Draw a random permutation of integers 0 to **n**. Used to shuffle arrays of length **n**. For example, a permutation and its inverse can be generated by calling ``p, p_inv = get_random_permutation(n)``. To randomly shuffle an **n**-dimensional vector ``x``, call ``x[p]``. To un-shuffle ``x`` after it has already been shuffled, call ``x[p_inv]``. :param n: maximum value :return: 2-tuple of ``numpy`` arrays; the permutation and its inverse """ p = np.random.permutation(np.arange(n)) p_inv = np.zeros_like(p) p_inv[p] = np.arange(n) return p, p_inv
[docs] def sn(string): """ Compute a valid scope name version of a string. :param string: ``str``; input string :return: ``str``; transformed string """ return re.sub('[^A-Za-z0-9_.\\-/]', '.', string)
[docs] def reg_name(string): """ Standardize a variable name for regularization :param string: ``str``; input string :return: ``str``; transformed string """ name = string.split(':')[0] name = name.replace('/', '_') return name
[docs] def pca(X, n_dim=None, dtype=np.float32): """ Perform principal components analysis on a data table. :param X: ``numpy`` or ``pandas`` array; the input data :param n_dim: ``int`` or ``None``; maximum number of principal components. If ``None``, all components are retained. :param dtype: ``numpy`` dtype; return dtype :return: 5-tuple of ``numpy`` arrays; transformed data, eigenvectors, eigenvalues, input means, and input standard deviations """ X = np.array(X, dtype=dtype) assert len(X.shape) == 2, 'Wrong dimensionality for PCA (X must be rank 2).' means = X.mean(0, keepdims=True) sds = X.std(0, keepdims=True) X -= means X /= sds C = np.cov(X, rowvar=False) eigenval, eigenvec = linalg.eigh(C) sorted_id = np.argsort(eigenval)[::-1] eigenval = eigenval[sorted_id] eigenvec = eigenvec[:,sorted_id] if n_dim is not None and n_dim < eigenvec.shape[1]: eigenvec = eigenvec[:,:n_dim] Xpc = np.dot(X, eigenvec) return Xpc, eigenvec, eigenval, means, sds
def logsumexp(a): return np.exp(a - special.logsumexp(a))
[docs] def nested(model_name_1, model_name_2): """ Check whether two CDR models are nested with 1 degree of freedom :param model_name_1: ``str``; name of first model :param model_name_2: ``str``; name of second model :return: ``bool``; ``True`` if models are nested with 1 degree of freedom, ``False`` otherwise """ split = (model_name_1.split('!'), model_name_2.split('!')) m_base = [x[0] for x in split] m_ablated = [set(x[1:]) for x in split] a = 0 if len(m_ablated[0]) < len(m_ablated[1]) else 1 b = 1 - a return m_base[a] == m_base[b] and len(m_ablated[b] - m_ablated[a]) == 1 and len(m_ablated[a] - m_ablated[b]) == 0
[docs] def filter_names(names, filters): """ Return elements of **names** permitted by **filters**, preserving order in which filters were matched. Filters can be ordinary strings, regular expression objects, or string representations of regular expressions. For a regex filter to be considered a match, the expression must entirely match the name. :param names: ``list`` of ``str``; pool of names to filter. :param filters: ``list`` of ``{str, SRE_Pattern}``; filters to apply in order :return: ``list`` of ``str``; names in **names** that pass at least one filter """ filters_regex = [re.compile(f if f.endswith('$') else f + '$') for f in filters] out = [] for i in range(len(filters)): filter = filters[i] filter_regex = filters_regex[i] for name in names: if name not in out: if name == filter: out.append(name) elif filter_regex.match(name): out.append(name) return out
[docs] def filter_models(names, filters=None, cdr_only=False): """ Return models contained in **names** that are permitted by **filters**, preserving order in which filters were matched. Filters can be ordinary strings, regular expression objects, or string representations of regular expressions. For a regex filter to be considered a match, the expression must entirely match the name. If ``filters`` is zero-length, returns **names**. :param names: ``list`` of ``str``; pool of model names to filter. :param filters: ``list`` of ``{str, SRE_Pattern}`` or ``None``; filters to apply in order. If ``None``, no additional filters. :param cdr_only: ``bool``; if ``True``, only returns CDR models. If ``False``, returns all models admitted by **filters**. :return: ``list`` of ``str``; names in **names** that pass at least one filter, or all of **names** if no filters are applied. """ if cdr_only: names = [name for name in names if not (name.startswith('LM') or name.startswith('GAM'))] if filters is None: filters = [] if len(filters) > 0: out = filter_names(names, filters) else: out = names return out
def get_partition_list(partition): if not isinstance(partition, list): partition = partition.strip().split() if ':' in partition[0]: partition = partition[0].split(':') else: if len(partition) == 1: partition = partition[0].split('-') if len(partition) == 1: partition = partition[0].split('+') return partition def paths_from_partition_cliarg(partition, config): partition = get_partition_list(partition) X_paths = [] y_paths = [] X_map = { 'train': config.X_train, 'dev': config.X_dev, 'test': config.X_test } Y_map = { 'train': config.Y_train, 'dev': config.Y_dev, 'test': config.Y_test } for p in partition: _X_paths = X_map[p] _y_paths = Y_map[p] assert _X_paths is not None, 'No data path provided for partition %s. If you did not intend to require ' + \ 'dev evaluation, set ``eval_freq`` to ``0`` in your model config.' % p assert _y_paths is not None, 'No data path provided for partition %s. If you did not intend to require ' + \ 'dev evaluation, set ``eval_freq`` to ``0`` in your model config.' % p for _X_path in _X_paths: if _X_path not in X_paths: X_paths.append(_X_path) for _y_path in _y_paths: if _y_path not in y_paths: y_paths.append(_y_path) return X_paths, y_paths def get_irf_name(x, irf_name_map): k = None for y in sorted(list(irf_name_map.keys())): if y == x: k = y elif y in x: if k is None or len(y) > len(k): k = y if k is None: out = x else: out = irf_name_map[k] return out def get_numerical_sd(sd, in_dim=1, out_dim=1): in_dim = float(int(in_dim)) out_dim = float(int(out_dim)) if isinstance(sd, str): if sd.lower().startswith('xavier') or sd.lower().startswith('glorot'): factor = sd[6:] if factor: factor = int(factor) else: factor = 1 out = math.sqrt(2. / (in_dim + out_dim)) elif sd.lower().startswith('he'): factor = sd[2:] if factor: factor = int(factor) else: factor = 1 out = math.sqrt(2. / in_dim) else: raise ValueError('Unrecognized variance initializer: %s' % sd) else: factor = 1 out = sd out *= factor return out
[docs] def load_cdr(dir_path, suffix=''): """ Convenience method for reconstructing a saved CDR object. First loads in metadata from ``m.obj``, then uses that metadata to construct the computation graph. Then, if saved weights are found, these are loaded into the graph. :param dir_path: Path to directory containing the CDR checkpoint files. :param suffix: ``str``; file suffix. :return: The loaded CDR instance. """ with open(dir_path + '/m%s.obj' % suffix, 'rb') as f: m = pickle.load(f) m.build(outdir=dir_path, verbose=False) m.load(outdir=dir_path, suffix=suffix) return m
def flatten_dict(d, keys=None): if keys is None: keys = [] out = [] for k in sorted(d.keys()): v = d[k] if isinstance(v, dict): out += flatten_dict(v, keys + [k]) else: out.append((tuple(keys + [k]), v)) return out def pretty_print_time(s): days, s = divmod(s, 24*60*60) hours, s = divmod(s, 60*60) minutes, s = divmod(s, 60) seconds = s vals = [] for i, val in enumerate((days, hours, minutes, seconds)): if i == 0: suffix = 'd' elif i == 1: suffix = 'h' elif i == 2: suffix = 'm' else: suffix = 's' if len(vals): vals.append(('%02d%s' % (val, suffix))) elif val > 0: vals.append('%d%s' % (val, suffix)) if len(vals): out = '-'.join(vals) else: out = '0' return out