Source code for calfcv.calfcv

"""
The Calf and CalfCV classifiers.

Calf implements a Coarse Approximation Linear Function. [1]
CalfCV optimizes weight selection through cross validation.

References
========================
[1] Jeffries, C.D., Ford, J.R., Tilson, J.L. et al.
A greedy regression algorithm with coarse weights offers novel advantages.
Sci Rep 12, 5440 (2022). https://doi.org/10.1038/s41598-022-09415-2

"""
import time

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import minmax_scale
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


def predict(X, w):
    """ Predict the classes from the weights and features

    Arguments:
        X : array-like, shape (n_samples, n_features)
            The training input features and samples
        w : weights

    Returns:
        y_pred : the prediction of the ground truth, y

    """
    return np.sum(np.multiply(X, w), 1)


def hv_candidate(X, y, w, c):
    """ Find the auc of the weights augmented by the candidate vertex.

    Arguments:
        X : array-like, shape (n_samples, n_features)
            The training input features and samples
        y : ground truth vector
        w : vetted weights
        c : scalar candidate weight

    Returns:
        auc : the prediction AUC

    """
    assert X.shape[1] - 1 == len(w), "X or w have the wrong shape"
    y_p = predict(X, w + [c])
    try:
        auc = roc_auc_score(y_true=y, y_score=y_p)
    except ValueError:
        auc = 0
    return auc


def hv_max(X, y, w, c):
    """Find the weight in c that augments w to maximize auc

    Arguments:
        X : array-like, shape (n_samples, n_features)
            The training input features and samples
        y : ground truth vector
        w : vetted weights
        c : list of candidate weights

    Returns:
        (auc, v) : a tuple of auc at the best grid point v

    """
    res = [(hv_candidate(X, y, w, v), v) for v in c]
    return sorted(res, reverse=True)[0]


# noinspection PyAttributeOutsideInit,PyUnresolvedReferences
def fit_hv(X, y, grid):
    """ Find the weights that best fit X using points from grid

    Arguments:
        X : array-like, shape (n_samples, n_features)
            The training input features and samples.
        y : ground truth vector
        grid : a list or array of candidate weights

    Returns:
        auc, w : the weights that maximize auc, and the list of feature auc

    """
    feature_range = range(X.shape[1])
    w = []
    auc = []
    for i in feature_range:
        X_c = X[:, 0:i + 1]

        # granular approximation
        max_auc, w_c = hv_max(X_c, y, w, grid)

        # if the auc goes down then we skip the feature by weighting it at 0
        if auc and max_auc <= max(auc):
            w = w + [0]
        else:
            w = w + [w_c]
        auc = auc + [max_auc]
    return auc, w


# noinspection PyAttributeOutsideInit
[docs]class Calf(ClassifierMixin, BaseEstimator): """ Course approximation linear function CalfCV fits a linear model with coefficients w = (w1, ..., wp) to maximize the AUC of the targets predicted by the linear function. Parameters ---------- grid : the search grid. Default is (-1, 1). verbose : 0 is silent. 1-3 are increasingly verbose Attributes ---------- coef_ : array of shape (n_features, ) Estimated coefficients for the linear fit problem. Only one target should be passed, and this is a 1D array of length n_features. auc_ : array of shape (n_features, ) The cumulative auc up to each feature. n_features_in_ : int Number of features seen during :term:`fit`. classes_ : list The unique class labels fit_time_ : float The number of seconds to fit X to y Notes ----- The feature matrix must be centered at 0. This can be accomplished with sklearn.preprocessing.StandardScaler, or similar. No intercept is calculated. Examples -------- >>> import numpy >>> from calfcv import Calf >>> from sklearn.datasets import make_classification as mc >>> X, y = mc(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=42) >>> numpy.round(X[0:3, :], 2) array([[ 1.23, -0.76], [ 0.7 , -1.38], [ 2.55, 2.5 ]]) >>> y[0:3] array([0, 0, 1]) >>> cls = CalfCV().fit(X, y) >>> cls.score(X, y) 0.7 >>> cls.best_coef_ [1, 1] >>> numpy.round(cls.best_score_, 2) 0.82 >>> cls.fit_time_ > 0 True >>> cls.predict(np.array([[3, 5]])) array([0]) >>> cls.predict_proba(np.array([[3, 5]])) array([[1., 0.]]) """
[docs] def __init__(self, grid=(-1, 1), verbose=0): """ Initialize Calf""" self.grid = [grid] if isinstance(grid, int) else grid self.verbose = verbose if isinstance(verbose, int) and verbose in [0, 1, 2, 3] else 0
[docs] def fit(self, X, y): """ fit Calf to X and y Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples y : ground truth vector Returns: self """ if y is None: raise ValueError('requires y to be passed, but the target y is None') X, y = check_X_y(X, y) self.n_features_in_ = X.shape[1] self.classes_ = unique_labels(y) self.X_ = X self.y_ = y # fit and time the fit start = time.time() self.auc_, self.w_ = fit_hv(X, y, grid=self.grid) self.fit_time_ = time.time() - start self.coef_ = self.w_ if self.verbose > 0: print() print('=======================================') print('Coefficients ', self.coef_) print('Max AUC', max(self.auc_)) print('Objective score', self.score(X, y)) print('Fit time', self.fit_time_) self.is_fitted_ = True return self
[docs] def decision_function(self, X): """ Identify confidence scores for the samples Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples Returns: the decision vector (n_samples) """ check_is_fitted(self, ['is_fitted_', 'X_', 'y_']) X = self._validate_data(X, accept_sparse="csr", reset=False) scores = np.array( minmax_scale( predict(X, self.w_), feature_range=(-1, 1) ) ) return scores
[docs] def predict(self, X): """ Predict class labels for each sample Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples Returns: the class prediction of X (n_samples) """ check_is_fitted(self, ['is_fitted_', 'X_', 'y_']) X = check_array(X) if len(self.classes_) < 2: y_class = self.y_ else: # and convert to [0, 1] classes. y_class = np.heaviside(self.decision_function(X), 0).astype(int) # get the class labels y_class = [self.classes_[x] for x in y_class] return np.array(y_class)
[docs] def predict_proba(self, X): """ Identify the probability of each sample class label Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples Returns: the class probabilities of X (n_samples) """ check_is_fitted(self, ['is_fitted_', 'X_', 'y_']) X = check_array(X) y_proba = np.array( minmax_scale( self.decision_function(X), feature_range=(0, 1) ) ) class_prob = np.column_stack((1 - y_proba, y_proba)) return class_prob
def _more_tags(self): return { 'poor_score': True, 'non_deterministic': True, 'binary_only': True }
# noinspection PyAttributeOutsideInit, PyUnresolvedReferences
[docs]class CalfCV(ClassifierMixin, BaseEstimator): """ Course approximation linear function with cross validation CalfCV fits a linear model with coefficients w = (w1, ..., wp) to maximize the AUC of the targets predicted by the linear function. Parameters ---------- grid : the search grid. Default is (-1, 1). verbose : 0 is silent. 1-3 are increasingly verbose Attributes ---------- best_coef_ : array of shape (n_features, ) Estimated coefficients for the linear fit problem. Only one target should be passed, and this is a 1D array of length n_features. best_score_ : float The best auc score over the cross validation best_auc_ : array of shape (n_features, ) The cumulative auc by feature. n_features_in_ : int Number of features seen during :term:`fit`. classes_ : list The unique class labels fit_time_ : float The number of seconds to fit X to y Notes ----- Only one processor is used due to a bug caused by "Python’s multiprocessing that does fork without exec". See, https://scikit-learn.org/stable/faq.html#id27 Examples -------- >>> import numpy >>> from calfcv import CalfCV >>> from sklearn.datasets import make_classification as mc >>> X, y = mc(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=42) >>> numpy.round(X[0:3, :], 2) array([[ 1.23, -0.76], [ 0.7 , -1.38], [ 2.55, 2.5 ]]) >>> y[0:3] array([0, 0, 1]) >>> cls = CalfCV().fit(X, y) >>> cls.score(X, y) 0.7 >>> numpy.round(cls.best_score_, 2) 0.82 >>> numpy.round(cls.best_auc_, 2) array([0.53, 0.8 ]) >>> cls.best_coef_ [1, 1] >>> numpy.round(cls.best_score_, 2) 0.82 >>> cls.fit_time_ > 0 True >>> cls.predict(np.array([[3, 5]])) array([0]) >>> cls.predict_proba(np.array([[3, 5]])) array([[1., 0.]]) """
[docs] def __init__(self, grid=(-1, 1), verbose=0): """ Initialize CalfCV""" self.grid = [grid] if isinstance(grid, int) else grid self.verbose = verbose if isinstance(verbose, int) and verbose in [0, 1, 2, 3] else 0
[docs] def fit(self, X, y): """ fit X and y Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples. y : ground truth vector Returns: self """ X, y = check_X_y(X, y) self.X_ = X self.y_ = y self.n_features_in_ = X.shape[1] self.classes_ = unique_labels(y) self.model_ = GridSearchCV( estimator=Pipeline( steps=[ ('scaler', StandardScaler()), ('classifier', Calf()) ] ), param_grid={'classifier__grid': [self.grid]}, scoring="roc_auc", verbose=self.verbose ) # fit and time start = time.time() self.model_.fit(X, y) self.fit_time_ = time.time() - start self.is_fitted_ = True # "best_score_: Mean cross-validated score of the best_estimator" # "https://stackoverflow.com/a/50233868/12865125" self.best_score_ = self.model_.best_score_ self.best_coef_ = self.model_.best_estimator_['classifier'].coef_ self.best_auc_ = self.model_.best_estimator_['classifier'].auc_ if self.verbose > 0: print() print('=======================================') print('Objective best score', self.best_score_) print('Best coef_ ', self.best_coef_) print('Objective best params', self.model_.best_params_) return self
[docs] def decision_function(self, X): """ Identify confidence scores for the samples Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples Returns: the decision vector (n_samples) """ check_is_fitted(self, ['is_fitted_', 'model_']) return self.model_.decision_function(X)
[docs] def predict(self, X): """ Predict class labels for each sample Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples Returns: the class prediction of X (n_samples) """ check_is_fitted(self, ['is_fitted_', 'model_']) return self.model_.predict(X)
[docs] def predict_proba(self, X): """ Identify the probability of each sample class label Arguments: X : array-like, shape (n_samples, n_features) The training input features and samples Returns: the class probabilities of X (n_samples) """ check_is_fitted(self, ['is_fitted_', 'model_']) return self.model_.predict_proba(X)
def _more_tags(self): return { 'poor_score': True, 'non_deterministic': True, 'binary_only': True }