Small pathological example¶
This example shows that AUC according to the R version of Calf may be meaningless when the data is collinear.
[41]:
# Author: Rolf Carlson, Carlson Research LLC, <hrolfrc@gmail.com>
# License: 3-clause BSD
Here is a description of the problem
[42]:
# For five subjects, suppose the control = 0, case = 1 status is
#
# 0
# 0
# 0
# 1
# 1
#
# Suppose for five markers the observed values (column = a marker) are
#
# 0.3801 0.2484 -0.1280 -0.5741 1.0631
# -0.9703 -0.5551 -0.3680 1.1324 -1.0930
# 0.5148 -0.9927 0.2833 1.0068 0.5449
# 1.1880 1.5985 -1.2621 -0.5094 0.5316
# -1.1126 -0.2992 1.4748 -1.0558 -1.0467
#
# What weight vector provides us with AUC = 1.0?
#
# Running the problem through calf gets the following:
#
# Marker Weight
# F4 1
# F2 -1
# F5 1
#
# AUC: 1.0
# Final p-value: 0.05582771541247467
[43]:
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from sklearn.inspection import permutation_importance
from sklearn.linear_model import Lasso
import numpy as np
import re
import pprint
rng = np.random.default_rng()
import seaborn as sns
sns.set_theme(style="darkgrid")
Create the problem¶
[44]:
X_str = """0.3801 0.2484 -0.1280 -0.5741 1.0631
-0.9703 -0.5551 -0.3680 1.1324 -1.0930
0.5148 -0.9927 0.2833 1.0068 0.5449
1.1880 1.5985 -1.2621 -0.5094 0.5316
-1.1126 -0.2992 1.4748 -1.0558 -1.0467
"""
# convert whitespace to a single space
X_str = re.sub(r"\s+", " ", X_str)
# make an array
X = np.fromstring(X_str, dtype='float64', sep=' ')
X = X.reshape(5, 5)
# X = X.tolist()
# print(X)
y = np.array([0, 0, 0, 1, 1]).reshape(5, 1)
# # Y = [0, 0, 0, 1, 1]
# x = X.astype('float64')
# y = Y.astype('float64')
feature_names = ['F1', 'F2', 'F3', 'F4', 'F5']
[45]:
X
[45]:
array([[ 0.3801, 0.2484, -0.128 , -0.5741, 1.0631],
[-0.9703, -0.5551, -0.368 , 1.1324, -1.093 ],
[ 0.5148, -0.9927, 0.2833, 1.0068, 0.5449],
[ 1.188 , 1.5985, -1.2621, -0.5094, 0.5316],
[-1.1126, -0.2992, 1.4748, -1.0558, -1.0467]])
[46]:
y
[46]:
array([[0],
[0],
[0],
[1],
[1]])
Generalization failure¶
Lasso does not learn enough to predict the data
[47]:
clf = Lasso().fit(X, y)
print("Accuracy on the data: {:.2f}".format(clf.score(X, y)))
Accuracy on the data: 0.00
None of the features are identified as important
[48]:
result = permutation_importance(clf, X, y, n_repeats=10, random_state=42, n_jobs=-1)
selected_features = []
perm_sorted_idx = []
for i in result.importances_mean.argsort()[::-1]:
# defining importance as the mean - 2 stdev produces a better prediction
# than just mean
if result.importances_mean[i] - 2 * result.importances_std[i] > 0.003:
selected_features.append(feature_names[i])
perm_sorted_idx.append(i)
print(f"{feature_names[i]: <8}"
f"{result.importances_mean[i]: .3f}"
f" +/- {result.importances_std[i]: .3f}")
pprint.pprint(result)
{'importances': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
'importances_mean': array([0., 0., 0., 0., 0.]),
'importances_std': array([0., 0., 0., 0., 0.])}