Since this classifier does not use any features but only the label counts, we use image filenames to build training data. Similar to other classifiers, donor 4 is not included in the train/test set.
import numpy as np
from glob import glob
from os.path import basename
from functools import reduce
from collections import Counter
from sklearn import metrics
from IPython.display import display, Markdown
all_data = {}
image_path = './images/sample_images/processed/augmented/donor_{}/*/*.png'
for d in [1, 2, 3, 5, 6]:
names = [basename(n) for n in glob(image_path.format(d))]
# 0 is negative and 1 is positive
labels = [0 if 'noact' in n else 1 for n in names]
x = np.array(names)
y = np.array(labels)
all_data[d] = {'x': x, 'y': y}
Even though this is a trivial model, we can implement it using sk-learn's API. This makes it easier to compute the performance statistics and compare with other models.
class FrequencyClassifier():
"""
Build a trivial frequency classifier with sklearn interface.
"""
def __init__(self, pos_freq):
self.pos_freq = pos_freq
def predict_proba(self, x):
"""
Use the positive sample frequency in the training set as the
positive probablity of all elements in x.
Args:
x(np.array): the feature vector you want to predict on
Return:
[p1, p2]: [probability of the negative label, probability of
the positive label]
"""
probs = [1-self.pos_freq, self.pos_freq]
return np.vstack([probs for i in range(x.shape[0])])
def predict(self, x):
"""
Use the majority class in the training set to predict all
elements in x.
Args:
x(np.array): the feature vector you want to predict on
Return:
[p1, p2]: [probability of the negative label, probability of
the positive label]
"""
return np.array([1 if self.pos_freq >= 0.5 else 0 for i in
range(x.shape[0])])
Then, we use this classifier to train 5 models for 5 test donors. For example, for test donor 1, the classifier counts the positive frequency in donors 2, 3, 5, 6.
# Mapping test donor to its trained model
trained_models = {}
for d in [1, 2, 3, 5, 6]:
# Concatenate y labels to form the training set for current
# test donor d
train_donors = [i for i in [1, 2, 3, 5, 6] if i != d]
train_y = np.hstack([all_data[t]['y'] for t in train_donors])
# Count the positive samples in these training labels
count = Counter(train_y)
pos_freq = count[1] / len(train_y)
print("The positive frequency in donor {} is {:.4f}.".format(train_donors,
pos_freq))
# Create a frequency classifier model for this test donor d
trained_models[d] = FrequencyClassifier(pos_freq)
The positive frequency in donor [2, 3, 5, 6] is 0.4966. The positive frequency in donor [1, 3, 5, 6] is 0.3243. The positive frequency in donor [1, 2, 5, 6] is 0.4314. The positive frequency in donor [1, 2, 3, 6] is 0.3278. The positive frequency in donor [1, 2, 3, 5] is 0.3774.
Some of the positive frequencies differ from our whole-dataset frequency classifier reported in the paper. It is due to the rounding of subsamples.
For each test donor, we will run its trained model and compute performance statistics.
def get_score(model, x_test, y_test, pos=1):
"""
This function runs the trained `model` on `x_test`, compares the test
result with `y_test`. Finally, it outputs a collection of various
classificaiton performance metrics.
Args:
model: a trained sklearn model
x_test(np.array(m, n)): 2D feature array in the testset, m elements and
each element has n features
y_test(np.array(m)): 1D label array in the testset. There are m entries.
pos: the encoding of postive label in `y_test`
Return:
A dictionary containing the metrics information and predictions:
metrics scores: ['acc': accuracy, 'precision', 'recall',
'ap': average precision,
'aroc': area under ROC curve,
'pr': PR curve points,
'roc': ROC curve points]
predicitons: ['y_true': the groundtruth labels,
'y_score': predicted probability]
"""
y_predict_prob = model.predict_proba(x_test)
y_predict = model.predict(x_test)
# Sklearn requires the prob list to be 1D
y_predict_prob = [x[pos] for x in y_predict_prob]
y_test_fixed = np.array(y_test)
if pos == 0:
# Flip the array so 1 represents the positive class
y_test_fixed = 1 - np.array(y_test)
# Compute the PR-curve points
precisions, recalls, thresholds = metrics.precision_recall_curve(
y_test_fixed,
y_predict_prob,
pos_label=pos
)
# Compute the roc-curve points
fprs, tprs, roc_thresholds = metrics.roc_curve(y_test_fixed, y_predict_prob,
pos_label=pos)
return ({'acc': metrics.accuracy_score(y_test_fixed, y_predict),
'precision': metrics.precision_score(y_test_fixed, y_predict,
pos_label=pos),
'recall': metrics.recall_score(y_test_fixed, y_predict,
pos_label=pos),
'ap': metrics.average_precision_score(y_test_fixed,
y_predict_prob),
'aroc': metrics.roc_auc_score(y_test_fixed,
y_predict_prob),
'pr': [precisions.tolist(), recalls.tolist(),
thresholds.tolist()],
'roc': [fprs.tolist(), tprs.tolist(), roc_thresholds.tolist()],
'y_true': y_test,
'y_score': y_predict_prob})
def make_table(metric_dict, count_dict):
"""
Transfer the model performance metric dictionary into a Markdown table.
Args:
metric_dict(dict): a dictionary encoding model performance statisitcs
and prediction information for all test donors
count_dict(dict): a dictionary encoding the count of activated and
quiescent samples for each test donor
Return:
string: a Markdown syntax table
"""
# Define header and line template
table_str = ""
line = "|{}|{:.2f}%|{:.2f}%|{:.2f}%|{:.2f}%|{:.2f}%|{}|{}|\n"
table_str += ("|Test Donor|Accuracy|Precision|Recall|Average Precision|" +
"AUC|Num of Activated|Num of Quiescent|\n")
table_str += "|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|\n"
for d in [1, 2, 3, 5, 6]:
result = metric_dict[d]
table_str += (line.format("donor_{}".format(d),
result['acc'] * 100,
result['precision'] * 100,
result['recall'] * 100,
result['ap'] * 100,
result['aroc'] * 100,
count_dict[d]['activated'],
count_dict[d]['quiescent']))
return table_str
model_performance = {}
for d in [1, 2, 3, 5, 6]:
# Collect the performance metrics for each test donor d
cur_y_test = all_data[d]['y']
cur_x_test = all_data[d]['x']
model_performance[d] = get_score(trained_models[d],
cur_x_test,
cur_y_test)
# Save the model performance, so we can compare all models in transfer_learning
# notebook
np.savez('./resource/frequency_model_performance.npz',
model_performance=model_performance)
/Users/JayWong/miniconda3/envs/cellimage/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for)
# Count the labels for each donor
count_dict = {}
for d in [1, 2, 3, 5, 6]:
# Do not count augmented images
act_count = len(glob("./images/sample_images/processed/" +
"augmented/donor_{}/activated/*.png".format(d))) // 6
qui_count = len(glob("./images/sample_images/processed/" +
"augmented/donor_{}/quiescent/*.png".format(d))) // 6
count_dict[d] = {
'activated': act_count,
'quiescent': qui_count
}
# Create a table summary
display(Markdown(make_table(model_performance, count_dict)))
Test Donor | Accuracy | Precision | Recall | Average Precision | AUC | Num of Activated | Num of Quiescent |
---|---|---|---|---|---|---|---|
donor_1 | 88.11% | 0.00% | 0.00% | 11.89% | 50.00% | 22 | 163 |
donor_2 | 18.75% | 0.00% | 0.00% | 81.25% | 50.00% | 65 | 15 |
donor_3 | 73.41% | 0.00% | 0.00% | 26.59% | 50.00% | 46 | 127 |
donor_5 | 27.17% | 0.00% | 0.00% | 72.83% | 50.00% | 67 | 25 |
donor_6 | 56.86% | 0.00% | 0.00% | 43.14% | 50.00% | 44 | 58 |
Since the prediction is constant, we failed to compute precision and recall. Also, there are some interesting relationships among accuracy, average precision and AUC. For example, when pos_freq
is greater than $0.5$, $\text{AP} = \text{ACC}$. When pos_freq
is less than or equal to $0.5$, then $\text{AP} = 1 - \text{ACC}$. You can read this notebook to learn more.
In this notebook, we demonstrate the general workflow of training and testing classifiers for different test donors. We also set up our baseline model using a trivial frequency classifier.