Identify autophagy-positive cells¶

The goal of this notebook is to train a machine learning model to distininguish between autophagy-positive and autophagy-negative cells based on pre-calculated image features.

We have defined our classes as follows:

Class 0 : unstimulated WT cells
Class 1 : 14h Torin-1 stimulated WT cells

After training and evaluating our model, we want to compare cells without a functional EI24 gene (EI24KO cells) to WT cells.

import lamindb as ln
import matplotlib.pyplot as plt
import scportrait
import seaborn as sns
import numpy as np

from anndata import concat
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc

ln.track()

def _get_cells(dataframe):
    """Extract and concatenate single-cell data for cells specified in the input dataframe."""
    sc_data = None
    for uid in dataframe.dataset.unique():
        _selected_cells = dataframe[dataframe.dataset == uid].copy()
        _sc_data = scportrait.io.read_h5sc(
            ln.Artifact.using("scportrait/examples").get(uid).cache()
        )
        _sc_data = _sc_data[
            _sc_data.obs.scportrait_cell_id.isin(
                _selected_cells.scportrait_cell_id.values
            )
        ].copy()
        _sc_data.obs["score"] = _selected_cells.prob_class1.values

        if sc_data is None:
            sc_data = _sc_data
        else:
            sc_data = concat([sc_data, _sc_data], uns_merge="first", index_unique="-")
            sc_data.obs.reset_index(inplace=True, drop=True)
            sc_data.obs.index = sc_data.obs.index.values.astype(str)
    return sc_data

# Define parameters for our RandomForest Classifier
ln.Param(name="random_state", dtype="int").save()
ln.Param(name="n_estimators", dtype="int").save()
ln.Param(name="max_depth", dtype="int").save()
ln.Param(name="min_samples_split", dtype="int").save()
ln.Param(name="min_samples_leaf", dtype="int").save()
ln.Param(name="max_features", dtype="str").save()
ln.Param(name="criterion", dtype="str").save()
ln.Param(name="bootstrap", dtype="bool").save()

# Define parameter values
rfc_params = {
    "random_state": 42,
    "n_estimators": 100,
    "max_depth": 10,
    "min_samples_split": 2,
    "min_samples_leaf": 1,
    "max_features": "sqrt",
    "criterion": "gini",
    "bootstrap": True,
}

ln.track(params=rfc_params)

Show code cell output Hide code cell output

! record with similar name exists! did you mean to load it?

	uid	name	dtype	is_type	unit	description	array_rank	array_size	array_shape	proxy_dtype	synonyms	_expect_many	_curation	space_id	type_id	run_id	created_at	created_by_id	_aux	_branch_code
id
18	Gee7gaw7PHha	min_samples_split	int	None	None	None	0	0	None	None	None	True	None	1	None	135	2025-05-13 20:10:37.484000+00:00	1	{'af': {'0': None, '1': True, '2': False}}	1

/opt/hostedtoolcache/Python/3.12.10/x64/lib/python3.12/site-packages/lamindb/core/_context.py:429: FutureWarning: Use features instead of params, params will be removed in the future.
  run.params.add_values(params)

→ loaded Transform('YtCZO4zwFFR90000'), re-started Run('KFEyGyn4...') at 2025-05-13 20:10:37 UTC
→ params: random_state=42, n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, criterion=gini, bootstrap=True

→ notebook imports: anndata==0.11.4 lamindb==1.5.2 matplotlib==3.10.3 numpy==2.0.2 scikit-learn==1.6.1 scportrait==1.3.5 seaborn==0.13.2

• recommendation: to identify the notebook across renames, pass the uid: ln.track("YtCZO4zwFFR9", params={...})

study = ln.ULabel.using("scportrait/examples").get(name="autophagy imaging")

sc_datasets = (
    ln.Artifact.using("scportrait/examples")
    .filter(ulabels=study, is_latest=True)
    .filter(ulabels__name="scportrait single-cell images")
)
featurized_datasets = (
    ln.Artifact.using("scportrait/examples")
    .filter(ulabels=study, is_latest=True)
    .filter(description="featurized single-cell images")
)

class_lookup = {0: "untreated", 1: "14h Torin-1"}
label_lookup = {v: k for k, v in class_lookup.items()}

First, lets look at some example images from our two classes. As we can see the cells look very distinct to one another. Hopefully our ML model will be able to seperate them as well.

# get example images for positive and negative autophagy
num_rows, num_cols = 4, 4
n_cells = num_rows * num_cols
channel_of_interest = 4  # LC3B channel a key autophagosome marker

autophagy_positive_example = scportrait.io.read_h5sc(
    sc_datasets.filter(ulabels__name="WT")
    .filter(ulabels__name="14h Torin-1")[0]
    .cache()
)
autophagy_negative_example = scportrait.io.read_h5sc(
    sc_datasets.filter(ulabels__name="WT").filter(ulabels__name="untreated")[0].cache()
)

# create a figure with two panels for negative and postive examples
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 7))
scportrait.pl.cell_grid_single_channel(
    autophagy_negative_example,
    select_channel=channel_of_interest,
    ax=axes[0],
    title="Autophagy negative cells (LC3B distribution)",
    show_fig=False,
)
scportrait.pl.cell_grid_single_channel(
    autophagy_positive_example,
    select_channel=channel_of_interest,
    ax=axes[1],
    title="Autophagy postive cells (LC3B distribution)",
    show_fig=False,
)

Train ML model¶

# load data
wt_cells_afs = (
    featurized_datasets.filter(ulabels__name="WT", is_latest=True).distinct().one()
)
features_wt = wt_cells_afs.load()

ko_cells_afs = (
    featurized_datasets.filter(ulabels__name="EI24KO", is_latest=True).distinct().one()
)
features_ko = ko_cells_afs.load()

# split data into training and testing
data_train, data_test = train_test_split(features_wt, test_size=0.4, random_state=42)

# prepare data for model training by removing columns we don't want to train on
_data_train = data_train.drop(columns=["dataset", "scportrait_cell_id"])
_data_train = _data_train.drop(
    columns=[col for col in data_train.columns if "mCherry" in col]
)  # subset to only include features from our channel of interest

_data_test = data_test.drop(columns=["dataset", "scportrait_cell_id"])
_data_test = _data_test.drop(
    columns=[col for col in data_test.columns if "mCherry" in col]
)  # subset to only include features from our channel of interest

# Separate features and target
X_train = _data_train.drop("class", axis=1)
y_train = _data_train["class"]

X_test = _data_test.drop("class", axis=1)
y_test = _data_test["class"]

# Train model
clf = RandomForestClassifier(**rfc_params)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
y_scores = clf.predict_proba(X_test)

data_test["predicted_class"] = y_pred
data_test["prob_class1"] = y_scores[:, 1]

# compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
labels = np.unique(y_test)
labels = [class_lookup[label] for label in labels]

# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)

# Plot the confusion matrix and ROC Curve
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    xticklabels=labels,
    yticklabels=labels,
    ax=axes[1],
    cbar=False,
)
axes[1].set_xlabel("Predicted Label")
axes[1].set_ylabel("True Label")
axes[1].set_title("Confusion Matrix [% of true label]")

axes[0].plot(fpr, tpr, color="blue", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})")
axes[0].plot([0, 1], [0, 1], color="gray", linestyle="--")  # Diagonal line
axes[0].set_xlim([0.0, 1.0])
axes[0].set_ylim([0.0, 1.0])
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].set_title("ROC Curve")
axes[0].legend(loc="lower right", frameon=False)
fig.tight_layout()

_images/96cabaedbdef08c425de630c0261894a753302655e6cd21c62f1475a515e201d.png

Considering that our model was only trained on very few input images, it performs well with an AUC of 0.77.

However, since the AUC is < 1, our model is still appears to make some mistakes. We can further investigate this through a confusion matrix. Our classifier is performing well at recognizing autophagy negative cells, correctly identifying 92% of the examples in our dataset. Unfortunately, the classifier is not yet very good at correctly identifying autophagy-postive cells. Less than 50% of these examples are classified correctly.

Let’s visualize some of the cells from each class to see if we can gain some insights into what our model is doing.

# visualize some example cells with classification scores from the test dataset
num_rows, num_cols = 3, 3
n_cells = num_rows * num_cols
channel_of_interest = 4  # LC3B channel a key autophagosome marker

# annotate dataset with TP, TN, FN, FP
data_test["FP"] = (data_test["class"] == 0) & (data_test["predicted_class"] == 1)
data_test["FN"] = (data_test["class"] == 1) & (data_test["predicted_class"] == 0)
data_test["TP"] = (data_test["class"] == 1) & (data_test["predicted_class"] == 1)
data_test["TN"] = (data_test["class"] == 0) & (data_test["predicted_class"] == 0)

# get example cells for each class
cells_TP = (
    data_test[data_test.TP].sort_values("prob_class1", ascending=False).head(n_cells)
)
cells_TN = (
    data_test[data_test.TN].sort_values("prob_class1", ascending=False).tail(n_cells)
)
cells_FN = (
    data_test[data_test.FN].sort_values("prob_class1", ascending=False).tail(n_cells)
)
cells_FP = (
    data_test[data_test.FP].sort_values("prob_class1", ascending=False).head(n_cells)
)

cell_sets = {
    "TP": _get_cells(cells_TP),
    "TN": _get_cells(cells_TN),
    "FN": _get_cells(cells_FN),
    "FP": _get_cells(cells_FP),
}

# make the plot
n_panel_rows = 2
n_panel_cols = 2
fig, axes = plt.subplots(nrows=n_panel_rows, ncols=n_panel_cols, figsize=(13, 13))

for j in range(n_panel_rows):
    for i in range(n_panel_cols):
        ax = axes[j, i]
        title, cells = cell_sets.popitem()
        scportrait.pl.cell_grid_single_channel(
            cells,
            cell_ids=cells.obs.scportrait_cell_id,
            cell_labels=cells.obs.score.round(2).values,
            select_channel=channel_of_interest,
            ax=ax,
            title=f"{title} with Prob(stimulated)",
            show_fig=False,
        )

Show code cell output Hide code cell output

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='89C8kQyV4Kjzj4SB0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='KH1brNUW'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='1GKwxrAp7XJmAqpt0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='GadMJgvN'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='zGFV103h7KW1AbmE0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='KH1brNUW'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='iuuMnf7xC4wYmkv80000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='KH1brNUW'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='uJ9W0phl9z0QhFOY0000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='GadMJgvN'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='uTNKe0UmY5IOowhC0000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='GadMJgvN'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='9m0dxLtxu35ludr70000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='KH1brNUW'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='zGFV103h7KW1AbmE0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='KH1brNUW'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='1GKwxrAp7XJmAqpt0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='GadMJgvN'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='89C8kQyV4Kjzj4SB0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='KH1brNUW'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='p8J4ly0vv0QjuPEe0000'), ULabel(uid='9hHptuyb'), ULabel(uid='A2945i5P'), ULabel(uid='GadMJgvN'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='uTNKe0UmY5IOowhC0000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='GadMJgvN'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='iuuMnf7xC4wYmkv80000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='KH1brNUW'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='uJ9W0phl9z0QhFOY0000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='GadMJgvN'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='9m0dxLtxu35ludr70000'), ULabel(uid='9hHptuyb'), ULabel(uid='EgihgpJC'), ULabel(uid='KH1brNUW'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

_images/fcc467ce4035e188acb9ec11e400b40659bea38c048cf696246d633f4bca87f2.png

Looking at the FPs (i.e. cells that look stimulated despite not having been stimulated) we can see two different types of cells:

Very small cells that do not show any visible autophagosomes
Larger cells that clearly show some autophagosomes

The type 1 cells look like they are the result of our model making mistakes, but the type 2 cells look much more similar to TP cells than TN cells. Based on literature, we know that cells can under go spontaneous autophagy even in the absence of Torin-1, for example, as a result of nutrient scarcity. Since our class labelling is just based on cells having not been treated with Torin-1, we would be annotating these cells incorrectly. In these cases our model does not make a mistake, but has in fact uncovered mislabelled examples in the dataset.

Looking at the FNs (i.e. cells that the model identifies as having been unstimulated despite having been stimulated) the cells appear homogenous and look more comparable to the TP Population. So in this case it seems our model is making a mistake.

Before applying this model in a biological context, we would therefore probably need to invest some more time and effort into improving it. We could for example:

Train our model on additional input data to make it more robust
Perform a pre-screening of our training data to ensure we remove any incorrectly labelled cells
Improve the image features we are training our model on to better reflect the biology we are interested in

If you are interested in finding out more you can check out the original paper where the authors trained a deep learning model to classify these cells with a much higher accuracy.

Investigate the EI24 KO cells¶

Now lets take a look at the EI24-deficient cells.

data_ko = features_ko.drop(columns=["dataset", "scportrait_cell_id"])
data_ko = data_ko.drop(columns=[x for x in data_ko.columns if "mCherry" in x])
X_ko = data_ko.drop("class", axis=1)
y_true = data_ko["class"]
predictions_ko = clf.predict(X_ko)

# compute confusion matrix
cm = confusion_matrix(y_true, predictions_ko)
cm_normalized = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
labels = np.unique(y_test)
labels = [class_lookup[label] for label in labels]

# Plot the confusion matrix and ROC Curve
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 2))
sns.heatmap(
    cm_normalized,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    xticklabels=labels,
    yticklabels=labels,
    ax=axes,
    cbar=False,
)
axes.set_xlabel("Predicted Label")
axes.set_ylabel("True Label")
axes.set_title("Confusion Matrix [% of true label]")
fig.tight_layout()

_images/4178bce6a21775425f5ed7a6a65971ce2d64e2a1810f7154d17ac6e1688c46af.png

Interestingly, our model classifies a high percentage of stimulated EI24-KO cells as being unstimulated. Lets take a look at the images again.

# compare WT And EI24KO cells
num_rows, num_cols = 4, 4
n_cells = num_rows * num_cols
channel_of_interest = 4  # LC3B channel a key autophagosome marker

EI24_KO_stimulated = scportrait.io.read_h5sc(
    sc_datasets.filter(ulabels__name="EI24KO")
    .filter(ulabels__name="14h Torin-1")[0]
    .cache()
)
EI24_KO_unstimulated = scportrait.io.read_h5sc(
    sc_datasets.filter(ulabels__name="EI24KO")
    .filter(ulabels__name="untreated")[0]
    .cache()
)

# create a figure with two panels for negative and postive examples
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
scportrait.pl.cell_grid_single_channel(
    autophagy_negative_example,
    select_channel=channel_of_interest,
    ax=axes[0, 0],
    title="WT autophagy negative cells",
    show_fig=False,
)
scportrait.pl.cell_grid_single_channel(
    autophagy_positive_example,
    select_channel=channel_of_interest,
    ax=axes[0, 1],
    title="WT autophagy postive cells",
    show_fig=False,
)
scportrait.pl.cell_grid_single_channel(
    EI24_KO_unstimulated,
    select_channel=channel_of_interest,
    ax=axes[1, 0],
    title="EI24KO unstimulated cells",
    show_fig=False,
)
scportrait.pl.cell_grid_single_channel(
    EI24_KO_stimulated,
    select_channel=channel_of_interest,
    ax=axes[1, 1],
    title="EI24KO stimulated cells",
    show_fig=False,
)

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='XaTalaUNv7d3QwXc0000'), ULabel(uid='Aj8KGwbh'), ULabel(uid='A2945i5P'), ULabel(uid='joRCMMWX'), ULabel(uid='CrR7fgIZ'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

→ returning existing schema with same hash: Schema(uid='TRnRtjbekihtt1qs', name='single-cell image dataset schema obs', n=1, is_type=False, itype='Feature', hash='qpw3p8_e1D7Rc2eqqY07uw', minimal_set=True, ordered_set=False, maximal_set=False, space_id=1, created_by_id=1, run_id=3, created_at=2025-05-13 20:05:45 UTC)

→ mapped records: Artifact(uid='9KvNUZng67uxy4G90000'), ULabel(uid='Aj8KGwbh'), ULabel(uid='EgihgpJC'), ULabel(uid='joRCMMWX'), ULabel(uid='1KPobNqK'), ULabel(uid='QrU6fxsG'), ULabel(uid='xhpmj7p7'), ULabel(uid='xHqZKcIG'), ULabel(uid='JWE2jNdk'), ULabel(uid='PKiCEP1h'), ULabel(uid='HRRTqARL'), ULabel(uid='e82fx2wm')

_images/614a11e2fc92955b1396454ae8c3b40d52abdf5214d8d56b61e83e007eb77ee6.png

The EI24 KO cells have fewer LC3 puncta, and seem to show a defect in the formation of autophagosomes. Even when stimulated, EI24 KO cells look comparable to unstimated cells. In this instance, out model appears to correctly identify the biological effect of a deficiency in the EI24 gene.

Trace cells back to see them in their original context¶

As image analysis advances, obtaining the full context of a small section of the original image is often essential.

# select a random cell from the WT dataset
cell = features_wt.sample(1, random_state=42)
dataset = cell["dataset"].values[0]
cell_id = cell["scportrait_cell_id"].values[0]

# get SpatialData object and single-cell image dataset
sdata = (
    ln.Artifact.using("scportrait/examples")
    .get(
        key=ln.Artifact.using("scportrait/examples")
        .get(dataset)
        .key.replace("single_cell_data.h5ad", "spatialdata.zarr")
    )
    .load()
)
single_cell_images = scportrait.io.read_h5sc(ln.Artifact.get(dataset).cache())

# lookup location of cell of interest
x, y = sdata["centers_seg_all_nucleus"].compute().loc[cell_id, :]

# plot single-cell images and spatial context
fig, axs = plt.subplots(1, 2, figsize=(12, 3.5))
scportrait.pl.cell_grid_multi_channel(
    single_cell_images, cell_ids=cell_id, ax=axs[0], show_fig=False
)
_sdata = scportrait.tl.sdata.pp.get_bounding_box_sdata(
    sdata, center_x=x, center_y=y, max_width=100
)
scportrait.pl.plot_segmentation_mask(
    _sdata,
    masks=["seg_all_nucleus", "seg_all_cytosol"],
    ax=axs[1],
    show_fig=False,
)
axs[1].scatter(
    x,
    y,
    color="red",
    s=200,
)
fig.tight_layout()

→ transferred records: Artifact(uid='sF2ax2t7OJXkSId40003')

_images/6f12a67d305254a678629603582d5e279e07dc7b5bd272ff94fc7117610968d5.png

ln.finish()