Enforce pre-defined validation constraints

In a previous guide, you defined validation constraints ad-hoc when initializing Curator objects.

Often, you want to enforce a pre-defined set of validation constraints, like, e.g., the CELLxGENE curator (Curate AnnData based on the CELLxGENE schema).

This guide shows how to subclass Curator to enforce pre-defined constraints.

Define a custom curator

Consider the example of electronic health records (EHR). We want to ensure that

  1. every record has the fields disease, phenotype, developmental_stage, and age

  2. values for these fields map against specific versions of pre-defined ontologies

The following implementation achieves the goal by subclassing DataFrameCurator.

EHR Curator
import bionty as bt
import pandas as pd
from lamindb.core import DataFrameCurator, Record, logger
from lamindb.core.types import UPathStr, FieldAttr

__version__ = "0.1.0"


class EHRCurator(DataFrameCurator):
    """Custom curation flow for electronic health record data."""

    def __init__(self, data: pd.DataFrame | UPathStr):
        # Curate these columns against the specified fields
        DEFAULT_CATEGORICALS = {
            "disease": bt.Disease.name,
            "phenotype": bt.Phenotype.name,
            "developmental_stage": bt.DevelopmentalStage.name,
        }

        # If columns or values are missing, we substitute with these defaults
        DEFAULT_VALUES = {
            "disease": "normal",
            "development_stage": "unknown",
            "phenotype": "unknown",
        }

        # Validate values onto the following ontology versions
        DEFAULT_SOURCES = {
            "disease": bt.Source.get(
                entity="bionty.Disease", name="mondo", version="2023-04-04"
            ),
            "developmental_stage": bt.Source.get(
                entity="bionty.DevelopmentalStage", name="hsapdv", version="2020-03-10"
            ),
            "phenotype": bt.Source.get(
                entity="bionty.Phenotype",
                name="hp",
                version="2023-06-17",
                organism="human",
            ),
        }

        self.data = data

        for col, default in DEFAULT_VALUES.items():
            if col not in self.data.columns:
                self.data[col] = default
            else:
                self.data[col] = self.data[col].fillna(default)

        super().__init__(
            df=self.data,
            categoricals=DEFAULT_CATEGORICALS,
            sources=DEFAULT_SOURCES,
            organism="human",
        )

    def validate(self, organism: str | None = None) -> bool:
        """Validates the internal EHR standard."""
        missing_columns = {"disease", "phenotype", "developmental_stage", "age"} - set(
            self.data.columns
        )
        if missing_columns:
            logger.error(
                f"Columns {', '.join(map(repr, missing_columns))} are missing but required."
            )
            return False

        return DataFrameCurator.validate(self, organism)

Use the custom curator

!lamin init --storage ./subclass-curator --schema bionty
 connected lamindb: testuser1/subclass-curator
import lamindb as ln
import bionty as bt
import pandas as pd
from ehrcurator import EHRCurator

ln.track("2XEr2IA4n1w40000")
 connected lamindb: testuser1/subclass-curator
 created Transform('2XEr2IA4'), started new Run('ovhZkwRJ') at 2024-12-20 15:04:47 UTC
 notebook imports: bionty==0.53.2 ehrcurator lamindb==0.77.3 pandas==2.2.3
# create example DataFrame that has all mandatory columns but one ('patient_age') is wrongly named
data = {
    'disease': ['Alzheimer disease', 'diabetes mellitus', 'breast cancer', 'Hypertension', 'asthma'],
    'phenotype': ['Mental deterioration', 'Hyperglycemia', 'Tumor growth', 'Increased blood pressure', 'Airway inflammation'],
    'developmental_stage': ['Adult', 'Adult', 'Adult', 'Adult', 'Child'],
    'patient_age': [70, 55, 60, 65, 12],
}
df = pd.DataFrame(data)
df
Hide code cell output
disease phenotype developmental_stage patient_age
0 Alzheimer disease Mental deterioration Adult 70
1 diabetes mellitus Hyperglycemia Adult 55
2 breast cancer Tumor growth Adult 60
3 Hypertension Increased blood pressure Adult 65
4 asthma Airway inflammation Child 12
ehrcurator = EHRCurator(df)
ehrcurator.validate()
Hide code cell output
 added 3 records with Feature.name for "columns": 'disease', 'phenotype', 'developmental_stage'
✗ Columns 'age' are missing but required.
False
# Fix the name of wrongly spelled column
df.columns = df.columns.str.replace("patient_age", "age")
ehrcurator.validate()
Hide code cell output
 saving validated records of 'disease'
 added 4 records from public with Disease.name for "disease": 'Alzheimer disease', 'diabetes mellitus', 'asthma', 'breast cancer'
 saving validated records of 'phenotype'
 added 3 records from public with Phenotype.name for "phenotype": 'Increased blood pressure', 'Mental deterioration', 'Hyperglycemia'
 mapping "disease" on Disease.name
!   1 term is not validated: 'Hypertension'
    → fix typos, remove non-existent values, or save terms via .add_new_from("disease")
 mapping "phenotype" on Phenotype.name
!   2 terms are not validated: 'Tumor growth', 'Airway inflammation'
    → fix typos, remove non-existent values, or save terms via .add_new_from("phenotype")
 mapping "developmental_stage" on DevelopmentalStage.name
!   2 terms are not validated: 'Adult', 'Child'
    → fix typos, remove non-existent values, or save terms via .add_new_from("developmental_stage")
False
# Use lookup objects to curate the values
disease_lo = bt.Disease.public().lookup()
phenotype_lo = bt.Phenotype.public().lookup()
developmental_stage_lo = bt.DevelopmentalStage.public().lookup()

df["disease"] = df["disease"].replace({"Hypertension": disease_lo.hypertensive_disorder.name})
df["phenotype"] = df["phenotype"].replace({
    "Tumor growth": phenotype_lo.neoplasm.name,
    "Airway inflammation": phenotype_lo.bronchitis.name}
)
df["developmental_stage"] = df["developmental_stage"].replace({
    "Adult": developmental_stage_lo.adolescent_stage.name,
    "Child": developmental_stage_lo.child_stage.name
})

ehrcurator.validate()
Hide code cell output
 saving validated records of 'disease'
 added 1 record from public with Disease.name for "disease": 'hypertensive disorder'
 saving validated records of 'phenotype'
 added 2 records from public with Phenotype.name for "phenotype": 'Bronchitis', 'Neoplasm'
 saving validated records of 'developmental_stage'
 added 2 records from public with DevelopmentalStage.name for "developmental_stage": 'adolescent stage', 'child stage'
 "disease" is validated against Disease.name
 "phenotype" is validated against Phenotype.name
 "developmental_stage" is validated against DevelopmentalStage.name
True
Hide code cell content
!rm -rf subclass-curator
!lamin delete --force subclass-curator
 deleting instance testuser1/subclass-curator