EHR

In this guide, we’ll look at curating a DataFrame storing examplary EHR data, curate it and save it as an annotated .parquet file.

  1. the dataframe has columns disease, phenotype, developmental_stage, and age

  2. if columns or values are missing, we standardize the dataframe with default values

  3. any values that are present map against specific versions of pre-defined ontologies

# pip install 'lamindb[bionty]'
!lamin init --storage ./test-ehrschema --modules bionty
Hide code cell output
 initialized lamindb: testuser1/test-ehrschema
import lamindb as ln
import bionty as bt
import pandas as pd

ln.track("2XEr2IA4n1w4")
Hide code cell output
 connected lamindb: testuser1/test-ehrschema
 created Transform('2XEr2IA4n1w40000', key='ehr.ipynb'), started new Run('bIcNzcZaF8ERISaV') at 2025-10-16 11:31:21 UTC
 notebook imports: bionty==1.8.1 lamindb==1.13.0 pandas==2.3.3

Define a schema

Let us first define the ontology versions we want to use.

disease_source = bt.Source.get(
    entity="bionty.Disease", name="mondo", currently_used=True
)

developmental_stage_source = bt.Source.get(
    entity="bionty.DevelopmentalStage", name="hsapdv", currently_used=True
)

bt.Source.filter(entity="bionty.Phenotype", name="pato").update(currently_used=False)
phenotype_source = bt.Source.get(
    entity="bionty.Phenotype", name="hp"
)  # will use add_source
phenotype_source.currently_used = True
phenotype_source.save()
Source(uid='48fBFLmn', entity='bionty.Phenotype', organism='human', name='hp', version='2024-04-26', in_db=False, currently_used=True, description='Human Phenotype Ontology', url='https://github.com/obophenotype/human-phenotype-ontology/releases/download/v2024-04-26/hp.owl', source_website='https://hpo.jax.org', branch_id=1, space_id=1, created_by_id=1, created_at=2025-10-16 11:31:18 UTC, is_locked=False)

Let us now create a schema by defining the features that it measures. The ontology versions are captured via their uid.

schema = ln.Schema(
    name="My EHR schema",
    features=[
        ln.Feature(name="age", dtype=int).save(),
        ln.Feature(
            name="disease",
            dtype=bt.Disease,
            default_value="normal",
            nullable=False,
            cat_filters={"source__uid": disease_source.uid},
        ).save(),
        ln.Feature(
            name="developmental_stage",
            dtype=bt.DevelopmentalStage,
            default_value="unknown",
            nullable=False,
            cat_filters={"source__uid": developmental_stage_source.uid},
        ).save(),
        ln.Feature(
            name="phenotype",
            dtype=bt.Phenotype,
            default_value="unknown",
            nullable=False,
            cat_filters={"source__uid": phenotype_source.uid},
        ).save(),
    ],
).save()
# look at a dataframe of the features that are part of the schema
schema.features.to_dataframe()
Hide code cell output
uid name dtype is_type unit description array_rank array_size array_shape proxy_dtype synonyms _expect_many _curation space_id type_id is_locked run_id created_at created_by_id _aux branch_id
id
1 IILGxMGtTpdO age int None None None 0 0 None None None None None 1 None False 1 2025-10-16 11:31:22.512000+00:00 1 {'af': {'0': None, '1': True, '2': False}} 1
2 HH0PGi76WE3j disease cat[bionty.Disease[source__uid='IGIkseWQ']] None None None 0 0 None None None None None 1 None False 1 2025-10-16 11:31:22.517000+00:00 1 {'af': {'0': 'normal', '1': False, '2': False}} 1
3 fLZufPeAMKFR developmental_stage cat[bionty.DevelopmentalStage[source__uid='1Gb... None None None 0 0 None None None None None 1 None False 1 2025-10-16 11:31:22.522000+00:00 1 {'af': {'0': 'unknown', '1': False, '2': False}} 1
4 mOJ52sah1zuo phenotype cat[bionty.Phenotype[source__uid='48fBFLmn']] None None None 0 0 None None None None None 1 None False 1 2025-10-16 11:31:22.526000+00:00 1 {'af': {'0': 'unknown', '1': False, '2': False}} 1

Curate an example dataset

Create an example DataFrame that has all required columns but one is misnamed.

dataset = {
    "disease": pd.Categorical(
        [
            "Alzheimer disease",
            "diabetes mellitus",
            pd.NA,
            "Hypertension",
            "asthma",
        ]
    ),
    "phenotype": pd.Categorical(
        [
            "Mental deterioration",
            "Hyperglycemia",
            "Tumor growth",
            "Increased blood pressure",
            "Airway inflammation",
        ]
    ),
    "developmental_stage": pd.Categorical(
        ["Adult", "Adult", "Adult", "Adult", "Child"]
    ),
    "patient_age": [70, 55, 60, 65, 12],
}
df = pd.DataFrame(dataset)
df
Hide code cell output
disease phenotype developmental_stage patient_age
0 Alzheimer disease Mental deterioration Adult 70
1 diabetes mellitus Hyperglycemia Adult 55
2 NaN Tumor growth Adult 60
3 Hypertension Increased blood pressure Adult 65
4 asthma Airway inflammation Child 12

Let’s validate it.

curator = ln.curators.DataFrameCurator(df, schema)
try:
    curator.validate()
except ln.errors.ValidationError as e:
    assert "column 'age' not in dataframe" in str(e)
    print(e)
Hide code cell output
{
    "SCHEMA": {
        "COLUMN_NOT_IN_DATAFRAME": [
            {
                "schema": null,
                "column": null,
                "check": "column_in_dataframe",
                "error": "column 'age' not in dataframe. Columns in dataframe: ['disease', 'phenotype', 'developmental_stage', 'patient_age']"
            }
        ],
        "SERIES_CONTAINS_NULLS": [
            {
                "schema": null,
                "column": "disease",
                "check": "not_nullable",
                "error": "non-nullable series 'disease' contains null values:2    NaNName: disease, dtype: categoryCategories (4, object): ['Alzheimer disease', 'Hypertension', 'asthma', 'diabetes mellitus']"
            }
        ]
    }
}

Fix the name of the patient_age column to be age.

df.columns = df.columns.str.replace("patient_age", "age")
try:
    curator.validate()
except ln.errors.ValidationError as e:
    assert "non-nullable series 'disease' contains null values" in str(e)
    print(e)
Hide code cell output
{
    "SCHEMA": {
        "SERIES_CONTAINS_NULLS": [
            {
                "schema": null,
                "column": "disease",
                "check": "not_nullable",
                "error": "non-nullable series 'disease' contains null values:2    NaNName: disease, dtype: categoryCategories (4, object): ['Alzheimer disease', 'Hypertension', 'asthma', 'diabetes mellitus']"
            }
        ]
    }
}

Standardize the dataframe so that the missing value gets populated with the default value.

curator.standardize()
try:
    curator.validate()
except ln.errors.ValidationError as e:
    print(e)
    # assert "2 terms are not validated: 'Tumor growth', 'Airway inflammation'" in str(e)
! 2 terms not validated in feature 'disease': 'normal', 'Hypertension'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('disease')
! 2 terms not validated in feature 'developmental_stage': 'Adult', 'Child'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('developmental_stage')
! 2 terms not validated in feature 'phenotype': 'Tumor growth', 'Airway inflammation'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('phenotype')
2 terms not validated in feature 'phenotype': 'Tumor growth', 'Airway inflammation'
    → fix typos, remove non-existent values, or save terms via: curator.cat.add_new_from('phenotype')

Add the ‘normal’ term to the disease registry.

bt.Disease(name="normal", description="Healthy condition").save()
Disease(uid='7kTPatVd', name='normal', description='Healthy condition', branch_id=1, space_id=1, created_by_id=1, run_id=1, created_at=2025-10-16 11:31:26 UTC, is_locked=False)

Curate the remaining mismatches manually.

diseases = bt.Disease.public().lookup()
phenotypes = bt.Phenotype.public().lookup()
developmental_stages = bt.DevelopmentalStage.public().lookup()

df["disease"] = df["disease"].cat.rename_categories(
    {"Hypertension": diseases.hypertensive_disorder.name}
)
df["phenotype"] = df["phenotype"].cat.rename_categories(
    {
        "Tumor growth": phenotypes.neoplasm.name,
        "Airway inflammation": phenotypes.bronchitis.name,
    }
)
df["developmental_stage"] = df["developmental_stage"].cat.rename_categories(
    {
        "Adult": developmental_stages.adolescent_stage.name,
        "Child": developmental_stages.child_stage.name,
    }
)

curator.validate()
Hide code cell content
!rm -rf test-ehrschema
!lamin delete --force test-ehrschema
 deleting instance testuser1/test-ehrschema