##### Arc Virtual Cell Atlas: scRNA-seq [image: .md][image]

The Arc Virtual Cell Atlas hosts one of the biggest collections of
scRNA-seq datasets.

Lamin mirrors the dataset for simplified access here: laminlabs/arc-
virtual-cell-atlas.

If you use the data academically, please cite the original
publications, Youngblut *et al.* (2025) and Zhang *et al.* (2025).

If you'd like to transfer data into your own LaminDB instance, see the
transfer guide.

 # pip install 'lamindb[gcp,bionty,pertdb]'
 !lamin init --modules bionty,pertdb --storage ./test-arc-virtual-cell-atlas

 import lamindb as ln
 import pyarrow.compute as pc
 import anndata as ad

Create the central query object for this instance:

 db = ln.DB("laminlabs/arc-virtual-cell-atlas")

#### Tahoe-100M

 project_tahoe = db.Project.get(name="Tahoe-100M")
 project_tahoe

 # one collection in this project
 project_tahoe.collections.to_dataframe()

Every individual dataset in the atlas is an ".h5ad" file that is
registered as an artifact in LaminDB.

Artifact level metadata are registered and can be explored as follows:

 # get the collection: https://lamin.ai/laminlabs/arc-virtual-cell-atlas/collection/BpavRL4ntRTzWEE5
 collection_tahoe = db.Collection.get(key="tahoe100")
 # 14 artifacts in this collection, each correspond to a plate
 artifacts_tahoe = collection_tahoe.artifacts.distinct()
 artifacts_tahoe.to_dataframe()

50 cell lines.

 artifacts_tahoe.to_list("cell_lines__name")[:5]

380 compounds.

 artifacts_tahoe.to_list("compounds__name")[:5]

1,138 perturbations.

 artifacts_tahoe.to_list("compound_perturbations__name")[:5]

 # check the curated metadata of the first artifact
 artifact1 = artifacts_tahoe[0]
 artifact1.describe()

16 obs metadata features.

 artifact1.features.slots["obs"].members.to_dataframe()

###### Query artifacts of interest based on metadata

Since all metadata are registered in the sql database, we can explore
the datasets without accessing them.

Let's find which datasets contain A549 cells perturbed with Piroxicam.

 # lookup objects give you pythonic access to the values
 cell_lines = db.bionty.CellLine.lookup("ontology_id")
 drugs = db.pertdb.Compound.lookup()

 artifacts_a549_piroxicam = artifacts_tahoe.filter(
 cell_lines=cell_lines.cvcl_0023, compounds=drugs.piroxicam
 )
 artifacts_a549_piroxicam.to_dataframe()

You can download an ".h5ad" into your local cache:

 artifact1.cache()

Or stream it:

 artifact1.open()

###### Open the obs metadata parquet file as a PyArrow Dataset

Open the obs metadata file (2.29G) with "PyArrow.Dataset".

 obs_metadata = db.Artifact.filter(
 key__endswith="obs_metadata.parquet", projects=project_tahoe
 ).one()
 obs_metadata

 obs_metadata_ds = obs_metadata.open()
 obs_metadata_ds.schema

Which A549 cells are perturbed with Piroxicam.

 filter_expr = (pc.field("cell_name") == cell_lines.cvcl_0023.name) & (
 pc.field("drug") == drugs.piroxicam.name
 )
 obs_metadata_df = obs_metadata_ds.scanner(filter=filter_expr).to_table().to_pandas()
 obs_metadata_df.value_counts("plate")

Retrieve the corresponding cells from h5ad files.

 plate_cells = df.groupby("plate")["BARCODE_SUB_LIB_ID"].apply(list)

 adatas = []
 for artifact in artifacts_a549_piroxicam:
 plate = artifact.features.get_values()["plate"]
 idxs = plate_cells.get(plate)
 print(f"Loading {len(idxs)} cells from plate {plate}")
 with artifact.open() as store:
 adata = store[idxs].to_memory() # can also subst genes here
 adatas.append(adata)

#### scBaseCount

 project_scbasecount = db.Project.get(name="scBaseCount")
 project_scbasecount

This project has 105 collections (21 organisms x 5 count features):

 project_scbasecount.collections.to_dataframe()

###### Query artifacts of interest based on metadata

Often you might not want to access all the h5ads in a collection, but
rather filter them by metadata:

 organisms = db.bionty.Organism.lookup()
 tissues = db.bionty.Tissue.lookup()
 efos = db.bionty.ExperimentalFactor.lookup()
 feature_counts = db.ULabel.filter(type__name="STARsolo count features").lookup()

 h5ads_brain = db.Artifact.filter(
 suffix=".h5ad",
 projects=project_scbasecount,
 organisms=organisms.human,
 ulabels=feature_counts.genefull_ex50pas,
 tissues=tissues.brain,
 experimental_factors=efos.single_cell,
 ).distinct()

 h5ads_brain.to_dataframe()

###### Load the h5ad files with obs metadata

Load the h5ads as a single AnnData:

 adata_concat = h5ads_brain[:5].load()
 adata_concat

Open the sample metadata:

 sample_meta = db.Artifact.filter(
 key__endswith="sample_metadata.parquet",
 projects=project_scbasecount,
 organisms=organisms.human,
 ulabels=feature_counts.genefull_ex50pas,
 ).one()
 sample_meta

 sample_meta_dataset = sample_meta.open()
 sample_meta_dataset.schema

Fetch corresponding sample metadata:

 filter_expr = pc.field("srx_accession").isin(
 adata_concat.obs["SRX_accession"].astype(str)
 )
 df = sample_meta_dataset.scanner(filter=filter_expr).to_table().to_pandas()

Add the sample metadata to the AnnData:

 adata_concat.obs = adata_concat.obs.merge(
 df, left_on="SRX_accession", right_on="srx_accession"
 )
 adata_concat

 adata_concat.obs.head()