Arc Virtual Cell Atlas: scRNA-seq ¶

The Arc Virtual Cell Atlas hosts one of the biggest collections of scRNA-seq datasets.

Lamin mirrors the dataset for simplified access here: laminlabs/arc-virtual-cell-atlas.

If you use the data academically, please cite the original publications, Youngblut et al. (2025) and Zhang et al. (2025).

If you’d like to transfer data into your own LaminDB instance, see the transfer guide.

# pip install 'lamindb[gcp,bionty,pertdb]'
!lamin init --modules bionty,pertdb --storage ./test-arc-virtual-cell-atlas

import lamindb as ln
import pyarrow.compute as pc
import anndata as ad

Create the central query object for this instance:

db = ln.DB("laminlabs/arc-virtual-cell-atlas")

Tahoe-100M¶

project_tahoe = db.Project.get(name="Tahoe-100M")
project_tahoe

# one collection in this project
project_tahoe.collections.to_dataframe()

Show code cell output Hide code cell output

	uid	key	description	hash	reference	reference_type	version_tag	is_latest	is_locked	created_at	branch_id	created_on_id	space_id	created_by_id	run_id	meta_artifact_id
id
1	BpavRL4ntRTzWEE50000	tahoe100	None	GCLk4ZgQxgWspjmEUk3gIg	None	None	2025-02-25	True	False	2025-02-26 13:51:22.787537+00:00	1	1	1	1	3	None

Every individual dataset in the atlas is an .h5ad file that is registered as an artifact in LaminDB.

Artifact level metadata are registered and can be explored as follows:

# get the collection: https://lamin.ai/laminlabs/arc-virtual-cell-atlas/collection/BpavRL4ntRTzWEE5
collection_tahoe = db.Collection.get(key="tahoe100")
# 14 artifacts in this collection, each correspond to a plate
artifacts_tahoe = collection_tahoe.artifacts.distinct()
artifacts_tahoe.to_dataframe()

Show code cell output Hide code cell output

	uid	key	description	suffix	kind	otype	size	hash	n_files	n_observations	...	is_latest	is_locked	created_at	branch_id	created_on_id	space_id	storage_id	run_id	schema_id	created_by_id
id
1375	BDttiuV3Te8VB0dU0000	2025-02-25/h5ad/plate9_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	18791302576	4kHbVbmreg6akW6ZgsjxaA	None	5866669	...	True	False	2025-02-25 23:22:22.759201+00:00	1	1	1	2	1	3	1
1374	czC19UpUEszVH2bU0000	2025-02-25/h5ad/plate8_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	30390935958	ilAzEPIh4FlDeTFaJ1dILw	None	8880979	...	True	False	2025-02-25 23:22:22.387666+00:00	1	1	1	2	1	3	1
1373	DC5cacdJr1VoEXnl0000	2025-02-25/h5ad/plate7_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	16514746341	NOS4MY6eYYPOnAB8ViyWYg	None	5692117	...	True	False	2025-02-25 23:22:22.009157+00:00	1	1	1	2	1	3	1
1372	aAHQ3zbD7n1asyYr0000	2025-02-25/h5ad/plate6_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	28934897078	NYvQEqVClziHm0ozWhOw1w	None	7545393	...	True	False	2025-02-25 23:22:21.629962+00:00	1	1	1	2	1	3	1
1371	EZATJLC4jE7pmwo40000	2025-02-25/h5ad/plate5_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	19763140865	VMBKFzOI5cj7UC1UDENP4A	None	6419498	...	True	False	2025-02-25 23:22:21.255154+00:00	1	1	1	2	1	3	1
1370	tKTeff0ugWqAm4P70000	2025-02-25/h5ad/plate4_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	23292672278	BkBXznbSovNWXtzPFITPcQ	None	7004356	...	True	False	2025-02-25 23:22:20.879928+00:00	1	1	1	2	1	3	1
1369	XVSrkq9pyF1OBLgG0000	2025-02-25/h5ad/plate3_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	13173722269	Jnrt7DaSUCGn8D8LS2itaw	None	4705402	...	True	False	2025-02-25 23:22:20.497965+00:00	1	1	1	2	1	3	1
1368	ZFeVfd0ugAHeWCxm0000	2025-02-25/h5ad/plate2_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	29037152127	usxviuqGbuw0RYnECCVCWw	None	8064658	...	True	False	2025-02-25 23:22:20.113956+00:00	1	1	1	2	1	3	1
1367	aJIqo7bNyJAs9z0r0000	2025-02-25/h5ad/plate1_filt_Vevo_Tahoe100M_WSe...	None	.h5ad	dataset	AnnData	19070623904	9iCNcouMqfNS3HA/2GUWOA	None	5481420	...	True	False	2025-02-25 23:22:19.737995+00:00	1	1	1	2	1	3	1
1366	vn5cUJCHbjpPPsZx0000	2025-02-25/h5ad/plate14_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	22427932564	FrnStRehP16siRGG35ou+g	None	6518806	...	True	False	2025-02-25 23:22:19.357999+00:00	1	1	1	2	1	3	1
1365	9L9HZ55HqUL0aqaR0000	2025-02-25/h5ad/plate13_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	28071589885	RKOiaay+CHvv+Ukk/N+28A	None	8501658	...	True	False	2025-02-25 23:22:18.977981+00:00	1	1	1	2	1	3	1
1364	S2h2rPLCaUhZAM9u0000	2025-02-25/h5ad/plate12_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	37495736876	VjAkWVFGVpzAMi9Innusuw	None	10487057	...	True	False	2025-02-25 23:22:18.600910+00:00	1	1	1	2	1	3	1
1363	omn7JStfJMzy8m6O0000	2025-02-25/h5ad/plate11_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	23230802756	N2mzoYlMLEl6PdecaYyDvw	None	7435869	...	True	False	2025-02-25 23:22:18.229629+00:00	1	1	1	2	1	3	1
1362	56uA9lPPmJ4zLUcr0000	2025-02-25/h5ad/plate10_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	26536400717	j1FXsX7hs7u+eBqnWnmNHw	None	8044908	...	True	False	2025-02-25 23:22:17.849980+00:00	1	1	1	2	1	3	1

14 rows × 21 columns

50 cell lines.

artifacts_tahoe.to_list("cell_lines__name")[:5]

380 compounds.

artifacts_tahoe.to_list("compounds__name")[:5]

1,138 perturbations.

artifacts_tahoe.to_list("compound_perturbations__name")[:5]

# check the curated metadata of the first artifact
artifact1 = artifacts_tahoe[0]
artifact1.describe()

Show code cell output Hide code cell output

Artifact: 2025-02-25/h5ad/plate10_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad (0000)
├── uid: 56uA9lPPmJ4zLUcr0000            run: 0xj4zui (register-tahoe100.ipynb)
│   kind: dataset                        otype: AnnData                        
│   hash: j1FXsX7hs7u+eBqnWnmNHw         size: 24.7 GB                         
│   branch: main                         space: all                            
│   created_at: 2025-02-25 23:22:17 UTC  created_by: sunnyosun                 
│   n_observations: 8044908                                                    
├── storage/path: gs://arc-ctc-tahoe100/2025-02-25/h5ad/plate10_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad
├── Dataset features
│   ├── var (62710 bionty.Gene.stabl…                                                                              
│   │   ANKIB1                         float                                                                       
│   │   C1orf112                       float                                                                       
│   │   CFH                            float                                                                       
│   │   CFTR                           float                                                                       
│   │   CYP51A1                        float                                                                       
│   │   DPM1                           float                                                                       
│   │   ENPP4                          float                                                                       
│   │   FGR                            float                                                                       
│   │   FUCA2                          float                                                                       
│   │   GCLC                           float                                                                       
│   │   KRIT1                          float                                                                       
│   │   LAS1L                          float                                                                       
│   │   NFYA                           float                                                                       
│   │   NIPAL3                         float                                                                       
│   │   RAD52                          float                                                                       
│   │   SCYL3                          float                                                                       
│   │   SEMA3F                         float                                                                       
│   │   STPG1                          float                                                                       
│   │   TNMD                           float                                                                       
│   │   TSPAN6                         float                                                                       
│   └── obs (16)                                                                                                   
│       BARCODE                        str                                                                         
│       G2M_score                      float                                                                       
│       S_score                        float                                                                       
│       cell_line                      bionty.CellLine.description                                                 
│       cell_name                      bionty.CellLine                      A-172, A-427, A498, A549, AN3 CA, AsPC…
│       drug                           pertdb.Compound                      5-Azacytidine, 5-Fluorouracil, Abirate…
│       drugname_drugconc              pertdb.CompoundPerturbation          [('5-Azacytidine', 0.05, 'uM')], [('5-…
│       gene_count                     int                                                                         
│       mread_count                    int                                                                         
│       pass_filter                    ULabel[yMABN5Dr]                     full, minimal                          
│       pcnt_mito                      float                                                                       
│       phase                          ULabel[kTzOKZ54]                     G1, G2M, S                             
│       plate                          ULabel[SjVCuE2Q]                     plate10                                
│       sample                         pertdb.Biosample                                                            
│       sublibrary                     str                                                                         
│       tscp_count                     int                                                                         
└── Labels
    └── .ulabels                       ULabel                               plate10, G1, G2M, S, full, minimal     
        .projects                      Project                              Tahoe-100M                             
        .references                    Reference                            Tahoe-100M: A Giga-Scale Single-Cell P…
        .organisms                     bionty.Organism                      human                                  
        .cell_lines                    bionty.CellLine                      NCI-H1573, NCI-H460, hTERT-HPNE, SW48,…
        .compounds                     pertdb.Compound                      Bestatin (hydrochloride), Ataluren, Ca…
        .compound_perturbations        pertdb.CompoundPerturbation          [('Bestatin (hydrochloride)', 0.05, 'u…

16 obs metadata features.

artifact1.features.slots["obs"].members.to_dataframe()

Show code cell output Hide code cell output

	uid	name	_dtype_str	unit	description	array_rank	array_size	array_shape	synonyms	default_value	...	coerce	is_locked	is_type	created_at	branch_id	created_on_id	space_id	created_by_id	run_id	type_id
id
19	gQE1h3fIBiSf	sample	cat[pertdb.Biosample]	None	Unique treatment identifier, distinguishes rep...	0	0	None	None	None	...	False	False	False	2025-02-26 10:59:36.743558+00:00	1	1	1	1	3	None
5	IjSP1lCY3Hyw	gene_count	int	None	Number of genes with at least one count	0	0	None	None	None	...	False	False	False	2025-02-25 22:30:30.668750+00:00	1	1	1	1	3	None
6	LHUmmYKjIGPl	tscp_count	int	None	Number of transcripts, aka UMI count	0	0	None	None	None	...	False	False	False	2025-02-25 22:30:31.236532+00:00	1	1	1	1	3	None
7	PZDiL36nJSFv	mread_count	int	None	Number of reads per cell	0	0	None	None	None	...	False	False	False	2025-02-25 22:30:31.810331+00:00	1	1	1	1	3	None
18	fLwdFKBUhBY9	drugname_drugconc	cat[pertdb.CompoundPerturbation]	None	Drug name, concentration, and concentration unit	0	0	None	None	None	...	False	False	False	2025-02-25 23:04:17.541812+00:00	1	1	1	1	3	None
17	Q0cj2JR5Juwn	drug	cat[pertdb.Compound]	None	Drug name, parsed out from the drugname_drugco...	0	0	None	None	None	...	False	False	False	2025-02-25 23:02:05.717794+00:00	1	1	1	1	3	None
4	vshELphl73qp	cell_line	cat[bionty.CellLine.description]	None	Cell line information (if applicable)	0	0	None	None	None	...	False	False	False	2025-02-25 22:27:22.393997+00:00	1	1	1	1	3	None
15	3X4d0QEUuprp	sublibrary	str	None	Sublibrary ID (related to library prep and seq...	0	0	None	None	None	...	False	False	False	2025-02-25 22:35:14.673178+00:00	1	1	1	1	3	None
16	dQELv2sIVnJX	BARCODE	str	None	Barcode ID	0	0	None	None	None	...	False	False	False	2025-02-25 22:35:15.627971+00:00	1	1	1	1	3	None
8	X640W5tBUPOQ	pcnt_mito	float	None	Percentage of mitochondrial reads	0	0	None	None	None	...	False	False	False	2025-02-25 22:31:21.581885+00:00	1	1	1	1	3	None
9	bujDkB4Nd1S5	S_score	float	None	Inferred S phase score	0	0	None	None	None	...	False	False	False	2025-02-25 22:31:22.144135+00:00	1	1	1	1	3	None
10	CF0O0e0WZxFz	G2M_score	float	None	Inferred G2M score	0	0	None	None	None	...	False	False	False	2025-02-25 22:31:22.708895+00:00	1	1	1	1	3	None
2	QboQ1Q1Yxsjn	phase	cat[ULabel[kTzOKZ54]]	None	Inferred cell cycle phase	0	0	None	None	None	...	False	False	False	2025-02-25 22:21:56.935262+00:00	1	1	1	1	3	None
3	PVpyJhciLdCQ	pass_filter	cat[ULabel[yMABN5Dr]]	None	"Full" filters are more stringent on gene_coun...	0	0	None	None	None	...	False	False	False	2025-02-25 22:25:30.918235+00:00	1	1	1	1	3	None
11	KPT70T8xJLIt	cell_name	cat[bionty.CellLine]	None	Commonly-used cell name (related to the cell_l...	0	0	None	None	None	...	False	False	False	2025-02-25 22:32:56.082195+00:00	1	1	1	1	3	None
1	YRSYWdIiesqL	plate	cat[ULabel[SjVCuE2Q]]	None	Plate identifier	0	0	None	None	None	...	False	False	False	2025-02-25 22:03:51.786985+00:00	1	1	1	1	3	None

16 rows × 21 columns

Query artifacts of interest based on metadata¶

Since all metadata are registered in the sql database, we can explore the datasets without accessing them.

Let’s find which datasets contain A549 cells perturbed with Piroxicam.

# lookup objects give you pythonic access to the values
cell_lines = db.bionty.CellLine.lookup("ontology_id")
drugs = db.pertdb.Compound.lookup()

artifacts_a549_piroxicam = artifacts_tahoe.filter(
    cell_lines=cell_lines.cvcl_0023, compounds=drugs.piroxicam
)
artifacts_a549_piroxicam.to_dataframe()

Show code cell output Hide code cell output

	uid	key	description	suffix	kind	otype	size	hash	n_files	n_observations	...	is_latest	is_locked	created_at	branch_id	created_on_id	space_id	storage_id	run_id	schema_id	created_by_id
id
1364	S2h2rPLCaUhZAM9u0000	2025-02-25/h5ad/plate12_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	37495736876	VjAkWVFGVpzAMi9Innusuw	None	10487057	...	True	False	2025-02-25 23:22:18.600910+00:00	1	1	1	2	1	3	1
1363	omn7JStfJMzy8m6O0000	2025-02-25/h5ad/plate11_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	23230802756	N2mzoYlMLEl6PdecaYyDvw	None	7435869	...	True	False	2025-02-25 23:22:18.229629+00:00	1	1	1	2	1	3	1
1362	56uA9lPPmJ4zLUcr0000	2025-02-25/h5ad/plate10_filt_Vevo_Tahoe100M_WS...	None	.h5ad	dataset	AnnData	26536400717	j1FXsX7hs7u+eBqnWnmNHw	None	8044908	...	True	False	2025-02-25 23:22:17.849980+00:00	1	1	1	2	1	3	1

3 rows × 21 columns

You can download an .h5ad into your local cache:

artifact1.cache()

Or stream it:

artifact1.open()

Open the obs metadata parquet file as a PyArrow Dataset¶

Open the obs metadata file (2.29G) with PyArrow.Dataset.

obs_metadata = db.Artifact.filter(
    key__endswith="obs_metadata.parquet", projects=project_tahoe
).one()
obs_metadata

obs_metadata_ds = obs_metadata.open()
obs_metadata_ds.schema

Which A549 cells are perturbed with Piroxicam.

filter_expr = (pc.field("cell_name") == cell_lines.cvcl_0023.name) & (
    pc.field("drug") == drugs.piroxicam.name
)
obs_metadata_df = obs_metadata_ds.scanner(filter=filter_expr).to_table().to_pandas()
obs_metadata_df.value_counts("plate")

Retrieve the corresponding cells from h5ad files.

plate_cells = df.groupby("plate")["BARCODE_SUB_LIB_ID"].apply(list)

adatas = []
for artifact in artifacts_a549_piroxicam:
    plate = artifact.features.get_values()["plate"]
    idxs = plate_cells.get(plate)
    print(f"Loading {len(idxs)} cells from plate {plate}")
    with artifact.open() as store:
        adata = store[idxs].to_memory() # can also subst genes here
        adatas.append(adata)

scBaseCount¶

project_scbasecount = db.Project.get(name="scBaseCount")
project_scbasecount

This project has 105 collections (21 organisms x 5 count features):

project_scbasecount.collections.to_dataframe()

Show code cell output Hide code cell output

! truncated query result to limit=100 Collection objects

	uid	key	description	hash	reference	reference_type	version_tag	is_latest	is_locked	created_at	branch_id	created_on_id	space_id	created_by_id	run_id	meta_artifact_id
id
107	wwvSKTeDmTri9Ppf0000	scBaseCount/Velocyto/Mus_musculus	None	j3BeJyLuclN11yQpqHJj6Q	None	None	2025-02-25	True	False	2025-03-03 11:09:45.776463+00:00	1	1	1	1	10	None
106	wdVaulVvESgAWwtf0000	scBaseCount/GeneFull_ExonOverIntron/Mus_musculus	None	Yr9AxC-eL10vVMuigJOlrg	None	None	2025-02-25	True	False	2025-03-03 11:09:34.372387+00:00	1	1	1	1	10	None
105	83gTx3oxX5S4SxQ30000	scBaseCount/GeneFull_Ex50pAS/Mus_musculus	None	x-Tm3VldcW71n3mYE2KknQ	None	None	2025-02-25	True	False	2025-03-03 11:09:22.891607+00:00	1	1	1	1	10	None
104	zLwr9k0TkiRt6ymZ0000	scBaseCount/GeneFull/Mus_musculus	None	i30e5gnKklC8UBqSS0aVSA	None	None	2025-02-25	True	False	2025-03-03 11:09:11.674645+00:00	1	1	1	1	10	None
103	wQQNz6vrQeKuro540000	scBaseCount/Gene/Mus_musculus	None	QeF9x4hTGYLw8MzFvLBCoQ	None	None	2025-02-25	True	False	2025-03-03 11:09:00.351899+00:00	1	1	1	1	10	None
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12	Aioyo5zHXzPUkSuT0000	scBaseCount/Velocyto/Bos_taurus	None	HkQe255ahyg8xbV35eRs4Q	None	None	2025-02-25	True	False	2025-03-03 11:00:28.668980+00:00	1	1	1	1	10	None
11	gJrcdOm2sG7JUINS0000	scBaseCount/GeneFull_ExonOverIntron/Bos_taurus	None	BFGras5oupzn4iVCjSjZ0A	None	None	2025-02-25	True	False	2025-03-03 11:00:23.782698+00:00	1	1	1	1	10	None
10	gY3xsMES4idjZb320000	scBaseCount/GeneFull_Ex50pAS/Bos_taurus	None	7E9sWxY48KZlzq0K9vT-rw	None	None	2025-02-25	True	False	2025-03-03 11:00:18.903653+00:00	1	1	1	1	10	None
9	owfF1Bfuq660eiDp0000	scBaseCount/GeneFull/Bos_taurus	None	ionjx_HD9P6K9u5dJKgR3w	None	None	2025-02-25	True	False	2025-03-03 11:00:14.013350+00:00	1	1	1	1	10	None
8	ttGkPgXxLDO4sSXF0000	scBaseCount/Gene/Bos_taurus	None	jn1Nhcdt0lpB1I3hQ4SgFw	None	None	2025-02-25	True	False	2025-03-03 11:00:09.130314+00:00	1	1	1	1	10	None

100 rows × 16 columns

Query artifacts of interest based on metadata¶

Often you might not want to access all the h5ads in a collection, but rather filter them by metadata:

organisms = db.bionty.Organism.lookup()
tissues = db.bionty.Tissue.lookup()
efos = db.bionty.ExperimentalFactor.lookup()
feature_counts = db.ULabel.filter(type__name="STARsolo count features").lookup()

h5ads_brain = db.Artifact.filter(
    suffix=".h5ad",
    projects=project_scbasecount,
    organisms=organisms.human,
    ulabels=feature_counts.genefull_ex50pas,
    tissues=tissues.brain,
    experimental_factors=efos.single_cell,
).distinct()

h5ads_brain.to_dataframe()

Show code cell output Hide code cell output

! truncated query result to limit=100 Artifact objects

	uid	key	description	suffix	kind	otype	size	hash	n_files	n_observations	...	is_latest	is_locked	created_at	branch_id	created_on_id	space_id	storage_id	run_id	schema_id	created_by_id
id
118412	AgGg94YvPyD2J3xN0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	103189773	XPcRAQJEzXp/aSQQLRYukw	None	21458	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
118411	XGZc3o3CXLr7Ai6r0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	127392244	gu1uqPVfX1hKdsedK7DUNA	None	16021	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
118410	G526AmojhrmEfGSB0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	125246141	B0R9drWNPWlPQg2eDVdvlQ	None	14134	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
118409	v3PUXjo6pfzyGVMB0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	98071338	UokquYv9f7gt0atu4vK4jA	None	11673	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
118408	DU6TONwI4j3hplTT0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	135310504	kNPAbA/aEut/sJrYctH01Q	None	14599	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
117410	HaPkPp9YL0pQXxcz0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	90957844	y8uiWV30cvuXfOqm+eTo1A	None	11191	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
117407	8LTArb9cSkapKA2y0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	39121738	JFi/i0Ih6DipQYS8vP2SrA	None	3780	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
117406	zOtkyAOMeXf0OHkO0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	31518175	+NiApiUkfwXyrDDcxVR96g	None	3920	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
117405	DqIvWjdS75AHdVwJ0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	31705623	lW4HW9XX96UpY5o4DJYJ+g	None	3368	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1
117404	yFSbcNYC2GKy1Aae0000	2025-02-25/h5ad/GeneFull_Ex50pAS/Homo_sapiens/...	None	.h5ad	dataset	AnnData	40797146	8gyle0Hc/2D2n0DGFryWJg	None	5030	...	True	False	2025-02-28 16:46:25.771217+00:00	1	1	1	3	10	55	1

100 rows × 21 columns

Load the h5ad files with obs metadata¶

Load the h5ads as a single AnnData:

adata_concat = h5ads_brain[:5].load()
adata_concat

Open the sample metadata:

sample_meta = db.Artifact.filter(
    key__endswith="sample_metadata.parquet",
    projects=project_scbasecount,
    organisms=organisms.human,
    ulabels=feature_counts.genefull_ex50pas,
).one()
sample_meta

sample_meta_dataset = sample_meta.open()
sample_meta_dataset.schema

Fetch corresponding sample metadata:

filter_expr = pc.field("srx_accession").isin(
    adata_concat.obs["SRX_accession"].astype(str)
)
df = sample_meta_dataset.scanner(filter=filter_expr).to_table().to_pandas()

Add the sample metadata to the AnnData:

adata_concat.obs = adata_concat.obs.merge(
    df, left_on="SRX_accession", right_on="srx_accession"
)
adata_concat

adata_concat.obs.head()

Show code cell output Hide code cell output

	gene_count	umi_count	SRX_accession	artifact_uid	entrez_id	srx_accession	file_path	obs_count	lib_prep	tech_10x	cell_prep	organism	tissue	disease	perturbation	cell_line	czi_collection_id	czi_collection_name
0	5939	14141.0	ERX10019090	pvHcKyHxnSeXUa2F0000	26040861	ERX10019090	gs://arc-scbasecount/2025-02-25/h5ad/GeneFull_...	17403	10x_Genomics	3_prime_gex	single_cell	Homo sapiens	brain	Parkinson's Disease	Rotenone, 0.125 µM	WTSIi018-B-1	None	None
1	6331	18138.0	ERX10019090	pvHcKyHxnSeXUa2F0000	26040861	ERX10019090	gs://arc-scbasecount/2025-02-25/h5ad/GeneFull_...	17403	10x_Genomics	3_prime_gex	single_cell	Homo sapiens	brain	Parkinson's Disease	Rotenone, 0.125 µM	WTSIi018-B-1	None	None
2	5447	16033.0	ERX10019090	pvHcKyHxnSeXUa2F0000	26040861	ERX10019090	gs://arc-scbasecount/2025-02-25/h5ad/GeneFull_...	17403	10x_Genomics	3_prime_gex	single_cell	Homo sapiens	brain	Parkinson's Disease	Rotenone, 0.125 µM	WTSIi018-B-1	None	None
3	2307	4154.0	ERX10019090	pvHcKyHxnSeXUa2F0000	26040861	ERX10019090	gs://arc-scbasecount/2025-02-25/h5ad/GeneFull_...	17403	10x_Genomics	3_prime_gex	single_cell	Homo sapiens	brain	Parkinson's Disease	Rotenone, 0.125 µM	WTSIi018-B-1	None	None
4	965	1183.0	ERX10019090	pvHcKyHxnSeXUa2F0000	26040861	ERX10019090	gs://arc-scbasecount/2025-02-25/h5ad/GeneFull_...	17403	10x_Genomics	3_prime_gex	single_cell	Homo sapiens	brain	Parkinson's Disease	Rotenone, 0.125 µM	WTSIi018-B-1	None	None