Source code for cellarium.ml.data.fileio

# Copyright Contributors to the Cellarium project.
# SPDX-License-Identifier: BSD-3-Clause

import re
import shutil
import tempfile
import urllib.request

from anndata import AnnData, read_h5ad
from google.cloud.storage import Client

url_schemes = ("http:", "https:", "ftp:")


[docs] def read_h5ad_gcs(filename: str, storage_client: Client | None = None) -> AnnData: r""" Read ``.h5ad``-formatted hdf5 file from the Google Cloud Storage. Example:: >>> adata = read_h5ad_gcs("gs://dsp-cellarium-cas-public/test-data/test_0.h5ad") Args: filename: Path to the data file in Cloud Storage. """ if not filename.startswith("gs:"): raise ValueError("The filename must start with 'gs:' protocol name.") # parse bucket and blob names from the filename filename = re.sub(r"^gs://?", "", filename) bucket_name, blob_name = filename.split("/", 1) if storage_client is None: storage_client = Client() bucket = storage_client.bucket(bucket_name) blob = bucket.blob(blob_name) with blob.open("rb") as f: return read_h5ad(f)
[docs] def read_h5ad_url(filename: str) -> AnnData: r""" Read ``.h5ad``-formatted hdf5 file from the URL. Example:: >>> adata = read_h5ad_url( ... "https://storage.googleapis.com/dsp-cellarium-cas-public/test-data/test_0.h5ad" ... ) Args: filename: URL of the data file. """ if not any(filename.startswith(scheme) for scheme in url_schemes): raise ValueError("The filename must start with 'http:', 'https:', or 'ftp:' protocol name.") with urllib.request.urlopen(filename) as response: with tempfile.TemporaryFile() as tmp_file: shutil.copyfileobj(response, tmp_file) return read_h5ad(tmp_file)
[docs] def read_h5ad_local(filename: str) -> AnnData: r""" Read ``.h5ad``-formatted hdf5 file from the local disk. Args: filename: Path to the local data file. """ if not filename.startswith("file:"): raise ValueError("The filename must start with 'file:' protocol name.") filename = re.sub(r"^file://?", "", filename) return read_h5ad(filename)
[docs] def read_h5ad_file(filename: str, **kwargs) -> AnnData: r""" Read ``.h5ad``-formatted hdf5 file from a filename. Args: filename: Path to the data file. """ if filename.startswith("gs:"): return read_h5ad_gcs(filename, **kwargs) if filename.startswith("file:"): return read_h5ad_local(filename) if any(filename.startswith(scheme) for scheme in url_schemes): return read_h5ad_url(filename) return read_h5ad(filename)