Source code for cellarium.ml.preprocessing.highly_variable_genes

# Copyright Contributors to the Cellarium project.
# SPDX-License-Identifier: BSD-3-Clause

import logging
import warnings

import numpy as np
import pandas as pd
import torch


[docs] def get_highly_variable_genes( gene_names: list, mean: torch.Tensor, var: torch.Tensor, n_top_genes: int | None = None, min_disp: float | None = 0.5, max_disp: float | None = np.inf, min_mean: float | None = 0.0125, max_mean: float | None = 3, n_bins: int = 20, ) -> pd.DataFrame: r""" Get Highly variably genes. This is a replication of Highly Variable Genes from Scanpy with a ``Seurat`` flavor. **References:** 1. `Highly Variable Genes from Scanpy <https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.highly_variable_genes.html>`_. Args: gene_names: Ensembl gene ids. mean: Gene expression means. var: Gene expression vars. n_top_genes: Number of highly-variable genes to keep. min_disp: If ``n_top_genes`` unequals None, this and all other cutoffs for the means and the normalized dispersions are ignored. max_disp: If ``n_top_genes`` unequals None, this and all other cutoffs for the means and the normalized dispersions are ignored. min_mean: If ``n_top_genes`` unequals None, this and all other cutoffs for the means and the normalized dispersions are ignored. max_mean: If ``n_top_genes`` unequals None, this and all other cutoffs for the means and the normalized dispersions are ignored. n_bins: Number of bins for binning the mean gene expression. Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1. You’ll be informed about this """ # compute the dispersion if not (len(gene_names) == len(mean) == len(var)): raise ValueError("Sizes of `gene_names`, `mean`, and `var` should be the same") mean[mean == 0] = 1e-12 # set entries equal to zero to small value dispersion = var / mean # logarithmized mean as in Seurat dispersion[dispersion == 0] = np.nan dispersion = torch.log(dispersion) mean = torch.log1p(mean) # All the following quantities are "per-gene" here df = pd.DataFrame() df["means"] = mean df["dispersions"] = dispersion df["mean_bin"] = pd.cut(df["means"], bins=n_bins) disp_grouped = df.groupby("mean_bin", observed=False)["dispersions"] disp_mean_bin = disp_grouped.mean() disp_std_bin = disp_grouped.std(ddof=1) df.index = gene_names # retrieve those genes that have nan std, these are the ones where # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() gen_indices = np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist() if len(gen_indices) > 0: logging.debug( f"Gene indices {gen_indices} fell into a single bin: their " "normalized dispersion was set to 1.\n " "Decreasing `n_bins` will likely avoid this effect." ) # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. disp_std_bin[one_gene_per_bin.values] = disp_mean_bin[one_gene_per_bin.values].values disp_mean_bin[one_gene_per_bin.values] = 0 # actually do the normalization df["dispersions_norm"] = ( df["dispersions"].values - disp_mean_bin[df["mean_bin"].values].values # use values here as index differs ) / disp_std_bin[df["mean_bin"].values].values dispersion_norm = df["dispersions_norm"].values if n_top_genes is not None: dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)] dispersion_norm[::-1].sort() # interestingly, np.argpartition is slightly slower if n_top_genes > len(gene_names): logging.info("`n_top_genes` > `adata.n_var`, returning all genes.") n_top_genes = len(gene_names) if n_top_genes > len(dispersion_norm): warnings.warn( "`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.", UserWarning, ) n_top_genes = len(dispersion_norm) disp_cut_off = dispersion_norm[n_top_genes - 1] gene_subset = np.nan_to_num(df["dispersions_norm"].values) >= disp_cut_off logging.debug(f"the {n_top_genes} top genes correspond to a " f"normalized dispersion cutoff of {disp_cut_off}") else: dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat gene_subset = np.logical_and.reduce( ( mean.numpy() > min_mean, mean.numpy() < max_mean, dispersion_norm > min_disp, dispersion_norm < max_disp, ) ) df["highly_variable"] = gene_subset return df