Source code for cia.external

import numpy as np
import pandas as pd
import scanpy as sc

[docs] def celltypist_majority_vote(data, classification_obs, groups_obs=None, min_prop=0, unassigned_label='Unassigned'): """ A function that wraps Celltypist majority voting (DOI: 10.1126/science.abl5197). Assigns cell group labels based on the majority voting of cell type predictions within each group. If no reference cell groups are provided, an over-clustering step is performed using the Leiden algorithm. Parameters ---------- data : anndata.AnnData An AnnData object containing the cell data and, optionally, previous clustering results. classification_obs : str or list of str The AnnData.obs column(s) where the cell type predictions (labels) are stored. groups_obs : str, optional The AnnData.obs column where the reference group labels are stored. If None, an over-clustering with the Leiden algorithm is performed based on the dataset size. min_prop : float, optional The minimum proportion of cells required to assign a majority vote label to a group. If the largest cell type in a group doesn't reach this proportion, the group is labeled as 'Unassigned'. unassigned_label : str, optional The label to assign to cell groups where no cell type reaches the minimum proportion. Default is 'Unassigned'. Notes ----- The function automatically adjusts the resolution for the Leiden algorithm based on the number of observations in the data. Results of majority voting are stored back in the AnnData.obs, adding a column for each classification considered. """ # Determine resolution for Leiden clustering based on data size if groups_obs is not provided if groups_obs is None: resolution = 5 + 5 * (data.n_obs // 20000) # Increasing resolution in steps based on data size print(f'Reference annotation not selected. Computing over-clustering with Leiden algorithm (resolution={resolution}) ...') sc.tl.leiden(data, resolution=resolution, key_added=f'leiden_{resolution}') groups_obs = f'leiden_{resolution}' print(f'Dataset has been divided into {len(data.obs[groups_obs].cat.categories)} groups according to transcriptional similarities.') print(f'Over-clustering result saved in AnnData.obs["{groups_obs}"].') else: print(f'AnnData.obs["{groups_obs}"] selected as reference annotation.') print('Extending the more represented cell type label to each cell group...\n') groups = data.obs[groups_obs] # Ensure classification_obs is a list classification_obs = [classification_obs] if isinstance(classification_obs, str) else classification_obs for classification in classification_obs: votes = pd.crosstab(data.obs[classification], groups) majority = votes.idxmax() freqs = votes.max() / votes.sum() # Apply minimum proportion threshold to assign labels majority_labels = majority.where(freqs >= min_prop, unassigned_label) data.obs[f'{classification}_majority_voting'] = groups.map(majority_labels).astype('category') print(f'New classification labels have been stored in AnnData.obs["{classification}_majority_voting"].')