Source code for cia.external

import numpy as np
import pandas as pd
import scanpy as sc


[docs]
def celltypist_majority_vote(data, classification_obs, groups_obs=None, min_prop=0, unassigned_label='Unassigned'):
    """
    A function that wraps Celltypist majority voting (DOI: 10.1126/science.abl5197).
    Assigns cell group labels based on the majority voting of cell type predictions within each group.

    If no reference cell groups are provided, an over-clustering step is performed using the Leiden algorithm.

    Parameters
    ----------
    data : anndata.AnnData
        An AnnData object containing the cell data and, optionally, previous clustering results.
    classification_obs : str or list of str
        The AnnData.obs column(s) where the cell type predictions (labels) are stored.
    groups_obs : str, optional
        The AnnData.obs column where the reference group labels are stored. If None, an over-clustering with the
        Leiden algorithm is performed based on the dataset size.
    min_prop : float, optional
        The minimum proportion of cells required to assign a majority vote label to a group. If the largest
        cell type in a group doesn't reach this proportion, the group is labeled as 'Unassigned'.
    unassigned_label : str, optional
        The label to assign to cell groups where no cell type reaches the minimum proportion. Default is 'Unassigned'.

    Notes
    -----
    The function automatically adjusts the resolution for the Leiden algorithm based on the number of observations in the data.
    Results of majority voting are stored back in the AnnData.obs, adding a column for each classification considered.
    """
    # Determine resolution for Leiden clustering based on data size if groups_obs is not provided
    if groups_obs is None:
        resolution = 5 + 5 * (data.n_obs // 20000)  # Increasing resolution in steps based on data size
        print(f'Reference annotation not selected. Computing over-clustering with Leiden algorithm (resolution={resolution}) ...')
        sc.tl.leiden(data, resolution=resolution, key_added=f'leiden_{resolution}')
        groups_obs = f'leiden_{resolution}'
        print(f'Dataset has been divided into {len(data.obs[groups_obs].cat.categories)} groups according to transcriptional similarities.')
        print(f'Over-clustering result saved in AnnData.obs["{groups_obs}"].')
    else:
        print(f'AnnData.obs["{groups_obs}"] selected as reference annotation.')

    print('Extending the more represented cell type label to each cell group...\n')
    groups = data.obs[groups_obs]

    # Ensure classification_obs is a list
    classification_obs = [classification_obs] if isinstance(classification_obs, str) else classification_obs

    for classification in classification_obs:
        votes = pd.crosstab(data.obs[classification], groups)
        majority = votes.idxmax()
        freqs = votes.max() / votes.sum()

        # Apply minimum proportion threshold to assign labels
        majority_labels = majority.where(freqs >= min_prop, unassigned_label)
        data.obs[f'{classification}_majority_voting'] = groups.map(majority_labels).astype('category')

        print(f'New classification labels have been stored in AnnData.obs["{classification}_majority_voting"].')