Source code for cia.report

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools 
import scipy
import os


[docs] def group_composition(data, classification_obs, ref_obs, columns_order=None, cmap='Reds', save=None): """ Plots a heatmap showing the percentages of cells classified with a given method (method of interest) in cell groups defined with a different one (reference method). Parameters ---------- data : anndata.AnnData An AnnData object containing the cell classification data. classification_obs : str A string specifying the AnnData.obs column where the labels assigned by the method of interest are stored. ref_obs : str A string specifying the AnnData.obs column where the labels assigned by the reference method are stored. columns_order : list of str, optional A list of strings specifying the order of columns in the heatmap. cmap : str or matplotlib.colors.Colormap, optional The colormap for the heatmap. Defaults to 'Reds'. save : str, optional A filename to save the heatmap. If provided, the heatmap is saved in the 'figures' directory with 'CIA_' prefix. Returns ------- matplotlib.axes.Axes or None A heatmap AxesSubplot object is returned if `save` is None. Otherwise, the plot is saved to a file, and None is returned. Examples -------- >>> group_composition(adata, 'method_labels', 'reference_labels') """ # Compute the cross-tabulation of group memberships df = pd.crosstab(data.obs[ref_obs], data.obs[classification_obs]) df = round((df / np.array(df.sum(axis=1)).reshape(len(df.index), 1)) * 100, 2) # Reorder columns if specified if columns_order: df = df.reindex(columns=columns_order) # Plot heatmap heatmap = sns.heatmap(df, cmap=cmap, annot=True) # Save the figure if `save` is provided if save: if not os.path.exists('./figures'): os.makedirs('figures') fig = heatmap.get_figure() fig.savefig(f"figures/CIA_{save}") plt.close(fig) # Close the figure to prevent display in notebook environments return None return heatmap
[docs] def grouped_distributions(data, columns_obs, ref_obs, cmap='Reds', scale_medians=None, save=None): """ Plots a heatmap of median values for selected columns in AnnData.obs across cell groups and performs statistical tests to evaluate the differences in distributions. The Wilcoxon test checks if each group's signature score is significantly higher than others in the same group. The Mann-Whitney U test checks if each signature has the highest score values in the corresponding group compared to all other groups. Parameters ---------- data : anndata.AnnData An AnnData object containing the cell data. columns_obs : list of str Column names in AnnData.obs where the values of interest are stored. ref_obs : str Column name in AnnData.obs where the cell group labels are stored. cmap : str or matplotlib.colors.Colormap, optional Colormap for the heatmap. Defaults to 'Reds'. scale_medians : str, optional How to scale the median values in the heatmap. Options: 'row-wise', 'column-wise', or None. save : str, optional Filename to save the heatmap. If provided, saves the heatmap in 'figures' directory with 'CIA_' prefix. Returns ------- None or AxesSubplot If `save` is provided, the heatmap is saved and None is returned. Otherwise, returns the AxesSubplot object. """ # grouped_df=data.obs.groupby(ref_obs).median() # grouped_df=grouped_df[columns_obs] grouped_df = data.obs.groupby(ref_obs, observed=False)[columns_obs].median() if scale_medians!=None: if scale_medians=='row-wise': grouped_df=grouped_df.transpose()/np.array(grouped_df.sum(axis=1)) grouped_df=grouped_df.transpose() if scale_medians=='column-wise': grouped_df=grouped_df/np.array(grouped_df.sum(axis=0)) subsets={} results={} print('Performing Wilcoxon test on each cell group ...') combs=list(itertools.permutations(columns_obs,2)) count=0 for i in data.obs[ref_obs].cat.categories: subsets[i]= data[data.obs[ref_obs]==i].obs[columns_obs] pos=subsets[i].median().values.argmax() for j in combs: if ((sum(subsets[i][j[0]])!=0) & (sum(subsets[i][j[1]])!=0)): result=scipy.stats.wilcoxon(subsets[i][j[0]], subsets[i][j[1]], alternative='two-sided') if result[1] >= 0.01 and j[0]==subsets[i].median().index[pos]: count+=1 print('WARNING in cell group '+i+': '+ j[0]+' values are not significantly different from '+j[1]+' values.') if count==0: print('For each cell group there is a distribution significantly higher than the others (p<0.01)') print('') print('Performing Mann-Whitney U test on each selected AnnData.obs column ...') combs=list(itertools.permutations(data.obs[ref_obs].cat.categories,2)) count=0 for i in columns_obs: sign={} l=[] for c in data.obs[ref_obs].cat.categories: l.append(subsets[c][i].values) sign[i]=pd.DataFrame(l).transpose() sign[i].columns=data.obs[ref_obs].cat.categories pos=sign[i].median().argmax() for j in combs: result=scipy.stats.mannwhitneyu(subsets[j[0]][i], subsets[j[1]][i], alternative='two-sided') if result[1] >= 0.01 and j[0]==sign[i].median().index[pos]: count+=1 print('WARNING in '+i+' distribution: values in '+ j[0]+' group are not significantly different from values in '+j[1]+' group') print('(p= '+str(result[1])+')') if count==0: print('For each distribution, there is only a cell group in which values are higher with respect to all the other groups (p<0.01)') if save!=None: if not os.path.exists('./figures'): os.makedirs('figures') return sns.heatmap(grouped_df, cmap=cmap, annot=True).get_figure().savefig("figures/CIA_"+save) return sns.heatmap(grouped_df, cmap=cmap, annot=True)
[docs] def compute_classification_metrics(data, classification_obs, ref_obs, unassigned_label=''): """ Computes the main metrics of classification by comparing labels of cells classified with given methods (methods of interest) to labels assigned with a different one (reference method). Cells labeled as unassigned_label in any method of interest are excluded from the metrics calculation. Additionally, if present, the percentage of unassigned cells for each classification method is calculated and reported. Parameters ---------- data : anndata.AnnData An AnnData object containing the cell data. classification_obs : list of str A list of strings specifying the AnnData.obs columns where the labels assigned by the methods of interest are stored. ref_obs : str A string specifying the AnnData.obs column where the labels assigned by the reference method are stored. unassigned_label : str, optional The label used to mark unassigned cells in the classification columns. Cells with this label will be excluded from the metrics calculation. Default is an empty string, which means no cells are excluded based on their label. Returns ------- report : pandas.DataFrame A pandas.DataFrame containing the overall sensitivity (SE), specificity (SP), precision (PR), accuracy (ACC), F1-score (F1), and, if specified, the percentage of unassigned cells (%UN) for each classification method compared to the reference method. Example ------- >>> import scanpy as sc >>> adata = sc.read_h5ad('your_data_file.h5ad') # Load your AnnData file >>> adata.obs['method1'] = ['label1', 'label2', 'label1', 'label2'] # Example classification >>> adata.obs['method2'] = ['label1', 'label1', 'label2', 'label2'] # Another example classification >>> adata.obs['reference'] = ['label1', 'label1', 'label2', 'label2'] # Reference classification >>> classification_metrics(adata, ['method1', 'method2'], 'reference', unassigned_label='Unassigned') """ report = {} if type(classification_obs)==str: classification_obs=list(classification_obs) for m in classification_obs: SE, SP, PR, ACC, F1, UN = [], [], [], [], [], [] total_cells = len(data.obs) unassigned_count = sum(data.obs[m] == unassigned_label) UN.append(round((unassigned_count / total_cells) * 100,2)) # Calculate percentage of unassigned cells filtered_data = data[data.obs[m] != unassigned_label] TP_l, TN_l, FP_l, FN_l = [], [], [], [] for i in filtered_data.obs[ref_obs].cat.categories: TP_l.append(sum((filtered_data.obs[m] == i) & (filtered_data.obs[ref_obs] == i))) TN_l.append(sum((filtered_data.obs[m] != i) & (filtered_data.obs[ref_obs] != i))) FP_l.append(sum((filtered_data.obs[m] == i) & (filtered_data.obs[ref_obs] != i))) FN_l.append(sum((filtered_data.obs[m] != i) & (filtered_data.obs[ref_obs] == i))) TP = sum(TP_l) TN = sum(TN_l) FP = sum(FP_l) FN = sum(FN_l) SE.append(TP / (TP + FN)) SP.append(TN / (TN + FP)) PR.append(TP / (TP + FP)) ACC.append((TN + TP) / (TN + TP + FN + FP)) F1.append((2 * TP) / (2 * TP + FN + FP)) metrics = np.array([SE, SP, PR, ACC, F1, UN]).flat report[m] = metrics report = pd.DataFrame(report) report.index = ['SE', 'SP', 'PR', 'ACC', 'F1', '%UN'] report = report.transpose() if sum(report['%UN'])==0: del report['%UN'] return report
[docs] def grouped_classification_metrics(data, classification_obs, ref_obs, unassigned_label=''): """ Computes the main metrics of classification for each group defined by the reference method, comparing the labels from the method of interest with the reference labels. Additionally, if specified, computes the percentage of unlabelled cells for each group. Parameters ---------- data : anndata.AnnData An AnnData object containing the cell data. classification_obs : str The AnnData.obs column where the labels assigned by the method of interest are stored. ref_obs : str The AnnData.obs column where the labels assigned by the reference method are stored. Returns ------- report : pandas.DataFrame A DataFrame containing the per-group sensitivity (SE), specificity (SP), precision (PR), accuracy (ACC), F1-score (F1), and if present, the percentage of unassigned cells (%UN) for the selected classification method. Example ------- >>> import scanpy as sc >>> adata = sc.read_h5ad('your_data_file.h5ad') # Load your AnnData file >>> classification_obs = 'predicted_labels' >>> ref_obs = 'actual_labels' >>> metrics_report = grouped_classification_metrics(adata, classification_obs, ref_obs) """ report = [] for group in data.obs[ref_obs].cat.categories: is_group = data.obs[ref_obs] == group is_unassigned = data.obs[classification_obs] == unassigned_label TP = np.sum(data.obs[classification_obs][is_group] == group) TN = np.sum(data.obs[classification_obs][~is_group] != group) FP = np.sum(data.obs[classification_obs][~is_group] == group) FN = np.sum(data.obs[classification_obs][is_group] != group) SE = TP / (TP + FN) if TP + FN else 0 SP = TN / (TN + FP) if TN + FP else 0 PR = TP / (TP + FP) if TP + FP else 0 ACC = (TP + TN) / (TP + TN + FP + FN) if TP + TN + FP + FN else 0 F1 = 2 * TP / (2 * TP + FP + FN) if 2 * TP + FP + FN else 0 # Calculate the percentage of unassigned cells for this group group_total = np.sum(is_group) unassigned_count = np.sum(is_unassigned & is_group) UN = (unassigned_count / group_total) * 100 if group_total else 0 report.append([SE, SP, PR, ACC, F1, UN]) report_df = pd.DataFrame(report, columns=['SE', 'SP', 'PR', 'ACC', 'F1', '%UN'], index=data.obs[ref_obs].cat.categories) if sum(report_df['%UN'])==0: del report_df['%UN'] return report_df
[docs] def plot_group_composition(df, ref_col, comp_col, plot_type='percentage', palette='Set3', show_legend=True): """ Plot the composition of each reference group as a horizontal stacked bar plot. The composition can be shown either as raw counts or as percentages. Parameters: df : pandas.DataFrame DataFrame containing the data to be plotted. ref_col : str the name of the column representing the reference grouping variable. comp_col: str the name of the column representing the grouping to be compared. plot_type : str indicates whether to plot 'percentage' or 'raw' counts. Defaults to 'percentage'. palette : str or list the color palette to use. Defaults to 'Set3'. show_legend : bool whether to display the legend on the plot. Defaults to True. Returns ------- AxesSubplot """ # Check if specified columns exist in the DataFrame if not {ref_col, comp_col}.issubset(df.columns): raise ValueError("Specified columns are not in the DataFrame") # Create a contingency table of counts contingency_table = pd.crosstab(df[ref_col], df[comp_col], dropna=False) # Ensure all groups are represented, even with zero counts all_groups = list(contingency_table.columns) # Calculate percentages if required if plot_type == 'percentage': contingency_table = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100 # Set up the color palette colors = sns.color_palette(palette, n_colors=len(all_groups)) # Plotting fig, ax = plt.subplots(figsize=(10, 6)) bottom = None for i, group in enumerate(all_groups): values = contingency_table[group].values ax.barh(contingency_table.index, values, left=bottom, label=group, color=colors[i]) if bottom is None: bottom = values else: bottom += values ax.set_xlabel('Percentage' if plot_type == 'percentage' else 'Count') ax.set_ylabel(ref_col) ax.set_title('Group Composition by ' + ref_col) if show_legend: ax.legend(title=comp_col, bbox_to_anchor=(1.05, 1), loc='upper left') plt.show()