Source code for cia.utils

import numpy as np
import pandas as pd
import itertools 
import scipy
# added for read_gmt_from_url
import requests # to be installed with pip or defined in the requirements
import io

def _fetch_url_gmt(url: str) -> dict:
    """Fetches a GMT file from a URL and parses it into a dictionary.

    Parameters
    ----------
    url : str
        The URL of the GMT file.

    Returns
    -------
    dict
        A dictionary where keys are gene set names and values are lists of associated genes.

    Raises
    ------
    ValueError
        If the request to fetch the GMT file fails.
    """
    response = requests.get(url)
    
    if response.status_code != 200:
        raise ValueError(f"Failed to fetch GMT file. HTTP Status Code: {response.status_code}")

    gene_sets = {}
    file_content = io.StringIO(response.text)

    for line in file_content:
        data = line.strip().split("\t")
        if len(data) < 3:
            continue  # Skip lines that don't have enough data
        gene_set_name = data[0]  # First column = set name
        genes = data[2:]  # Skip the second column (description) and take the genes
        gene_sets[gene_set_name] = genes

    return gene_sets


[docs] def signatures_similarity(signatures_dict, show='J'): """ Computes the similarity between gene signatures. Parameters ---------- signatures_dict : dict A dictionary having as keys the signature names and as values the lists of gene names (gene signatures). show : str, optional Specifies the metric for showing similarities: 'J' for Jaccard index or '%' for percentages of intersection. Default is 'J'. Returns ------- similarity : pandas.DataFrame A DataFrame containing the similarity of each pair of signatures, with signatures as both rows and columns. Raises ------ ValueError If 'show' is different from 'J' or '%'. Example ------- >>> signatures = { >>> 'signature1': ['gene1', 'gene2', 'gene3'], >>> 'signature2': ['gene2', 'gene3', 'gene4'], >>> 'signature3': ['gene1', 'gene5'] >>> } >>> similarity = signatures_similarity(signatures, show='J') >>> print(similarity) """ if show not in ['J', '%']: raise ValueError('show must be "J" or "%".') signature_names = list(signatures_dict.keys()) n = len(signature_names) similarity_matrix = np.zeros((n, n)) for i in range(n): for j in range(n): intersec = len(np.intersect1d(signatures_dict[signature_names[i]], signatures_dict[signature_names[j]])) if show == 'J': union = len(np.union1d(signatures_dict[signature_names[i]], signatures_dict[signature_names[j]])) similarity = intersec / union elif show == '%': similarity = round(100 * intersec / len(signatures_dict[signature_names[i]]), 2) similarity_matrix[i, j] = similarity_matrix[j, i] = similarity similarity = pd.DataFrame(similarity_matrix, index=signature_names, columns=signature_names) return similarity
[docs] def filter_degs(data, groupby, uns_key='rank_genes_groups', direction='up', logFC=0, scores=None, perc=0, mean=0): """ Filters differentially expressed genes (DEGs) obtained with scanpy.tl.rank_genes_groups based on given thresholds. Parameters ---------- data : anndata.AnnData An AnnData object containing the analysis results. groupby : str Column in AnnData.obs containing cell group labels. uns_key : str Key in AnnData.uns where differential expression analysis results are stored. direction : str Specifies if filtering for upregulated ('up') or downregulated ('down') genes. logFC : float Log fold change threshold to filter genes. scores : float, optional Z score threshold to filter genes. perc : float Percentage of cells expressing the gene threshold. mean : float Mean expression threshold to filter genes. Returns ------- signatures_dict : dict Dictionary with cell group names as keys and lists of filtered gene names as values. Raises ------ ValueError If 'direction' is not 'up' or 'down'. Example ------- >>> import scanpy as sc >>> adata = sc.datasets.pbmc68k_reduced() >>> sc.tl.rank_genes_groups(adata, 'louvain', method='t-test') >>> filtered_genes = filter_degs(adata, 'louvain', direction='up', logFC=1, perc=10, mean=0.1) >>> print(filtered_genes['0']) # Show filtered genes for the first group """ signatures_dict={} for group in data.obs[groupby].cat.categories: #for group in data.uns[uns_key]['names'].dtype.names: degs=data.uns[uns_key]['names'][group] n_cells=sum(data.obs[groupby]==group) if direction=='up': order=pd.DataFrame(data.uns[uns_key]['logfoldchanges'][group]).sort_values(by=0,ascending=False).index degs= degs[order] if scipy.sparse.issparse(data.raw.X): cells = (np.array(data.raw[data.obs[groupby].isin([group])][:,degs.tolist()].X.todense() > 0).sum(axis=0)/n_cells*100) else: cells = (np.array(data.raw[data.obs[groupby].isin([group])][:,degs.tolist()].X > 0).sum(axis=0)/n_cells*100) cells =(cells >= perc) gene_mean = np.ravel(data.raw[data.obs[groupby].isin([group])][:,degs.tolist()].X.mean(0)) gene_mean = (gene_mean >= mean ) lfc= data.uns[uns_key]['logfoldchanges'][group] lfc= (lfc[order] >=logFC) filters=[cells, gene_mean, lfc] if scores!=None: s= data.uns[uns_key]['scores'][group] s= (s[order] >=scores) filters.append(s) filters=np.bitwise_and.reduce(filters) signatures_dict[group]= degs[filters].tolist() elif direction=='down': order=pd.DataFrame(data.uns[uns_key]['logfoldchanges'][group]).sort_values(by=0,ascending=False).index degs= degs[order] if scipy.sparse.issparse(data.raw.X): cells = (np.array(data.raw[data.obs[groupby].isin([group])][:,degs.tolist()].X.todense() > 0).sum(axis=0)/n_cells*100) else: cells = (np.array(data.raw[data.obs[groupby].isin([group])][:,degs.tolist()].X> 0).sum(axis=0)/n_cells*100) cells =(cells <= perc) gene_mean = np.ravel(data.raw[data.obs[groupby].isin([group])][:,degs.tolist()].X.mean(0)) gene_mean = (gene_mean <= mean ) lfc= data.uns[uns_key]['logfoldchanges'][group] lfc= (lfc[order] <=logFC) filters=[cells, gene_mean, lfc] if scores!=None: s= data.uns[uns_key]['scores'][group] s= (s[order] <=scores) filters.append(s) filters=np.bitwise_and.reduce(filters) signatures_dict[group]= degs[filters].tolist() else: raise ValueError('direction must be "up" or "down".') return signatures_dict
[docs] def save_gmt(signatures_dict, file): """ A function to convert a dictionary of signatures in a gmt file correctly formatted for signature_score and signature_based_classification functions. Parameters ---------- signatures_dict: dict a dictionary having as keys the signature names and as values the gene signatures (lists of gene names). file: str filepath of gmt file. See pandas.DataFrame.to_csv documentation. """ with open(file, 'w') as f: for key, values in signatures_dict.items(): line = key + '\t' + key + '\t' + '\t'.join(values) + '\n' f.write(line)