Source code for cellmaps_hierarchyeval.perturb

import logging
import math
import pandas as pd
from scipy import stats
import cellmaps_utils.music_utils as music_utils
from ndex2 import constants
from scipy.stats import ranksums

from cellmaps_hierarchyeval.exceptions import CellmapshierarchyevalError

logger = logging.getLogger(__name__)



[docs]
class PerturbSeqAnalysis(object):
    """
    Contains utilities to compare Perturbation data
    against hierarchy passed in via constructor
    """

    def __init__(self, hierarchy, hierarchy_parent=None):
        """
        Constructor

        :param hierarchy:
        :type hierarchy: :py:class:`~ndex2.cx2.CX2Network`
        :param hierarchy_parent:
        :type hierarchy_parent: :py:class:`~ndex2.cx2.CX2Network`
        """
        self._hierarchy = hierarchy
        self._hierarchy_parent = hierarchy_parent


[docs]
    def get_heatmap_for_given_hierarchy_system(self, hier_system_node_id,
                                               perturbseq_df, num_perturb_seq=25):
        """
        Given an id for a system in hierarchy **hier_system_node_id** and
        Perturb-seq data **perturbseq_df** create a heatmap of X most variable
        Perturb-seq proteins.

        This is done by filtering **perturbseq_df** for rows that match genes in
        given system and then keeping **num_perturb_seq** most variable columns

        :param hier_system_node_id: node id system to analyze
        :type hier_system_node_id: int
        :param perturbseq_df:
        :type perturbseq_df: :py:class:`pandas.DataFrame`
        :param num_perturb_seq:
        :type num_perturb_seq: int
        :return: heat map table
        :rtype: :py:class:`pandas.DataFrame`
        """
        node_values = self._hierarchy.get_node(hier_system_node_id)
        assembly_genes = node_values[constants.ASPECT_VALUES]['CD_MemberList'].split(' ')
        cluster_genes_in_perturb = [x for x in assembly_genes if x in perturbseq_df.index.values]

        # from notebook but changed to match these variables
        variance_per_column = perturbseq_df.var()
        most_variable = variance_per_column.sort_values(ascending=False).head(num_perturb_seq).index.values
        data = perturbseq_df.loc[cluster_genes_in_perturb, most_variable]
        data = data.apply(stats.zscore, axis=1)
        return data



[docs]
    def get_root_gene_pair_similarities(self):
        """
        Calculates similarity scores between gene pairs in the root node of a hierarchy. Genes in the same community
        linked to the root node are marked with a similarity of 0, indicating they are directly related,
        while all other pairs are set to 1, suggesting no direct relation.

        :return: A DataFrame with genes as both rows and columns, populated with similarity scores.
        :rtype: :py:class:`pandas.DataFrame`
        """
        root_node = None
        genes = []
        # get pairs only in root node
        for nodeid, node in self._hierarchy.get_nodes().items():
            if node[constants.ASPECT_VALUES]['HCX::isRoot']:
                root_node = nodeid
                genes = node[constants.ASPECT_VALUES]['CD_MemberList'].split(' ')

        if root_node is None:
            raise CellmapshierarchyevalError('No root node detected!')

        # Get nodes that are directly connected to root node
        communities_connected_to_root = []
        for edgeid, edge in self._hierarchy.get_edges().items():
            if edge[constants.EDGE_SOURCE] == root_node:
                communities_connected_to_root.append(edge[constants.EDGE_TARGET])

        # Assign similarity scores
        root_pairs = pd.DataFrame(1, index=genes, columns=genes)
        for community in communities_connected_to_root:
            community_node = self._hierarchy.get_node(community)
            community_genes = community_node[constants.ASPECT_VALUES]['CD_MemberList'].split(' ')
            root_pairs.loc[community_genes, community_genes] = 0
        return root_pairs



[docs]
    @staticmethod
    def get_root_overlapping_pair_similarities(root_pairs, perturbseq_df):
        """
        Get similarity scores from **perturbseq_df** that match genes attached to the root
        node of the hierarchy

        :param root_pairs: A DataFrame representing similarity scores between all genes in the root node,
                            where genes within the same community connected to the root have a score of 0,
                            indicating direct relation, and all other pairs have a score of 1,
                            indicating no direct relation.
        :type root_pairs: :py:class:`pandas.DataFrame`
        :param perturbseq_df:
        :type perturbseq_df: :py:class:`pandas.DataFrame`
        :return: A tuple containing:
             - A DataFrame of scaled cosine similarity scores for overlapping genes in communities direct to root
                and Perturb-seq data.
             - A DataFrame of root-associated similarity scores, filtered to only include overlapping genes.
        :rtype: tuple
        """
        overlap_genes = list(set(root_pairs.index.values).intersection(set(perturbseq_df.index.values)))
        overlap_functional_data = perturbseq_df.loc[overlap_genes]
        functional_data_similarity = music_utils.cosine_similarity_scaled(overlap_functional_data)
        overlap_root_pairs = root_pairs.loc[overlap_genes, overlap_genes]

        return functional_data_similarity, overlap_root_pairs



[docs]
    @staticmethod
    def get_root_functional_data_similarity(functional_data_similarity, overlap_root_pairs):
        """
        Extracts and returns a list of functional similarity scores for gene pairs that are not in the same community,
            based on a filtered upper triangle extraction of the similarity matrix (ensures that only unique,
            non-redundant gene pair comparisons are considered).

        :param functional_data_similarity: A DataFrame of scaled cosine similarity scores for overlapping genes in
                                            communities direct to root and Perturb-seq data.
        :type functional_data_similarity: :py:class:`pandas.DataFrame`
        :param overlap_root_pairs: A DataFrame of root-associated similarity scores, filtered to only include
                                    overlapping genes. A score of 0 indicates a direct relation (same community)
                                    and scores greater than 0 indicate no direct relation
        :type overlap_root_pairs: :py:class:`pandas.DataFrame`
        :return: A list of non-NaN similarity scores for gene pairs that are not directly related.
        :rtype: list
        """
        root_mask = overlap_root_pairs > 0
        root_functional_data_similarity = [x for x in
                                           music_utils.upper_tri_values(functional_data_similarity[root_mask]) if
                                           not math.isnan(x)]
        return root_functional_data_similarity



[docs]
    def get_cluster_similarity(self, functional_data_similarity, hier_system_node_id):
        """
        Retrieves the upper triangle similarity scores for genes within a specific cluster of a hierarchy.
        The scores are extracted from a DataFrame that contains scaled cosine similarity scores for genes that overlap
        between communities direct to root and Perturb-seq data.

        :param functional_data_similarity: A DataFrame of scaled cosine similarity scores for overlapping genes in
                                            communities direct to root and Perturb-seq data.
        :type functional_data_similarity: :py:class:`pandas.DataFrame`
        :param hier_system_node_id: The identifier for a specific node within a hierarchy.
        :type hier_system_node_id: int
        :return: An array of similarity scores from the upper triangle portion of the matrix for the specified cluster.
        :rtype: :py:func:`numpy.array`
        """
        # Retrieve node information and extract gene members from the specified cluster.
        node_values = self._hierarchy.get_node(hier_system_node_id)
        cluster_genes = node_values[constants.ASPECT_VALUES]['CD_MemberList'].split(' ')

        # Filter genes that are present in the functional data similarity matrix.
        cluster_genes_in_functional_data = [x for x in cluster_genes if x in functional_data_similarity.index.values]

        # Extract the relevant portion of the similarity matrix and return upper triangle values.
        cluster_functional_data_similarity = music_utils.upper_tri_values(functional_data_similarity.loc[
            cluster_genes_in_functional_data, cluster_genes_in_functional_data])

        return cluster_functional_data_similarity



[docs]
    @staticmethod
    def compare_cluster_root_similarities(cluster_functional_data_similarity, root_functional_data_similarity):
        """
        Performs a rank-sum test to compare the distribution of functional data similarity scores
        between a specific cluster and gene pairs in root. This test helps determine if the similarity scores
        in the cluster are statistically significantly greater than those in the root.

        :param cluster_functional_data_similarity: An array of similarity scores within a specific cluster.
        :type cluster_functional_data_similarity: numpy.array
        :param root_functional_data_similarity: A list of non-NaN similarity scores for gene pairs not directly related
                                                in the root.
        :type root_functional_data_similarity: list
        :return: A tuple containing the test statistic and the p-value of the rank-sum test.
        :rtype: (float, float)
        """
        statistic, p_value = ranksums(cluster_functional_data_similarity, root_functional_data_similarity,
                                      alternative='greater')

        return statistic, p_value