Source code for pybel_tools.analysis.rcr

# -*- coding: utf-8 -*-

"""An implementation of Reverse Causal Reasoning (RCR) described by [Catlett2013]_.

.. [Catlett2013] Catlett, N. L., *et al* (2013). `Reverse causal reasoning: applying qualitative causal knowledge to
                 the interpretation of high-throughput data <https://doi.org/10.1186/1471-2105-14-340>`_.
                 BMC Bioinformatics, 14(1), 340.
"""

import pandas
import scipy.stats
from collections import defaultdict
from scipy.special import binom

from pybel.constants import CAUSAL_INCREASE_RELATIONS, CAUSAL_DECREASE_RELATIONS, RELATION

__all__ = [
    'run_rcr',
]


def point_probability(k, n, l, p=0.5):
    return binom(n - l, k) * p ** k * (1 - p) ** (n - k - l)


def concordance(k, n, m, l, p=0.5):
    return sum(point_probability(j, n, l, p) for j in range(k, min(n - 1, m)))


[docs]def run_rcr(graph, tag='dgxp'):
    """Run the reverse causal reasoning algorithm on a graph.

    Steps:

    1. Get all downstream controlled things into map (that have at least 4 downstream things)
    2. calculate population of all things that are downstream controlled

    .. note:: Assumes all nodes have been pre-tagged with data

    :param pybel.BELGraph graph:
    :param str tag: The key for the nodes' data dictionaries that corresponds to the integer value for its differential
                    expression.
    """

    # Step 1: Calculate the hypothesis subnetworks (just simple star graphs)
    hypotheses = defaultdict(set)
    increases = defaultdict(set)
    decreases = defaultdict(set)

    for u, v, d in graph.edges(data=True):
        hypotheses[u].add(v)

        if d[RELATION] in CAUSAL_INCREASE_RELATIONS:
            increases[u].add(v)

        elif d[RELATION] in CAUSAL_DECREASE_RELATIONS:
            decreases[u].add(v)

    # Step 2: Calculate the matching of the data points to the causal relationships

    #: A dictionary from {tuple controller node: int count of correctly matching observations}
    correct = defaultdict(int)
    #: A dictionary from {tuple controller node: int count of incorrectly matching observations}
    contra = defaultdict(int)
    #: A dictionary from {tuple controller node: int count of ambiguous observations}
    ambiguous = defaultdict(int)
    #: A dictionary from {tuple controller node: int count of missing obvservations}
    missing = defaultdict(int)

    for controller, downstream_nodes in hypotheses.items():
        if len(downstream_nodes) < 4:
            continue  # need enough data to make reasonable calculations!

        for node in downstream_nodes:

            if node in increases[controller] and node in decreases[controller]:
                ambiguous[controller] += 1

            elif node in increases[controller]:
                if graph.node[node][tag] == 1:
                    correct[controller] += 1
                elif graph.node[node][tag] == -1:
                    contra[controller] += 1

            elif node in decreases[controller]:
                if graph.node[node][tag] == 1:
                    contra[controller] += 1
                elif graph.node[node][tag] == -1:
                    correct[controller] += 1

            else:
                missing[controller] += 1

    # Step 3: Keep only controller nodes who have 4 or more downstream nodes
    controllers = {
        controller
        for controller, downstream_nodes in hypotheses.items()
        if 4 <= len(downstream_nodes)
    }

    # Step 4: Calculate concordance scores
    concordance_scores = {
        controller: scipy.stats.beta(0.5, correct[controller], contra[controller])
        for controller in controllers
    }

    # Step 5: Calculate richness scores
    # TODO

    # Calculate the population as the union of all downstream nodes for all controllers
    population = {
        node
        for controller in controllers
        for node in hypotheses[controller]
    }
    population_size = len(population)

    # Step 6: Export

    return pandas.DataFrame({
        'contra': contra,
        'correct': correct,
        'concordance': concordance_scores
    })