Source code for pybel.io.spia

# -*- coding: utf-8 -*-

"""An exporter for signaling pathway impact analysis (SPIA) described by [Tarca2009]_.

.. [Tarca2009] Tarca, A. L., *et al* (2009). `A novel signaling pathway impact analysis
               <https://doi.org/10.1093/bioinformatics/btn577>`_. Bioinformatics, 25(1), 75–82.

.. seealso:: https://bioconductor.org/packages/release/bioc/html/SPIA.html
"""

import itertools as itt
import os
from collections import OrderedDict
from typing import Dict, Mapping, Set

import pandas as pd

from ..constants import (
    ASSOCIATION,
    CAUSAL_DECREASE_RELATIONS,
    CAUSAL_INCREASE_RELATIONS,
    RELATION,
)
from ..dsl import CentralDogma, Gene, ListAbundance, ProteinModification, Rna
from ..language import pmod_mappings
from ..struct import BELGraph
from ..typing import EdgeData

__all__ = [
    "to_spia_dfs",
    "to_spia_excel",
    "to_spia_tsvs",
]

SPIADataFrames = Mapping[str, pd.DataFrame]

KEGG_RELATIONS = [
    "activation",
    "compound",
    "binding/association",
    "expression",
    "inhibition",
    "activation_phosphorylation",
    "phosphorylation",
    "inhibition_phosphorylation",
    "inhibition_dephosphorylation",
    "dissociation",
    "dephosphorylation",
    "activation_dephosphorylation",
    "state change",
    "activation_indirect effect",
    "inhibition_ubiquination",
    "ubiquination",
    "expression_indirect effect",
    "inhibition_indirect effect",
    "repression",
    "dissociation_phosphorylation",
    "indirect effect_phosphorylation",
    "activation_binding/association",
    "indirect effect",
    "activation_compound",
    "activation_ubiquination",
]


[docs]def to_spia_excel(graph: BELGraph, path: str) -> None:
    """Write the BEL graph as an SPIA-formatted excel sheet at the given path."""
    x = to_spia_dfs(graph)
    spia_matrices_to_excel(x, path)


[docs]def to_spia_tsvs(graph: BELGraph, directory: str) -> None:
    """Write the BEL graph as a set of SPIA-formatted TSV files in a given directory."""
    x = to_spia_dfs(graph)
    spia_matrices_to_tsvs(x, directory)


[docs]def to_spia_dfs(graph: BELGraph) -> SPIADataFrames:
    """Create an excel sheet ready to be used in SPIA software.

    :param graph: BELGraph
    :return: dictionary with matrices
    """
    index_nodes = get_matrix_index(graph)
    spia_matrices = build_spia_matrices(index_nodes)

    for u, v, edge_data in graph.edges(data=True):
        # Both nodes are CentralDogma abundances
        if isinstance(u, CentralDogma) and isinstance(v, CentralDogma):
            # Update matrix dict
            update_spia_matrices(spia_matrices, u, v, edge_data)

        # Subject is CentralDogmaAbundance and node is ListAbundance
        elif isinstance(u, CentralDogma) and isinstance(v, ListAbundance):
            # Add a relationship from subject to each of the members in the object
            for node in v.members:
                # Skip if the member is not in CentralDogma
                if not isinstance(node, CentralDogma):
                    continue

                update_spia_matrices(spia_matrices, u, node, edge_data)

        # Subject is ListAbundance and node is CentralDogmaAbundance
        elif isinstance(u, ListAbundance) and isinstance(v, CentralDogma):
            # Add a relationship from each of the members of the subject to the object
            for node in u.members:
                # Skip if the member is not in CentralDogma
                if not isinstance(node, CentralDogma):
                    continue

                update_spia_matrices(spia_matrices, node, v, edge_data)

        # Both nodes are ListAbundance
        elif isinstance(u, ListAbundance) and isinstance(v, ListAbundance):
            for sub_member, obj_member in itt.product(u.members, v.members):
                # Update matrix if both are CentralDogma
                if isinstance(sub_member, CentralDogma) and isinstance(obj_member, CentralDogma):
                    update_spia_matrices(spia_matrices, sub_member, obj_member, edge_data)

        # else Not valid edge

    return spia_matrices


def get_matrix_index(graph: BELGraph) -> Set[str]:
    """Return set of HGNC names from Proteins/Rnas/Genes/miRNA, nodes that can be used by SPIA."""
    # TODO: Using HGNC Symbols for now
    return {node.name for node in graph if isinstance(node, CentralDogma) and node.namespace.upper() == "HGNC"}


def build_spia_matrices(nodes: Set[str]) -> Dict[str, pd.DataFrame]:
    """Build an adjacency matrix for each KEGG relationship and return in a dictionary.

    :param nodes: A set of HGNC gene symbols
    :return: Dictionary of adjacency matrix for each relationship
    """
    nodes = list(sorted(nodes))

    # Create sheets of the excel in the given order
    matrices = OrderedDict()
    for relation in KEGG_RELATIONS:
        matrices[relation] = pd.DataFrame(0, index=nodes, columns=nodes)

    return matrices


UB_NAMES = {"Ub"} | {e.name for e in pmod_mappings["Ub"]["xrefs"]}
PH_NAMES = {"Ph"} | {e.name for e in pmod_mappings["Ph"]["xrefs"]}


def update_spia_matrices(
    spia_matrices: Dict[str, pd.DataFrame],
    u: CentralDogma,
    v: CentralDogma,
    edge_data: EdgeData,
) -> None:
    """Populate the adjacency matrix."""
    if u.namespace.lower() != "hgnc" or v.namespace.lower() != "hgnc":
        return

    u_name = u.name
    v_name = v.name
    relation = edge_data[RELATION]

    if relation in CAUSAL_INCREASE_RELATIONS:
        # If it has pmod check which one and add it to the corresponding matrix
        if v.variants and any(isinstance(variant, ProteinModification) for variant in v.variants):
            for variant in v.variants:
                if not isinstance(variant, ProteinModification):
                    continue
                elif variant.entity.name in UB_NAMES:
                    spia_matrices["activation_ubiquination"][u_name][v_name] = 1
                elif variant.entity.name in PH_NAMES:
                    spia_matrices["activation_phosphorylation"][u_name][v_name] = 1
        elif isinstance(v, (Gene, Rna)):  # Normal increase, add activation
            spia_matrices["expression"][u_name][v_name] = 1
        else:
            spia_matrices["activation"][u_name][v_name] = 1

    elif relation in CAUSAL_DECREASE_RELATIONS:
        # If it has pmod check which one and add it to the corresponding matrix
        if v.variants and any(isinstance(variant, ProteinModification) for variant in v.variants):
            for variant in v.variants:
                if not isinstance(variant, ProteinModification):
                    continue
                elif variant.entity.name in UB_NAMES:
                    spia_matrices["inhibition_ubiquination"][u_name][v_name] = 1
                elif variant.entity.name in PH_NAMES:
                    spia_matrices["inhibition_phosphorylation"][u_name][v_name] = 1
        elif isinstance(v, (Gene, Rna)):  # Normal decrease, check which matrix
            spia_matrices["repression"][u_name][v_name] = 1
        else:
            spia_matrices["inhibition"][u_name][v_name] = 1

    elif relation == ASSOCIATION:
        spia_matrices["binding_association"][u_name][v_name] = 1


def spia_matrices_to_excel(spia_matrices: SPIADataFrames, path: str) -> None:
    """Export a SPIA data dictionary into an Excel sheet at the given path.

    .. note::

        # The R import should add the values:
        # ["nodes"] from the columns
        # ["title"] from the name of the file
        # ["NumberOfReactions"] set to "0"
    """
    writer = pd.ExcelWriter(path, engine="xlsxwriter")

    for relation, df in spia_matrices.items():
        df.to_excel(writer, sheet_name=relation, index=False)

    # Save excel
    writer.save()


def spia_matrices_to_tsvs(spia_matrices: SPIADataFrames, directory: str) -> None:
    """Export a SPIA data dictionary into a directory as several TSV documents."""
    os.makedirs(directory, exist_ok=True)
    for relation, df in spia_matrices.items():
        df.to_csv(
            os.path.join(directory, "{relation}.tsv".format(relation=relation)),
            index=True,
        )