Source code for pybel_tools.analysis.spia

# -*- coding: utf-8 -*-

"""An exporter for signaling pathway impact analysis (SPIA) described by [Tarca2009]_.

.. [Tarca2009] Tarca, A. L., *et al* (2009). `A novel signaling pathway impact analysis
               <https://doi.org/10.1093/bioinformatics/btn577>`_. Bioinformatics, 25(1), 75–82.

To run this module on an arbitrary BEL graph, use the command ``python -m pybel_tools.analysis.spia``.

.. seealso:: https://bioconductor.org/packages/release/bioc/html/SPIA.html
"""

import os
import sys
from collections import OrderedDict
from itertools import product
from typing import Dict, Mapping, Set

import click
import pandas as pd
from pybel import BELGraph
from pybel.cli import graph_pickle_argument
from pybel.constants import (
    ASSOCIATION, CAUSAL_DECREASE_RELATIONS, CAUSAL_INCREASE_RELATIONS, IDENTIFIER, NAME, RELATION,
)
from pybel.dsl import CentralDogma, Gene, ListAbundance, ProteinModification, Rna
from pybel.typing import EdgeData

__all__ = [
    'bel_to_spia_matrices',
    'spia_matrices_to_excel',
    'spia_matrices_to_tsvs',
]

KEGG_RELATIONS = [
    "activation",
    "compound",
    "binding_association",
    "expression",
    "inhibition",
    "activation_phosphorylation",
    "phosphorylation",
    "inhibition_phosphorylation",
    "inhibition_dephosphorylation",
    "dissociation",
    "dephosphorylation",
    "activation_dephosphorylation",
    "state change",
    "activation_indirect effect",
    "inhibition_ubiquination",
    "ubiquination",
    "expression_indirect effect",
    "inhibition_indirect effect",
    "repression",
    "dissociation_phosphorylation",
    "indirect effect_phosphorylation",
    "activation_binding_association",
    "indirect effect",
    "activation_compound",
    "activation_ubiquination"
]


[docs]def bel_to_spia_matrices(graph: BELGraph) -> Mapping[str, pd.DataFrame]:
    """Create an excel sheet ready to be used in SPIA software.

    :param graph: BELGraph
    :return: dictionary with matrices
    """
    index_nodes = get_matrix_index(graph)
    spia_matrices = build_spia_matrices(index_nodes)

    for u, v, edge_data in graph.edges(data=True):
        # Both nodes are CentralDogma abundances
        if isinstance(u, CentralDogma) and isinstance(v, CentralDogma):
            # Update matrix dict
            update_spia_matrices(spia_matrices, u, v, edge_data)

        # Subject is CentralDogmaAbundance and node is ListAbundance
        elif isinstance(u, CentralDogma) and isinstance(v, ListAbundance):
            # Add a relationship from subject to each of the members in the object
            for node in v.members:
                # Skip if the member is not in CentralDogma
                if not isinstance(node, CentralDogma):
                    continue

                update_spia_matrices(spia_matrices, u, node, edge_data)

        # Subject is ListAbundance and node is CentralDogmaAbundance
        elif isinstance(u, ListAbundance) and isinstance(v, CentralDogma):
            # Add a relationship from each of the members of the subject to the object
            for node in u.members:
                # Skip if the member is not in CentralDogma
                if not isinstance(node, CentralDogma):
                    continue

                update_spia_matrices(spia_matrices, node, v, edge_data)

        # Both nodes are ListAbundance
        elif isinstance(u, ListAbundance) and isinstance(v, ListAbundance):
            for sub_member, obj_member in product(u.members, v.members):
                # Update matrix if both are CentralDogma
                if isinstance(sub_member, CentralDogma) and isinstance(obj_member, CentralDogma):
                    update_spia_matrices(spia_matrices, sub_member, obj_member, edge_data)

        # else Not valid edge

    return spia_matrices


def get_matrix_index(graph: BELGraph) -> Set[str]:
    """Return set of HGNC names from Proteins/Rnas/Genes/miRNA, nodes that can be used by SPIA."""
    # TODO: Using HGNC Symbols for now
    return {
        node.name
        for node in graph
        if isinstance(node, CentralDogma) and node.namespace.upper() == 'HGNC'
    }


def build_spia_matrices(nodes: Set[str]) -> Dict[str, pd.DataFrame]:
    """Build an adjacency matrix for each KEGG relationship and return in a dictionary.

    :param nodes: A set of HGNC gene symbols
    :return: Dictionary of adjacency matrix for each relationship
    """
    nodes = list(sorted(nodes))

    # Create sheets of the excel in the given order
    matrices = OrderedDict()
    for relation in KEGG_RELATIONS:
        matrices[relation] = pd.DataFrame(0, index=nodes, columns=nodes)

    return matrices


def update_spia_matrices(spia_matrices: Dict[str, pd.DataFrame],
                         u: CentralDogma,
                         v: CentralDogma,
                         edge_data: EdgeData,
                         ) -> None:
    """Populate the adjacency matrix."""
    if u.namespace.upper() != 'HGNC' or v.namespace.upper() != 'HGNC':
        return

    u_name = u.name
    v_name = v.name
    relation = edge_data[RELATION]

    if relation in CAUSAL_INCREASE_RELATIONS:
        # If it has pmod check which one and add it to the corresponding matrix
        if v.variants and any(isinstance(variant, ProteinModification) for variant in v.variants):
            for variant in v.variants:
                if not isinstance(variant, ProteinModification):
                    continue
                if variant[IDENTIFIER][NAME] == "Ub":
                    spia_matrices["activation_ubiquination"][u_name][v_name] = 1
                elif variant[IDENTIFIER][NAME] == "Ph":
                    spia_matrices["activation_phosphorylation"][u_name][v_name] = 1
        elif isinstance(v, (Gene, Rna)):  # Normal increase, add activation
            spia_matrices['expression'][u_name][v_name] = 1
        else:
            spia_matrices['activation'][u_name][v_name] = 1

    elif relation in CAUSAL_DECREASE_RELATIONS:
        # If it has pmod check which one and add it to the corresponding matrix
        if v.variants and any(isinstance(variant, ProteinModification) for variant in v.variants):
            for variant in v.variants:
                if not isinstance(variant, ProteinModification):
                    continue
                if variant[IDENTIFIER][NAME] == "Ub":
                    spia_matrices['inhibition_ubiquination'][u_name][v_name] = 1
                elif variant[IDENTIFIER][NAME] == "Ph":
                    spia_matrices["inhibition_phosphorylation"][u_name][v_name] = 1
        elif isinstance(v, (Gene, Rna)):  # Normal decrease, check which matrix
            spia_matrices["repression"][u_name][v_name] = 1
        else:
            spia_matrices["inhibition"][u_name][v_name] = 1

    elif relation == ASSOCIATION:
        spia_matrices["binding_association"][u_name][v_name] = 1


[docs]def spia_matrices_to_excel(spia_matrices: Mapping[str, pd.DataFrame], path: str) -> None:
    """Export a SPIA data dictionary into an Excel sheet at the given path.

    .. note::

        # The R import should add the values:
        # ["nodes"] from the columns
        # ["title"] from the name of the file
        # ["NumberOfReactions"] set to "0"
    """
    writer = pd.ExcelWriter(path, engine='xlsxwriter')

    for relation, df in spia_matrices.items():
        df.to_excel(writer, sheet_name=relation, index=False)

    # Save excel
    writer.save()


[docs]def spia_matrices_to_tsvs(spia_matrices: Mapping[str, pd.DataFrame], directory: str) -> None:
    """Export a SPIA data dictionary into a directory as several TSV documents."""
    os.makedirs(directory, exist_ok=True)
    for relation, df in spia_matrices.items():
        df.to_csv(os.path.join(directory, f'{relation}.tsv'), index=True)


@click.command()
@graph_pickle_argument
@click.option('--xlsx', type=click.Path(file_okay=True, dir_okay=False))
@click.option('--tsvs', type=click.Path(file_okay=False, dir_okay=True))
def main(graph: BELGraph, xlsx: str, tsvs: str):
    """Export the graph to a SPIA Excel sheet."""
    if not xlsx and not tsvs:
        click.secho('Specify at least one option --xlsx or --tsvs', fg='red')
        sys.exit(1)

    spia_matrices = bel_to_spia_matrices(graph)

    if xlsx:
        spia_matrices_to_excel(spia_matrices, xlsx)

    if tsvs:
        spia_matrices_to_tsvs(spia_matrices, tsvs)


if __name__ == '__main__':
    main()