Source code for pybel.io.spia

# -*- coding: utf-8 -*-

"""An exporter for signaling pathway impact analysis (SPIA) described by [Tarca2009]_.

.. [Tarca2009] Tarca, A. L., *et al* (2009). `A novel signaling pathway impact analysis
               <https://doi.org/10.1093/bioinformatics/btn577>`_. Bioinformatics, 25(1), 75–82.

.. seealso:: https://bioconductor.org/packages/release/bioc/html/SPIA.html
"""

import itertools as itt
import os
from collections import OrderedDict
from typing import Dict, Mapping, Set

import pandas as pd

from ..constants import (
    ASSOCIATION,
    CAUSAL_DECREASE_RELATIONS,
    CAUSAL_INCREASE_RELATIONS,
    RELATION,
)
from ..dsl import CentralDogma, Gene, ListAbundance, ProteinModification, Rna
from ..language import pmod_mappings
from ..struct import BELGraph
from ..typing import EdgeData

__all__ = [
    "to_spia_dfs",
    "to_spia_excel",
    "to_spia_tsvs",
]

SPIADataFrames = Mapping[str, pd.DataFrame]

KEGG_RELATIONS = [
    "activation",
    "compound",
    "binding/association",
    "expression",
    "inhibition",
    "activation_phosphorylation",
    "phosphorylation",
    "inhibition_phosphorylation",
    "inhibition_dephosphorylation",
    "dissociation",
    "dephosphorylation",
    "activation_dephosphorylation",
    "state change",
    "activation_indirect effect",
    "inhibition_ubiquination",
    "ubiquination",
    "expression_indirect effect",
    "inhibition_indirect effect",
    "repression",
    "dissociation_phosphorylation",
    "indirect effect_phosphorylation",
    "activation_binding/association",
    "indirect effect",
    "activation_compound",
    "activation_ubiquination",
]


[docs]def to_spia_excel(graph: BELGraph, path: str) -> None: """Write the BEL graph as an SPIA-formatted excel sheet at the given path.""" x = to_spia_dfs(graph) spia_matrices_to_excel(x, path)
[docs]def to_spia_tsvs(graph: BELGraph, directory: str) -> None: """Write the BEL graph as a set of SPIA-formatted TSV files in a given directory.""" x = to_spia_dfs(graph) spia_matrices_to_tsvs(x, directory)
[docs]def to_spia_dfs(graph: BELGraph) -> SPIADataFrames: """Create an excel sheet ready to be used in SPIA software. :param graph: BELGraph :return: dictionary with matrices """ index_nodes = get_matrix_index(graph) spia_matrices = build_spia_matrices(index_nodes) for u, v, edge_data in graph.edges(data=True): # Both nodes are CentralDogma abundances if isinstance(u, CentralDogma) and isinstance(v, CentralDogma): # Update matrix dict update_spia_matrices(spia_matrices, u, v, edge_data) # Subject is CentralDogmaAbundance and node is ListAbundance elif isinstance(u, CentralDogma) and isinstance(v, ListAbundance): # Add a relationship from subject to each of the members in the object for node in v.members: # Skip if the member is not in CentralDogma if not isinstance(node, CentralDogma): continue update_spia_matrices(spia_matrices, u, node, edge_data) # Subject is ListAbundance and node is CentralDogmaAbundance elif isinstance(u, ListAbundance) and isinstance(v, CentralDogma): # Add a relationship from each of the members of the subject to the object for node in u.members: # Skip if the member is not in CentralDogma if not isinstance(node, CentralDogma): continue update_spia_matrices(spia_matrices, node, v, edge_data) # Both nodes are ListAbundance elif isinstance(u, ListAbundance) and isinstance(v, ListAbundance): for sub_member, obj_member in itt.product(u.members, v.members): # Update matrix if both are CentralDogma if isinstance(sub_member, CentralDogma) and isinstance(obj_member, CentralDogma): update_spia_matrices(spia_matrices, sub_member, obj_member, edge_data) # else Not valid edge return spia_matrices
def get_matrix_index(graph: BELGraph) -> Set[str]: """Return set of HGNC names from Proteins/Rnas/Genes/miRNA, nodes that can be used by SPIA.""" # TODO: Using HGNC Symbols for now return {node.name for node in graph if isinstance(node, CentralDogma) and node.namespace.upper() == "HGNC"} def build_spia_matrices(nodes: Set[str]) -> Dict[str, pd.DataFrame]: """Build an adjacency matrix for each KEGG relationship and return in a dictionary. :param nodes: A set of HGNC gene symbols :return: Dictionary of adjacency matrix for each relationship """ nodes = list(sorted(nodes)) # Create sheets of the excel in the given order matrices = OrderedDict() for relation in KEGG_RELATIONS: matrices[relation] = pd.DataFrame(0, index=nodes, columns=nodes) return matrices UB_NAMES = {"Ub"} | {e.name for e in pmod_mappings["Ub"]["xrefs"]} PH_NAMES = {"Ph"} | {e.name for e in pmod_mappings["Ph"]["xrefs"]} def update_spia_matrices( spia_matrices: Dict[str, pd.DataFrame], u: CentralDogma, v: CentralDogma, edge_data: EdgeData, ) -> None: """Populate the adjacency matrix.""" if u.namespace.lower() != "hgnc" or v.namespace.lower() != "hgnc": return u_name = u.name v_name = v.name relation = edge_data[RELATION] if relation in CAUSAL_INCREASE_RELATIONS: # If it has pmod check which one and add it to the corresponding matrix if v.variants and any(isinstance(variant, ProteinModification) for variant in v.variants): for variant in v.variants: if not isinstance(variant, ProteinModification): continue elif variant.entity.name in UB_NAMES: spia_matrices["activation_ubiquination"][u_name][v_name] = 1 elif variant.entity.name in PH_NAMES: spia_matrices["activation_phosphorylation"][u_name][v_name] = 1 elif isinstance(v, (Gene, Rna)): # Normal increase, add activation spia_matrices["expression"][u_name][v_name] = 1 else: spia_matrices["activation"][u_name][v_name] = 1 elif relation in CAUSAL_DECREASE_RELATIONS: # If it has pmod check which one and add it to the corresponding matrix if v.variants and any(isinstance(variant, ProteinModification) for variant in v.variants): for variant in v.variants: if not isinstance(variant, ProteinModification): continue elif variant.entity.name in UB_NAMES: spia_matrices["inhibition_ubiquination"][u_name][v_name] = 1 elif variant.entity.name in PH_NAMES: spia_matrices["inhibition_phosphorylation"][u_name][v_name] = 1 elif isinstance(v, (Gene, Rna)): # Normal decrease, check which matrix spia_matrices["repression"][u_name][v_name] = 1 else: spia_matrices["inhibition"][u_name][v_name] = 1 elif relation == ASSOCIATION: spia_matrices["binding_association"][u_name][v_name] = 1 def spia_matrices_to_excel(spia_matrices: SPIADataFrames, path: str) -> None: """Export a SPIA data dictionary into an Excel sheet at the given path. .. note:: # The R import should add the values: # ["nodes"] from the columns # ["title"] from the name of the file # ["NumberOfReactions"] set to "0" """ writer = pd.ExcelWriter(path, engine="xlsxwriter") for relation, df in spia_matrices.items(): df.to_excel(writer, sheet_name=relation, index=False) # Save excel writer.save() def spia_matrices_to_tsvs(spia_matrices: SPIADataFrames, directory: str) -> None: """Export a SPIA data dictionary into a directory as several TSV documents.""" os.makedirs(directory, exist_ok=True) for relation, df in spia_matrices.items(): df.to_csv( os.path.join(directory, "{relation}.tsv".format(relation=relation)), index=True, )