# -*- coding: utf-8 -*-
"""Convert a BEL graph to HiPathia inputs.
Input
-----
SIF File
~~~~~~~~
- Text file with three columns separated by tabs.
- Each row represents an interaction in the pathway. First column is the source
node, third column the target node, and the second is the type of relation
between them.
- Only activation and inhibition interactions are allowed.
- The name of the nodes in this file will be stored as the IDs of the nodes.
- The nodes IDs should have the following structure: N (dash) pathway ID (dash)
node ID.
- HiPathia distinguish between two types of nodes: simple and complex.
Simple nodes:
- Simple nodes may include many genes, but only one is needed to perform the
function of the node. This could correspond to a protein family of enzymes
that all have the same function - only one of them needs to be present for
the action to take place. Simple nodes are defined within
- Node IDs from simple nodes do not include any space, i.e. N-hsa04370-11.
Complex nodes:
- Complex nodes include different simple nodes and represent protein complexes.
Each simple node within the complex represents one protein in the complex.
This node requires the presence of all their simple nodes to perform its
function.
- Node IDs from complex nodes are the juxtaposition of the included simple node
IDs, separated by spaces, i.e. N-hsa04370-10 26.
ATT File
~~~~~~~~
Text file with twelve (12) columns separated by tabulars. Each row represents a node (either simple or complex).
The columns included are:
1. ``ID``: Node ID as explained above.
2. ``label``: Name to be shown in the picture of the pathway en HGNC. Generally, the gene name of the first
included EntrezID gene is used as label. For complex nodes, we juxtapose the gene names of the first genes of
each simple node included (see genesList column below).
3. ``X``: The X-coordinate of the position of the node in the pathway.
4. ``Y``: The Y-coordinate of the position of the node in the pathway.
5. ``color``: The default color of the node.
6. ``shape``: The shape of the node. "rectangle" should be used for genes and "circle" for metabolites.
7. ``type``: The type of the node, either "gene" for genes or "compound" for metabolites. For complex nodes, the
type of each of their included simple nodes is juxtaposed separated by commas, i.e. gene,gene.
8. ``label.cex``: Amount by which plotting label should be scaled relative to the default.
9. ``label.color``: Default color of the node.
10. ``width``: Default width of the node.
11. ``height``: Default height of the node.
12. ``genesList``: List of genes included in each node, with EntrezID:
- Simple nodes: EntrezIDs of the genes included, separated by commas (",") and no spaces, i.e. 56848,8877 for
node N-hsa04370-11.
- Complex nodes: GenesList of the simple nodes included, separated by a slash ("/") and no spaces, and in the
same order as in the node ID. For example, node N-hsa04370-10 26 includes two simple nodes: 10 and 26. Its
genesList column is 5335,5336,/,9047, meaning that the genes included in node 10 are 5335 and 5336, and the
gene included in node 26 is 9047.
"""
import logging
import os
from collections import defaultdict
from itertools import groupby
from operator import itemgetter
from typing import List, Optional, Set, Tuple, Union
import networkx as nx
import pandas as pd
from ..constants import (
CAUSAL_INCREASE_RELATIONS,
CAUSAL_POLAR_RELATIONS,
CITATION_TYPE_OTHER,
IS_A,
RELATION,
)
from ..dsl import ComplexAbundance, Protein, hgnc
from ..struct import BELGraph
__all__ = [
"from_hipathia_paths",
"from_hipathia_dfs",
"to_hipathia",
"to_hipathia_dfs",
]
logger = logging.getLogger(__name__)
ATT_COLS = ["ID", "label", "genesList"]
[docs]def from_hipathia_paths(name: str, att_path: str, sif_path: str) -> BELGraph:
"""Get a BEL graph from HiPathia files."""
att_df = pd.read_csv(att_path, sep="\t")
sif_df = pd.read_csv(sif_path, sep="\t", header=None, names=["source", "relation", "target"])
return from_hipathia_dfs(name=name, att_df=att_df, sif_df=sif_df)
def group_delimited_list(entries: List[str], sep: str = "/") -> List[List[str]]:
"""Group delimited things in a list."""
return [list(b) for a, b in groupby(entries, lambda z: z == sep) if not a]
def _p(identifier: str):
return Protein(
namespace="ncbigene",
identifier=identifier,
# name=name,
)
def _f(identifier: str):
return Protein(
namespace="hipathia.family",
identifier=identifier,
# name=name,
)
[docs]def from_hipathia_dfs(name: str, att_df: pd.DataFrame, sif_df: pd.DataFrame) -> BELGraph:
"""Get a BEL graph from HiPathia dataframes."""
def _clean_name(s):
prefix = "N-{name}-".format(name=name)
if prefix not in s:
raise ValueError("wrong name for pathway")
return tuple(sorted(s[len(prefix) :].split(" ")))
att_df["ID"] = att_df["ID"].map(_clean_name)
att_df["label"] = att_df["label"].str.split(" ")
att_df["genesList"] = att_df["genesList"].str.split(",").map(group_delimited_list)
simple_node_to_dsl = {}
family_node_to_dsl = {}
complex_node_to_dsl = {}
graph = BELGraph(name=name)
for components, component_label_lists, component_gene_lists in att_df[["ID", "label", "genesList"]].values:
if not components:
print(att_df[["ID", "label", "genesList"]])
raise ValueError("missing components in row")
if len(components) == 1: # This is a simple node, representing a protein or protein family
component, label, entrez_ids = (
components[0],
component_label_lists[0],
component_gene_lists[0],
)
if len(entrez_ids) == 1: # just a protein
simple_node_to_dsl[component] = _p(identifier=entrez_ids[0])
else: # a protein family
family_dsl = _f(identifier=label)
for entrez_id in entrez_ids:
child_dsl = _p(entrez_id)
graph.add_is_a(child_dsl, family_dsl)
family_node_to_dsl[component] = family_dsl
else: # This is a complex node, representing a protein complex of simple nodes
component_dsls = []
components = tuple(sorted(components))
for component, label, entrez_ids in zip(components, component_label_lists, component_gene_lists):
if len(entrez_ids) == 1:
simple_dsl = _p(identifier=entrez_ids[0])
simple_node_to_dsl[component] = simple_dsl
component_dsls.append(simple_dsl)
else:
family_dsl = _f(identifier=label)
for entrez_id in entrez_ids:
child_dsl = _p(identifier=entrez_id)
graph.add_is_a(child_dsl, family_dsl)
family_node_to_dsl[component] = family_dsl
component_dsls.append(family_dsl)
component_dsl = ComplexAbundance(component_dsls)
graph.add_node_from_data(component_dsl)
complex_node_to_dsl[components] = component_dsl
# Remap all of the dictionaries
x = {}
x.update(complex_node_to_dsl)
for k, v in simple_node_to_dsl.items():
x[(k,)] = v
for k, v in family_node_to_dsl.items():
x[(k,)] = v
sif_df["source"] = sif_df["source"].map(_clean_name).map(x.get)
sif_df["target"] = sif_df["target"].map(_clean_name).map(x.get)
for source, relation, target in sif_df.values:
if relation == "activation":
graph.add_increases(source, target, citation=(CITATION_TYPE_OTHER, "HiPathia"), evidence="")
elif relation == "inhibition":
graph.add_decreases(source, target, citation=(CITATION_TYPE_OTHER, "HiPathia"), evidence="")
else:
raise ValueError("unknown relation: {relation}".format(relation=relation))
return graph
[docs]def to_hipathia(
graph: BELGraph,
directory: str,
draw: bool = True,
) -> None:
"""Export HiPathia artifacts for the graph."""
att_df, sif_df = to_hipathia_dfs(graph, draw_directory=directory if draw else None)
if att_df is None and sif_df is None:
logger.warning("can not convert graph %s", graph.name)
return
att_df.to_csv(os.path.join(directory, "{}.att".format(graph.name)), sep="\t", index=False)
sif_df.to_csv(os.path.join(directory, "{}.sif".format(graph.name)), sep="\t", index=False)
def _is_node_family(graph: BELGraph, node: Protein) -> Optional[Set[Protein]]:
"""Get the children of the protein node, if some exist."""
children = set()
for child, _, data in graph.in_edges(node, data=True):
if data[RELATION] == IS_A:
children.add(child)
if children and not all(isinstance(child, Protein) for child in children):
logger.warning("not all children of {} are proteins: {}".format(node, children))
return
return children
[docs]def to_hipathia_dfs(
graph: BELGraph,
draw_directory: Optional[str] = None,
) -> Union[Tuple[None, None], Tuple[pd.DataFrame, pd.DataFrame]]:
"""Get the ATT and SIF dataframes.
:param graph: A BEL graph
:param draw_directory: The directory in which a drawing should be output
1. Identify nodes:
1. Identify all proteins
2. Identify all protein families
3. Identify all complexes with just a protein or a protein family in them
2. Identify interactions between any of those things that are causal
3. Profit!
"""
proteins = set()
families = defaultdict(set)
complexes = set()
for node in sorted(graph, key=str):
if isinstance(node, Protein):
children = _is_node_family(graph, node)
if children:
families[node] = children
else:
proteins.add(node)
elif isinstance(node, ComplexAbundance) and all(isinstance(m, Protein) for m in node.members):
complexes.add(node)
families = {node: sorted(values, key=str) for node, values in sorted(families.items(), key=itemgetter(0))}
nodes = sorted(proteins.union(families).union(complexes), key=str)
new_nodes = set()
edges = []
for u, v, _, d in sorted(
graph.out_edges(nodes, keys=True, data=True),
key=lambda t: (str(t[0]), str(t[1]), t[2]),
):
relation = d[RELATION]
if relation not in CAUSAL_POLAR_RELATIONS:
continue
new_nodes.add(u)
new_nodes.add(v)
edges.append(
(
u,
"activation" if relation in CAUSAL_INCREASE_RELATIONS else "inhibition",
v,
)
)
att = {}
dsl_to_k = {}
i = 0
for node in sorted(new_nodes, key=str):
if node in families:
i += 1
k = (i,)
children = families[node]
child_identifiers = [child.identifier for child in children]
if not all(child_identifiers):
logger.warning("not all children were grounded: %s", child_identifiers)
continue
labels, genes_lists = [node.name], [child_identifiers]
elif isinstance(node, Protein):
if not node.identifier or not node.name:
logger.warning("node was not grounded: %s", node)
continue
i += 1
k = (i,)
labels, genes_lists = [node.name], [[node.identifier]]
elif isinstance(node, ComplexAbundance):
k, labels, genes_lists = [], [], []
for member in node.members:
i += 1
k.append(i)
labels.append(member.name)
if member in families:
children = families[member]
child_identifiers = [child.identifier for child in children]
if not all(child_identifiers):
logger.warning("not all children were grounded: %s", child_identifiers)
continue
genes_lists.append(child_identifiers)
else:
if not member.identifier:
logger.warning("member was not grounded: %s", member)
continue
genes_lists.append([member.identifier])
k = tuple(k)
else:
logger.debug("skipping node {}".format(node))
continue
k = "N-{}-{}".format(graph.name, " ".join(map(str, k)))
att[k] = labels, genes_lists
dsl_to_k[node] = k
edges = [
(dsl_to_k[source], relation, dsl_to_k[target])
for source, relation, target in edges
if source in dsl_to_k and target in dsl_to_k
]
sif_df = pd.DataFrame(edges) # DONE
composite_graph = nx.Graph([(k_source, k_target) for k_source, _, k_target in edges])
try:
from networkx.drawing.nx_agraph import pygraphviz_layout
pos = pygraphviz_layout(composite_graph, prog="neato", args="-Gstart=5")
except ImportError:
logger.warning("could not import pygraphviz. Falling back to force directed")
pos = nx.fruchterman_reingold_layout(composite_graph, seed=5)
if not pos:
return None, None
nx_labels = {} # from k to label
min_x = min(x for x, y in pos.values())
min_y = min(y for x, y in pos.values())
att_rows = []
for k, (labels, genes_lists) in sorted(att.items()):
if k not in pos:
logger.warning("node not in graph: %s", k)
continue
nx_labels[k] = label = " ".join(labels)
types = ",".join(["gene"] * len(labels))
gene_list = ",/,".join(",".join(gene_list) for gene_list in genes_lists)
x, y = pos[k]
att_rows.append(
(
k, # 1. ID
label, # 2. label
int(100 * (x - min_x)), # 3. X
int(100 * (y - min_y)), # 4. Y
"white", # 5. color
"rectangle", # 6. shape
types, # 7.
0.5, # 8. label.cex
"black", # 9. label.color
46, # 10. width
17, # 11. height
gene_list, # 12. gene list
)
)
att_df = pd.DataFrame(
att_rows,
columns=[
"ID",
"label",
"X",
"Y",
"color",
"shape",
"type",
"label.cex",
"label.color",
"width",
"height",
"genesList",
],
)
if draw_directory is not None:
try:
import matplotlib.pyplot as plt
except ImportError:
logger.warning("could not draw graph because matplotlib is not installed")
else:
plt.figure(figsize=(20, 20))
nx.draw_networkx(composite_graph, pos, labels=nx_labels)
plt.axis("off")
plt.savefig(os.path.join(draw_directory, "{}.png".format(graph.name)))
return att_df, sif_df
def make_hsa047370() -> BELGraph:
"""Make an example BEL graph corresponding to the example data from Marina."""
graph = BELGraph(name="hsa04370")
node_1 = hgnc(name="CDC42")
node_9 = hgnc(name="KDR")
node_11 = hgnc(name="SPHK2")
node_17 = hgnc(name="MAPKAPK3")
node_18 = hgnc(name="PPP3CA")
node_19 = hgnc(name="AKT3")
node_20 = hgnc(name="PIK3R5")
node_21 = hgnc(name="NFATC2")
node_22 = hgnc(name="PRKCA")
node_24 = hgnc(name="MAPK14")
node_27 = hgnc(name="SRC")
node_29 = hgnc(name="VEGFA")
node_32 = hgnc(name="MAPK1")
node_33 = hgnc(name="MAP2K1")
node_34 = hgnc(name="RAF1")
node_35 = hgnc(name="HRAS")
node_10 = ComplexAbundance([hgnc(name="PLCG1"), hgnc(name="SH2D2A")])
node_28 = hgnc(name="SHC2")
node_23 = hgnc(name="PTK2")
node_25 = hgnc(name="PXN")
node_16 = hgnc(name="HSPB1")
node_36 = hgnc(name="NOS3")
node_37 = hgnc(name="CASP9")
node_38 = hgnc(name="BAD")
node_39 = hgnc(name="RAC1")
node_14 = hgnc(name="PTGS2")
node_15 = hgnc(name="PLA2G4B")
def _add_increases(a, b):
graph.add_directly_increases(a, b, citation="", evidence="")
def _add_decreases(a, b):
graph.add_directly_decreases(a, b, citation="", evidence="")
_add_increases(node_1, node_24)
_add_increases(node_9, node_28)
_add_increases(node_9, node_23)
_add_increases(node_9, node_25)
_add_increases(node_9, node_20)
_add_increases(node_9, node_27)
_add_increases(node_9, node_10)
_add_increases(node_11, node_35)
_add_increases(node_17, node_16)
_add_increases(node_18, node_21)
_add_increases(node_19, node_36)
_add_decreases(node_19, node_37)
_add_decreases(node_19, node_38)
_add_increases(node_20, node_39)
_add_increases(node_20, node_19)
_add_increases(node_21, node_14)
_add_increases(node_22, node_34)
_add_increases(node_22, node_11)
_add_increases(node_24, node_17)
_add_increases(node_27, node_20)
_add_increases(node_29, node_9)
_add_increases(node_32, node_15)
_add_increases(node_33, node_32)
_add_increases(node_34, node_33)
_add_increases(node_35, node_34)
_add_increases(node_10, node_18)
_add_increases(node_10, node_22)
_add_increases(node_10, node_15)
_add_increases(node_10, node_36)
return graph