Source code for pybel.io.jgif

# -*- coding: utf-8 -*-

"""Conversion functions for BEL graphs with JGIF JSON.

The JSON Graph Interchange Format (JGIF) is `specified <http://jsongraphformat.info/>`_ similarly to the Node-Link
JSON. Interchange with this format provides compatibilty with other software and repositories, such as the
`Causal Biological Network Database <http://causalbionet.com/>`_.
"""

import gzip
import json
import logging
import re
from collections import defaultdict
from operator import methodcaller
from typing import Any, Mapping, Optional, TextIO, Union

import requests
from networkx.utils import open_file
from pyparsing import ParseException

from .. import constants as pc
from ..constants import (
    ANNOTATIONS,
    CITATION,
    EVIDENCE,
    GRAPH_ANNOTATION_URL,
    GRAPH_NAMESPACE_URL,
    METADATA_AUTHORS,
    METADATA_CONTACT,
    METADATA_INSERT_KEYS,
    METADATA_LICENSES,
    RELATION,
    UNQUALIFIED_EDGES,
)
from ..exceptions import NakedNameWarning, UndefinedNamespaceWarning
from ..parser import BELParser
from ..struct import BELGraph
from ..version import get_version

__all__ = [
    "from_cbn_jgif",
    "from_cbn_jgif_file",
    "from_jgif",
    "from_jgif_file",
    "from_jgif_gz",
    "from_jgif_jsons",
    "to_jgif",
    "to_jgif_file",
    "to_jgif_gz",
    "to_jgif_jsons",
    "post_jgif",
]

logger = logging.getLogger(__name__)

annotation_map = {
    "tissue": "Tissue",
    "disease": "Disease",
    "species_common_name": "Species",
    "cell": "Cell",
}

species_map = {
    "human": "9606",
    "rat": "10116",
    "mouse": "10090",
}

placeholder_evidence = (
    "This Network edge has no supporting evidence.  Please add real evidence to this edge prior to deleting."
)

EXPERIMENT_CONTEXT = "experiment_context"


def map_cbn(d):
    """Pre-processes the JSON from the CBN.

    - removes statements without evidence, or with placeholder evidence

    :param dict d: Raw JGIF from the CBN
    :return: Preprocessed JGIF
    :rtype: dict
    """
    for i, edge in enumerate(d["graph"]["edges"]):
        if "metadata" not in edge:
            continue

        if "evidences" not in edge["metadata"]:
            continue

        for j, evidence in enumerate(edge["metadata"]["evidences"]):
            if EXPERIMENT_CONTEXT not in evidence:
                continue

            # ctx = {k.strip().lower(): v.strip() for k, v in evidence[EXPERIMENT_CONTEXT].items() if v.strip()}

            new_context = {}

            for key, value in evidence[EXPERIMENT_CONTEXT].items():
                if not value:
                    logger.debug("key %s without value", key)
                    continue

                value = value.strip()

                if not value:
                    logger.debug("key %s without value", key)
                    continue

                key = key.strip().lower()

                if key == "species_common_name":
                    new_context["Species"] = species_map[value.lower()]
                elif key in annotation_map:
                    new_context[annotation_map[key]] = value
                else:
                    new_context[key] = value

            """
            for k, v in annotation_map.items():
                if k not in ctx:
                    continue

                d['graph']['edges'][i]['metadata']['evidences'][j][EXPERIMENT_CONTEXT][v] = ctx[k]
                del d['graph']['edges'][i]['metadata']['evidences'][j][EXPERIMENT_CONTEXT][k]

            if 'species_common_name' in ctx:
                species_name = ctx['species_common_name'].strip().lower()
                d['graph']['edges'][i]['metadata']['evidences'][j][EXPERIMENT_CONTEXT]['Species'] = species_map[
                    species_name]
                del d['graph']['edges'][i]['metadata']['evidences'][j][EXPERIMENT_CONTEXT][
                    'species_common_name']
            """

            # TODO can this be replaced with edge as well?
            d["graph"]["edges"][i]["metadata"]["evidences"][j][EXPERIMENT_CONTEXT] = new_context

    return d


NAMESPACE_URLS = {
    "TAX": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/ncbi-taxonomy/ncbi-taxonomy-20200322.belns",
    "HGNC": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/hgnc-human-genes/hgnc-human-genes-20150601.belns",
    "GOBP": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/go-biological-process/go-biological-process-20150601.belns",
    "SFAM": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/selventa-protein-families/selventa-protein-families-20150601.belns",
    "GOCC": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/go-cellular-component/go-cellular-component-20170511.belns",
    "MESHPP": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/mesh-processes/mesh-processes-20150601.belns",
    "MGI": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/mgi-mouse-genes/mgi-mouse-genes-20150601.belns",
    "RGD": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/rgd-rat-genes/rgd-rat-genes-20150601.belns",
    "CHEBI": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/chebi/chebi-20150601.belns",
    "SCHEM": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/selventa-legacy-chemicals/selventa-legacy-chemicals-20150601.belns",
    "EGID": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/entrez-gene-ids/entrez-gene-ids-20150601.belns",
    "MESHD": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/mesh-diseases/mesh-diseases-20150601.belns",
    "SDIS": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/selventa-legacy-diseases/selventa-legacy-diseases-20150601.belns",
    "SCOMP": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/selventa-named-complexes/selventa-named-complexes-20150601.belns",
    "MESHC": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/mesh-chemicals/mesh-chemicals-20170511.belns",
    "GOBPID": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/go-biological-process-ids/go-biological-process-ids-20150601.belns",
    "GOCCID": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/go-cellular-component-ids/go-cellular-component-ids-20150601.belns",
    "MESHCS": "https://arty.scai.fraunhofer.de/artifactory/bel/namespace/mesh-cell-structures/mesh-cell-structures-20150601.belns",
}

ANNOTATION_URLS = {
    "Cell": "https://arty.scai.fraunhofer.de/artifactory/bel/annotation/cell-line/cell-line-20150601.belanno",
    "Disease": "https://arty.scai.fraunhofer.de/artifactory/bel/annotation/disease/disease-20150601.belanno",
    "Species": "https://arty.scai.fraunhofer.de/artifactory/bel/annotation/species-taxonomy-id/species-taxonomy-id-20170511.belanno",
    "Tissue": "https://arty.scai.fraunhofer.de/artifactory/bel/annotation/mesh-anatomy/mesh-anatomy-20150601.belanno",
}

NAMESPACE_TO_PATTERN = {
    namespace: re.compile(r".*") for namespace in (set(NAMESPACE_URLS) | {"GO", "MESH"})  # don't validate anything
}


[docs]@open_file(0, mode="r") def from_cbn_jgif_file(path: Union[str, TextIO]) -> BELGraph: """Build a graph from a file containing the CBN variant of JGIF. :param path: A path or file-like """ return from_cbn_jgif(json.load(path))
[docs]def from_cbn_jgif(graph_jgif_dict): """Build a BEL graph from CBN JGIF. Map the JGIF used by the Causal Biological Network Database to standard namespace and annotations, then builds a BEL graph using :func:`pybel.from_jgif`. :param dict graph_jgif_dict: The JSON object representing the graph in JGIF format :rtype: BELGraph Example: .. code-block:: python import requests from pybel import from_cbn_jgif apoptosis_url = 'http://causalbionet.com/Networks/GetJSONGraphFile?networkId=810385422' graph_jgif_dict = requests.get(apoptosis_url).json() graph = from_cbn_jgif(graph_jgif_dict) .. warning:: Handling the annotations is not yet supported, since the CBN documents do not refer to the resources used to create them. This may be added in the future, but the annotations must be stripped from the graph before uploading to the network store using :func:`pybel.struct.mutation.strip_annotations`. """ graph_jgif_dict = map_cbn(graph_jgif_dict) graph_jgif_dict["graph"][GRAPH_NAMESPACE_URL] = NAMESPACE_URLS graph_jgif_dict["graph"][GRAPH_ANNOTATION_URL] = ANNOTATION_URLS graph_jgif_dict["graph"]["metadata"].update( { METADATA_AUTHORS: "Causal Biological Networks Database", METADATA_LICENSES: """ Please cite: - www.causalbionet.com - https://bionet.sbvimprover.com as well as any relevant publications. The sbv IMPROVER project, the website and the Symposia are part of a collaborative project designed to enable scientists to learn about and contribute to the development of a new crowd sourcing method for verification of scientific data and results. The current challenges, website and biological network models were developed and are maintained as part of a collaboration among Selventa, OrangeBus and ADS. The project is led and funded by Philip Morris International. For more information on the focus of Philip Morris International’s research, please visit www.pmi.com. """.replace( "\n", "\t" ), METADATA_CONTACT: "CausalBiologicalNetworks.RD@pmi.com", } ) graph = from_jgif(graph_jgif_dict) return graph
[docs]def from_jgif(graph_jgif_dict, parser_kwargs: Optional[Mapping[str, Any]] = None): # noqa:C901 """Build a BEL graph from a JGIF JSON object. :param dict graph_jgif_dict: The JSON object representing the graph in JGIF format :rtype: BELGraph """ graph = BELGraph() root = graph_jgif_dict["graph"] if "label" in root: graph.name = root["label"] if "metadata" in root: metadata = root["metadata"] for key in METADATA_INSERT_KEYS: if key in metadata: graph.document[key] = metadata[key] for k in (GRAPH_ANNOTATION_URL, GRAPH_NAMESPACE_URL): if k in root: graph.graph[k] = root[k] parser = BELParser(graph, namespace_to_pattern=NAMESPACE_TO_PATTERN) parser.bel_term.addParseAction(parser.handle_term) for node in root["nodes"]: node_label = node.get("label") if node_label is None: logger.warning("node missing label: %s", node) continue try: parser.bel_term.parseString(node_label) except NakedNameWarning as e: logger.info("Naked name: %s", e) except UndefinedNamespaceWarning as e: logger.info("Undefined namespace: %s", e) except ParseException: logger.info("Parse exception for %s", node_label) for i, edge in enumerate(root["edges"]): relation = edge.get("relation") if relation is None: logger.warning("no relation for edge: %s", edge) if relation in {"actsIn", "translocates"}: continue # don't need legacy BEL format edge_metadata = edge.get("metadata") if edge_metadata is None: logger.warning("no metadata for edge: %s", edge) continue bel_statement = edge.get("label") if bel_statement is None: logger.debug("No BEL statement for edge %s", edge) evidences = edge_metadata.get("evidences") if relation in UNQUALIFIED_EDGES: pass # FIXME? else: if not evidences: # is none or is empty list logger.debug("No evidence for edge %s", edge) continue for evidence in evidences: citation = evidence.get("citation") if not citation: continue if "type" not in citation or "id" not in citation: continue summary_text = evidence["summary_text"].strip() if not summary_text or summary_text == placeholder_evidence: continue parser.control_parser.clear() citation_namespace = citation["type"].lower().strip() citation_namespace = pc.CITATION_NORMALIZER.get(citation_namespace, citation_namespace) parser.control_parser.citation_db = citation_namespace parser.control_parser.citation_db_id = citation["id"].strip() parser.control_parser.evidence = summary_text annotations = parser.graph._clean_annotations(evidence[EXPERIMENT_CONTEXT]) parser.control_parser.annotations.update(annotations) try: parser.parseString(bel_statement, line_number=i) except Exception as e: logger.warning("JGIF relation parse error: %s for %s", e, bel_statement) return graph
[docs]@open_file(0, mode="r") def from_jgif_file(path: Union[str, TextIO]) -> BELGraph: """Build a graph from the JGIF JSON contained in the given file. :param path: A path or file-like """ return from_jgif(json.load(path))
[docs]def from_jgif_gz(path: str) -> BELGraph: """Read a graph as JGIF JSON from a gzip file.""" with gzip.open(path, "rt") as file: return from_jgif(json.load(file))
[docs]def from_jgif_jsons(graph_json_str: str) -> BELGraph: """Read a BEL graph from a JGIF JSON string.""" return from_jgif(json.loads(graph_json_str))
[docs]def to_jgif(graph): """Build a JGIF dictionary from a BEL graph. :param pybel.BELGraph graph: A BEL graph :return: A JGIF dictionary :rtype: dict .. warning:: Untested! This format is not general purpose and is therefore time is not heavily invested. If you want to use Cytoscape.js, we suggest using :func:`pybel.to_cx` instead. The example below shows how to output a BEL graph as a JGIF dictionary. .. code-block:: python import os from pybel.examples import sialic_acid_graph graph_jgif_json = pybel.to_jgif(sialic_acid_graph) If you want to write the graph directly to a file as JGIF, see func:`to_jgif_file`. """ u_v_r_bel = {} nodes_entry = [] edges_entry = [] for node in sorted(graph, key=methodcaller("as_bel")): nodes_entry.append( { "id": node.md5, "label": node.as_bel(), "bel_function_type": node.function, } ) for u, v in graph.edges(): relation_evidences = defaultdict(list) for key, data in graph[u][v].items(): if (u, v, data[RELATION]) not in u_v_r_bel: u_v_r_bel[u, v, data[RELATION]] = graph.edge_to_bel(u, v, edge_data=data) bel = u_v_r_bel[u, v, data[RELATION]] evidence_dict = { "bel_statement": bel, "key": key, } if ANNOTATIONS in data: evidence_dict["experiment_context"] = data[ANNOTATIONS] if EVIDENCE in data: evidence_dict["summary_text"] = data[EVIDENCE] if CITATION in data: evidence_dict["citation"] = data[CITATION] relation_evidences[data[RELATION]].append(evidence_dict) for relation, evidences in relation_evidences.items(): edges_entry.append( { "source": u.md5, "target": v.md5, "relation": relation, "label": u_v_r_bel[u, v, relation], "metadata": { "evidences": evidences, }, } ) return { "graph": { "metadata": dict( origin=dict(name="pybel", version=get_version()), **graph.document, ), "nodes": nodes_entry, "edges": edges_entry, }, }
[docs]@open_file(1, mode="w") def to_jgif_file(graph: BELGraph, file: Union[str, TextIO], **kwargs) -> None: """Write JGIF to a file. :param graph: A BEL graph :param file: A writable file or file-like The example below shows how to output a BEL graph as JGIF to an open file. .. code-block:: python from pybel.examples import sialic_acid_graph from pybel import to_jgif_file with open('graph.bel.jgif.json', 'w') as file: to_jgif_file(sialic_acid_graph, file) The example below shows how to output a BEL graph as JGIF to a file at a given path. .. code-block:: python from pybel.examples import sialic_acid_graph from pybel import to_jgif_file to_jgif_file(sialic_acid_graph, 'graph.bel.jgif.json') If you have a big graph, you might consider storing it as a gzipped JGIF file by using :func:`to_jgif_gz`. """ json.dump(to_jgif(graph), file, ensure_ascii=False, **kwargs)
[docs]def to_jgif_gz(graph, path: str, **kwargs) -> None: """Write a graph as JGIF JSON to a gzip file.""" with gzip.open(path, "wt") as file: json.dump(to_jgif(graph), file, ensure_ascii=False, **kwargs)
[docs]def to_jgif_jsons(graph: BELGraph, **kwargs) -> str: """Dump this graph as a JGIF JSON object to a string.""" return json.dumps(to_jgif(graph), ensure_ascii=False, **kwargs)
[docs]def post_jgif(graph: BELGraph, url: str, **kwargs) -> requests.Response: """Post the JGIF to a given URL.""" return requests.post(url, json=to_jgif(graph), **kwargs)