Source code for pybel.io.graphdati

# -*- coding: utf-8 -*-

"""Conversion functions for BEL graphs with GraphDati.

Note that these are not exact I/O - you can't currently use them as a round trip because
the input functions expect the GraphDati format that's output by BioDati.
"""

import gzip
import json
import logging
from collections import defaultdict
from typing import Any, Iterable, List, Mapping, Optional, TextIO, Tuple, Union

import pyparsing
from networkx.utils import open_file
from tqdm.autonotebook import tqdm

from .jgif import NAMESPACE_TO_PATTERN
from ..canonicalize import edge_to_tuple
from ..constants import (
    ANNOTATIONS,
    CITATION,
    CITATION_TYPE_PUBMED,
    CITATION_TYPE_URL,
    EVIDENCE,
    IDENTIFIER,
    NAMESPACE,
    RELATION,
    UNQUALIFIED_EDGES,
)
from ..parser import BELParser
from ..struct import BELGraph
from ..typing import EdgeData

__all__ = [
    "to_graphdati",
    "from_graphdati",
    "to_graphdati_file",
    "from_graphdati_file",
    "to_graphdati_gz",
    "from_graphdati_gz",
    "to_graphdati_jsons",
    "from_graphdati_jsons",
    "to_graphdati_jsonl",
    "to_graphdati_jsonl_gz",
]

logger = logging.getLogger(__name__)

NanopubMapping = Mapping[str, Mapping[str, Any]]

SCHEMA_URI = "https://github.com/belbio/schemas/blob/master/schemas/nanopub_bel-1.0.0.yaml"
GRAPHDATI_PUBLICATION_TYPES = {
    "PMID": CITATION_TYPE_PUBMED,
    "http": CITATION_TYPE_URL,
    "https": CITATION_TYPE_URL,
}


[docs]@open_file(1, mode="w")
def to_graphdati_file(graph: BELGraph, path: Union[str, TextIO], use_identifiers: bool = True, **kwargs) -> None:
    """Write this graph as GraphDati JSON to a file.

    :param graph: A BEL graph
    :param path: A path or file-like
    """
    json.dump(to_graphdati(graph, use_identifiers=use_identifiers), path, ensure_ascii=False, **kwargs)


[docs]def from_graphdati_file(path: Union[str, TextIO]) -> BELGraph:
    """Load a file containing GraphDati JSON.

    :param path: A path or file-like
    """
    return from_graphdati(json.load(path))


[docs]def to_graphdati_gz(graph: BELGraph, path: str, **kwargs) -> None:
    """Write a graph as GraphDati JSON to a gzip file."""
    with gzip.open(path, "wt") as file:
        to_graphdati_file(graph, file, **kwargs)


[docs]def from_graphdati_gz(path: str) -> BELGraph:
    """Read a graph as GraphDati JSON from a gzip file."""
    with gzip.open(path, "rt") as file:
        return from_graphdati(json.load(file))


[docs]def to_graphdati_jsons(graph: BELGraph, **kwargs) -> str:
    """Dump this graph as a GraphDati JSON object to a string.

    :param graph: A BEL graph
    """
    return json.dumps(to_graphdati(graph), ensure_ascii=False, **kwargs)


[docs]def from_graphdati_jsons(s: str) -> BELGraph:
    """Load a graph from a GraphDati JSON string.

    :param graph: A BEL graph
    """
    return from_graphdati(json.loads(s))


[docs]@open_file(1, mode="w")
def to_graphdati_jsonl(graph, file, use_identifiers: bool = True, use_tqdm: bool = True):
    """Write this graph as a GraphDati JSON lines file.

    :param graph: A BEL graph
    """
    for nanopub in _iter_graphdati(graph, use_identifiers=use_identifiers, use_tqdm=use_tqdm):
        print(json.dumps(nanopub), file=file)


[docs]def to_graphdati_jsonl_gz(graph: BELGraph, path: str, **kwargs) -> None:
    """Write a graph as GraphDati JSONL to a gzip file.

    :param graph: A BEL graph
    """
    with gzip.open(path, "wt") as file:
        to_graphdati_jsonl(graph, file, **kwargs)


[docs]def to_graphdati(
    graph,
    *,
    use_identifiers: bool = True,
    skip_unqualified: bool = True,
    use_tqdm: bool = False,
    metadata_extras: Optional[Mapping[str, Any]] = None,
) -> List[NanopubMapping]:
    """Export a GraphDati list using the nanopub.

    :param graph: A BEL graph
    :param use_identifiers: use OBO-style identifiers
    :param use_tqdm: Show a progress bar while generating nanopubs
    :param skip_unqualified: Should unqualified edges be output as nanopubs? Defaults to false.
    :param metadata_extras: Extra information to pass into the metadata part of nanopubs
    """
    return list(
        _iter_graphdati(
            graph,
            use_identifiers=use_identifiers,
            skip_unqualified=skip_unqualified,
            metadata_extras=metadata_extras,
            use_tqdm=use_tqdm,
        )
    )


def _iter_graphdati(
    graph,
    *,
    skip_unqualified: bool = True,
    use_identifiers: bool = True,
    use_tqdm: bool = False,
    metadata_extras: Optional[Mapping[str, Any]] = None,
) -> Iterable[NanopubMapping]:
    it = graph.edges(keys=True, data=True)
    if use_tqdm:
        it = tqdm(it, total=graph.number_of_edges(), desc="iterating as nanopubs")
    for u, v, k, d in it:
        if skip_unqualified and d[RELATION] in UNQUALIFIED_EDGES:
            continue
        yield _make_nanopub(graph, u, v, k, d, use_identifiers, metadata_extras=metadata_extras)


def _make_nanopub(graph: BELGraph, u, v, k, d, use_identifiers, metadata_extras=None) -> NanopubMapping:
    return dict(
        nanopub=dict(
            schema_uri=SCHEMA_URI,
            type=dict(name="BEL", version="2.1.0"),
            annotations=_get_annotations(d),
            citation=_get_citation(d),
            assertions=_get_assertions(u, v, d, use_identifiers),
            evidence=_get_evidence(d),
            metadata=_get_metadata(graph, d, extras=metadata_extras),
            id="pybel_{}".format(k),
        ),
    )


def _get_assertions(u, v, d, use_identifiers):
    return [
        dict(
            zip(
                ("subject", "relation", "object"),
                edge_to_tuple(u, v, d, use_identifiers=use_identifiers),
            )
        ),
    ]


def _get_evidence(d):
    return d.get(EVIDENCE, "Not Available")


def _get_citation(d):
    citation = d.get(CITATION)
    rv = {}
    if citation is None:
        rv["reference"] = "Not Available"
    else:
        rv["database"] = dict(name=citation[NAMESPACE], id=citation[IDENTIFIER])
    return rv


def _get_metadata(graph: BELGraph, _, extras=None):
    rv = dict(
        gd_creator=graph.authors,
        version=graph.version,
    )  # TODO later
    if extras is not None:
        rv.update(extras)
    return rv


def _get_annotations(d: EdgeData) -> List[Mapping[str, str]]:
    rv = []
    for key, values in d.get(ANNOTATIONS, {}).items():
        if isinstance(values, dict):
            for value in values:
                rv.append(
                    {
                        "type": "Evidence",
                        "label": key,
                        "id": str(value),
                    }
                )
        else:
            rv.append(
                {
                    "type": "Evidence",
                    "label": key,
                    "id": str(values),
                }
            )
    return rv


[docs]def from_graphdati(j, use_tqdm: bool = True) -> BELGraph:
    """Convert data from the "normal" network format.

    .. warning:: BioDati crashes when requesting the ``full`` network format, so this isn't yet explicitly supported
    """
    root = j["graph"]
    graph = BELGraph(
        name=root.get("label"),
        version=root["metadata"].get("gd_rev"),
        authors=root["metadata"].get("gd_creator"),
        description=root.get("gd_description"),
    )
    # Just in case you want to find it again
    graph.graph["biodati_network_id"] = root["metadata"]["id"]

    parser = BELParser(
        graph=graph,
        namespace_to_pattern=NAMESPACE_TO_PATTERN,  # To be updated manually depending on what William is up to
    )

    it = root["edges"]
    if use_tqdm:
        it = tqdm(it, desc="iterating edges")

    for i, edge in enumerate(it):
        relation = edge.get("relation")
        if relation is None:
            logger.warning("no relation for edge: %s", edge)

        if relation in {"actsIn", "translocates"}:
            continue  # don't need legacy BEL format

        bel_statement = edge.get("label")  # this is actually the BEL statement
        if bel_statement is None:
            logger.debug("No BEL statement for edge %s", edge)
            continue

        # Fill up that sweet, sweet metadata
        metadata_entries = edge["metadata"]["nanopub_data"]
        for metadata in metadata_entries:
            parser.control_parser.clear()

            citation = metadata["citation_id"]  # as CURIE
            citation_db, citation_id = _parse_biodati_citation(citation)
            if citation_db is None:
                continue
            parser.control_parser.citation_db = citation_db
            parser.control_parser.citation_db_id = citation_id

            # FIXME where is the evidence/support/summary text?
            parser.control_parser.evidence = "No evidence available from BioDai"

            nanopub_id = metadata["nanopub_id"]
            parser.control_parser.annotations["biodati_nanopub_id"] = [nanopub_id]

            annotations = metadata["annotations"]
            parser.control_parser.annotations.update(_parse_biodati_annotations(annotations))

            # Finally, parse the BEL statement (once to go with each set of metadata)
            # TODO change parser to give back pre-compiled info so this doesn't need to be repeated
            try:
                parser.parseString(bel_statement, line_number=i)
            except pyparsing.ParseException as e:
                logger.warning("parse error for %s: %s", bel_statement, e)

    return graph


def _parse_biodati_citation(citation: str) -> Union[Tuple[None, None], Tuple[str, str]]:
    try:
        citation_db, citation_id = citation.split(":")
    except ValueError:
        logger.warning("structured citation not available for %s", citation)
        return None, None

    try:
        citation_db = GRAPHDATI_PUBLICATION_TYPES[citation_db]
    except KeyError:
        logger.warning("invalid citation structure: %s", citation)
        return None, None

    return citation_db, citation_id


def _parse_biodati_annotations(annotations: List[Mapping[str, str]]) -> Mapping[str, Mapping[str, bool]]:
    rv = defaultdict(set)
    for annotation in annotations:
        annotation_curie = annotation["id"]
        annotation_prefix, annotation_id = annotation_curie.split(":", 1)
        rv[annotation_prefix].add(annotation_id)
    return dict(rv)