Source code for pybel.io.hetionet.hetionet

# -*- coding: utf-8 -*-

"""Importer for Hetionet JSON."""

import bz2
import json
import logging
import os
from typing import Any, Mapping, Set, Tuple, Union
from urllib.request import urlretrieve

from tqdm import tqdm

from .constants import (
    ACTIVATES_ACTIONS, BINDS_ACTIONS, COMPOUND, DSL_MAP, GENE, HETIONET_PUBMED, INHIBITS_ACTIONS, PHARMACOLOGICAL_CLASS,
    QUALIFIED_MAPPING, REGULATES_ACTIONS, UNQUALIFIED_MAPPING,
)
from ...config import CACHE_DIRECTORY
from ...dsl import Abundance, Protein
from ...struct import BELGraph

__all__ = [
    'get_hetionet',
    'from_hetionet_json',
    'from_hetionet_gz',
    'from_hetionet_file',
]

logger = logging.getLogger(__name__)

JSON_BZ2_URL = 'https://github.com/hetio/hetionet/raw/master/hetnet/json/hetionet-v1.0.json.bz2'
PATH = os.path.join(CACHE_DIRECTORY, 'hetionet-v1.0.json.bz2')


[docs]def get_hetionet() -> BELGraph:
    """Get Hetionet from GitHub, cache, and convert to BEL."""
    if not os.path.exists(PATH):
        logger.warning('downloading hetionet from %s to %s', JSON_BZ2_URL, PATH)
        urlretrieve(JSON_BZ2_URL, PATH)  # noqa: S310
    return from_hetionet_gz(PATH)


[docs]def from_hetionet_gz(path: str) -> BELGraph:
    """Get Hetionet from its JSON GZ file."""
    logger.info('opening %s', path)
    with bz2.open(path) as file:
        return from_hetionet_file(file)


[docs]def from_hetionet_file(file) -> BELGraph:
    """Get Hetionet from a JSON file."""
    logger.info('parsing json from %s', file)
    j = json.load(file)
    logger.info('converting hetionet dict to BEL')
    return from_hetionet_json(j)


[docs]def from_hetionet_json(
    hetionet_dict: Mapping[str, Any],
    use_tqdm: bool = True,
) -> BELGraph:
    """Convert a Hetionet dictionary to a BEL graph."""
    graph = BELGraph(  # FIXME what metadata is appropriate?
        name='Hetionet',
        version='1.0',
        authors='Daniel Himmelstein',
    )
    # FIXME add namespaces
    # graph.namespace_pattern.update({})

    kind_identifier_to_name = {
        (x['kind'], x['identifier']): x['name']
        for x in hetionet_dict['nodes']
    }

    edges = hetionet_dict['edges']

    if use_tqdm:
        edges = tqdm(edges, desc='Converting Hetionet', unit_scale=True)
        it_logger = edges.write
    else:
        it_logger = logger.info

    for edge in edges:
        _add_edge(graph, edge, kind_identifier_to_name, it_logger)

    return graph


def _get_node(edge, key, kind_identifier_to_name) -> Union[Tuple[None, None, None, None], Tuple[str, str, str, str]]:
    node_type, node_identifier = edge[key]
    namespace = DSL_MAP.get(node_type)
    if namespace is None:
        return None, None, None, None
    node_name = kind_identifier_to_name[node_type, node_identifier]
    node_identifier = str(node_identifier)

    if node_identifier.lower().startswith(namespace):
        node_identifier = node_identifier[1 + len(namespace):]  # remove redundant prefix

    return node_type, namespace, node_identifier, node_name


def _add_edge(  # noqa: C901
    graph,
    edge,
    kind_identifier_to_name,
    it_logger,
) -> Union[None, str, Set[str]]:
    source_type, source_ns, source_identifier, source_name = _get_node(edge, 'source_id', kind_identifier_to_name)
    target_type, target_ns, target_identifier, target_name = _get_node(edge, 'target_id', kind_identifier_to_name)
    if source_type is None or target_type is None:
        return

    kind = edge['kind']

    # direction = e['direction']
    data = edge['data']
    if 'unbiased' in data:
        del data['unbiased']

    annotations = {}
    if 'source' in data:
        source = data.pop('source')
        annotations['source'] = {source: True}
    elif 'sources' in data:
        annotations['source'] = {
            source: True
            for source in data.pop('sources')
        }
    else:
        pass
        # it_logger(f'Missing source for {source_identifier}-{kind}-{target_identifier}\n{e}')

    if 'pubmed_ids' in data:
        citations = list(data.pop('pubmed_ids'))
    else:
        citations = [HETIONET_PUBMED]

    for k, v in data.items():
        if k in {'actions', 'urls', 'subtypes'}:
            continue  # handled explicitly later
        if not isinstance(v, (str, int, bool, float)):
            it_logger('Unhandled: {source_identifier}-{kind}-{target_identifier} {k}: {v}'.format(
                source_identifier=source_identifier, kind=kind, target_identifier=target_identifier,
                k=k, v=v,
            ))
            continue
        annotations[k] = {v: True}

    for _h_type, h_dsl, _r, _t_type, t_dsl, f in QUALIFIED_MAPPING:
        if source_type != _h_type or kind != _r or target_type != _t_type:
            continue
        rv = set()
        for citation in citations:
            key = f(
                graph,
                h_dsl(namespace=source_ns, identifier=source_identifier, name=source_name),
                t_dsl(namespace=target_ns, identifier=target_identifier, name=target_name),
                citation=citation, evidence='', annotations=annotations,
            )
            rv.add(key)
        return rv

    for _h_type, h_dsl, _r, _t_type, t_dsl, f in UNQUALIFIED_MAPPING:
        if source_type == _h_type and kind == _r and target_type == _t_type:
            return f(
                graph,
                h_dsl(namespace=source_ns, identifier=source_identifier, name=source_name),
                t_dsl(namespace=target_ns, identifier=target_identifier, name=target_name),
            )

    def _check(_source_type: str, _kind: str, _target_type: str) -> bool:
        """Check the metaedge."""
        return kind == _kind and source_type == _source_type and target_type == _target_type

    if _check(COMPOUND, 'binds', GENE):
        drug = Abundance(namespace='drugbank', name=source_name, identifier=source_identifier)
        protein = Protein(namespace='ncbigene', name=target_name, identifier=target_identifier)

        rv = set()
        for action in data.get('actions', []):
            action = action.lower()
            if action in ACTIVATES_ACTIONS:
                key = graph.add_directly_activates(
                    drug, protein, citation=HETIONET_PUBMED, evidence='', annotations=annotations,
                )
            elif action in INHIBITS_ACTIONS:
                key = graph.add_directly_inhibits(
                    drug, protein, citation=HETIONET_PUBMED, evidence='', annotations=annotations,
                )
            elif action in REGULATES_ACTIONS:
                key = graph.add_regulates(drug, protein, citation=HETIONET_PUBMED, evidence='', annotations=annotations)
            elif action in BINDS_ACTIONS:
                key = graph.add_binds(drug, protein, citation=HETIONET_PUBMED, evidence='', annotations=annotations)
            else:
                key = graph.add_binds(drug, protein, citation=HETIONET_PUBMED, evidence='', annotations=annotations)
                it_logger('Unhandled action for {source_identifier}-{kind}-{target_identifier}: {action}'.format(
                    source_identifier=source_identifier, kind=kind, target_identifier=target_identifier, action=action,
                ))
            rv.add(key)
        return rv

    if _check(PHARMACOLOGICAL_CLASS, 'includes', COMPOUND):
        return graph.add_is_a(
            Abundance(namespace='drugbank', name=target_name, identifier=target_identifier),
            Abundance(namespace='drugcentral', name=source_name, identifier=source_identifier),
        )

    it_logger('missed: {edge}'.format(edge=edge))