Source code for pybel.io.line_utils

# -*- coding: utf-8 -*-

"""This module contains helper functions for reading BEL scripts."""

import logging
import os
import re
import time
from typing import Any, Iterable, List, Mapping, Optional, Tuple

from bel_resources import ResourceError, split_file_to_annotations_and_definitions
from pyparsing import ParseException
from sqlalchemy.exc import OperationalError
from tqdm.autonotebook import tqdm

from ..constants import INVERSE_DOCUMENT_KEYS, REQUIRED_METADATA
from ..exceptions import (
    BELParserWarning,
    BELSyntaxError,
    InconsistentDefinitionError,
    MalformedMetadataException,
    MissingMetadataException,
    PlaceholderAminoAcidWarning,
    VersionFormatWarning,
)
from ..manager import Manager
from ..parser import BELParser, MetadataParser
from ..struct.graph import BELGraph

__all__ = [
    "parse_lines",
]

logger = logging.getLogger(__name__)
parser_logger = logging.getLogger("pybel.parser")

METADATA_LINE_RE = re.compile(r"(SET\s+DOCUMENT|DEFINE\s+NAMESPACE|DEFINE\s+ANNOTATION)")
LOG_FMT = "%d:%d %s %s"
LOG_FMT_PATH = "%s:%d:%d %s %s"


[docs]def parse_lines( graph: BELGraph, lines: Iterable[str], manager: Optional[Manager] = None, disallow_nested: bool = False, citation_clearing: bool = True, use_tqdm: bool = False, tqdm_kwargs: Optional[Mapping[str, Any]] = None, no_identifier_validation: bool = False, disallow_unqualified_translocations: bool = False, allow_redefinition: bool = False, allow_definition_failures: bool = False, allow_naked_names: bool = False, required_annotations: Optional[List[str]] = None, upgrade_urls: bool = False, ) -> None: """Parse an iterable of lines into this graph. Delegates to :func:`parse_document`, :func:`parse_definitions`, and :func:`parse_statements`. :param graph: A BEL graph :param lines: An iterable over lines of BEL script :param manager: A PyBEL database manager :param disallow_nested: If true, turns on nested statement failures :param citation_clearing: Should :code:`SET Citation` statements clear evidence and all annotations? Delegated to :class:`pybel.parser.ControlParser` :param use_tqdm: Use :mod:`tqdm` to show a progress bar? :param tqdm_kwargs: Keywords to pass to ``tqdm`` :param disallow_unqualified_translocations: If true, allow translocations without TO and FROM clauses. :param required_annotations: Annotations that are required for all statements :param upgrade_urls: Automatically upgrade old namespace URLs. Defaults to false. .. warning:: These options allow concessions for parsing BEL that is either **WRONG** or **UNSCIENTIFIC**. Use them at risk to reproducibility and validity of your results. :param no_identifier_validation: If true, turns off namespace validation :param allow_naked_names: If true, turns off naked namespace failures :param allow_redefinition: If true, doesn't fail on second definition of same name or annotation :param allow_definition_failures: If true, allows parsing to continue if a terminology file download/parse fails """ docs, definitions, statements = split_file_to_annotations_and_definitions(lines) if manager is None: manager = Manager() metadata_parser = MetadataParser( manager, allow_redefinition=allow_redefinition, skip_validation=no_identifier_validation, upgrade_urls=upgrade_urls, ) parse_document( graph, docs, metadata_parser, ) parse_definitions( graph, definitions, metadata_parser, allow_failures=allow_definition_failures, use_tqdm=use_tqdm, tqdm_kwargs=tqdm_kwargs, ) bel_parser = BELParser( graph=graph, # terminologies namespace_to_term_to_encoding=metadata_parser.namespace_to_term_to_encoding, namespace_to_pattern=metadata_parser.namespace_to_pattern, annotation_to_term=metadata_parser.annotation_to_term, annotation_to_pattern=metadata_parser.annotation_to_pattern, annotation_to_local=metadata_parser.annotation_to_local, # language settings disallow_nested=disallow_nested, citation_clearing=citation_clearing, skip_validation=no_identifier_validation, allow_naked_names=allow_naked_names, disallow_unqualified_translocations=disallow_unqualified_translocations, required_annotations=required_annotations, ) parse_statements( graph, statements, bel_parser, use_tqdm=use_tqdm, tqdm_kwargs=tqdm_kwargs, ) logger.info( "Network has %d nodes and %d edges", graph.number_of_nodes(), graph.number_of_edges(), )
def parse_document( graph: BELGraph, enumerated_lines: Iterable[Tuple[int, str]], metadata_parser: MetadataParser, ) -> None: """Parse the lines in the document section of a BEL script.""" parse_document_start_time = time.time() for line_number, line in enumerated_lines: try: metadata_parser.parseString(line, line_number=line_number) except VersionFormatWarning as exc: _log_parse_exception(graph, exc) graph.add_warning(exc) except Exception as e: exc = MalformedMetadataException(line_number, line, 0) _log_parse_exception(graph, exc) raise exc from e for required in REQUIRED_METADATA: required_metadatum = metadata_parser.document_metadata.get(required) if required_metadatum is not None: continue required_metadatum_key = INVERSE_DOCUMENT_KEYS[required] # This has to be insert since it needs to go on the front! exc = MissingMetadataException.make(required_metadatum_key) graph.warnings.insert(0, (None, exc, {})) _log_parse_exception(graph, exc) graph.document.update(metadata_parser.document_metadata) logger.info( "Finished parsing document section in %.02f seconds", time.time() - parse_document_start_time, ) def parse_definitions( graph: BELGraph, enumerated_lines: Iterable[Tuple[int, str]], metadata_parser: MetadataParser, allow_failures: bool = False, use_tqdm: bool = False, tqdm_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Parse the lines in the definitions section of a BEL script. :param graph: A BEL graph :param enumerated_lines: An enumerated iterable over the lines in the definitions section of a BEL script :param metadata_parser: A metadata parser :param allow_failures: If true, allows parser to continue past strange failures :param use_tqdm: Use :mod:`tqdm` to show a progress bar? :param tqdm_kwargs: Keywords to pass to ``tqdm`` :raises: pybel.parser.parse_exceptions.InconsistentDefinitionError :raises: pybel.resources.exc.ResourceError :raises: sqlalchemy.exc.OperationalError """ parse_definitions_start_time = time.time() if use_tqdm: _tqdm_kwargs = dict(desc="Definitions", leave=False) if tqdm_kwargs: _tqdm_kwargs.update(tqdm_kwargs) enumerated_lines = tqdm(list(enumerated_lines), **_tqdm_kwargs) for line_number, line in enumerated_lines: try: metadata_parser.parseString(line, line_number=line_number) except (InconsistentDefinitionError, ResourceError) as e: parser_logger.exception(LOG_FMT, line_number, 0, e.__class__.__name__, line) raise e except OperationalError as e: parser_logger.warning( "Need to upgrade database. See http://pybel.readthedocs.io/en/latest/installation.html#upgrading", ) raise e except Exception as e: if not allow_failures: exc = MalformedMetadataException(line_number, line, 0) _log_parse_exception(graph, exc) raise exc from e graph.namespace_url.update(metadata_parser.namespace_url_dict) graph.namespace_pattern.update( {keyword: pattern.pattern for keyword, pattern in metadata_parser.namespace_to_pattern.items()} ) graph.annotation_url.update(metadata_parser.annotation_url_dict) graph.annotation_pattern.update( {keyword: pattern.pattern for keyword, pattern in metadata_parser.annotation_to_pattern.items()} ) graph.annotation_list.update(metadata_parser.annotation_to_local) logger.info( "Finished parsing definitions section in %.02f seconds", time.time() - parse_definitions_start_time, ) metadata_parser.ensure_resources() logger.info("Finished ensuring namespaces in cache") def parse_statements( graph: BELGraph, enumerated_lines: Iterable[Tuple[int, str]], bel_parser: BELParser, use_tqdm: bool = True, tqdm_kwargs: Optional[Mapping[str, Any]] = None, ) -> None: """Parse a list of statements from a BEL Script. :param graph: A BEL graph :param enumerated_lines: An enumerated iterable over the lines in the statements section of a BEL script :param bel_parser: A BEL parser :param use_tqdm: Use :mod:`tqdm` to show a progress bar? Requires reading whole file to memory. :param tqdm_kwargs: Keywords to pass to ``tqdm`` """ parse_statements_start_time = time.time() if use_tqdm: tqdm_kwargs = {} if tqdm_kwargs is None else dict(tqdm_kwargs) tqdm_kwargs.setdefault("desc", "Statements") tqdm_kwargs.setdefault("leave", False) enumerated_lines = tqdm(list(enumerated_lines), **tqdm_kwargs) for line_number, line in enumerated_lines: try: bel_parser.parseString(line, line_number=line_number) except ParseException as e: exc = BELSyntaxError(line_number, line, e.loc) _log_parse_exception(graph, exc) graph.add_warning(exc, bel_parser.get_annotations()) except PlaceholderAminoAcidWarning as exc: exc.line_number = line_number _log_parse_exception(graph, exc) graph.add_warning(exc, bel_parser.get_annotations()) except BELParserWarning as exc: _log_parse_exception(graph, exc) graph.add_warning(exc, bel_parser.get_annotations()) except Exception: parser_logger.exception(LOG_FMT, line_number, 0, "General Failure", line) raise logger.info( "Parsed statements section in %.02f seconds with %d warnings", time.time() - parse_statements_start_time, len(graph.warnings), ) def _log_parse_exception(graph: BELGraph, exc: BELParserWarning): if graph.path: s = LOG_FMT_PATH % ( os.path.basename(graph.path), exc.line_number, exc.position, exc.__class__.__name__, exc, ) else: s = LOG_FMT % (exc.line_number, exc.position, exc.__class__.__name__, exc) tqdm.write(s)