# -*- coding: utf-8 -*-
"""This module contains helper functions for reading BEL scripts."""
import logging
import os
import re
import time
from typing import Any, Iterable, List, Mapping, Optional, Tuple
from bel_resources import ResourceError, split_file_to_annotations_and_definitions
from pyparsing import ParseException
from sqlalchemy.exc import OperationalError
from tqdm.autonotebook import tqdm
from ..constants import INVERSE_DOCUMENT_KEYS, REQUIRED_METADATA
from ..exceptions import (
BELParserWarning,
BELSyntaxError,
InconsistentDefinitionError,
MalformedMetadataException,
MissingMetadataException,
PlaceholderAminoAcidWarning,
VersionFormatWarning,
)
from ..manager import Manager
from ..parser import BELParser, MetadataParser
from ..struct.graph import BELGraph
__all__ = [
"parse_lines",
]
logger = logging.getLogger(__name__)
parser_logger = logging.getLogger("pybel.parser")
METADATA_LINE_RE = re.compile(r"(SET\s+DOCUMENT|DEFINE\s+NAMESPACE|DEFINE\s+ANNOTATION)")
LOG_FMT = "%d:%d %s %s"
LOG_FMT_PATH = "%s:%d:%d %s %s"
[docs]def parse_lines(
graph: BELGraph,
lines: Iterable[str],
manager: Optional[Manager] = None,
disallow_nested: bool = False,
citation_clearing: bool = True,
use_tqdm: bool = False,
tqdm_kwargs: Optional[Mapping[str, Any]] = None,
no_identifier_validation: bool = False,
disallow_unqualified_translocations: bool = False,
allow_redefinition: bool = False,
allow_definition_failures: bool = False,
allow_naked_names: bool = False,
required_annotations: Optional[List[str]] = None,
upgrade_urls: bool = False,
) -> None:
"""Parse an iterable of lines into this graph.
Delegates to :func:`parse_document`, :func:`parse_definitions`, and :func:`parse_statements`.
:param graph: A BEL graph
:param lines: An iterable over lines of BEL script
:param manager: A PyBEL database manager
:param disallow_nested: If true, turns on nested statement failures
:param citation_clearing: Should :code:`SET Citation` statements clear evidence and all annotations?
Delegated to :class:`pybel.parser.ControlParser`
:param use_tqdm: Use :mod:`tqdm` to show a progress bar?
:param tqdm_kwargs: Keywords to pass to ``tqdm``
:param disallow_unqualified_translocations: If true, allow translocations without TO and FROM clauses.
:param required_annotations: Annotations that are required for all statements
:param upgrade_urls: Automatically upgrade old namespace URLs. Defaults to false.
.. warning::
These options allow concessions for parsing BEL that is either **WRONG** or **UNSCIENTIFIC**. Use them at
risk to reproducibility and validity of your results.
:param no_identifier_validation: If true, turns off namespace validation
:param allow_naked_names: If true, turns off naked namespace failures
:param allow_redefinition: If true, doesn't fail on second definition of same name or annotation
:param allow_definition_failures: If true, allows parsing to continue if a terminology file download/parse fails
"""
docs, definitions, statements = split_file_to_annotations_and_definitions(lines)
if manager is None:
manager = Manager()
metadata_parser = MetadataParser(
manager,
allow_redefinition=allow_redefinition,
skip_validation=no_identifier_validation,
upgrade_urls=upgrade_urls,
)
parse_document(
graph,
docs,
metadata_parser,
)
parse_definitions(
graph,
definitions,
metadata_parser,
allow_failures=allow_definition_failures,
use_tqdm=use_tqdm,
tqdm_kwargs=tqdm_kwargs,
)
bel_parser = BELParser(
graph=graph,
# terminologies
namespace_to_term_to_encoding=metadata_parser.namespace_to_term_to_encoding,
namespace_to_pattern=metadata_parser.namespace_to_pattern,
annotation_to_term=metadata_parser.annotation_to_term,
annotation_to_pattern=metadata_parser.annotation_to_pattern,
annotation_to_local=metadata_parser.annotation_to_local,
# language settings
disallow_nested=disallow_nested,
citation_clearing=citation_clearing,
skip_validation=no_identifier_validation,
allow_naked_names=allow_naked_names,
disallow_unqualified_translocations=disallow_unqualified_translocations,
required_annotations=required_annotations,
)
parse_statements(
graph,
statements,
bel_parser,
use_tqdm=use_tqdm,
tqdm_kwargs=tqdm_kwargs,
)
logger.info(
"Network has %d nodes and %d edges",
graph.number_of_nodes(),
graph.number_of_edges(),
)
def parse_document(
graph: BELGraph,
enumerated_lines: Iterable[Tuple[int, str]],
metadata_parser: MetadataParser,
) -> None:
"""Parse the lines in the document section of a BEL script."""
parse_document_start_time = time.time()
for line_number, line in enumerated_lines:
try:
metadata_parser.parseString(line, line_number=line_number)
except VersionFormatWarning as exc:
_log_parse_exception(graph, exc)
graph.add_warning(exc)
except Exception as e:
exc = MalformedMetadataException(line_number, line, 0)
_log_parse_exception(graph, exc)
raise exc from e
for required in REQUIRED_METADATA:
required_metadatum = metadata_parser.document_metadata.get(required)
if required_metadatum is not None:
continue
required_metadatum_key = INVERSE_DOCUMENT_KEYS[required]
# This has to be insert since it needs to go on the front!
exc = MissingMetadataException.make(required_metadatum_key)
graph.warnings.insert(0, (None, exc, {}))
_log_parse_exception(graph, exc)
graph.document.update(metadata_parser.document_metadata)
logger.info(
"Finished parsing document section in %.02f seconds",
time.time() - parse_document_start_time,
)
def parse_definitions(
graph: BELGraph,
enumerated_lines: Iterable[Tuple[int, str]],
metadata_parser: MetadataParser,
allow_failures: bool = False,
use_tqdm: bool = False,
tqdm_kwargs: Optional[Mapping[str, Any]] = None,
) -> None:
"""Parse the lines in the definitions section of a BEL script.
:param graph: A BEL graph
:param enumerated_lines: An enumerated iterable over the lines in the definitions section of a BEL script
:param metadata_parser: A metadata parser
:param allow_failures: If true, allows parser to continue past strange failures
:param use_tqdm: Use :mod:`tqdm` to show a progress bar?
:param tqdm_kwargs: Keywords to pass to ``tqdm``
:raises: pybel.parser.parse_exceptions.InconsistentDefinitionError
:raises: pybel.resources.exc.ResourceError
:raises: sqlalchemy.exc.OperationalError
"""
parse_definitions_start_time = time.time()
if use_tqdm:
_tqdm_kwargs = dict(desc="Definitions", leave=False)
if tqdm_kwargs:
_tqdm_kwargs.update(tqdm_kwargs)
enumerated_lines = tqdm(list(enumerated_lines), **_tqdm_kwargs)
for line_number, line in enumerated_lines:
try:
metadata_parser.parseString(line, line_number=line_number)
except (InconsistentDefinitionError, ResourceError) as e:
parser_logger.exception(LOG_FMT, line_number, 0, e.__class__.__name__, line)
raise e
except OperationalError as e:
parser_logger.warning(
"Need to upgrade database. See http://pybel.readthedocs.io/en/latest/installation.html#upgrading",
)
raise e
except Exception as e:
if not allow_failures:
exc = MalformedMetadataException(line_number, line, 0)
_log_parse_exception(graph, exc)
raise exc from e
graph.namespace_url.update(metadata_parser.namespace_url_dict)
graph.namespace_pattern.update(
{keyword: pattern.pattern for keyword, pattern in metadata_parser.namespace_to_pattern.items()}
)
graph.annotation_url.update(metadata_parser.annotation_url_dict)
graph.annotation_pattern.update(
{keyword: pattern.pattern for keyword, pattern in metadata_parser.annotation_to_pattern.items()}
)
graph.annotation_list.update(metadata_parser.annotation_to_local)
logger.info(
"Finished parsing definitions section in %.02f seconds",
time.time() - parse_definitions_start_time,
)
metadata_parser.ensure_resources()
logger.info("Finished ensuring namespaces in cache")
def parse_statements(
graph: BELGraph,
enumerated_lines: Iterable[Tuple[int, str]],
bel_parser: BELParser,
use_tqdm: bool = True,
tqdm_kwargs: Optional[Mapping[str, Any]] = None,
) -> None:
"""Parse a list of statements from a BEL Script.
:param graph: A BEL graph
:param enumerated_lines: An enumerated iterable over the lines in the statements section of a BEL script
:param bel_parser: A BEL parser
:param use_tqdm: Use :mod:`tqdm` to show a progress bar? Requires reading whole file to memory.
:param tqdm_kwargs: Keywords to pass to ``tqdm``
"""
parse_statements_start_time = time.time()
if use_tqdm:
tqdm_kwargs = {} if tqdm_kwargs is None else dict(tqdm_kwargs)
tqdm_kwargs.setdefault("desc", "Statements")
tqdm_kwargs.setdefault("leave", False)
enumerated_lines = tqdm(list(enumerated_lines), **tqdm_kwargs)
for line_number, line in enumerated_lines:
try:
bel_parser.parseString(line, line_number=line_number)
except ParseException as e:
exc = BELSyntaxError(line_number, line, e.loc)
_log_parse_exception(graph, exc)
graph.add_warning(exc, bel_parser.get_annotations())
except PlaceholderAminoAcidWarning as exc:
exc.line_number = line_number
_log_parse_exception(graph, exc)
graph.add_warning(exc, bel_parser.get_annotations())
except BELParserWarning as exc:
_log_parse_exception(graph, exc)
graph.add_warning(exc, bel_parser.get_annotations())
except Exception:
parser_logger.exception(LOG_FMT, line_number, 0, "General Failure", line)
raise
logger.info(
"Parsed statements section in %.02f seconds with %d warnings",
time.time() - parse_statements_start_time,
len(graph.warnings),
)
def _log_parse_exception(graph: BELGraph, exc: BELParserWarning):
if graph.path:
s = LOG_FMT_PATH % (
os.path.basename(graph.path),
exc.line_number,
exc.position,
exc.__class__.__name__,
exc,
)
else:
s = LOG_FMT % (exc.line_number, exc.position, exc.__class__.__name__, exc)
tqdm.write(s)