Source code for alogos._grammar.parsing.parsing_with_lark

import lark as _lark

from ... import exceptions as _exceptions
from ..._utilities import argument_processing as _ap
from ..._utilities.operating_system import NEWLINE as _NEWLINE
from .. import data_structures as _data_structures


# Shortcuts
_NT = _data_structures.NonterminalSymbol
_T = _data_structures.TerminalSymbol
_BACKSLASH = "\\"  # Caution: r'\' is not a valid string literal


[docs]def parse_string(grammar, string, parser, get_multiple_trees, max_num_trees=None): """Parse a string with Lark and construct a derivation tree of this package. The derivation tree is built with the data structure defined in this package, not with the parse tree object provided by Lark. The reason is that the derivation tree is subsequently used for various tasks such as different traversals, extracting strings or derivations and visualization. Therefore it desirable to have it decoupled from the parser library and that it comes with methods for the mentioned tasks. References ---------- - https://github.com/lark-parser/lark - https://github.com/lark-parser/lark/blob/master/lark/exceptions.py """ # Argument processing string = _ap.str_arg("string", string) parser = _ap.str_arg("parser", parser, vals=["earley", "lalr"]) get_multiple_trees = _ap.bool_arg( "get_multiple_trees", get_multiple_trees, default=False ) max_num_trees = _ap.int_arg( "max_num_trees", max_num_trees, default=1_000_000_000_000 ) if get_multiple_trees and parser != "earley": _exceptions.raise_lark_parser_mult_error() # Caching: Create the Lark parser once and reuse it in subsequent calls parser_id = ( parser if parser == "lalr" else "{}{}".format(parser, int(get_multiple_trees)) ) lark_parser, nt_map_fwd, nt_map_rev = grammar._lookup_or_calc( "lark", parser_id, _calc_lark_parser_and_nt_maps, grammar, parser, get_multiple_trees, ) # Parsing try: tree_s = lark_parser.parse(string) except Exception as excp: _exceptions._raise_parser_string_error(excp) # Tree conversion if get_multiple_trees: tree_s = _ambig_lark_tree_to_dts( grammar, tree_s, nt_map_fwd, nt_map_rev, max_num_trees ) else: tree_s = _lark_tree_to_dt(grammar, tree_s, nt_map_rev) return tree_s
# Result conversion def _lark_tree_to_dt(grammar, lark_tree, nt_map_rev): """Convert a parse tree of lark-parser into a derivation tree of this package.""" def get_label(lark_node): if isinstance(lark_node, _lark.Tree): return _NT(nt_map_rev[lark_node.data].text) return _T(lark_node.value) # Generate the derivation tree dt = _data_structures.DerivationTree(grammar) dt.root_node = _data_structures.Node(grammar.start_symbol) # Traverse the Lark tree via dfs and generate the go derivation tree node by node lark_stack = [lark_tree] ori_stack = [dt.root_node] while lark_stack: lark_node = lark_stack.pop() ori_node = ori_stack.pop() if isinstance(lark_node, _lark.Tree): lark_child_nodes = lark_node.children child_symbols = [get_label(child) for child in lark_child_nodes] if not child_symbols: # Empty string is not a node in the Lark tree and hence needs special treatment empty_string = _T("") child_symbols = [empty_string] dt._expand(ori_node, child_symbols) continue ori_child_nodes = dt._expand(ori_node, child_symbols) lark_stack.extend(lark_child_nodes) ori_stack.extend(ori_child_nodes) return dt def _ambig_lark_tree_to_dts(grammar, lark_tree, nt_map_fwd, nt_map_rev, max_num_trees): """Convert an ambiguous parse tree of lark-parser into a list of derivation trees.""" # Get all trees from the forest provided by Lark stack = [lark_tree] lark_start_symbol = nt_map_fwd[grammar.start_symbol] valid_trees = [] while stack: candidate_tree = stack.pop() node_label = candidate_tree.data # "_ambig" in a node means that multiple trees are present in the child nodes if node_label == "_ambig": new_candidate_trees = candidate_tree.children stack.extend(new_candidate_trees) elif node_label == lark_start_symbol: valid_trees.append(candidate_tree) # Restrict number of trees if len(valid_trees) >= max_num_trees: break else: _exceptions._raise_parser_node_error(node_label) # Convert Lark trees to derivation trees of this package dts = [_lark_tree_to_dt(grammar, tree, nt_map_rev) for tree in valid_trees] return dts # Cached calculations def _calc_lark_parser_and_nt_maps(grammar, parser, get_multiple_trees): """Create all Lark objects in a single function. This means that whenever a parser is generated, the same nonterminal map and grammar are generated again. This redundant calculations are accepted in order to improve lookup speed. """ # Nonterminal maps nt_map_fwd, nt_map_rev = grammar._lookup_or_calc( "lark", "nt_maps", _calc_nonterminal_maps, grammar ) # Lark grammar lark_grammar = grammar._lookup_or_calc( "lark", "grammar", _calc_lark_grammar, grammar, nt_map_fwd ) # Lark parser: different parsers are created from the same nonterminal maps and Lark grammar lark_parser = _calc_lark_parser( grammar, parser, nt_map_fwd, lark_grammar, get_multiple_trees ) return lark_parser, nt_map_fwd, nt_map_rev def _calc_nonterminal_maps(grammar): """Create a mapping between nonterminals of the original grammar and simplified symbols.""" nt_map_fwd = { nt: "nt{}".format(i) for i, nt in enumerate(grammar.nonterminal_symbols) } nt_map_rev = {val: key for key, val in nt_map_fwd.items()} return nt_map_fwd, nt_map_rev def _calc_lark_grammar(grammar, nt_map_fwd): """Convert the grammar object into an EBNF-like text that Lark is guaranteed to recognize. References ---------- - https://lark-parser.readthedocs.io/en/latest/grammar """ parts = [] for lhs, rhs_multiple in grammar.production_rules.items(): # Add a new lhs parts.append(_NEWLINE) used_text = nt_map_fwd[lhs] parts.append(used_text + ": ") # Add one or several rhs to this new lhs. If several, they are separted by | as "or" symbol for rhs in rhs_multiple: for symbol in rhs: if isinstance(symbol, _NT): parts.append(nt_map_fwd[symbol] + " ") else: parts.append(_to_lark_terminal(symbol.text) + " ") parts.append("| ") parts.pop() # Remove last separator symbol parts.pop(0) # Remove first newline lark_grammar = "".join(parts) return lark_grammar def _calc_lark_parser(grammar, parser, nt_map_fwd, lark_grammar, get_multiple_trees): """Create a lark parser with or without ambiguity detection. References ---------- - https://lark-parser.readthedocs.io/en/latest/classes/ """ # Argument processing specific_options = dict() if parser == "earley" and get_multiple_trees: specific_options = dict(ambiguity="explicit") # Create the parser try: lark_start_symbol = nt_map_fwd[grammar.start_symbol] lark_parser = _lark.Lark( grammar=lark_grammar, parser=parser, start=lark_start_symbol, keep_all_tokens=True, **specific_options, ) except Exception as excp: _exceptions._raise_parser_creation_error(excp) return lark_parser def _to_lark_terminal(symbol): """Prevent problems with certain characters inside the text of a terminal. References ---------- - https://stackoverflow.com/questions/647769/why-cant-pythons-raw-string-literals-end-with-a-single-backslash """ if symbol: # A non-even number of backslashes at the end of a symbol would escape the last quote if symbol.endswith(_BACKSLASH): num_trailing_backslashes = len(symbol) - len(symbol.rstrip(_BACKSLASH)) uneven_num_trailing_backslashes = num_trailing_backslashes % 2 == 1 if uneven_num_trailing_backslashes: symbol = symbol + _BACKSLASH # A double quote symbol would end the terminal too early and lead to downstream problems if '"' in symbol: symbol = symbol.replace('"', r"\"") # For some reason a newline symbol is not allowed and needs to be replaced with its # escape sequence representation if "\n" in symbol: symbol = symbol.replace("\n", r"\n") # Enclose the safe symbol in double quotes for Lark lark_terminal_symbol = '"{}"'.format(symbol) else: lark_terminal_symbol = "" return lark_terminal_symbol