Source code for alogos._grammar.parsing.io_with_regex

import re as _re

from ... import exceptions as _exceptions
from ... import warnings as _warnings
from ..._utilities import argument_processing as _ap
from ..._utilities.operating_system import NEWLINE as _NEWLINE
from .. import data_structures as _data_structures


# Shared regex patterns
# - Letter followed by arbitrary sequence of letters, digits, _ and -
_STANDALONE_NONTERMINAL_PATTERN = r"[a-zA-Z][-_0-9a-zA-Z]*"

# - Sequence of non-whitespace symbols
_STANDALONE_TERMINAL_PATTERN = r"[^\s]+"

# - Shortest possible sequence of symbols and newline (until first end mark is found)
_ENCLOSED_NONTERMINAL_PATTERN = r".+?"
_ENCLOSED_TERMINAL_PATTERN = r"(?:.|{newline})*?".format(newline=_NEWLINE)

# - Longest possible sequence of whitespaces including empty (space, tab, newline, linefeed, etc.)
_MAX_OPT_WHITESPACE = r"\s*"

# Suffix for naming rules derived in automatic EBNF to BNF conversion
_SHARED_SUFFIX = "_§"


# Reading BNF and EBNF


[docs]def read_bnf(grammar, bnf_text, verbose, *symbols):
    """Read grammar from text in BNF notation."""
    # Argument processing
    bnf_text = _ap.str_arg("bnf_text", bnf_text)
    verbose = _ap.bool_arg("verbose", verbose)
    (
        def_symbol,
        rule_separator_symbol,
        start_nt,
        end_nt,
        start_t1,
        end_t1,
        start_t2,
        end_t2,
    ) = _check_symbols_validity(*symbols)

    # Conversion of user-provided symbols into escaped strings for secure use in regex patterns
    esc_def_symbol = _re.escape(def_symbol)
    esc_start_nt = _re.escape(start_nt)
    esc_end_nt = _re.escape(end_nt)
    esc_start_t1 = _re.escape(start_t1)
    esc_end_t1 = _re.escape(end_t1)
    esc_start_t2 = _re.escape(start_t2)
    esc_end_t2 = _re.escape(end_t2)

    # Print arguments
    if verbose:

        def represent(text):
            return "Empty string" if text == "" else text

        heading = "Used symbols"
        print(heading)
        print("-" * len(heading))
        print()
        print("defining_symbol:", represent(def_symbol))
        print("rule_separator_symbol:", represent(rule_separator_symbol))
        print("start_nonterminal_symbol:", represent(start_nt))
        print("end_nonterminal_symbol:", represent(end_nt))
        print("start_terminal_symbol:", represent(start_t1))
        print("end_terminal_symbol:", represent(end_t1))
        print("start_terminal_symbol2:", represent(start_t2))
        print("end_terminal_symbol2:", represent(end_t2))

    # Parsing step 1: Split full text into list of lhs (=nonterminal) and rhs (=other symbols)
    # 1.a) Regex pattern preparation
    if start_nt and end_nt:
        pattern_nt = r"{}{}{}".format(
            esc_start_nt, _ENCLOSED_NONTERMINAL_PATTERN, esc_end_nt
        )
    else:
        pattern_nt = _STANDALONE_NONTERMINAL_PATTERN
    pattern_lhs_rhs = r"{newline}{ws}({nonterminal}){ws}{defsymbol}".format(
        newline=_NEWLINE,
        ws=_MAX_OPT_WHITESPACE,
        nonterminal=pattern_nt,
        defsymbol=esc_def_symbol,
    )
    re_lhs_rhs = _re.compile(pattern_lhs_rhs)
    bnf_text = _NEWLINE + bnf_text  # newline to guarantee recognition of the first lhs

    # 1.b) Parsing
    txt_list_lhs_rhs = re_lhs_rhs.split(bnf_text)
    txt_list_lhs_rhs = txt_list_lhs_rhs[1:]  # get rid of an empty first match

    # Parsing step 2: Finding the single nonterminal symbol in lhs and arbitrary symbols in rhs
    # 2.a) Helper functions
    def create_enclosed_symbol_pattern(start, mid, end):
        if start and end:
            return r"{}{}{}".format(start, mid, end)
        return None

    def crop_symbol(txt_sym, txt_start, txt_end):
        idx_start = len(txt_start)
        idx_end = len(txt_sym) - len(txt_end)
        return txt_sym[idx_start:idx_end]

    def text_is_a_nonterminal(text):
        return re_nt.fullmatch(text)

    def text_is_a_terminal(text):
        return re_t.fullmatch(text)

    def process_a_nonterminal(txt_nt_enclosed, crop=True):
        if crop:
            txt_nt = crop_symbol(txt_nt_enclosed, start_nt, end_nt)
        sym_nt = _data_structures.NonterminalSymbol(txt_nt)
        return sym_nt

    def process_a_terminal(txt_t_enclosed):
        if txt_t_enclosed.startswith(start_t1) and txt_t_enclosed.endswith(end_t1):
            txt_t = crop_symbol(txt_t_enclosed, start_t1, end_t1)
        else:
            txt_t = crop_symbol(txt_t_enclosed, start_t2, end_t2)
        sym_t = _data_structures.TerminalSymbol(txt_t)
        _add_terminal_to_grammar(grammar, sym_t)
        return sym_t

    # 2.b) Regex pattern preparation
    # Nonterminal symbols
    pattern_nt = create_enclosed_symbol_pattern(
        esc_start_nt, _ENCLOSED_NONTERMINAL_PATTERN, esc_end_nt
    )
    if pattern_nt:
        re_nt = _re.compile("({})".format(pattern_nt))
    else:
        re_nt = None
    # Terminal symbols
    pattern_t1 = create_enclosed_symbol_pattern(
        esc_start_t1, _ENCLOSED_TERMINAL_PATTERN, esc_end_t1
    )
    pattern_t2 = create_enclosed_symbol_pattern(
        esc_start_t2, _ENCLOSED_TERMINAL_PATTERN, esc_end_t2
    )
    if pattern_t1 and pattern_t2:
        re_t = _re.compile("({})".format("|".join([pattern_t1, pattern_t2])))
    elif pattern_t1:
        re_t = _re.compile("({})".format(pattern_t1))
    else:
        re_t = None
    # Right-hand sides
    pattern_nonempty = [
        pat for pat in [pattern_nt, pattern_t1, pattern_t2] if pat is not None
    ]
    pattern_rhs_splitter = "({})".format("|".join(pattern_nonempty))
    re_rhs_splitter = _re.compile(pattern_rhs_splitter)

    # 2.c) Parsing
    i = 0
    while i < len(txt_list_lhs_rhs):
        # Get next left-hand side (lhs) and multiple right-hand sides (rhs) belonging to it
        txt_lhs = txt_list_lhs_rhs[i]
        txt_rhs = txt_list_lhs_rhs[i + 1]
        i += 2
        if verbose:
            print()
            heading = "Production list {}".format(i // 2)
            print(heading)
            print("-" * len(heading))
            print()

        # Parse lhs: Get the single nonterminal
        sym_lhs_nt = process_a_nonterminal(txt_lhs, crop=True)
        _add_nonterminal_to_grammar(grammar, sym_lhs_nt)
        if grammar.start_symbol is None:
            grammar.start_symbol = sym_lhs_nt
        if verbose:
            print("Left-hand side")
            print("  Text: {}".format(txt_lhs))
            print("  Interpretation: {}".format(repr(sym_lhs_nt)))
            print()
            print("Right-hand sides:")
            print("  Text: {}".format(txt_rhs))
            print("  Interpretation:")

        # Parse rhs: Get several productions, each consisting of arbitrary symbols
        sym_list_rhs = []
        for sym_or_seq in re_rhs_splitter.split(txt_rhs):
            # Mode 1: Nonterminals and terminals are enclosed by indicator symbols
            if re_nt and re_t:
                if text_is_a_nonterminal(sym_or_seq):
                    sym_nt = process_a_nonterminal(sym_or_seq)
                    sym_list_rhs.append(sym_nt)
                elif text_is_a_terminal(sym_or_seq):
                    sym_t = process_a_terminal(sym_or_seq)
                    sym_list_rhs.append(sym_t)
                else:
                    txt_list_rhs = sym_or_seq.split(rule_separator_symbol)
                    if len(txt_list_rhs) > 1:
                        # Add a rule
                        _add_rule_to_grammar(grammar, sym_lhs_nt, sym_list_rhs, verbose)
                        sym_list_rhs = []
            # Mode 2: Only nonterminals are enclosed by indicator symbols
            elif re_nt:
                if text_is_a_nonterminal(sym_or_seq):
                    sym_nt = process_a_nonterminal(sym_or_seq)
                    sym_list_rhs.append(sym_nt)
                else:
                    txt_list_rhs = sym_or_seq.split(rule_separator_symbol)
                    for cnt, txt_rhs in enumerate(txt_list_rhs):
                        if cnt > 0:
                            # Add a rule
                            _add_rule_to_grammar(
                                grammar, sym_lhs_nt, sym_list_rhs, verbose
                            )
                            sym_list_rhs = []
                        for txt_t in txt_rhs.split():
                            sym_t = process_a_terminal(txt_t)
                            sym_list_rhs.append(sym_t)
            # Mode 3: Only terminals are enclosed by indicator symbols
            elif re_t:
                if text_is_a_terminal(sym_or_seq):
                    sym_t = process_a_terminal(sym_or_seq)
                    sym_list_rhs.append(sym_t)
                else:
                    txt_list_rhs = sym_or_seq.split(rule_separator_symbol)
                    for cnt, txt_rhs in enumerate(txt_list_rhs):
                        if cnt > 0:
                            # Add a rule
                            _add_rule_to_grammar(
                                grammar, sym_lhs_nt, sym_list_rhs, verbose
                            )
                            sym_list_rhs = []
                        for txt_nt in txt_rhs.split():
                            sym_nt = process_a_nonterminal(txt_nt)
                            sym_list_rhs.append(sym_nt)
        # Add the last rule
        _add_rule_to_grammar(grammar, sym_lhs_nt, sym_list_rhs, verbose)

    # Check if the resulting grammar is valid
    _check_grammar_validity(grammar)


[docs]def read_ebnf(grammar, ebnf_text, verbose, *symbols):
    """Read grammar from text in EBNF notation.

    References
    ----------
    - https://en.wikipedia.org/wiki/Wirth_syntax_notation
    - http://homepage.divms.uiowa.edu/~jones/compiler/gtools/

    - Conversion of EBNF to BNF notation

        - https://stackoverflow.com/questions/2466484/converting-ebnf-to-bnf
        - https://stackoverflow.com/questions/2842809/lexers-vs-parsers
          useful notes in "EBNF really doesn't add much to the power of grammars."
        - https://condor.depaul.edu/ichu/csc447/notes/wk3/BNF.pdf
          useful notes in summary on last page
        - https://stackoverflow.com/questions/20175248/ebnf-is-this-an-ll1-grammar

    """
    # Argument processing
    ebnf_text = _ap.str_arg("ebnf_text", ebnf_text)
    verbose = _ap.bool_arg("verbose", verbose)
    (
        def_symbol,
        rule_separator_symbol,
        start_nt,
        end_nt,
        start_t1,
        end_t1,
        start_t2,
        end_t2,
    ) = _check_symbols_validity(*symbols)

    # Fixed symbols
    start_group_symbol = "("
    end_group_symbol = ")"
    start_option_symbol = "["
    end_option_symbol = "]"
    start_repeat_symbol = "{"
    end_repeat_symbol = "}"
    quantifier_0_to_1_symbol = "?"
    quantifier_1_to_n_symbol = "+"
    quantifier_0_to_n_symbol = "*"

    # Conversion of symbols into escaped strings for secure use in regex patterns
    esc_defining_symbol = _re.escape(def_symbol)
    esc_rule_separator_symbol = _re.escape(rule_separator_symbol)
    esc_start_nt = _re.escape(start_nt)
    esc_end_nt = _re.escape(end_nt)
    esc_start_t1 = _re.escape(start_t1)
    esc_end_t1 = _re.escape(end_t1)
    esc_start_t2 = _re.escape(start_t2)
    esc_end_t2 = _re.escape(end_t2)

    esc_start_group_symbol = _re.escape(start_group_symbol)
    esc_end_group_symbol = _re.escape(end_group_symbol)
    esc_start_option_symbol = _re.escape(start_option_symbol)
    esc_end_option_symbol = _re.escape(end_option_symbol)
    esc_start_repeat_symbol = _re.escape(start_repeat_symbol)
    esc_end_repeat_symbol = _re.escape(end_repeat_symbol)
    esc_quantifier_0_to_1_symbol = _re.escape(quantifier_0_to_1_symbol)
    esc_quantifier_1_to_n_symbol = _re.escape(quantifier_1_to_n_symbol)
    esc_quantifier_0_to_n_symbol = _re.escape(quantifier_0_to_n_symbol)

    # Print arguments and fixed symbols
    if verbose:

        def represent(text):
            return "Empty string" if text == "" else text

        heading = "Used symbols"
        print(heading)
        print("-" * len(heading))
        print()
        print("defining_symbol:", represent(def_symbol))
        print("rule_separator_symbol:", represent(rule_separator_symbol))
        print("start_nonterminal_symbol:", represent(start_nt))
        print("end_nonterminal_symbol:", represent(end_nt))
        print("start_terminal_symbol:", represent(start_t1))
        print("end_terminal_symbol:", represent(end_t1))
        print("start_terminal_symbol2:", represent(start_t2))
        print("end_terminal_symbol2:", represent(end_t2))
        print()
        print("start_group_symbol:", represent(start_group_symbol))
        print("end_group_symbol:", represent(end_group_symbol))
        print("start_option_symbol:", represent(start_option_symbol))
        print("end_option_symbol:", represent(end_option_symbol))
        print("start_repeat_symbol:", represent(start_repeat_symbol))
        print("end_repeat_symbol:", represent(end_repeat_symbol))
        print("quantifier_0_to_1_symbol:", represent(quantifier_0_to_1_symbol))
        print("quantifier_1_to_n_symbol:", represent(quantifier_1_to_n_symbol))
        print("quantifier_0_to_n_symbol:", represent(quantifier_0_to_n_symbol))

    # Parsing step 1: Split full text into list of lhs (=nonterminal) and rhs (=other symbols)
    # 1.a) Regex pattern preparation
    if start_nt and end_nt:
        pattern_nt = r"{}{}{}".format(
            esc_start_nt, _ENCLOSED_NONTERMINAL_PATTERN, esc_end_nt
        )
    else:
        pattern_nt = _STANDALONE_NONTERMINAL_PATTERN
    pattern_lhs_rhs = r"{newline}{ws}({nonterminal}){ws}{defsymbol}".format(
        newline=_NEWLINE,
        ws=_MAX_OPT_WHITESPACE,
        nonterminal=pattern_nt,
        defsymbol=esc_defining_symbol,
    )
    re_lhs_rhs = _re.compile(pattern_lhs_rhs)
    ebnf_text = (
        _NEWLINE + ebnf_text
    )  # newline to guarantee recognition of the first lhs

    # 1.b) Parsing
    txt_list_lhs_rhs = re_lhs_rhs.split(ebnf_text)
    txt_list_lhs_rhs = txt_list_lhs_rhs[1:]  # get rid of an empty first match

    # Parsing step 2: Finding the single nonterminal symbol in lhs and arbitrary symbols in rhs
    # 2.a) Helper functions
    def crop_symbol(txt_sym, txt_start, txt_end):
        idx_start = len(txt_start)
        idx_end = len(txt_sym) - len(txt_end)
        return txt_sym[idx_start:idx_end]

    def process_a_nonterminal(txt_nt, crop=False):
        if crop:
            txt_nt = crop_symbol(txt_nt, start_nt, end_nt)
        sym_nt = _data_structures.NonterminalSymbol(txt_nt)
        return sym_nt

    def process_a_terminal(txt_t_enclosed):
        if txt_t_enclosed.startswith(start_t1) and txt_t_enclosed.endswith(end_t1):
            txt_t = crop_symbol(txt_t_enclosed, start_t1, end_t1)
        else:
            txt_t = crop_symbol(txt_t_enclosed, start_t2, end_t2)
        sym_t = _data_structures.TerminalSymbol(txt_t)
        # _add_terminal_to_grammar(grammar, sym_t)  # deferred to process_a_rule for empty string
        return sym_t

    def text_is_a_nonterminal(text):
        return re_nt.fullmatch(text)

    def text_is_a_terminal(text):
        return re_t.fullmatch(text)

    def process_a_nt_special_mix(rhs_text):
        seq = [
            process_a_nonterminal(text, crop=True)
            if text_is_a_nonterminal(text)
            else text
            for text in re_rhs_splitter.findall(rhs_text)
        ]
        return seq

    def find_an_innermost_bracket_pair(txt_list):
        found_bracket = None
        last_opening_symbol = None
        last_opening_symbol_idx = None
        for idx, text in enumerate(txt_list):
            if isinstance(text, str):
                if text in [
                    start_group_symbol,
                    start_option_symbol,
                    start_repeat_symbol,
                ]:
                    last_opening_symbol = text
                    last_opening_symbol_idx = idx
                elif (
                    text == end_group_symbol
                    and last_opening_symbol == start_group_symbol
                ):
                    found_bracket = ("group", last_opening_symbol_idx, idx)
                    break
                elif (
                    text == end_option_symbol
                    and last_opening_symbol == start_option_symbol
                ):
                    found_bracket = ("option", last_opening_symbol_idx, idx)
                    break
                elif (
                    text == end_repeat_symbol
                    and last_opening_symbol == start_repeat_symbol
                ):
                    found_bracket = ("repeat", last_opening_symbol_idx, idx)
                    break
        return found_bracket

    def find_a_quantifier(txt_list):
        found_quantifier = None
        for idx, text in enumerate(txt_list):
            if idx == 0:
                continue  # a quantifier on zeroth position is ignored
            if isinstance(text, str):
                if text == quantifier_0_to_1_symbol:
                    found_quantifier = ("0 to 1", idx)
                    break
                if text == quantifier_1_to_n_symbol:
                    found_quantifier = ("1 to n", idx)
                    break
                if text == quantifier_0_to_n_symbol:
                    found_quantifier = ("0 to n", idx)
                    break
        return found_quantifier

    def create_nt_with_derived_name(text, suffix):
        cnt = 0
        while True:
            new_text = "{text}{suffix}{count}".format(
                text=text, suffix=suffix, count=cnt
            )
            sym_nt = process_a_nonterminal(new_text)
            if sym_nt not in seen_nonterminals:
                seen_nonterminals.add(sym_nt)
                break
            cnt += 1
        return sym_nt

    def cut_out_exclusive(seq, start_idx, end_idx):
        left = seq[:start_idx]
        inner = seq[start_idx + 1 : end_idx]
        right = seq[end_idx + 1 :]
        return left, inner, right

    def cut_out_left_inclusive(seq, start_idx, end_idx):
        left = seq[:start_idx]
        inner = seq[start_idx:end_idx]
        right = seq[end_idx + 1 :]
        return left, inner, right

    def remove_brackets(rule):
        """Remove {}, () and [] from a EBNF right-hand side by creating new helper rules.

        References
        ----------
        - https://stackoverflow.com/questions/2466484/converting-ebnf-to-bnf
          Caution: {} is imprecise because {a|b|c} means (a|b|c)* and not a*|b|c

        """
        sym_lhs_nt, sym_list_rhs = rule
        helper_rules = []
        while True:
            # Find the first pair of brackets that have no other brackets within them
            bracket_hit = find_an_innermost_bracket_pair(sym_list_rhs)
            if bracket_hit is None:
                break
            bracket_type, start_idx, end_idx = bracket_hit

            # Handle {} by reducing the expression {something} to the expression (something)*
            if bracket_type == "repeat":
                sym_list_rhs[start_idx] = "("
                sym_list_rhs[end_idx] = ")"
                end_idx_incr = end_idx + 1
                sym_list_rhs[end_idx_incr:end_idx_incr] = ["*"]
                continue

            # Handle the other two types of brackets
            sym_nt_new = create_nt_with_derived_name(
                sym_lhs_nt.text, suffix=_SHARED_SUFFIX
            )
            left, sym_list_inner, right = cut_out_exclusive(
                sym_list_rhs, start_idx, end_idx
            )
            sym_list_rhs = left + [sym_nt_new] + right
            if bracket_type == "group":
                # Handle ()
                helper_rule = (sym_nt_new, sym_list_inner)
            elif bracket_type == "option":
                # Handle []
                empty = process_a_terminal("")
                helper_rule = (sym_nt_new, sym_list_inner + ["|"] + [empty])

            helper_rules.append(helper_rule)
        modified_rule = (sym_lhs_nt, sym_list_rhs)
        new_rules = [modified_rule] + helper_rules
        return new_rules

    def remove_quantifiers(rule):
        """Remove *, + and ? from a EBNF right-hand side by creating new helper rules.

        References
        ----------
        - https://stackoverflow.com/questions/2466484/converting-ebnf-to-bnf

        """
        sym_lhs_nt, list_rhs = rule
        helper_rules = []
        while True:
            # Find the first quantifier
            quantifier_hit = find_a_quantifier(list_rhs)
            if quantifier_hit is None:
                break
            quantifier_type, idx = quantifier_hit

            # Handle all three types of quantifiers
            sym_nt_new = create_nt_with_derived_name(
                sym_lhs_nt.text, suffix=_SHARED_SUFFIX
            )
            left, inner, right = cut_out_left_inclusive(list_rhs, idx - 1, idx)
            list_rhs = left + [sym_nt_new] + right
            if quantifier_type == "0 to 1":
                sym_t_empty = process_a_terminal("")
                helper_rule = (sym_nt_new, inner + ["|"] + [sym_t_empty])
            elif quantifier_type == "0 to n":
                sym_t_empty = process_a_terminal("")
                helper_rule = (sym_nt_new, [sym_nt_new] + inner + ["|"] + [sym_t_empty])
            elif quantifier_type == "1 to n":
                helper_rule = (sym_nt_new, inner + ["|"] + [sym_nt_new] + inner)
            helper_rules.append(helper_rule)
        modified_rule = (sym_lhs_nt, list_rhs)
        new_rules = [modified_rule] + helper_rules
        return new_rules

    def remove_alternation_and_nonsense(rule):
        sym_lhs_nt, list_rhs = rule
        part = []
        splitted_rules = []
        for item in list_rhs:
            if isinstance(item, str):
                if item == rule_separator_symbol:
                    rule = (sym_lhs_nt, part)
                    splitted_rules.append(rule)
                    part = []
            else:
                part.append(item)
        rule = (sym_lhs_nt, part)
        splitted_rules.append(rule)
        return splitted_rules

    def process_a_rule(rule):
        # Treat all occurrences of brackets: {}, () and []
        intermediary_rules1 = remove_brackets(rule)

        # Treat all occurrences of quantifiers: *, + and ?
        intermediary_rules2 = []
        for intermediary_rule in intermediary_rules1:
            new_rules = remove_quantifiers(intermediary_rule)
            intermediary_rules2.extend(new_rules)

        # Treat all occurrences of alternation symbols: |
        final_rules = []
        for intermediary_rule in intermediary_rules2:
            new_rules = remove_alternation_and_nonsense(intermediary_rule)
            final_rules.extend(new_rules)

        # Add all final rules to the grammar
        for lhs, rhs in final_rules:
            # Add nonterminals to grammar
            _add_nonterminal_to_grammar(grammar, lhs)
            # Add terminals to grammar here, so that all empty strings arrive at correct position
            for sym in rhs:
                if isinstance(sym, _data_structures.TerminalSymbol):
                    _add_terminal_to_grammar(grammar, sym)
            # Add rule
            _add_rule_to_grammar(grammar, lhs, rhs, verbose)

    # 2.b) Regex pattern preparation
    # Nonterminal symbols
    re_nt = _re.compile(pattern_nt)
    # Terminal symbols
    use_t1 = start_t1 and end_t1
    use_t2 = start_t2 and end_t2
    pattern_t1 = "{}{}{}".format(esc_start_t1, _ENCLOSED_TERMINAL_PATTERN, esc_end_t1)
    pattern_t2 = "{}{}{}".format(esc_start_t2, _ENCLOSED_TERMINAL_PATTERN, esc_end_t2)
    if use_t1 and use_t2:
        re_t = _re.compile("({}|{})".format(pattern_t1, pattern_t2))
    elif use_t1:
        re_t = _re.compile("({})".format(pattern_t1))
    # Special symbols
    special_symbols = [
        esc_rule_separator_symbol,
        esc_start_group_symbol,
        esc_end_group_symbol,
        esc_start_option_symbol,
        esc_end_option_symbol,
        esc_start_repeat_symbol,
        esc_end_repeat_symbol,
        esc_quantifier_0_to_1_symbol,
        esc_quantifier_1_to_n_symbol,
        esc_quantifier_0_to_n_symbol,
    ]
    pattern_special_symbols = "{}".format("|".join(special_symbols))
    # Right-hand sides
    pattern_rhs_splitter = "({}|{})".format(pattern_special_symbols, pattern_nt)
    re_rhs_splitter = _re.compile(pattern_rhs_splitter)

    # 2.c) Parsing
    seen_nonterminals = set()
    i = 0
    while i < len(txt_list_lhs_rhs):
        # Get next left-hand side (lhs) and multiple right-hand sides (rhs) belonging to it
        txt_lhs = txt_list_lhs_rhs[i]
        txt_rhs = txt_list_lhs_rhs[i + 1]
        i += 2
        if verbose:
            print()
            heading = "Production list {}".format(i // 2)
            print(heading)
            print("-" * len(heading))
            print()

        # Parse lhs: Get the single nonterminal
        sym_lhs_nt = process_a_nonterminal(txt_lhs, crop=True)
        if grammar.start_symbol is None:
            grammar.start_symbol = sym_lhs_nt
        if verbose:
            print("Left-hand side")
            print("  Text: {}".format(txt_lhs))
            print("  Interpretation: {}".format(repr(sym_lhs_nt)))
            print()
            print("Right-hand sides:")
            print("  Text: {}".format(txt_rhs))
            print("  Interpretation:")

        # Parse rhs: Get several productions, each consisting of arbitrary symbols
        t_vs_others_seq = [
            process_a_terminal(text) if text_is_a_terminal(text) else text
            for text in re_t.split(txt_rhs)
        ]
        t_nt_vs_special_sym_seq = []
        for sym_or_seq in t_vs_others_seq:
            if isinstance(sym_or_seq, _data_structures.TerminalSymbol):
                t_nt_vs_special_sym_seq.append(sym_or_seq)
            else:
                sym_list = process_a_nt_special_mix(sym_or_seq)
                t_nt_vs_special_sym_seq.extend(sym_list)

        # Process the rule consisting of lhs and rhs
        # = remove brackets, quantifiers, alternation and add the rule to the grammar
        rule = (sym_lhs_nt, t_nt_vs_special_sym_seq)
        process_a_rule(rule)

    # Delete duplicate rules that come from processing brackets and quantifiers
    try:
        _delete_duplicate_rules(grammar, only_helper_rules=True)
    except Exception:
        # If may fail for some grammars, where the subsequent validity check reports in detail
        pass

    # Check if the resulting grammar is valid
    _check_grammar_validity(grammar)


# Writing BNF and EBNF


[docs]def write_bnf(grammar, rules_on_separate_lines, *symbols):
    """Write grammar as text in BNF notation."""
    # Argument processing
    rules_on_separate_lines = _ap.bool_arg(
        "rules_on_separate_lines", rules_on_separate_lines
    )
    (
        def_symbol,
        rule_separator_symbol,
        start_nt,
        end_nt,
        start_t1,
        end_t1,
        start_t2,
        end_t2,
    ) = _check_symbols_validity(*symbols)
    use_nt = start_nt and end_nt
    use_t1 = start_t1 and end_t1
    use_t2 = start_t2 and end_t2

    # Regex pattern preparation
    re_nt_standalone = _re.compile(_STANDALONE_NONTERMINAL_PATTERN)
    re_t_standalone = _re.compile(_STANDALONE_TERMINAL_PATTERN)

    # Helper functions
    def contains_nt_indicator(text):
        return start_nt in text or end_nt in text

    def contains_t1_indicator(text):
        return start_t1 in text or end_t1 in text

    def contains_t2_indicator(text):
        return start_t2 in text or end_t2 in text

    def invalid_unenclosed_nonterminal(text):
        return not re_nt_standalone.fullmatch(text)

    def invalid_unenclosed_terminal(text):
        return not re_t_standalone.fullmatch(text)

    def process_nonterminal(text):
        if use_nt:
            if contains_nt_indicator(text):
                _exceptions.raise_write_nonterminal_error(text)
            start = start_nt
            end = end_nt
        else:
            if invalid_unenclosed_nonterminal(text):
                _exceptions.raise_write_nonterminal_error(text)
            start = ""
            end = ""
        text_with_start_and_end = "{}{}{}".format(start, text, end)
        return text_with_start_and_end

    def process_terminal(text):
        if use_t1 and use_t2:
            if contains_t1_indicator(text) and contains_t2_indicator(text):
                _exceptions.raise_write_terminal_error(text)
            if contains_t1_indicator(text):
                start = start_t2
                end = end_t2
            else:
                start = start_t1
                end = end_t1
        elif use_t1:
            if contains_t1_indicator(text):
                _exceptions.raise_write_terminal_error(text)
            start = start_t1
            end = end_t1
        else:
            if invalid_unenclosed_terminal(text):
                _exceptions.raise_write_terminal_error(text)
            start = ""
            end = ""
        text_with_start_and_end = "{}{}{}".format(start, text, end)
        return text_with_start_and_end

    # Write text
    seq = []
    for sym_lhs_nt, list_rhs in grammar.production_rules.items():
        # Left-hand side nonterminal symbol
        txt_lhs_nt = process_nonterminal(sym_lhs_nt.text)
        txt_lhs_nt_def = "{} {} ".format(txt_lhs_nt, def_symbol)
        seq.append(txt_lhs_nt_def)
        # Right-hand side sequence of symbols
        for sym_list_rhs in list_rhs:
            for sym in sym_list_rhs:
                if isinstance(sym, _data_structures.NonterminalSymbol):
                    txt_nt = process_nonterminal(sym.text)
                    seq.append(txt_nt)
                    seq.append(" ")
                else:
                    txt_t = process_terminal(sym.text)
                    seq.append(txt_t)
                    seq.append(" ")
            if rules_on_separate_lines:
                indentation = " " * (len(txt_lhs_nt) + len(def_symbol))
                seq.append(_NEWLINE)
                seq.append(indentation)
            seq.append("{} ".format(rule_separator_symbol))
        if rules_on_separate_lines:
            seq.pop()  # remove last lhs indent
            seq.pop()  # remove last newline
        seq.pop()  # remove last rule_separator_symbol
        seq.pop()  # remove last whitespace
        seq.append(_NEWLINE)
    return "".join(seq)


[docs]def write_ebnf(grammar, rules_on_separate_lines, *symbols):
    """Write grammar as text in EBNF notation."""
    # Currently reduced to BNF, i.e. no simplifications yet that are possible with EBNF constructs
    return write_bnf(grammar, rules_on_separate_lines, *symbols)


# Helper functions


def _check_symbols_validity(
    defining_symbol,
    rule_separator_symbol,
    start_nonterminal_symbol,
    end_nonterminal_symbol,
    start_terminal_symbol,
    end_terminal_symbol,
    start_terminal_symbol2,
    end_terminal_symbol2,
):
    # Type checking
    def_symbol = _ap.str_arg("defining_symbol", defining_symbol)
    rule_separator_symbol = _ap.str_arg("rule_separator_symbol", rule_separator_symbol)
    start_nt = _ap.str_arg("start_nonterminal_symbol", start_nonterminal_symbol)
    end_nt = _ap.str_arg("end_nonterminal_symbol", end_nonterminal_symbol)
    start_t1 = _ap.str_arg("start_terminal_symbol", start_terminal_symbol)
    end_t1 = _ap.str_arg("end_terminal_symbol", end_terminal_symbol)
    start_t2 = _ap.str_arg("start_terminal_symbol2", start_terminal_symbol2)
    end_t2 = _ap.str_arg("end_terminal_symbol2", end_terminal_symbol2)

    # Consistent: If one symbol is provided, the partner symbol needs to be provided too
    if _ap.logical_xor(start_nt, end_nt):
        _exceptions.raise_delimiter_symbol_error_1()
    if _ap.logical_xor(start_t1, end_t1):
        _exceptions.raise_surrounding_symbol_error_2()
    if _ap.logical_xor(start_t2, end_t2):
        _exceptions.raise_surrounding_symbol_error_3()

    # Ordered: If terminal_symbols1 are not provided, but terminal_symbols2 are, assign 2 to 1
    if start_t1 == "" and start_t2 != "":
        start_t1 = start_t2
        start_t2 = ""
    if end_t1 == "" and end_t2 != "":
        end_t1 = end_t2
        end_t2 = ""

    # Sufficient: Either nonterminals, or terminals, or both need to be enclosed by delimiters
    if not start_nt and not start_t1:
        _exceptions.raise_surrounding_symbol_error_4()

    # Unique: Nonterminal and terminal indicator symbols need to be different
    ind_nt = set([start_nt, end_nt])
    ind_t = set([start_t1, end_t1, start_t2, end_t2])
    overlap = ind_nt.intersection(ind_t)
    if "" in overlap:
        overlap.remove("")
    if overlap:
        listing = ", ".join(repr(x) for x in overlap)
        _exceptions.raise_surrounding_symbol_error_5(listing)
    return (
        def_symbol,
        rule_separator_symbol,
        start_nt,
        end_nt,
        start_t1,
        end_t1,
        start_t2,
        end_t2,
    )


def _check_grammar_validity(grammar):
    """Check criteria that a context-free grammar needs to fulfill.

    Criteria that raise a GrammarError if not fulfilled:
    - The sets of nonterminals, terminals and productions need to be non-empty.
    - Every nonterminal needs a production rule where it appears on the left-hand side,
      otherwise it can not be rewritten.
    - Every nonterminal needs a production rule that is non-recursive
      (also considering co-recursion), otherwise when the symbol appears once
      in a derivation there is no way to reach an end.

    Criteria that only emit warnings if not fulfilled (because algorithms can work anyways):
    - The sets of nonterminals and terminals should be disjoint.
    - Every production rule should be be unique.

    References
    ----------
    - https://en.wikipedia.org/wiki/Context-free_grammar#Proper_CFGs

    """
    # Raise an error if the grammar contains an empty set
    if not grammar.production_rules:
        _exceptions.raise_empty_productions_error()
    if not grammar.nonterminal_symbols:
        _exceptions.raise_empty_nonterminals_error()
    if not grammar.terminal_symbols:
        _exceptions.raise_empty_terminals_error()

    # Emit a warning if nonterminals and terminals are not disjoint sets of symbols
    s1 = set(sym.text for sym in grammar.nonterminal_symbols)
    s2 = set(sym.text for sym in grammar.terminal_symbols)
    intersection = s1.intersection(s2)
    if intersection:
        _warnings._warn_symbol_set_overlap(intersection)

    # Raise an error if a nonterminal (on rhs) has no corresponding production rule (on lhs)
    lhs_nonterminals = grammar.production_rules.keys()
    rhs_nonterminals = set()
    for rhs_multiple in grammar.production_rules.values():
        for rhs in rhs_multiple:
            for sym in rhs:
                if isinstance(sym, _data_structures.NonterminalSymbol):
                    rhs_nonterminals.add(sym)
    if len(lhs_nonterminals) != (len(rhs_nonterminals) - 1):
        missing_nonterminals = [
            "<{}>".format(nt) for nt in rhs_nonterminals if nt not in lhs_nonterminals
        ]
        if missing_nonterminals:
            _exceptions.raise_missing_nonterminals_error(missing_nonterminals)

    # Emit a warning if some production rules are not unique
    repeated_productions = dict()
    for lhs, rhs_vals in grammar.production_rules.items():
        seen = set()
        for rhs in rhs_vals:
            rhs_text = " ".join(
                "<{}>".format(sym.text)
                if isinstance(sym, _data_structures.NonterminalSymbol)
                else sym.text
                for sym in rhs
            )
            if rhs_text in seen:
                text = "  <{}> ::= {}".format(str(lhs), rhs_text)
                repeated_productions[text] = None
            seen.add(rhs_text)
    if repeated_productions:
        _warnings._warn_repeated_productions(repeated_productions)


def _delete_duplicate_rules(grammar, only_helper_rules=False):
    """Remove duplicate rules, i.e. when different lhs share exactly the same rhs."""
    # Delete lhs which have exactly the same rhs as another lhs
    # Example:
    #  S -> A B
    #  A -> 1 | 4
    #  B -> 1 | 4 ... delete it and replace occurrences of B in the rhs of every other lhs
    known_rhs = dict()
    delete_list = []
    replacement_map = dict()

    # Make sure the start symbol is considered first
    lhs_list = list(grammar.production_rules.keys())
    lhs_list.remove(grammar.start_symbol)
    lhs_list = [grammar.start_symbol] + lhs_list
    rule_list = [(lhs, grammar.production_rules[lhs]) for lhs in lhs_list]

    for lhs, rhs in rule_list:
        # Optionally skip rules where the lhs does not start with an underscore
        if only_helper_rules and _SHARED_SUFFIX not in str(lhs):
            continue
        # All lhs (except start symbol) may be deleted if their rhs is the same as in another lhs
        rhs_hashable = str(rhs)
        if rhs_hashable not in known_rhs:
            known_rhs[rhs_hashable] = lhs
        else:
            known_lhs = known_rhs[rhs_hashable]
            replacement_map[lhs] = known_lhs
            delete_list.append(lhs)

    # Replace lhs symbols that will be deleted
    for lhs, rhs_multiple in grammar.production_rules.items():
        for rhs_idx, rhs in enumerate(rhs_multiple):
            for symb_idx, symb in enumerate(rhs):
                if symb in replacement_map:
                    new_symb = replacement_map[symb]
                    grammar.production_rules[lhs][rhs_idx][symb_idx] = new_symb

    # Delete lhs symbols and their productions
    for lhs in delete_list:
        _remove_nonterminal_from_grammar(grammar, lhs)
        _remove_rule_from_grammar(grammar, lhs)


# Helper functions, which have to change when other data structures are used for symbols or rules


def _add_terminal_to_grammar(grammar, symbol):
    grammar.terminal_symbols.add(symbol)


def _add_nonterminal_to_grammar(grammar, symbol):
    grammar.nonterminal_symbols.add(symbol)


def _remove_nonterminal_from_grammar(grammar, symbol):
    grammar.nonterminal_symbols.remove(symbol)


def _add_rule_to_grammar(grammar, sym_lhs_nt, sym_list_rhs, verbose):
    # Special case: Empty rhs is interpreted as empty terminal symbol
    if len(sym_list_rhs) == 0:
        sym_t_empty = _data_structures.TerminalSymbol("")
        _add_terminal_to_grammar(grammar, sym_t_empty)
        sym_list_rhs = [sym_t_empty]
    # Print recognized rule
    if verbose:
        print("    {} -> {}".format(repr(sym_lhs_nt), repr(sym_list_rhs)))
    # Add rule
    if sym_lhs_nt in grammar.production_rules:
        grammar.production_rules[sym_lhs_nt].append(sym_list_rhs)
    else:
        grammar.production_rules[sym_lhs_nt] = [sym_list_rhs]


def _remove_rule_from_grammar(grammar, sym_lhs_nt):
    del grammar.production_rules[sym_lhs_nt]