Source code for alogos._grammar.normalization.chomsky

import itertools as _itertools

from .. import data_structures as _data_structures
from . import _shared


[docs]def is_cnf(grammar):
    """Check if the grammar is in Chomsky Normal Form (CNF)."""

    def is_in_form1(rhs):
        return len(rhs) == 1 and isinstance(rhs[0], _data_structures.TerminalSymbol)

    def is_in_form2(rhs):
        return len(rhs) == 2 and all(
            isinstance(sym, _data_structures.NonterminalSymbol) for sym in rhs
        )

    for rhs_multiple in grammar.production_rules.values():
        for rhs in rhs_multiple:
            if not is_in_form1(rhs) and not is_in_form2(rhs):
                return False
    return True


[docs]def to_cnf(grammar):
    """Convert the grammar G into Chomsky Normal Form (CNF).

    Notes
    -----
    CNF is defined unambiguously, but there are different algorithms to
    transform a grammar into a form that meets those criteria. These
    algorithms have varying time complexities and also lead to different
    growth of the grammar. Here a version is implemented where the new
    grammar's size is bounded by O(|G|^2) instead of O(2^2^|G|),
    adhering mainly to the presentation in a textbook by Elaine Rich.

    References
    ----------
    - Websites

        - https://en.wikipedia.org/wiki/Chomsky_normal_form
        - https://www.tutorialspoint.com/automata_theory/chomsky_normal_form.htm
        - https://www.geeksforgeeks.org/converting-context-free-grammar-chomsky-normal-form

    - Papers

        - Chomsky - On Certain Formal Properties of Grammars (1959)

            - An implication on pp. 149-150: "Theorem 5 asserts in
              particular that all type 2 languages can be generated by
              grammars which yield only trees with no more than two
              branches from each node."

        - Lange, Leiß - To CNF or not to CNF? An efficient yet
          presentable version of the CYK algorithm (2009)

            - pp. 5-6: "the order in which the single transformation
              steps are carried out should be: BIN→DEL→UNIT. This will
              yield a grammar of size O(|G|^2). Nevertheless, many
              textbooks choose a different order, namely DEL→UNIT→BIN
              which yields agrammar of size O(2^2^|G|)."

            - Table 3 on p. 7 gives an overview of textbooks that show
              how to transform a grammar into CNF. It indicates whether
              the transformation results in a grammar with size bound
              by O(|G|^2) or O(2^2^|G|). Based on this table, the
              textbook by Elaine Rich was chosen here as reference. The
              textbook by Hopcroft and Ullman is also mentioned because
              it is very well known and provides a detailed discussion
              of CNF, yet not a transformation with O(|G|^2) guarantee.

    - Books

        - Rich - Automata, Computability and Complexity (2007):
          pp. 171-175

            - p. 171: "There exists a straightforward four-step
              algorithm that converts a grammar [...] into a new
              grammar [...] in Chomsky normal form and
              L(G_C) = L(G) - {ε}"

                1. removeEps: removing from G all ε-rules
                2. removeUnits: removing from G all unit productions
                3. removeMixed: removing from G all rules whose
                   right-hand sides have length greater than 1 and
                   include a terminal
                4. removeLong: removing from G all rules whose
                   right-hand sides have length greater than 2

            - p. 175: "We will run removeLong as step 1 rather than as
              step 4. [...] With this change, removeEps runs in linear
              time. [...] So, if we change converttoChomsky so that it
              does step 4 first, its time complexity is O(n^2) and the
              size of the grammar that it produces is also O(n^2)."

        - Hopcroft, Ullman - Automata theory, language and
          computation (2004)

            - Preliminary simplications: pp. 261-272

                - Eliminate ε-productions
                - Eliminate unit productions
                - Eliminate useless symbols

            - Transformation to Chomsky Normal Form pp. 272-275

                - Arrange that all bodies of length 2 or more consist
                  only of variables
                - Break bodies of length 3 or more into a cascade of
                  production

    """
    # Copy the given grammar
    gr = grammar.copy()

    # Transformation
    gr = _extra_preprocessing(gr)
    gr = _remove_rules_longer_than_2(gr)  # BIN
    gr = _remove_eps_productions(gr)  # DEL
    gr = _remove_unit_productions(gr)  # UNIT
    gr = _remove_mixed_productions(gr)
    return gr


def _extra_preprocessing(grammar):
    """Prepare grammar to meet assumptions of some CNF transformations.

    Textbooks seems to have some unstated assumptions about the
    structure of the right hand sides of the grammar, which is
    established here by extra preparation:
    - The empty string ε seems to be present only in an isolated
      fashion, never aside of other strings or symbols, i.e. collapsed
      with its neighboring context whenever possible.
      Example: A -> B C | D ε | ε E ε F | ε becomes
      A -> B C | D | E F | ε
    - Nonterminals seem to never have just an ε production,
      i.e. at least one nonempty rhs. This has an implication for a
      transformation where they are found to be "nullable", namely that
      they can be "null" (=ε) but also "non-null"
      (=anything else than ε) and hence they should be absent in one
      rule variant but also present in another. This is only the case
      if there is not only A -> ε but also A -> x in the grammar,
      which is not guaranteed for a general CFG but can be ensured by
      preprocessing.
      Example: A -> x | ε remains as it is but A -> ε requires A to be
      removed from the grammar before CNF transformation

    """
    # 1) Remove nonterminals that can only be derived to an empty string
    # or list of empty strings and replace their appearances on rhs
    # with a ε terminal.
    # Examples for which this is the case (A would be removed and
    # replaced in the grammar):
    #   A -> ε
    #   A -> ε | ε
    #   A -> ε ε
    #   A -> ε ε | ε ε ε | ε ε ε ε ε
    # Reason: Without this step later removal of eps productions could
    # fail in different cases.
    def contains_only_eps(rhs):
        """Check if a rhs contains only ε symbols."""
        return len(rhs) > 0 and all(_shared.is_empty_terminal(sym) for sym in rhs)

    def remove_and_replace_nonterminal(grammar, nt_to_remove):
        new_rules = dict()
        for lhs, rhs_multiple in grammar.production_rules.items():
            # Remove it from lhs
            if lhs == nt_to_remove:
                continue
            new_rules[lhs] = []
            # Replace it on rhs with ε (=a newly created terminal
            # with string '')
            for rhs in rhs_multiple:
                new_rhs = [
                    sym if sym != nt_to_remove else _shared.create_empty_terminal()
                    for sym in rhs
                ]
                new_rules[lhs].append(new_rhs)
        grammar.production_rules = new_rules
        return grammar

    while True:
        newly_found = set()
        for lhs, rhs_multiple in grammar.production_rules.items():
            if len(rhs_multiple) > 0 and all(
                contains_only_eps(rhs) for rhs in rhs_multiple
            ):
                newly_found.add(lhs)
        if newly_found:
            for nt in newly_found:
                grammar = remove_and_replace_nonterminal(grammar, nt)
        else:
            break

    # 2) Remove all ε terminals that have other symbols besides them.
    # In other words, collapse empty strings with their neighbor
    # strings.
    # Examples of transformations:
    #   A -> B ε C ε  becomes  A -> B C
    #   A -> ε B      becomes  A -> B
    #   A -> ε ε      becomes  A -> ε
    #   A -> ε        remains as it is
    # Reason: Without this step later checks for nullable symbols could
    # fail in different cases.
    new_rules = dict()
    for lhs, rhs_multiple in grammar.production_rules.items():
        new_rules[lhs] = []
        for rhs in rhs_multiple:
            # keep only non-ε
            new_rhs = [sym for sym in rhs if not _shared.is_empty_terminal(sym)]
            # if it was a series of ε, keep one ε
            if len(new_rhs) == 0 and len(rhs) > 0:
                new_rhs = [rhs[0]]
            new_rules[lhs].append(new_rhs)
    grammar.production_rules = new_rules

    # Repair step updates all grammar properties to fit to the new
    # production rules
    grammar = _shared.update_grammar_parts(grammar)
    return grammar


def _remove_eps_productions(grammar):
    """Remove ε productions from grammar without changing its language.

    Caution: The grammar G is modified, but its language L(G) shall
    remain the same. There is one exception: If the original grammar G
    can produce the empty string ε, the new grammar G1 will not be
    able to do so. In other words, if ε is part of the language L(G),
    it will no longer be part of the language L(G1). In set notation,
    this fact can be captured as L(G1) = L(G) - {ε}

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007): pp. 162-163
    - Hopcroft, Ullman - Automata theory, language and
      computation (2004): pp. 265-268

    """
    # First step strictly follows Rich (p. 162, step 2)
    nullable_nonterminals = _find_nullable_nonterminals(grammar)

    # Second step follows a mix of Rich (p. 162, step 3) and
    # Hopcroft (p. 266, present or absent)
    grammar = _process_nullable_nonterminals(grammar, nullable_nonterminals)

    # Third step strictly follows Rich (p. 162, step 4)
    grammar = _remove_remaining_eps(grammar)

    # Repair step updates all grammar properties to fit to the
    # new production rules
    grammar = _shared.update_grammar_parts(grammar)
    return grammar


def _find_nullable_nonterminals(grammar):
    """Find nullable nonterminals to prepare for removing ε productions.

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007):
      p. 162, step 2 of removeEps

    """
    # "2. Find the set N of nullable variables"
    nullable_nonterminals = set()

    # Condition (1): A -> ε
    def condition1(rhs):
        return len(rhs) == 1 and _shared.is_empty_terminal(rhs[0])

    # Condition (2): A -> B C D  where all rhs symbols are already
    # marked as nullable
    def condition2(rhs, nullable_nonterminals):
        return all(sym in nullable_nonterminals for sym in rhs)

    # "2.1. Set N to the set of variables that satisfy (1)"
    for lhs, rhs_multiple in grammar.production_rules.items():
        for rhs in rhs_multiple:
            if condition1(rhs):
                nullable_nonterminals.add(lhs)
                break

    # "2.2. Until an entire pass is made without adding anything to
    # N do: Evaluate all other variables with respect to (2)."
    while True:
        found_new_nullable = False
        for lhs, rhs_multiple in grammar.production_rules.items():
            for rhs in rhs_multiple:
                if condition2(rhs, nullable_nonterminals):
                    if lhs not in nullable_nonterminals:
                        nullable_nonterminals.add(lhs)
                        found_new_nullable = True
        if not found_new_nullable:
            break
    return nullable_nonterminals


def _process_nullable_nonterminals(grammar, nullable_nonterminals):
    """Add rules to cover all combinations of nullable nonterminals.

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007): pp. 162,
      step 3 of removeEps
    - Hopcroft, Ullman - Automata theory, language and
      computation (2004): p. 266

    """
    # For each nullable nonterminal, let it be present or absent
    # wherever it appears on a rhs by adding new rules (all
    # combinatorial possibilities that arise from multiple nullable nt)
    def get_all_rhs_variants(rhs, nullable_nonterminals):
        # Find symbols in rhs that are nullable by remembering their
        # positions
        nullable_positions = [
            pos for pos, sym in enumerate(rhs) if sym in nullable_nonterminals
        ]
        map_pos = {pos_in_rhs: i for i, pos_in_rhs in enumerate(nullable_positions)}

        # Create combinations
        num_all_symbols = len(rhs)
        num_nullable_symbols = len(nullable_positions)
        combinations = list(
            _itertools.product([False, True], repeat=num_nullable_symbols)
        )
        if num_nullable_symbols == num_all_symbols:
            # If all symbols are nullable, remove a combination where
            # each symbol is set to absent
            combinations = combinations[1:]

        # Insert symbols according to combinations
        all_new_rhs = []
        for combination in combinations:
            new_rhs = [
                sym
                for pos, sym in enumerate(rhs)
                if pos not in nullable_positions  # include non-nullable symbol always
                or combination[map_pos[pos]]
            ]  # include nullable symbol sometimes
            all_new_rhs.append(new_rhs)
        return all_new_rhs

    new_rules = dict()
    for lhs, rhs_multiple in grammar.production_rules.items():
        new_rules[lhs] = []
        for rhs in rhs_multiple:
            some_new_rhs = get_all_rhs_variants(rhs, nullable_nonterminals)
            new_rules[lhs].extend(some_new_rhs)
    grammar.production_rules = new_rules

    # For each lhs, remove duplicate rhs
    # (that came from the construction or were present before)
    new_rules = dict()
    for lhs, rhs_multiple in grammar.production_rules.items():
        new_rules[lhs] = []
        known_rhs = set()
        for rhs in rhs_multiple:
            # nonterminals and terminals are distinguishable as str
            rhs_hashable = str(rhs)
            if rhs_hashable not in known_rhs:
                new_rules[lhs].append(rhs)
                known_rhs.add(rhs_hashable)
    grammar.production_rules = new_rules
    return grammar


def _remove_remaining_eps(grammar):
    """Remove remaining productions of form A -> ε.

    Additional step: If A -> ε is removed and A has no other rhs,
    then A has to be removed from the entire grammar. If by that removal
    another nonterminal has no rhs left, that has to be removed too,
    iteratively until no removal is required anymore.

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007):
      p. 162, step 4 of removeEps

    """
    # Remove A -> ε
    def is_eps(rhs):
        return len(rhs) == 1 and _shared.is_empty_terminal(rhs[0])

    new_rules = dict()
    for lhs, rhs_multiple in grammar.production_rules.items():
        new_rules[lhs] = []
        for rhs in rhs_multiple:
            if not is_eps(rhs):
                new_rules[lhs].append(rhs)
    grammar.production_rules = new_rules
    return grammar


def _remove_unit_productions(grammar):
    """Remove unit productions from the grammar.

    Definition from Rich on p. 171: "A unit production is a rule whose
    right-hand side consists of a single nonterminal symbol."

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007): p. 171
    - Hopcroft, Ullman - Automata theory, language and
      computation (2004): pp. 268-271

    """

    def to_hashable_rule(lhs, rhs):
        return "{}>{}".format(lhs, rhs)

    def is_new_unit_production(known_up, lhs, rhs):
        is_up = len(rhs) == 1 and isinstance(rhs[0], _data_structures.NonterminalSymbol)
        if is_up:
            up_hashable = to_hashable_rule(lhs, rhs)
            if up_hashable not in known_up:
                known_up.add(up_hashable)
                return True
        return False

    def remove_unit_production(known_up, grammar, lhs, rhs_to_remove):
        # 2.2. Remove [X → Y] from G
        grammar.production_rules[lhs] = [
            rhs for rhs in grammar.production_rules[lhs] if rhs != rhs_to_remove
        ]
        # 2.3. [...] For every rule Y → β [...] Add to G the rule X → β
        # unless that is a rule that has already been removed once
        for rhs in grammar.production_rules[rhs_to_remove[0]]:
            if len(rhs) == 1 and to_hashable_rule(lhs, rhs) in known_up:
                continue
            if rhs not in grammar.production_rules[lhs]:
                # add X → β (if not present already)
                grammar.production_rules[lhs].append(rhs)

    known_up = set()
    while True:
        # Search for unit productions
        new_up = []
        for lhs, rhs_multiple in grammar.production_rules.items():
            for rhs in rhs_multiple:
                if is_new_unit_production(known_up, lhs, rhs):
                    new_up.append((lhs, rhs))
                    break
        # If new unit productions were found, remove them.
        # Otherwise stop.
        if new_up:
            for lhs, rhs in new_up:
                remove_unit_production(known_up, grammar, lhs, rhs)
        else:
            break

    # Repair step updates all grammar properties to fit to the new
    # production rules
    grammar = _shared.update_grammar_parts(grammar)
    return grammar


def _remove_mixed_productions(grammar):
    """Remove mixed productions from the grammar.

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007): p. 172

    """
    # 2. Create a new nonterminal T_a for each terminal a in Σ
    def create_new_nonterminal(grammar, terminal):
        i = 0
        while True:
            text = "T_{}".format(terminal.text)
            if i > 0:
                text += "_{}".format(i)
            symbol = _data_structures.NonterminalSymbol(text)
            if symbol not in grammar.nonterminal_symbols:
                grammar.nonterminal_symbols.add(symbol)
                break
            i += 1
        return symbol

    new_nt_map = {
        sym: create_new_nonterminal(grammar, sym) for sym in grammar.terminal_symbols
    }

    # 3. Modify each rule in G whose right-hand side has length greater
    # than 1 and that contains a terminal symbol by substituting T_a for
    # each occurrence of the terminal a
    new_rules = dict()
    for lhs, rhs_multiple in grammar.production_rules.items():
        new_rules[lhs] = []
        for rhs in rhs_multiple:
            if len(rhs) > 1:
                rhs_new = [
                    new_nt_map[sym]
                    if isinstance(sym, _data_structures.TerminalSymbol)
                    else sym
                    for sym in rhs
                ]
            else:
                rhs_new = rhs
            new_rules[lhs].append(rhs_new)
    grammar.production_rules = new_rules

    # 4. Add to G, for each T_a, the rule T_a → a
    for terminal, nonterminal in new_nt_map.items():
        grammar.production_rules[nonterminal] = [[terminal]]

    # Repair step updates all grammar properties to fit to the
    # new production rules
    grammar = _shared.update_grammar_parts(grammar)
    return grammar


def _remove_rules_longer_than_2(grammar):
    """Remove long rules from the grammar.

    References
    ----------
    - Rich - Automata, Computability and Complexity (2007): p. 173
    - Hopcroft, Ullman - Automata theory, language and
      computation (2004): p. 273

    """
    # 2. For each rule [...] n > 2, create new nonterminals
    def create_and_add_new_rules(new_rules, lhs, rhs):
        ns = rhs
        ms = [
            _shared.create_new_nonterminal(grammar, prefix="M_")
            for _ in range(len(ns) - 2)
        ]
        # Start: A → N_1 M_2
        last_nt = lhs
        # Mid: M_2 → N_2 M_3, M_3 → N_3 M_4, ...
        for n, m in zip(ns, ms):
            rhs = [n, m]
            if last_nt in new_rules:
                new_rules[last_nt].append(rhs)
            else:
                new_rules[last_nt] = [rhs]
            last_nt = m
        # End: M_n-1 → N_n-1 N_n
        rhs = [ns[-2], ns[-1]]
        new_rules[last_nt] = [rhs]

    new_rules = dict()
    for lhs, rhs_multiple in grammar.production_rules.items():
        new_rules[lhs] = []
        for rhs in rhs_multiple:
            if len(rhs) > 2:
                create_and_add_new_rules(new_rules, lhs, rhs)
            else:
                new_rules[lhs].append(rhs)
    grammar.production_rules = new_rules

    # Repair step updates all grammar properties to fit to the
    # new production rules
    grammar = _shared.update_grammar_parts(grammar)
    return grammar