Source code for secure_inner_join.phonem

"""
Implementation of phonem

Based on:
Wilde, Georg ; Meyer, Carsten: Doppelgänger gesucht - Ein Programm fur
kontext-sensitive phonetische Textumwandlung. In: ct Magazin fur
Computer & Technik 25 (1988)

Inspired by Talisman's implementation:
https://raw.githubusercontent.com/Yomguithereal/talisman/4c8ac7fd9/src/phonetics/german/phonem.js
"""

from __future__ import annotations

import itertools

SUBSTITUTIONS = {
    "SC": "C",
    "SZ": "C",
    "CZ": "C",
    "TZ": "C",
    "TS": "C",
    "KS": "X",
    "PF": "V",
    "PH": "V",
    "QU": "KW",
    "UE": "Y",
    "AE": "E",
    "OE": "Ö",
    "EI": "AY",
    "EY": "AY",
    "EU": "OY",
    "AU": "A§",
    "OU": "§",
}

TRANSLATIONS: dict[str, int | str | None] = {
    "Z": "C",
    "K": "C",
    "G": "C",
    "Q": "C",
    "Ç": "C",
    "Ñ": "N",
    "ß": "S",
    "F": "V",
    "W": "V",
    "P": "B",
    "T": "D",
    "Á": "A",
    "À": "A",
    "Â": "A",
    "Ã": "A",
    "Å": "A",
    "Ä": "E",
    "Æ": "E",
    "É": "E",
    "È": "E",
    "Ê": "E",
    "Ë": "E",
    "I": "Y",
    "J": "Y",
    "Ì": "Y",
    "Í": "Y",
    "Î": "Y",
    "Ï": "Y",
    "Ü": "Y",
    "Ý": "Y",
    "§": "U",
    "Ú": "U",
    "Ù": "U",
    "Û": "U",
    "Ô": "O",
    "Ò": "O",
    "Ó": "O",
    "Õ": "O",
    "Ø": "Ö",
}

ACCEPTABLE_LETTERS = "ABCDLMNORSUVWXYÖ"
WHITELIST_LETTERS = set("ABCDEFGHIJKLMNOPQRSTUVWXYZ -")
WHITELIST_LETTERS.update(TRANSLATIONS.keys() | SUBSTITUTIONS.keys())
WHITELIST_LETTERS.discard(
    "§"
)  # Do not allow this non-printable character in original name



[docs]
def phonem_encode(string: str) -> str:
    """
    Implements the phonem phonetic algorithm suitable for Germanic names.

    Based on:
    Wilde, Georg ; Meyer, Carsten: Doppelgänger gesucht - Ein Programm fur
    kontext-sensitive phonetische Textumwandlung. In: ct Magazin fur
    Computer & Technik 25 (1988)

    Inspired by Talisman's implementation:
    https://raw.githubusercontent.com/Yomguithereal/talisman/4c8ac7fd9/src/phonetics/german/phonem.js

    Spaces and dashes (' ', '-'), are considered as word/name separators.

    :param string: string to encode
    :return: encoded string
    """
    # Strip all but acceptable letters and separators
    string = "".join(filter(lambda _: _ in WHITELIST_LETTERS, string.upper()))

    # Substitute
    for match, replacement in SUBSTITUTIONS.items():
        string = string.replace(match, replacement)

    # Translate
    string = string.translate(str.maketrans(TRANSLATIONS))

    # Squeeze and filter unacceptable letters
    output = "".join(
        char for char, _ in itertools.groupby(string) if char in ACCEPTABLE_LETTERS
    )

    return output