Spaces:

AustinWagner
/

turkic-transliteration-demo

Sleeping

App Files Files Community

wagner-austin commited on May 14

Commit

1a0742b

1 Parent(s): 68c6a96

updated requirements to direct at python package for turkic-transliterate. and removed files here

Browse files

Files changed (12) hide show

requirements.txt +2 -0
turkic_translit/__init__.py +0 -9
turkic_translit/cli.py +0 -109
turkic_translit/core.py +0 -38
turkic_translit/logging_config.py +0 -58
turkic_translit/patches.py +0 -77
turkic_translit/rules/ar_lat.rules +0 -15
turkic_translit/rules/kk_ipa.rules +0 -47
turkic_translit/rules/kk_lat2023.rules +0 -49
turkic_translit/rules/ky_ipa.rules +0 -43
turkic_translit/rules/ky_lat2023.rules +0 -40
turkic_translit/sanity.py +0 -25

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 # Core dependencies
 epitran>=1.0,<1.27
 fasttext-wheel==0.9.2

+turkic-transliterate>=0.1.0
 # Core dependencies
 epitran>=1.0,<1.27
 fasttext-wheel==0.9.2

turkic_translit/__init__.py DELETED Viewed

@@ -1,9 +0,0 @@
-from importlib.metadata import version
-# Set up logging first before any other operations
-from .logging_config import setup as _log_setup; _log_setup()
-# Import patches next to ensure they're applied before other imports
-from . import patches
-from .core import to_latin, to_ipa
-__all__ = ["to_latin", "to_ipa"]
-__version__ = "0.1.0"

turkic_translit/cli.py DELETED Viewed

@@ -1,109 +0,0 @@
-import sys, argparse, pathlib, time, os, logging
-from .core import to_latin, to_ipa
-from .logging_config import setup as _log_setup
-# Initialize logger
-log = logging.getLogger(__name__)
-def main() -> None:
-    ap = argparse.ArgumentParser(description="Turkic transliteration")
-    ap.add_argument("--lang", required=True, choices=["kk", "ky"])
-    ap.add_argument("--ipa", action="store_true", help="produce IPA")
-    ap.add_argument("--arabic", action="store_true", help="also transliterate Arabic script")
-    ap.add_argument("--in", dest="inp", default="-")
-    ap.add_argument("--out_latin", default="-")
-    ap.add_argument("--out_ipa")
-    ap.add_argument("--benchmark", action="store_true")
-    ap.add_argument("--log-level", choices=["debug", "info", "warning", "error", "critical"],
-                    default="info",
-                    help="Set logging level (default: info)")
-    args = ap.parse_args()
-    # Always set log level from args at the start (first runtime line)
-    os.environ["TURKIC_LOG_LEVEL"] = args.log_level.upper()
-    _log_setup()
-    outputs = ["latin"]
-    if args.ipa:
-        outputs.append("ipa")
-    # Use Rich markup for output modes (magenta)
-    outputs_markup = ", ".join(f"[magenta]{o}[/]" for o in outputs)
-    log.info(
-        f"Starting transliteration: lang={args.lang}, input={args.inp}, outputs={outputs_markup}, "
-        f"out_latin={args.out_latin}, out_ipa={args.out_ipa}, arabic={args.arabic}, benchmark={args.benchmark}"
-    )
-    # Use UTF-8-sig for Windows to include BOM for proper encoding support
-    encoding = "utf-8-sig" if sys.platform == "win32" else "utf-8"
-    try:
-        fin  = sys.stdin  if args.inp  == "-" else open(args.inp, encoding=encoding)
-        fo_l = sys.stdout if args.out_latin == "-" else open(args.out_latin, "w", encoding=encoding)
-        fo_i = None
-        if args.ipa:
-            if not args.out_ipa:
-                ap.error("--ipa requires --out_ipa")
-            fo_i = open(args.out_ipa, "w", encoding=encoding)
-    except UnicodeDecodeError as e:
-        sys.stderr.write(f"Encoding error: {e}\n")
-        sys.stderr.write("If you're on Windows, make sure your input file is properly encoded in UTF-8.\n")
-        sys.exit(1)
-    start = time.time()
-    n = 0
-    # Try to use tqdm for a progress bar if available and if we're in a TTY
-    use_progress_bar = False
-    pbar = None
-    # Check if we should use a progress bar (stderr is a TTY and input is not stdin)
-    is_tty_output = sys.stderr.isatty()
-    is_file_input = args.inp != "-"
-    if is_tty_output and is_file_input:
-        try:
-            from tqdm import tqdm
-            # Count the number of lines in the input file for the progress bar
-            total_lines = sum(1 for _ in fin)
-            fin.seek(0)  # Reset file pointer
-            pbar = tqdm(total=total_lines, unit="lines")
-            use_progress_bar = True
-            log.debug("Using tqdm progress bar for %d lines", total_lines)
-        except ImportError:
-            log.debug("tqdm not available, falling back to basic processing")
-    # Process lines
-    for line in fin:
-        lat = to_latin(line.rstrip("\n"), args.lang, args.arabic)
-        fo_l.write(lat + "\n")
-        if fo_i:
-            fo_i.write(to_ipa(line.rstrip("\n"), args.lang) + "\n")
-        n += 1
-        if use_progress_bar and pbar:
-            pbar.update(1)
-    log.info(f"Finished writing {n} lines to {args.out_latin if args.out_latin != '-' else 'stdout'}" + (f" and {args.out_ipa}" if args.ipa else ""))
-    # Close progress bar if used
-    if use_progress_bar and pbar:
-        pbar.close()
-    elapsed = time.time() - start
-    # Always log processing statistics, but at different levels based on benchmark flag
-    if args.benchmark:
-        log.info("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
-    else:
-        log.debug("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
-    log.info("Transliteration complete.")
-    # Clean up file handles
-    if fin is not sys.stdin:
-        fin.close()
-    if fo_l is not sys.stdout:
-        fo_l.close()
-    if fo_i:
-        fo_i.close()
-# This is the entry point when the module is run directly
-if __name__ == "__main__":
-    main()

turkic_translit/core.py DELETED Viewed

@@ -1,38 +0,0 @@
-"""Public API for Latin and IPA transliteration."""
-try:
-    import icu        # noqa: F401
-except ImportError as e:    # PyICU wheel is still missing
-    raise RuntimeError(
-        "PyICU missing. On Windows run:\n"
-        "  python scripts/get_pyicu_wheel.py\n"
-        "or manually install a wheel from "
-        "https://github.com/cgohlke/pyicu-build/releases ."
-    ) from e
-from functools import lru_cache
-import unicodedata as ud
-from pathlib import Path
-_RULE_DIR = Path(__file__).with_suffix("").parent / "rules"
-@lru_cache
-def _icu_trans(name: str) -> icu.Transliterator:
-    txt = (_RULE_DIR / name).read_text(encoding="utf8")
-    return icu.Transliterator.createFromRules(name, txt, 0)
-def to_latin(text: str, lang: str, include_arabic: bool = False) -> str:
-    if lang not in ("kk", "ky"):
-        raise ValueError("lang must be 'kk' or 'ky'")
-    rule = f"{lang}_lat2023.rules"
-    trans = _icu_trans(rule)
-    if include_arabic:
-        ar = _icu_trans("ar_lat.rules")
-        text = ar.transliterate(text)
-    out = trans.transliterate(text)
-    return ud.normalize("NFC", out)
-def to_ipa(text: str, lang: str) -> str:
-    trans = _icu_trans(f"{lang}_ipa.rules")
-    return ud.normalize("NFC", trans.transliterate(text))

turkic_translit/logging_config.py DELETED Viewed

@@ -1,58 +0,0 @@
-"""
-Centralized logging configuration module.
-Uses Rich for colorized output if available with fallback to standard library.
-"""
-import logging
-import os
-import sys
-from functools import lru_cache
-# Get log level from environment or default to INFO
-LOG_LEVEL = os.environ.get("TURKIC_LOG_LEVEL", "INFO").upper()
-@lru_cache(maxsize=1)
-def setup():
-    """
-    Set up logging with Rich if available, with fallback to stdlib logging.
-    Uses TURKIC_LOG_LEVEL environment variable or defaults to INFO.
-    Uses @lru_cache to ensure this is only run once.
-    """
-    root_logger = logging.getLogger()
-    # Clear any existing handlers
-    for handler in root_logger.handlers[:]:
-        root_logger.removeHandler(handler)
-    # Set the log level based on environment variable
-    log_level = getattr(logging, LOG_LEVEL, logging.INFO)
-    root_logger.setLevel(log_level)
-    # Try to use Rich for pretty, colorized output
-    try:
-        from rich.logging import RichHandler
-        # Configure Rich handler with appropriate settings
-        handler = RichHandler(
-            rich_tracebacks=True,
-            markup=True,
-            show_time=False,
-            show_path=False,
-        )
-        formatter = logging.Formatter("%(message)s")
-    except ImportError:
-        # Fall back to standard logging if Rich is not available
-        handler = logging.StreamHandler(sys.stderr)
-        formatter = logging.Formatter(
-            "%(levelname)s: %(message)s"
-        )
-    # Configure and add the handler
-    handler.setFormatter(formatter)
-    root_logger.addHandler(handler)
-    logger = logging.getLogger("turkic_translit")
-    logger.debug(f"Logging initialized at level {LOG_LEVEL}")
-    return logger

turkic_translit/patches.py DELETED Viewed

@@ -1,77 +0,0 @@
-"""
-Patches for third-party libraries to fix encoding issues on Windows.
-This module is imported automatically at startup.
-"""
-import os
-import sys
-import functools
-import logging
-from .logging_config import setup; setup()
-log = logging.getLogger(__name__)
-_PATCH_DONE = False
-_PATCHED_FILES = set()
-def _fix_broken_ssl_cert_env():
-    """
-    If the user (often Conda on Windows) left SSL_CERT_FILE pointing at a
-    non-existent bundle, httpx ⇢ gradio will crash on import.  When the file
-    is missing we delete the env-var so Python falls back to the system
-    certificates.
-    """
-    import os, pathlib, logging
-    log = logging.getLogger(__name__)
-    bundle = os.environ.get("SSL_CERT_FILE")
-    if bundle and not pathlib.Path(bundle).exists():
-        log.warning(
-            "SSL_CERT_FILE=%s does not exist – removing the variable so "
-            "httpx can create a default context", bundle)
-        del os.environ["SSL_CERT_FILE"]
-def apply_patches():
-    """Apply all necessary patches for third-party libraries."""
-    global _PATCH_DONE
-    _fix_broken_ssl_cert_env()  # ← new line: ensure SSL_CERT_FILE is valid before any third-party import
-    # Skip if patches have already been applied
-    if _PATCH_DONE:
-        log.debug("Patches already applied, skipping")
-        return
-    _PATCH_DONE = True
-    # Fix panphon encoding issues on Windows
-    if sys.platform == 'win32':
-        try:
-            import panphon.featuretable
-            import io
-            import csv
-            # Save the original open function
-            original_open = open
-            # Monkey patch the built-in open function when used by panphon
-            def patched_open_for_panphon(file, mode='r', *args, **kwargs):
-                # Add explicit UTF-8 encoding for CSV files opened by panphon
-                if 'panphon' in sys.modules and mode == 'r' and isinstance(file, str) and file.endswith('.csv'):
-                    if 'encoding' not in kwargs:
-                        kwargs['encoding'] = 'utf-8'
-                        # Only log the first time per unique file
-                        if file not in _PATCHED_FILES:
-                            log.debug(f"Applied UTF-8 encoding patch for {file}")
-                            _PATCHED_FILES.add(file)
-                return original_open(file, mode, *args, **kwargs)
-            # Set the environment variable for good measure
-            os.environ['PYTHONUTF8'] = '1'
-            # Apply the patch
-            import builtins
-            builtins.open = patched_open_for_panphon
-            log.info("Applied panphon UTF-8 patch for Windows")
-            # We've already applied the patch above
-        except ImportError:
-            log.warning("Could not patch panphon (not installed)")
-# Apply patches when module is imported
-apply_patches()
-log.debug("Patches module initialized")

turkic_translit/rules/ar_lat.rules DELETED Viewed

@@ -1,15 +0,0 @@
-# consonants
-ب > b ;  پ > p ;  ت > t ;  ج > j ;  چ > ch ;
-ح > h ;  خ > x ;  د > d ;  ر > r ;  ز > z ;  س > s ;
-ش > sh ; ص > s ;  ط > t ;  غ > gh ;
-ف > f ;  ق > q ;  ك > k ;  گ > g ;  ل > l ;  م > m ;  ن > n ;
-ه > h ;  ھ > h ;  ژ > zh ;  ڭ > ng ;  ۋ > w ;
-# vowels  (hamza carrier ئ can be dropped or mapped to ')
-ا > a ;   ە > e ;   ۆ > ö ;   و > o ;
-ۇ > u ;   ۈ > ü ;   ى > i ;   ې > ë ;
-# glottals
-ء > ' ;   ع > ' ;   ئ > ;
-:: NFC ;

turkic_translit/rules/kk_ipa.rules DELETED Viewed

@@ -1,47 +0,0 @@
-# Kazakh → IPA transliteration rules (kk_ipa.rules)
-# One line per Cyrillic letter.  Right-hand side is plain IPA (no slashes).  NFC-normalised.
-А > ɑ ;  а > ɑ ;
-Ә > æ ;  ә > æ ;
-Б > b ;  б > b ;
-В > v ;  в > v ;
-Г > ɡ ;  г > ɡ ;
-Ғ > ʁ ;  ғ > ʁ ;
-Д > d ;  д > d ;
-Е > e ;  е > e ;
-Ё > jo ; ё > jo ;
-Ж > ʒ ;  ж > ʒ ;
-З > z ;  з > z ;
-И > i ;  и > i ;
-Й > j ;  й > j ;
-К > k ;  к > k ;
-Қ > q ;  қ > q ;
-Л > l ;  л > l ;
-М > m ;  м > m ;
-Н > n ;  н > n ;
-Ң > ŋ ;  ң > ŋ ;
-О > o ;  о > o ;
-Ө > ø ;  ө > ø ;
-П > p ;  п > p ;
-Р > r ;  р > r ;
-С > s ;  с > s ;
-Т > t ;  т > t ;
-У > u ;  у > u ;
-Ұ > ʊ ;  ұ > ʊ ;
-Ү > y ;  ү > y ;
-Ф > f ;  ф > f ;
-Х > x ;  х > x ;
-Һ > h ;  һ > h ;
-Ц > ts ; ц > ts ;
-Ч > t͡ʃ ;  ч > t͡ʃ ;
-Ш > ʃ ;  ш > ʃ ;
-Щ > ɕː ; щ > ɕː ;
-Ъ > ʔ ;  ъ > ʔ ;
-Ы > ɯ ;  ы > ɯ ;
-І > ɪ ;  і > ɪ ;
-Ь > ;   ь > ;
-Э > e ;  э > e ;
-Ю > ju ; ю > ju ;
-Я > ja ; я > ja ;
-:: NFC ;

turkic_translit/rules/kk_lat2023.rules DELETED Viewed

@@ -1,49 +0,0 @@
-# Official Kazakh Latin alphabet (April 2021)
-# https://en.wikipedia.org/wiki/Kazakh_alphabets
-А > A ;  а > a ;
-Ә > Ä ;  ә > ä ;
-Б > B ;  б > b ;
-В > V ;  в > v ;
-Г > G ;  г > g ;
-Ғ > Ğ ;  ғ > ğ ;
-Д > D ;  д > d ;
-Е > E ;  е > e ;
-Ж > J ;  ж > j ;
-З > Z ;  з > z ;
-И > İ ;  и > i ;   # dotted İ/i
-Й > İ ;  й > i ;   # official merging per standard (ambiguity known)
-І > I ;  і > ı ;   # corrected: dotless lowercase ı
-К > K ;  к > k ;
-Қ > Q ;  қ > q ;
-Л > L ;  л > l ;
-М > M ;  м > m ;
-Н > N ;  н > n ;
-Ң > Ñ ;  ң > ñ ;
-О > O ;  о > o ;
-Ө > Ö ;  ө > ö ;
-П > P ;  п > p ;
-Р > R ;  р > r ;
-С > S ;  с > s ;
-Т > T ;  т > t ;
-У > U ;  у > u ;
-Ұ > Ū ;  ұ > ū ;
-Ү > Ü ;  ү > ü ;
-Ф > F ;  ф > f ;
-Х > H ;  х > h ;
-Һ > H ;  һ > h ;
-# Russian loan letters (clearly marked, NOT official Kazakh letters)
-Ё > Yo ;  ё > yo ;
-Э > Ė  ;  э > ė ;
-Ц > Ts ;  ц > ts ;
-Ч > Ch ;  ч > ch ;
-Ш > Ş  ;  ш > ş ;
-Щ > Şç ; щ > şç ;
-Ы > Y  ;  ы > y ;
-Ю > Yu ; ю > yu ;
-Я > Ya ; я > ya ;
-Ъ > ;   ъ > ;     # dropped entirely
-Ь > ;   ь > ;     # dropped entirely
-:: NFC ;

turkic_translit/rules/ky_ipa.rules DELETED Viewed

@@ -1,43 +0,0 @@
-# Kyrgyz → IPA transliteration rules (ky_ipa.rules)
-# One line per Cyrillic letter; NFC‐normalised; IPA given without slashes.
-А > a ;  а > a ;
-Б > b ;  б > b ;
-В > v ;  в > v ;
-Г > ɡ ;  г > ɡ ;
-Ғ > ʁ ;  ғ > ʁ ;
-Д > d ;  д > d ;
-Е > e ;  е > e ;
-Ё > jo ; ё > jo ;
-Ж > d͡ʒ ;  ж > d͡ʒ ;
-З > z ;  з > z ;
-И > i ;  и > i ;
-Й > j ;  й > j ;
-К > k ;  к > k ;
-Қ > q ;  қ > q ;
-Л > l ;  л > l ;
-М > m ;  м > m ;
-Н > n ;  н > n ;
-Ң > ŋ ;  ң > ŋ ;
-О > o ;  о > o ;
-Ө > ø ;  ө > ø ;
-П > p ;  п > p ;
-Р > r ;  р > r ;
-С > s ;  с > s ;
-Т > t ;  т > t ;
-У > u ;  у > u ;
-Ү > y ;  ү > y ;
-Ф > f ;  ф > f ;
-Х > x ;  х > x ;
-Ц > ts ; ц > ts ;
-Ч > t͡ʃ ;  ч > t͡ʃ ;
-Ш > ʃ ;  ш > ʃ ;
-Щ > ɕː ; щ > ɕː ;
-Ы > ɯ ;  ы > ɯ ;
-Э > ɛ ;  э > ɛ ;
-Ю > ju ; ю > ju ;
-Я > ja ; я > ja ;
-Ъ > ʔ ;  ъ > ʔ ;
-Ь > ;   ь > ;
-:: NFC ;

turkic_translit/rules/ky_lat2023.rules DELETED Viewed

@@ -1,40 +0,0 @@
-# Kyrgyz → Modern Practical Latin Transliteration (NFC)
-# One line per pair; use pre‑composed Unicode code‑points; digraphs are atomic tokens.
-А > A ;  а > a ;
-Б > B ;  б > b ;
-В > V ;  в > v ;
-Г > G ;  г > g ;
-Д > D ;  д > d ;
-Е > E ;  е > e ;
-Ё > Yo ; ё > yo ;
-Ж > J ;  ж > j ;
-З > Z ;  з > z ;
-И > İ ;  и > i ;   # dotted I
-Й > Ý ;  й > ý ;   # /j/ glide
-К > K ;  к > k ;
-Л > L ;  л > l ;
-М > M ;  м > m ;
-Н > N ;  н > n ;
-Ң > Ñ ;  ң > ñ ;
-О > O ;  о > o ;
-Ө > Ö ;  ө > ö ;
-П > P ;  п > p ;
-Р > R ;  р > r ;
-С > S ;  с > s ;
-Т > T ;  т > t ;
-У > U ;  у > u ;
-Ү > Ü ;  ү > ü ;
-Ф > F ;  ф > f ;
-Х > H ;  х > h ;
-Ц > Ts ; ц > ts ;  # digraph
-Ч > Ç ;  ч > ç ;
-Ш > Ş ;  ш > ş ;
-Щ > Şç ; щ > şç ;  # digraph
-Ы > Y ;  ы > y ;
-Э > É ;  э > é ;
-Ю > Yu ; ю > yu ;   # digraph
-Я > Ya ; я > ya ;   # digraph
-Ъ > ʼ ;  ъ > ʼ ;    # modifier apostrophe U+02BC
-Ь > ʼ ;  ь > ʼ ;
-:: NFC ;

turkic_translit/sanity.py DELETED Viewed

@@ -1,25 +0,0 @@
-"""Helper functions for Levenshtein and byte checks."""
-from rapidfuzz.distance import Levenshtein
-import io
-import os, re, unicodedata as ud
-def median_lev(file_lat:str, file_ipa:str, sample:int=5000) -> float:
-    from statistics import median
-    m=[]
-    with io.open(file_lat, encoding="utf8") as f1, io.open(file_ipa, encoding="utf8") as f2:
-        for i,(l,i_) in enumerate(zip(f1,f2)):
-            if i==sample: break
-            m.append(Levenshtein.normalized_distance(l.strip(), i_.strip()))
-    return median(m)
-def bytes_per_char(filename:str)->float:
-    import os, io
-    b = os.path.getsize(filename)
-    with io.open(filename, encoding="utf8") as f:
-        chars = sum(len(line) for line in f)
-    return b / chars
-def is_nfc(filename:str)->bool:
-    import unicodedata, io
-    with io.open(filename, encoding="utf8") as f:
-        return all(unicodedata.is_normalized("NFC", line) for line in f)