wagner-austin
commited on
Commit
·
1a0742b
1
Parent(s):
68c6a96
updated requirements to direct at python package for turkic-transliterate. and removed files here
Browse files- requirements.txt +2 -0
- turkic_translit/__init__.py +0 -9
- turkic_translit/cli.py +0 -109
- turkic_translit/core.py +0 -38
- turkic_translit/logging_config.py +0 -58
- turkic_translit/patches.py +0 -77
- turkic_translit/rules/ar_lat.rules +0 -15
- turkic_translit/rules/kk_ipa.rules +0 -47
- turkic_translit/rules/kk_lat2023.rules +0 -49
- turkic_translit/rules/ky_ipa.rules +0 -43
- turkic_translit/rules/ky_lat2023.rules +0 -40
- turkic_translit/sanity.py +0 -25
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
# Core dependencies
|
2 |
epitran>=1.0,<1.27
|
3 |
fasttext-wheel==0.9.2
|
|
|
1 |
+
turkic-transliterate>=0.1.0
|
2 |
+
|
3 |
# Core dependencies
|
4 |
epitran>=1.0,<1.27
|
5 |
fasttext-wheel==0.9.2
|
turkic_translit/__init__.py
DELETED
@@ -1,9 +0,0 @@
|
|
1 |
-
from importlib.metadata import version
|
2 |
-
# Set up logging first before any other operations
|
3 |
-
from .logging_config import setup as _log_setup; _log_setup()
|
4 |
-
# Import patches next to ensure they're applied before other imports
|
5 |
-
from . import patches
|
6 |
-
from .core import to_latin, to_ipa
|
7 |
-
|
8 |
-
__all__ = ["to_latin", "to_ipa"]
|
9 |
-
__version__ = "0.1.0"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/cli.py
DELETED
@@ -1,109 +0,0 @@
|
|
1 |
-
import sys, argparse, pathlib, time, os, logging
|
2 |
-
from .core import to_latin, to_ipa
|
3 |
-
from .logging_config import setup as _log_setup
|
4 |
-
|
5 |
-
# Initialize logger
|
6 |
-
log = logging.getLogger(__name__)
|
7 |
-
|
8 |
-
def main() -> None:
|
9 |
-
ap = argparse.ArgumentParser(description="Turkic transliteration")
|
10 |
-
ap.add_argument("--lang", required=True, choices=["kk", "ky"])
|
11 |
-
ap.add_argument("--ipa", action="store_true", help="produce IPA")
|
12 |
-
ap.add_argument("--arabic", action="store_true", help="also transliterate Arabic script")
|
13 |
-
ap.add_argument("--in", dest="inp", default="-")
|
14 |
-
ap.add_argument("--out_latin", default="-")
|
15 |
-
ap.add_argument("--out_ipa")
|
16 |
-
ap.add_argument("--benchmark", action="store_true")
|
17 |
-
ap.add_argument("--log-level", choices=["debug", "info", "warning", "error", "critical"],
|
18 |
-
default="info",
|
19 |
-
help="Set logging level (default: info)")
|
20 |
-
args = ap.parse_args()
|
21 |
-
|
22 |
-
# Always set log level from args at the start (first runtime line)
|
23 |
-
os.environ["TURKIC_LOG_LEVEL"] = args.log_level.upper()
|
24 |
-
_log_setup()
|
25 |
-
|
26 |
-
outputs = ["latin"]
|
27 |
-
if args.ipa:
|
28 |
-
outputs.append("ipa")
|
29 |
-
# Use Rich markup for output modes (magenta)
|
30 |
-
outputs_markup = ", ".join(f"[magenta]{o}[/]" for o in outputs)
|
31 |
-
log.info(
|
32 |
-
f"Starting transliteration: lang={args.lang}, input={args.inp}, outputs={outputs_markup}, "
|
33 |
-
f"out_latin={args.out_latin}, out_ipa={args.out_ipa}, arabic={args.arabic}, benchmark={args.benchmark}"
|
34 |
-
)
|
35 |
-
|
36 |
-
# Use UTF-8-sig for Windows to include BOM for proper encoding support
|
37 |
-
encoding = "utf-8-sig" if sys.platform == "win32" else "utf-8"
|
38 |
-
|
39 |
-
try:
|
40 |
-
fin = sys.stdin if args.inp == "-" else open(args.inp, encoding=encoding)
|
41 |
-
fo_l = sys.stdout if args.out_latin == "-" else open(args.out_latin, "w", encoding=encoding)
|
42 |
-
fo_i = None
|
43 |
-
if args.ipa:
|
44 |
-
if not args.out_ipa:
|
45 |
-
ap.error("--ipa requires --out_ipa")
|
46 |
-
fo_i = open(args.out_ipa, "w", encoding=encoding)
|
47 |
-
except UnicodeDecodeError as e:
|
48 |
-
sys.stderr.write(f"Encoding error: {e}\n")
|
49 |
-
sys.stderr.write("If you're on Windows, make sure your input file is properly encoded in UTF-8.\n")
|
50 |
-
sys.exit(1)
|
51 |
-
|
52 |
-
start = time.time()
|
53 |
-
n = 0
|
54 |
-
|
55 |
-
# Try to use tqdm for a progress bar if available and if we're in a TTY
|
56 |
-
use_progress_bar = False
|
57 |
-
pbar = None
|
58 |
-
|
59 |
-
# Check if we should use a progress bar (stderr is a TTY and input is not stdin)
|
60 |
-
is_tty_output = sys.stderr.isatty()
|
61 |
-
is_file_input = args.inp != "-"
|
62 |
-
|
63 |
-
if is_tty_output and is_file_input:
|
64 |
-
try:
|
65 |
-
from tqdm import tqdm
|
66 |
-
# Count the number of lines in the input file for the progress bar
|
67 |
-
total_lines = sum(1 for _ in fin)
|
68 |
-
fin.seek(0) # Reset file pointer
|
69 |
-
pbar = tqdm(total=total_lines, unit="lines")
|
70 |
-
use_progress_bar = True
|
71 |
-
log.debug("Using tqdm progress bar for %d lines", total_lines)
|
72 |
-
except ImportError:
|
73 |
-
log.debug("tqdm not available, falling back to basic processing")
|
74 |
-
|
75 |
-
# Process lines
|
76 |
-
for line in fin:
|
77 |
-
lat = to_latin(line.rstrip("\n"), args.lang, args.arabic)
|
78 |
-
fo_l.write(lat + "\n")
|
79 |
-
if fo_i:
|
80 |
-
fo_i.write(to_ipa(line.rstrip("\n"), args.lang) + "\n")
|
81 |
-
n += 1
|
82 |
-
if use_progress_bar and pbar:
|
83 |
-
pbar.update(1)
|
84 |
-
|
85 |
-
log.info(f"Finished writing {n} lines to {args.out_latin if args.out_latin != '-' else 'stdout'}" + (f" and {args.out_ipa}" if args.ipa else ""))
|
86 |
-
|
87 |
-
# Close progress bar if used
|
88 |
-
if use_progress_bar and pbar:
|
89 |
-
pbar.close()
|
90 |
-
|
91 |
-
elapsed = time.time() - start
|
92 |
-
# Always log processing statistics, but at different levels based on benchmark flag
|
93 |
-
if args.benchmark:
|
94 |
-
log.info("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
|
95 |
-
else:
|
96 |
-
log.debug("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
|
97 |
-
log.info("Transliteration complete.")
|
98 |
-
|
99 |
-
# Clean up file handles
|
100 |
-
if fin is not sys.stdin:
|
101 |
-
fin.close()
|
102 |
-
if fo_l is not sys.stdout:
|
103 |
-
fo_l.close()
|
104 |
-
if fo_i:
|
105 |
-
fo_i.close()
|
106 |
-
|
107 |
-
# This is the entry point when the module is run directly
|
108 |
-
if __name__ == "__main__":
|
109 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/core.py
DELETED
@@ -1,38 +0,0 @@
|
|
1 |
-
"""Public API for Latin and IPA transliteration."""
|
2 |
-
try:
|
3 |
-
import icu # noqa: F401
|
4 |
-
except ImportError as e: # PyICU wheel is still missing
|
5 |
-
raise RuntimeError(
|
6 |
-
"PyICU missing. On Windows run:\n"
|
7 |
-
" python scripts/get_pyicu_wheel.py\n"
|
8 |
-
"or manually install a wheel from "
|
9 |
-
"https://github.com/cgohlke/pyicu-build/releases ."
|
10 |
-
) from e
|
11 |
-
|
12 |
-
from functools import lru_cache
|
13 |
-
import unicodedata as ud
|
14 |
-
from pathlib import Path
|
15 |
-
|
16 |
-
|
17 |
-
_RULE_DIR = Path(__file__).with_suffix("").parent / "rules"
|
18 |
-
|
19 |
-
@lru_cache
|
20 |
-
def _icu_trans(name: str) -> icu.Transliterator:
|
21 |
-
txt = (_RULE_DIR / name).read_text(encoding="utf8")
|
22 |
-
return icu.Transliterator.createFromRules(name, txt, 0)
|
23 |
-
|
24 |
-
def to_latin(text: str, lang: str, include_arabic: bool = False) -> str:
|
25 |
-
if lang not in ("kk", "ky"):
|
26 |
-
raise ValueError("lang must be 'kk' or 'ky'")
|
27 |
-
rule = f"{lang}_lat2023.rules"
|
28 |
-
trans = _icu_trans(rule)
|
29 |
-
if include_arabic:
|
30 |
-
ar = _icu_trans("ar_lat.rules")
|
31 |
-
text = ar.transliterate(text)
|
32 |
-
out = trans.transliterate(text)
|
33 |
-
return ud.normalize("NFC", out)
|
34 |
-
|
35 |
-
|
36 |
-
def to_ipa(text: str, lang: str) -> str:
|
37 |
-
trans = _icu_trans(f"{lang}_ipa.rules")
|
38 |
-
return ud.normalize("NFC", trans.transliterate(text))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/logging_config.py
DELETED
@@ -1,58 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Centralized logging configuration module.
|
3 |
-
Uses Rich for colorized output if available with fallback to standard library.
|
4 |
-
"""
|
5 |
-
import logging
|
6 |
-
import os
|
7 |
-
import sys
|
8 |
-
from functools import lru_cache
|
9 |
-
|
10 |
-
# Get log level from environment or default to INFO
|
11 |
-
LOG_LEVEL = os.environ.get("TURKIC_LOG_LEVEL", "INFO").upper()
|
12 |
-
|
13 |
-
@lru_cache(maxsize=1)
|
14 |
-
def setup():
|
15 |
-
"""
|
16 |
-
Set up logging with Rich if available, with fallback to stdlib logging.
|
17 |
-
Uses TURKIC_LOG_LEVEL environment variable or defaults to INFO.
|
18 |
-
|
19 |
-
Uses @lru_cache to ensure this is only run once.
|
20 |
-
"""
|
21 |
-
root_logger = logging.getLogger()
|
22 |
-
|
23 |
-
# Clear any existing handlers
|
24 |
-
for handler in root_logger.handlers[:]:
|
25 |
-
root_logger.removeHandler(handler)
|
26 |
-
|
27 |
-
# Set the log level based on environment variable
|
28 |
-
log_level = getattr(logging, LOG_LEVEL, logging.INFO)
|
29 |
-
root_logger.setLevel(log_level)
|
30 |
-
|
31 |
-
# Try to use Rich for pretty, colorized output
|
32 |
-
try:
|
33 |
-
from rich.logging import RichHandler
|
34 |
-
|
35 |
-
# Configure Rich handler with appropriate settings
|
36 |
-
handler = RichHandler(
|
37 |
-
rich_tracebacks=True,
|
38 |
-
markup=True,
|
39 |
-
show_time=False,
|
40 |
-
show_path=False,
|
41 |
-
)
|
42 |
-
formatter = logging.Formatter("%(message)s")
|
43 |
-
|
44 |
-
except ImportError:
|
45 |
-
# Fall back to standard logging if Rich is not available
|
46 |
-
handler = logging.StreamHandler(sys.stderr)
|
47 |
-
formatter = logging.Formatter(
|
48 |
-
"%(levelname)s: %(message)s"
|
49 |
-
)
|
50 |
-
|
51 |
-
# Configure and add the handler
|
52 |
-
handler.setFormatter(formatter)
|
53 |
-
root_logger.addHandler(handler)
|
54 |
-
|
55 |
-
logger = logging.getLogger("turkic_translit")
|
56 |
-
logger.debug(f"Logging initialized at level {LOG_LEVEL}")
|
57 |
-
|
58 |
-
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/patches.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
"""
|
2 |
-
Patches for third-party libraries to fix encoding issues on Windows.
|
3 |
-
This module is imported automatically at startup.
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
import sys
|
7 |
-
import functools
|
8 |
-
import logging
|
9 |
-
from .logging_config import setup; setup()
|
10 |
-
|
11 |
-
log = logging.getLogger(__name__)
|
12 |
-
_PATCH_DONE = False
|
13 |
-
_PATCHED_FILES = set()
|
14 |
-
|
15 |
-
def _fix_broken_ssl_cert_env():
|
16 |
-
"""
|
17 |
-
If the user (often Conda on Windows) left SSL_CERT_FILE pointing at a
|
18 |
-
non-existent bundle, httpx ⇢ gradio will crash on import. When the file
|
19 |
-
is missing we delete the env-var so Python falls back to the system
|
20 |
-
certificates.
|
21 |
-
"""
|
22 |
-
import os, pathlib, logging
|
23 |
-
log = logging.getLogger(__name__)
|
24 |
-
bundle = os.environ.get("SSL_CERT_FILE")
|
25 |
-
if bundle and not pathlib.Path(bundle).exists():
|
26 |
-
log.warning(
|
27 |
-
"SSL_CERT_FILE=%s does not exist – removing the variable so "
|
28 |
-
"httpx can create a default context", bundle)
|
29 |
-
del os.environ["SSL_CERT_FILE"]
|
30 |
-
|
31 |
-
def apply_patches():
|
32 |
-
"""Apply all necessary patches for third-party libraries."""
|
33 |
-
global _PATCH_DONE
|
34 |
-
_fix_broken_ssl_cert_env() # ← new line: ensure SSL_CERT_FILE is valid before any third-party import
|
35 |
-
# Skip if patches have already been applied
|
36 |
-
if _PATCH_DONE:
|
37 |
-
log.debug("Patches already applied, skipping")
|
38 |
-
return
|
39 |
-
|
40 |
-
_PATCH_DONE = True
|
41 |
-
# Fix panphon encoding issues on Windows
|
42 |
-
if sys.platform == 'win32':
|
43 |
-
try:
|
44 |
-
import panphon.featuretable
|
45 |
-
import io
|
46 |
-
import csv
|
47 |
-
|
48 |
-
# Save the original open function
|
49 |
-
original_open = open
|
50 |
-
|
51 |
-
# Monkey patch the built-in open function when used by panphon
|
52 |
-
def patched_open_for_panphon(file, mode='r', *args, **kwargs):
|
53 |
-
# Add explicit UTF-8 encoding for CSV files opened by panphon
|
54 |
-
if 'panphon' in sys.modules and mode == 'r' and isinstance(file, str) and file.endswith('.csv'):
|
55 |
-
if 'encoding' not in kwargs:
|
56 |
-
kwargs['encoding'] = 'utf-8'
|
57 |
-
# Only log the first time per unique file
|
58 |
-
if file not in _PATCHED_FILES:
|
59 |
-
log.debug(f"Applied UTF-8 encoding patch for {file}")
|
60 |
-
_PATCHED_FILES.add(file)
|
61 |
-
return original_open(file, mode, *args, **kwargs)
|
62 |
-
|
63 |
-
# Set the environment variable for good measure
|
64 |
-
os.environ['PYTHONUTF8'] = '1'
|
65 |
-
|
66 |
-
# Apply the patch
|
67 |
-
import builtins
|
68 |
-
builtins.open = patched_open_for_panphon
|
69 |
-
log.info("Applied panphon UTF-8 patch for Windows")
|
70 |
-
|
71 |
-
# We've already applied the patch above
|
72 |
-
except ImportError:
|
73 |
-
log.warning("Could not patch panphon (not installed)")
|
74 |
-
|
75 |
-
# Apply patches when module is imported
|
76 |
-
apply_patches()
|
77 |
-
log.debug("Patches module initialized")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/rules/ar_lat.rules
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
# consonants
|
2 |
-
ب > b ; پ > p ; ت > t ; ج > j ; چ > ch ;
|
3 |
-
ح > h ; خ > x ; د > d ; ر > r ; ز > z ; س > s ;
|
4 |
-
ش > sh ; ص > s ; ط > t ; غ > gh ;
|
5 |
-
ف > f ; ق > q ; ك > k ; گ > g ; ل > l ; م > m ; ن > n ;
|
6 |
-
ه > h ; ھ > h ; ژ > zh ; ڭ > ng ; ۋ > w ;
|
7 |
-
|
8 |
-
# vowels (hamza carrier ئ can be dropped or mapped to ')
|
9 |
-
ا > a ; ە > e ; ۆ > ö ; و > o ;
|
10 |
-
ۇ > u ; ۈ > ü ; ى > i ; ې > ë ;
|
11 |
-
|
12 |
-
# glottals
|
13 |
-
ء > ' ; ع > ' ; ئ > ;
|
14 |
-
|
15 |
-
:: NFC ;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/rules/kk_ipa.rules
DELETED
@@ -1,47 +0,0 @@
|
|
1 |
-
# Kazakh → IPA transliteration rules (kk_ipa.rules)
|
2 |
-
# One line per Cyrillic letter. Right-hand side is plain IPA (no slashes). NFC-normalised.
|
3 |
-
|
4 |
-
А > ɑ ; а > ɑ ;
|
5 |
-
Ә > æ ; ә > æ ;
|
6 |
-
Б > b ; б > b ;
|
7 |
-
В > v ; в > v ;
|
8 |
-
Г > ɡ ; г > ɡ ;
|
9 |
-
Ғ > ʁ ; ғ > ʁ ;
|
10 |
-
Д > d ; д > d ;
|
11 |
-
Е > e ; е > e ;
|
12 |
-
Ё > jo ; ё > jo ;
|
13 |
-
Ж > ʒ ; ж > ʒ ;
|
14 |
-
З > z ; з > z ;
|
15 |
-
И > i ; и > i ;
|
16 |
-
Й > j ; й > j ;
|
17 |
-
К > k ; к > k ;
|
18 |
-
Қ > q ; қ > q ;
|
19 |
-
Л > l ; л > l ;
|
20 |
-
М > m ; м > m ;
|
21 |
-
Н > n ; н > n ;
|
22 |
-
Ң > ŋ ; ң > ŋ ;
|
23 |
-
О > o ; о > o ;
|
24 |
-
Ө > ø ; ө > ø ;
|
25 |
-
П > p ; п > p ;
|
26 |
-
Р > r ; р > r ;
|
27 |
-
С > s ; с > s ;
|
28 |
-
Т > t ; т > t ;
|
29 |
-
У > u ; у > u ;
|
30 |
-
Ұ > ʊ ; ұ > ʊ ;
|
31 |
-
Ү > y ; ү > y ;
|
32 |
-
Ф > f ; ф > f ;
|
33 |
-
Х > x ; х > x ;
|
34 |
-
Һ > h ; һ > h ;
|
35 |
-
Ц > ts ; ц > ts ;
|
36 |
-
Ч > t͡ʃ ; ч > t͡ʃ ;
|
37 |
-
Ш > ʃ ; ш > ʃ ;
|
38 |
-
Щ > ɕː ; щ > ɕː ;
|
39 |
-
Ъ > ʔ ; ъ > ʔ ;
|
40 |
-
Ы > ɯ ; ы > ɯ ;
|
41 |
-
І > ɪ ; і > ɪ ;
|
42 |
-
Ь > ; ь > ;
|
43 |
-
Э > e ; э > e ;
|
44 |
-
Ю > ju ; ю > ju ;
|
45 |
-
Я > ja ; я > ja ;
|
46 |
-
|
47 |
-
:: NFC ;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/rules/kk_lat2023.rules
DELETED
@@ -1,49 +0,0 @@
|
|
1 |
-
# Official Kazakh Latin alphabet (April 2021)
|
2 |
-
# https://en.wikipedia.org/wiki/Kazakh_alphabets
|
3 |
-
|
4 |
-
А > A ; а > a ;
|
5 |
-
Ә > Ä ; ә > ä ;
|
6 |
-
Б > B ; б > b ;
|
7 |
-
В > V ; в > v ;
|
8 |
-
Г > G ; г > g ;
|
9 |
-
Ғ > Ğ ; ғ > ğ ;
|
10 |
-
Д > D ; д > d ;
|
11 |
-
Е > E ; е > e ;
|
12 |
-
Ж > J ; ж > j ;
|
13 |
-
З > Z ; з > z ;
|
14 |
-
И > İ ; и > i ; # dotted İ/i
|
15 |
-
Й > İ ; й > i ; # official merging per standard (ambiguity known)
|
16 |
-
І > I ; і > ı ; # corrected: dotless lowercase ı
|
17 |
-
К > K ; к > k ;
|
18 |
-
Қ > Q ; қ > q ;
|
19 |
-
Л > L ; л > l ;
|
20 |
-
М > M ; м > m ;
|
21 |
-
Н > N ; н > n ;
|
22 |
-
Ң > Ñ ; ң > ñ ;
|
23 |
-
О > O ; о > o ;
|
24 |
-
Ө > Ö ; ө > ö ;
|
25 |
-
П > P ; п > p ;
|
26 |
-
Р > R ; р > r ;
|
27 |
-
С > S ; с > s ;
|
28 |
-
Т > T ; т > t ;
|
29 |
-
У > U ; у > u ;
|
30 |
-
Ұ > Ū ; ұ > ū ;
|
31 |
-
Ү > Ü ; ү > ü ;
|
32 |
-
Ф > F ; ф > f ;
|
33 |
-
Х > H ; х > h ;
|
34 |
-
Һ > H ; һ > h ;
|
35 |
-
|
36 |
-
# Russian loan letters (clearly marked, NOT official Kazakh letters)
|
37 |
-
Ё > Yo ; ё > yo ;
|
38 |
-
Э > Ė ; э > ė ;
|
39 |
-
Ц > Ts ; ц > ts ;
|
40 |
-
Ч > Ch ; ч > ch ;
|
41 |
-
Ш > Ş ; ш > ş ;
|
42 |
-
Щ > Şç ; щ > şç ;
|
43 |
-
Ы > Y ; ы > y ;
|
44 |
-
Ю > Yu ; ю > yu ;
|
45 |
-
Я > Ya ; я > ya ;
|
46 |
-
Ъ > ; ъ > ; # dropped entirely
|
47 |
-
Ь > ; ь > ; # dropped entirely
|
48 |
-
|
49 |
-
:: NFC ;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/rules/ky_ipa.rules
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
# Kyrgyz → IPA transliteration rules (ky_ipa.rules)
|
2 |
-
# One line per Cyrillic letter; NFC‐normalised; IPA given without slashes.
|
3 |
-
|
4 |
-
А > a ; а > a ;
|
5 |
-
Б > b ; б > b ;
|
6 |
-
В > v ; в > v ;
|
7 |
-
Г > ɡ ; г > ɡ ;
|
8 |
-
Ғ > ʁ ; ғ > ʁ ;
|
9 |
-
Д > d ; д > d ;
|
10 |
-
Е > e ; е > e ;
|
11 |
-
Ё > jo ; ё > jo ;
|
12 |
-
Ж > d͡ʒ ; ж > d͡ʒ ;
|
13 |
-
З > z ; з > z ;
|
14 |
-
И > i ; и > i ;
|
15 |
-
Й > j ; й > j ;
|
16 |
-
К > k ; к > k ;
|
17 |
-
Қ > q ; қ > q ;
|
18 |
-
Л > l ; л > l ;
|
19 |
-
М > m ; м > m ;
|
20 |
-
Н > n ; н > n ;
|
21 |
-
Ң > ŋ ; ң > ŋ ;
|
22 |
-
О > o ; о > o ;
|
23 |
-
Ө > ø ; ө > ø ;
|
24 |
-
П > p ; п > p ;
|
25 |
-
Р > r ; р > r ;
|
26 |
-
С > s ; с > s ;
|
27 |
-
Т > t ; т > t ;
|
28 |
-
У > u ; у > u ;
|
29 |
-
Ү > y ; ү > y ;
|
30 |
-
Ф > f ; ф > f ;
|
31 |
-
Х > x ; х > x ;
|
32 |
-
Ц > ts ; ц > ts ;
|
33 |
-
Ч > t͡ʃ ; ч > t͡ʃ ;
|
34 |
-
Ш > ʃ ; ш > ʃ ;
|
35 |
-
Щ > ɕː ; щ > ɕː ;
|
36 |
-
Ы > ɯ ; ы > ɯ ;
|
37 |
-
Э > ɛ ; э > ɛ ;
|
38 |
-
Ю > ju ; ю > ju ;
|
39 |
-
Я > ja ; я > ja ;
|
40 |
-
Ъ > ʔ ; ъ > ʔ ;
|
41 |
-
Ь > ; ь > ;
|
42 |
-
|
43 |
-
:: NFC ;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/rules/ky_lat2023.rules
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
# Kyrgyz → Modern Practical Latin Transliteration (NFC)
|
2 |
-
# One line per pair; use pre‑composed Unicode code‑points; digraphs are atomic tokens.
|
3 |
-
А > A ; а > a ;
|
4 |
-
Б > B ; б > b ;
|
5 |
-
В > V ; в > v ;
|
6 |
-
Г > G ; г > g ;
|
7 |
-
Д > D ; д > d ;
|
8 |
-
Е > E ; е > e ;
|
9 |
-
Ё > Yo ; ё > yo ;
|
10 |
-
Ж > J ; ж > j ;
|
11 |
-
З > Z ; з > z ;
|
12 |
-
И > İ ; и > i ; # dotted I
|
13 |
-
Й > Ý ; й > ý ; # /j/ glide
|
14 |
-
К > K ; к > k ;
|
15 |
-
Л > L ; л > l ;
|
16 |
-
М > M ; м > m ;
|
17 |
-
Н > N ; н > n ;
|
18 |
-
Ң > Ñ ; ң > ñ ;
|
19 |
-
О > O ; о > o ;
|
20 |
-
Ө > Ö ; ө > ö ;
|
21 |
-
П > P ; п > p ;
|
22 |
-
Р > R ; р > r ;
|
23 |
-
С > S ; с > s ;
|
24 |
-
Т > T ; т > t ;
|
25 |
-
У > U ; у > u ;
|
26 |
-
Ү > Ü ; ү > ü ;
|
27 |
-
Ф > F ; ф > f ;
|
28 |
-
Х > H ; х > h ;
|
29 |
-
Ц > Ts ; ц > ts ; # digraph
|
30 |
-
Ч > Ç ; ч > ç ;
|
31 |
-
Ш > Ş ; ш > ş ;
|
32 |
-
Щ > Şç ; щ > şç ; # digraph
|
33 |
-
Ы > Y ; ы > y ;
|
34 |
-
Э > É ; э > é ;
|
35 |
-
Ю > Yu ; ю > yu ; # digraph
|
36 |
-
Я > Ya ; я > ya ; # digraph
|
37 |
-
Ъ > ʼ ; ъ > ʼ ; # modifier apostrophe U+02BC
|
38 |
-
Ь > ʼ ; ь > ʼ ;
|
39 |
-
|
40 |
-
:: NFC ;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
turkic_translit/sanity.py
DELETED
@@ -1,25 +0,0 @@
|
|
1 |
-
"""Helper functions for Levenshtein and byte checks."""
|
2 |
-
from rapidfuzz.distance import Levenshtein
|
3 |
-
import io
|
4 |
-
import os, re, unicodedata as ud
|
5 |
-
|
6 |
-
def median_lev(file_lat:str, file_ipa:str, sample:int=5000) -> float:
|
7 |
-
from statistics import median
|
8 |
-
m=[]
|
9 |
-
with io.open(file_lat, encoding="utf8") as f1, io.open(file_ipa, encoding="utf8") as f2:
|
10 |
-
for i,(l,i_) in enumerate(zip(f1,f2)):
|
11 |
-
if i==sample: break
|
12 |
-
m.append(Levenshtein.normalized_distance(l.strip(), i_.strip()))
|
13 |
-
return median(m)
|
14 |
-
|
15 |
-
def bytes_per_char(filename:str)->float:
|
16 |
-
import os, io
|
17 |
-
b = os.path.getsize(filename)
|
18 |
-
with io.open(filename, encoding="utf8") as f:
|
19 |
-
chars = sum(len(line) for line in f)
|
20 |
-
return b / chars
|
21 |
-
|
22 |
-
def is_nfc(filename:str)->bool:
|
23 |
-
import unicodedata, io
|
24 |
-
with io.open(filename, encoding="utf8") as f:
|
25 |
-
return all(unicodedata.is_normalized("NFC", line) for line in f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|