wagner-austin commited on
Commit
1a0742b
·
1 Parent(s): 68c6a96

updated requirements to direct at python package for turkic-transliterate. and removed files here

Browse files
requirements.txt CHANGED
@@ -1,3 +1,5 @@
 
 
1
  # Core dependencies
2
  epitran>=1.0,<1.27
3
  fasttext-wheel==0.9.2
 
1
+ turkic-transliterate>=0.1.0
2
+
3
  # Core dependencies
4
  epitran>=1.0,<1.27
5
  fasttext-wheel==0.9.2
turkic_translit/__init__.py DELETED
@@ -1,9 +0,0 @@
1
- from importlib.metadata import version
2
- # Set up logging first before any other operations
3
- from .logging_config import setup as _log_setup; _log_setup()
4
- # Import patches next to ensure they're applied before other imports
5
- from . import patches
6
- from .core import to_latin, to_ipa
7
-
8
- __all__ = ["to_latin", "to_ipa"]
9
- __version__ = "0.1.0"
 
 
 
 
 
 
 
 
 
 
turkic_translit/cli.py DELETED
@@ -1,109 +0,0 @@
1
- import sys, argparse, pathlib, time, os, logging
2
- from .core import to_latin, to_ipa
3
- from .logging_config import setup as _log_setup
4
-
5
- # Initialize logger
6
- log = logging.getLogger(__name__)
7
-
8
- def main() -> None:
9
- ap = argparse.ArgumentParser(description="Turkic transliteration")
10
- ap.add_argument("--lang", required=True, choices=["kk", "ky"])
11
- ap.add_argument("--ipa", action="store_true", help="produce IPA")
12
- ap.add_argument("--arabic", action="store_true", help="also transliterate Arabic script")
13
- ap.add_argument("--in", dest="inp", default="-")
14
- ap.add_argument("--out_latin", default="-")
15
- ap.add_argument("--out_ipa")
16
- ap.add_argument("--benchmark", action="store_true")
17
- ap.add_argument("--log-level", choices=["debug", "info", "warning", "error", "critical"],
18
- default="info",
19
- help="Set logging level (default: info)")
20
- args = ap.parse_args()
21
-
22
- # Always set log level from args at the start (first runtime line)
23
- os.environ["TURKIC_LOG_LEVEL"] = args.log_level.upper()
24
- _log_setup()
25
-
26
- outputs = ["latin"]
27
- if args.ipa:
28
- outputs.append("ipa")
29
- # Use Rich markup for output modes (magenta)
30
- outputs_markup = ", ".join(f"[magenta]{o}[/]" for o in outputs)
31
- log.info(
32
- f"Starting transliteration: lang={args.lang}, input={args.inp}, outputs={outputs_markup}, "
33
- f"out_latin={args.out_latin}, out_ipa={args.out_ipa}, arabic={args.arabic}, benchmark={args.benchmark}"
34
- )
35
-
36
- # Use UTF-8-sig for Windows to include BOM for proper encoding support
37
- encoding = "utf-8-sig" if sys.platform == "win32" else "utf-8"
38
-
39
- try:
40
- fin = sys.stdin if args.inp == "-" else open(args.inp, encoding=encoding)
41
- fo_l = sys.stdout if args.out_latin == "-" else open(args.out_latin, "w", encoding=encoding)
42
- fo_i = None
43
- if args.ipa:
44
- if not args.out_ipa:
45
- ap.error("--ipa requires --out_ipa")
46
- fo_i = open(args.out_ipa, "w", encoding=encoding)
47
- except UnicodeDecodeError as e:
48
- sys.stderr.write(f"Encoding error: {e}\n")
49
- sys.stderr.write("If you're on Windows, make sure your input file is properly encoded in UTF-8.\n")
50
- sys.exit(1)
51
-
52
- start = time.time()
53
- n = 0
54
-
55
- # Try to use tqdm for a progress bar if available and if we're in a TTY
56
- use_progress_bar = False
57
- pbar = None
58
-
59
- # Check if we should use a progress bar (stderr is a TTY and input is not stdin)
60
- is_tty_output = sys.stderr.isatty()
61
- is_file_input = args.inp != "-"
62
-
63
- if is_tty_output and is_file_input:
64
- try:
65
- from tqdm import tqdm
66
- # Count the number of lines in the input file for the progress bar
67
- total_lines = sum(1 for _ in fin)
68
- fin.seek(0) # Reset file pointer
69
- pbar = tqdm(total=total_lines, unit="lines")
70
- use_progress_bar = True
71
- log.debug("Using tqdm progress bar for %d lines", total_lines)
72
- except ImportError:
73
- log.debug("tqdm not available, falling back to basic processing")
74
-
75
- # Process lines
76
- for line in fin:
77
- lat = to_latin(line.rstrip("\n"), args.lang, args.arabic)
78
- fo_l.write(lat + "\n")
79
- if fo_i:
80
- fo_i.write(to_ipa(line.rstrip("\n"), args.lang) + "\n")
81
- n += 1
82
- if use_progress_bar and pbar:
83
- pbar.update(1)
84
-
85
- log.info(f"Finished writing {n} lines to {args.out_latin if args.out_latin != '-' else 'stdout'}" + (f" and {args.out_ipa}" if args.ipa else ""))
86
-
87
- # Close progress bar if used
88
- if use_progress_bar and pbar:
89
- pbar.close()
90
-
91
- elapsed = time.time() - start
92
- # Always log processing statistics, but at different levels based on benchmark flag
93
- if args.benchmark:
94
- log.info("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
95
- else:
96
- log.debug("Processed %d lines in %.2fs (%.0f lines/s)", n, elapsed, n/elapsed if elapsed > 0 else 0)
97
- log.info("Transliteration complete.")
98
-
99
- # Clean up file handles
100
- if fin is not sys.stdin:
101
- fin.close()
102
- if fo_l is not sys.stdout:
103
- fo_l.close()
104
- if fo_i:
105
- fo_i.close()
106
-
107
- # This is the entry point when the module is run directly
108
- if __name__ == "__main__":
109
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/core.py DELETED
@@ -1,38 +0,0 @@
1
- """Public API for Latin and IPA transliteration."""
2
- try:
3
- import icu # noqa: F401
4
- except ImportError as e: # PyICU wheel is still missing
5
- raise RuntimeError(
6
- "PyICU missing. On Windows run:\n"
7
- " python scripts/get_pyicu_wheel.py\n"
8
- "or manually install a wheel from "
9
- "https://github.com/cgohlke/pyicu-build/releases ."
10
- ) from e
11
-
12
- from functools import lru_cache
13
- import unicodedata as ud
14
- from pathlib import Path
15
-
16
-
17
- _RULE_DIR = Path(__file__).with_suffix("").parent / "rules"
18
-
19
- @lru_cache
20
- def _icu_trans(name: str) -> icu.Transliterator:
21
- txt = (_RULE_DIR / name).read_text(encoding="utf8")
22
- return icu.Transliterator.createFromRules(name, txt, 0)
23
-
24
- def to_latin(text: str, lang: str, include_arabic: bool = False) -> str:
25
- if lang not in ("kk", "ky"):
26
- raise ValueError("lang must be 'kk' or 'ky'")
27
- rule = f"{lang}_lat2023.rules"
28
- trans = _icu_trans(rule)
29
- if include_arabic:
30
- ar = _icu_trans("ar_lat.rules")
31
- text = ar.transliterate(text)
32
- out = trans.transliterate(text)
33
- return ud.normalize("NFC", out)
34
-
35
-
36
- def to_ipa(text: str, lang: str) -> str:
37
- trans = _icu_trans(f"{lang}_ipa.rules")
38
- return ud.normalize("NFC", trans.transliterate(text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/logging_config.py DELETED
@@ -1,58 +0,0 @@
1
- """
2
- Centralized logging configuration module.
3
- Uses Rich for colorized output if available with fallback to standard library.
4
- """
5
- import logging
6
- import os
7
- import sys
8
- from functools import lru_cache
9
-
10
- # Get log level from environment or default to INFO
11
- LOG_LEVEL = os.environ.get("TURKIC_LOG_LEVEL", "INFO").upper()
12
-
13
- @lru_cache(maxsize=1)
14
- def setup():
15
- """
16
- Set up logging with Rich if available, with fallback to stdlib logging.
17
- Uses TURKIC_LOG_LEVEL environment variable or defaults to INFO.
18
-
19
- Uses @lru_cache to ensure this is only run once.
20
- """
21
- root_logger = logging.getLogger()
22
-
23
- # Clear any existing handlers
24
- for handler in root_logger.handlers[:]:
25
- root_logger.removeHandler(handler)
26
-
27
- # Set the log level based on environment variable
28
- log_level = getattr(logging, LOG_LEVEL, logging.INFO)
29
- root_logger.setLevel(log_level)
30
-
31
- # Try to use Rich for pretty, colorized output
32
- try:
33
- from rich.logging import RichHandler
34
-
35
- # Configure Rich handler with appropriate settings
36
- handler = RichHandler(
37
- rich_tracebacks=True,
38
- markup=True,
39
- show_time=False,
40
- show_path=False,
41
- )
42
- formatter = logging.Formatter("%(message)s")
43
-
44
- except ImportError:
45
- # Fall back to standard logging if Rich is not available
46
- handler = logging.StreamHandler(sys.stderr)
47
- formatter = logging.Formatter(
48
- "%(levelname)s: %(message)s"
49
- )
50
-
51
- # Configure and add the handler
52
- handler.setFormatter(formatter)
53
- root_logger.addHandler(handler)
54
-
55
- logger = logging.getLogger("turkic_translit")
56
- logger.debug(f"Logging initialized at level {LOG_LEVEL}")
57
-
58
- return logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/patches.py DELETED
@@ -1,77 +0,0 @@
1
- """
2
- Patches for third-party libraries to fix encoding issues on Windows.
3
- This module is imported automatically at startup.
4
- """
5
- import os
6
- import sys
7
- import functools
8
- import logging
9
- from .logging_config import setup; setup()
10
-
11
- log = logging.getLogger(__name__)
12
- _PATCH_DONE = False
13
- _PATCHED_FILES = set()
14
-
15
- def _fix_broken_ssl_cert_env():
16
- """
17
- If the user (often Conda on Windows) left SSL_CERT_FILE pointing at a
18
- non-existent bundle, httpx ⇢ gradio will crash on import. When the file
19
- is missing we delete the env-var so Python falls back to the system
20
- certificates.
21
- """
22
- import os, pathlib, logging
23
- log = logging.getLogger(__name__)
24
- bundle = os.environ.get("SSL_CERT_FILE")
25
- if bundle and not pathlib.Path(bundle).exists():
26
- log.warning(
27
- "SSL_CERT_FILE=%s does not exist – removing the variable so "
28
- "httpx can create a default context", bundle)
29
- del os.environ["SSL_CERT_FILE"]
30
-
31
- def apply_patches():
32
- """Apply all necessary patches for third-party libraries."""
33
- global _PATCH_DONE
34
- _fix_broken_ssl_cert_env() # ← new line: ensure SSL_CERT_FILE is valid before any third-party import
35
- # Skip if patches have already been applied
36
- if _PATCH_DONE:
37
- log.debug("Patches already applied, skipping")
38
- return
39
-
40
- _PATCH_DONE = True
41
- # Fix panphon encoding issues on Windows
42
- if sys.platform == 'win32':
43
- try:
44
- import panphon.featuretable
45
- import io
46
- import csv
47
-
48
- # Save the original open function
49
- original_open = open
50
-
51
- # Monkey patch the built-in open function when used by panphon
52
- def patched_open_for_panphon(file, mode='r', *args, **kwargs):
53
- # Add explicit UTF-8 encoding for CSV files opened by panphon
54
- if 'panphon' in sys.modules and mode == 'r' and isinstance(file, str) and file.endswith('.csv'):
55
- if 'encoding' not in kwargs:
56
- kwargs['encoding'] = 'utf-8'
57
- # Only log the first time per unique file
58
- if file not in _PATCHED_FILES:
59
- log.debug(f"Applied UTF-8 encoding patch for {file}")
60
- _PATCHED_FILES.add(file)
61
- return original_open(file, mode, *args, **kwargs)
62
-
63
- # Set the environment variable for good measure
64
- os.environ['PYTHONUTF8'] = '1'
65
-
66
- # Apply the patch
67
- import builtins
68
- builtins.open = patched_open_for_panphon
69
- log.info("Applied panphon UTF-8 patch for Windows")
70
-
71
- # We've already applied the patch above
72
- except ImportError:
73
- log.warning("Could not patch panphon (not installed)")
74
-
75
- # Apply patches when module is imported
76
- apply_patches()
77
- log.debug("Patches module initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/rules/ar_lat.rules DELETED
@@ -1,15 +0,0 @@
1
- # consonants
2
- ب > b ; پ > p ; ت > t ; ج > j ; چ > ch ;
3
- ح > h ; خ > x ; د > d ; ر > r ; ز > z ; س > s ;
4
- ش > sh ; ص > s ; ط > t ; غ > gh ;
5
- ف > f ; ق > q ; ك > k ; گ > g ; ل > l ; م > m ; ن > n ;
6
- ه > h ; ھ > h ; ژ > zh ; ڭ > ng ; ۋ > w ;
7
-
8
- # vowels (hamza carrier ئ can be dropped or mapped to ')
9
- ا > a ; ە > e ; ۆ > ö ; و > o ;
10
- ۇ > u ; ۈ > ü ; ى > i ; ې > ë ;
11
-
12
- # glottals
13
- ء > ' ; ع > ' ; ئ > ;
14
-
15
- :: NFC ;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/rules/kk_ipa.rules DELETED
@@ -1,47 +0,0 @@
1
- # Kazakh → IPA transliteration rules (kk_ipa.rules)
2
- # One line per Cyrillic letter. Right-hand side is plain IPA (no slashes). NFC-normalised.
3
-
4
- А > ɑ ; а > ɑ ;
5
- Ә > æ ; ә > æ ;
6
- Б > b ; б > b ;
7
- В > v ; в > v ;
8
- Г > ɡ ; г > ɡ ;
9
- Ғ > ʁ ; ғ > ʁ ;
10
- Д > d ; д > d ;
11
- Е > e ; е > e ;
12
- Ё > jo ; ё > jo ;
13
- Ж > ʒ ; ж > ʒ ;
14
- З > z ; з > z ;
15
- И > i ; и > i ;
16
- Й > j ; й > j ;
17
- К > k ; к > k ;
18
- Қ > q ; қ > q ;
19
- Л > l ; л > l ;
20
- М > m ; м > m ;
21
- Н > n ; н > n ;
22
- Ң > ŋ ; ң > ŋ ;
23
- О > o ; о > o ;
24
- Ө > ø ; ө > ø ;
25
- П > p ; п > p ;
26
- Р > r ; р > r ;
27
- С > s ; с > s ;
28
- Т > t ; т > t ;
29
- У > u ; у > u ;
30
- Ұ > ʊ ; ұ > ʊ ;
31
- Ү > y ; ү > y ;
32
- Ф > f ; ф > f ;
33
- Х > x ; х > x ;
34
- Һ > h ; һ > h ;
35
- Ц > ts ; ц > ts ;
36
- Ч > t͡ʃ ; ч > t͡ʃ ;
37
- Ш > ʃ ; ш > ʃ ;
38
- Щ > ɕː ; щ > ɕː ;
39
- Ъ > ʔ ; ъ > ʔ ;
40
- Ы > ɯ ; ы > ɯ ;
41
- І > ɪ ; і > ɪ ;
42
- Ь > ; ь > ;
43
- Э > e ; э > e ;
44
- Ю > ju ; ю > ju ;
45
- Я > ja ; я > ja ;
46
-
47
- :: NFC ;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/rules/kk_lat2023.rules DELETED
@@ -1,49 +0,0 @@
1
- # Official Kazakh Latin alphabet (April 2021)
2
- # https://en.wikipedia.org/wiki/Kazakh_alphabets
3
-
4
- А > A ; а > a ;
5
- Ә > Ä ; ә > ä ;
6
- Б > B ; б > b ;
7
- В > V ; в > v ;
8
- Г > G ; г > g ;
9
- Ғ > Ğ ; ғ > ğ ;
10
- Д > D ; д > d ;
11
- Е > E ; е > e ;
12
- Ж > J ; ж > j ;
13
- З > Z ; з > z ;
14
- И > İ ; и > i ; # dotted İ/i
15
- Й > İ ; й > i ; # official merging per standard (ambiguity known)
16
- І > I ; і > ı ; # corrected: dotless lowercase ı
17
- К > K ; к > k ;
18
- Қ > Q ; қ > q ;
19
- Л > L ; л > l ;
20
- М > M ; м > m ;
21
- Н > N ; н > n ;
22
- Ң > Ñ ; ң > ñ ;
23
- О > O ; о > o ;
24
- Ө > Ö ; ө > ö ;
25
- П > P ; п > p ;
26
- Р > R ; р > r ;
27
- С > S ; с > s ;
28
- Т > T ; т > t ;
29
- У > U ; у > u ;
30
- Ұ > Ū ; ұ > ū ;
31
- Ү > Ü ; ү > ü ;
32
- Ф > F ; ф > f ;
33
- Х > H ; х > h ;
34
- Һ > H ; һ > h ;
35
-
36
- # Russian loan letters (clearly marked, NOT official Kazakh letters)
37
- Ё > Yo ; ё > yo ;
38
- Э > Ė ; э > ė ;
39
- Ц > Ts ; ц > ts ;
40
- Ч > Ch ; ч > ch ;
41
- Ш > Ş ; ш > ş ;
42
- Щ > Şç ; щ > şç ;
43
- Ы > Y ; ы > y ;
44
- Ю > Yu ; ю > yu ;
45
- Я > Ya ; я > ya ;
46
- Ъ > ; ъ > ; # dropped entirely
47
- Ь > ; ь > ; # dropped entirely
48
-
49
- :: NFC ;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/rules/ky_ipa.rules DELETED
@@ -1,43 +0,0 @@
1
- # Kyrgyz → IPA transliteration rules (ky_ipa.rules)
2
- # One line per Cyrillic letter; NFC‐normalised; IPA given without slashes.
3
-
4
- А > a ; а > a ;
5
- Б > b ; б > b ;
6
- В > v ; в > v ;
7
- Г > ɡ ; г > ɡ ;
8
- Ғ > ʁ ; ғ > ʁ ;
9
- Д > d ; д > d ;
10
- Е > e ; е > e ;
11
- Ё > jo ; ё > jo ;
12
- Ж > d͡ʒ ; ж > d͡ʒ ;
13
- З > z ; з > z ;
14
- И > i ; и > i ;
15
- Й > j ; й > j ;
16
- К > k ; к > k ;
17
- Қ > q ; қ > q ;
18
- Л > l ; л > l ;
19
- М > m ; м > m ;
20
- Н > n ; н > n ;
21
- Ң > ŋ ; ң > ŋ ;
22
- О > o ; о > o ;
23
- Ө > ø ; ө > ø ;
24
- П > p ; п > p ;
25
- Р > r ; р > r ;
26
- С > s ; с > s ;
27
- Т > t ; т > t ;
28
- У > u ; у > u ;
29
- Ү > y ; ү > y ;
30
- Ф > f ; ф > f ;
31
- Х > x ; х > x ;
32
- Ц > ts ; ц > ts ;
33
- Ч > t͡ʃ ; ч > t͡ʃ ;
34
- Ш > ʃ ; ш > ʃ ;
35
- Щ > ɕː ; щ > ɕː ;
36
- Ы > ɯ ; ы > ɯ ;
37
- Э > ɛ ; э > ɛ ;
38
- Ю > ju ; ю > ju ;
39
- Я > ja ; я > ja ;
40
- Ъ > ʔ ; ъ > ʔ ;
41
- Ь > ; ь > ;
42
-
43
- :: NFC ;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/rules/ky_lat2023.rules DELETED
@@ -1,40 +0,0 @@
1
- # Kyrgyz → Modern Practical Latin Transliteration (NFC)
2
- # One line per pair; use pre‑composed Unicode code‑points; digraphs are atomic tokens.
3
- А > A ; а > a ;
4
- Б > B ; б > b ;
5
- В > V ; в > v ;
6
- Г > G ; г > g ;
7
- Д > D ; д > d ;
8
- Е > E ; е > e ;
9
- Ё > Yo ; ё > yo ;
10
- Ж > J ; ж > j ;
11
- З > Z ; з > z ;
12
- И > İ ; и > i ; # dotted I
13
- Й > Ý ; й > ý ; # /j/ glide
14
- К > K ; к > k ;
15
- Л > L ; л > l ;
16
- М > M ; м > m ;
17
- Н > N ; н > n ;
18
- Ң > Ñ ; ң > ñ ;
19
- О > O ; о > o ;
20
- Ө > Ö ; ө > ö ;
21
- П > P ; п > p ;
22
- Р > R ; р > r ;
23
- С > S ; с > s ;
24
- Т > T ; т > t ;
25
- У > U ; у > u ;
26
- Ү > Ü ; ү > ü ;
27
- Ф > F ; ф > f ;
28
- Х > H ; х > h ;
29
- Ц > Ts ; ц > ts ; # digraph
30
- Ч > Ç ; ч > ç ;
31
- Ш > Ş ; ш > ş ;
32
- Щ > Şç ; щ > şç ; # digraph
33
- Ы > Y ; ы > y ;
34
- Э > É ; э > é ;
35
- Ю > Yu ; ю > yu ; # digraph
36
- Я > Ya ; я > ya ; # digraph
37
- Ъ > ʼ ; ъ > ʼ ; # modifier apostrophe U+02BC
38
- Ь > ʼ ; ь > ʼ ;
39
-
40
- :: NFC ;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
turkic_translit/sanity.py DELETED
@@ -1,25 +0,0 @@
1
- """Helper functions for Levenshtein and byte checks."""
2
- from rapidfuzz.distance import Levenshtein
3
- import io
4
- import os, re, unicodedata as ud
5
-
6
- def median_lev(file_lat:str, file_ipa:str, sample:int=5000) -> float:
7
- from statistics import median
8
- m=[]
9
- with io.open(file_lat, encoding="utf8") as f1, io.open(file_ipa, encoding="utf8") as f2:
10
- for i,(l,i_) in enumerate(zip(f1,f2)):
11
- if i==sample: break
12
- m.append(Levenshtein.normalized_distance(l.strip(), i_.strip()))
13
- return median(m)
14
-
15
- def bytes_per_char(filename:str)->float:
16
- import os, io
17
- b = os.path.getsize(filename)
18
- with io.open(filename, encoding="utf8") as f:
19
- chars = sum(len(line) for line in f)
20
- return b / chars
21
-
22
- def is_nfc(filename:str)->bool:
23
- import unicodedata, io
24
- with io.open(filename, encoding="utf8") as f:
25
- return all(unicodedata.is_normalized("NFC", line) for line in f)