wagner-austin
commited on
Commit
·
5e627b0
1
Parent(s):
f2744fd
Enhance documentation with detailed README and code comments
Browse files- .gitattributes +0 -35
- .gitignore +0 -6
- README.md +22 -0
- app.py +17 -112
- env.yml +0 -27
- pyproject.toml +0 -44
- requirements.txt +3 -1
.gitattributes
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
DELETED
@@ -1,6 +0,0 @@
|
|
1 |
-
__pycache__/
|
2 |
-
*.egg-info/
|
3 |
-
*.whl
|
4 |
-
.venv/
|
5 |
-
.pytest_cache/
|
6 |
-
temp.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
@@ -11,4 +11,26 @@ license: apache-2.0
|
|
11 |
short_description: Transliteration of Kazakh & Kyrgyz into Latin and IPA
|
12 |
---
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
11 |
short_description: Transliteration of Kazakh & Kyrgyz into Latin and IPA
|
12 |
---
|
13 |
|
14 |
+
# Turkic Transliteration Demo
|
15 |
+
|
16 |
+
## Overview
|
17 |
+
This Hugging Face Space demonstrates transliteration capabilities for Turkic languages, specifically Kazakh and Kyrgyz. The demo converts text between Cyrillic script, Latin script, and International Phonetic Alphabet (IPA) notation.
|
18 |
+
|
19 |
+
## Features
|
20 |
+
- **Bidirectional Transliteration**: Convert between Cyrillic and Latin scripts
|
21 |
+
- **IPA Transcription**: Generate accurate phonetic transcriptions using IPA
|
22 |
+
- **Language Selection**: Support for Kazakh and Kyrgyz languages
|
23 |
+
- **Interactive Interface**: User-friendly web interface powered by Gradio
|
24 |
+
|
25 |
+
## Usage
|
26 |
+
1. Select your source and target scripts
|
27 |
+
2. Choose the language (Kazakh or Kyrgyz)
|
28 |
+
3. Enter or paste your text in the input field
|
29 |
+
4. View the transliterated/transcribed output instantly
|
30 |
+
|
31 |
+
## Technical Details
|
32 |
+
This demo is powered by the `turkic-transliterate` Python package, which provides specialized transliteration tools for Turkic languages with high accuracy for linguistic research and practical applications.
|
33 |
+
|
34 |
+
---
|
35 |
+
|
36 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -1,116 +1,21 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
"""
|
4 |
-
import gradio as gr
|
5 |
-
from turkic_translit.core import to_latin, to_ipa
|
6 |
-
import unicodedata as ud
|
7 |
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
else:
|
17 |
-
result = to_ipa(text, lang)
|
18 |
-
format_label = "IPA"
|
19 |
-
result = ud.normalize("NFC", result)
|
20 |
-
stats_md = (f"**Bytes** — Cyrillic : {len(text.encode('utf8'))}, "
|
21 |
-
f"{format_label} : {len(result.encode('utf8'))}")
|
22 |
-
return result, stats_md
|
23 |
-
except Exception as e:
|
24 |
-
raise gr.Error(str(e))
|
25 |
|
26 |
-
#
|
27 |
-
|
28 |
-
|
29 |
-
if mode == "IPA":
|
30 |
-
# Forced off + greyed-out
|
31 |
-
return gr.update(value=False, interactive=False)
|
32 |
-
# Latin: keep whatever the user last chose, stay interactive
|
33 |
-
return gr.update(interactive=True)
|
34 |
-
|
35 |
-
# Create the Gradio interface
|
36 |
-
with gr.Blocks(title="Turkic Transliteration Demo") as demo:
|
37 |
-
gr.Markdown("# Turkic Transliteration Demo")
|
38 |
-
gr.Markdown("Enter Cyrillic text for Kazakh (kk) or Kyrgyz (ky) and see the Latin transliteration")
|
39 |
-
|
40 |
-
with gr.Row():
|
41 |
-
with gr.Column():
|
42 |
-
input_text = gr.Textbox(
|
43 |
-
label="Input Text (Cyrillic)",
|
44 |
-
placeholder="Enter Kazakh or Kyrgyz text in Cyrillic script...",
|
45 |
-
lines=5
|
46 |
-
)
|
47 |
-
lang = gr.Radio(
|
48 |
-
["kk", "ky"],
|
49 |
-
label="Language",
|
50 |
-
info="kk = Kazakh, ky = Kyrgyz",
|
51 |
-
value="kk"
|
52 |
-
)
|
53 |
-
output_format = gr.Radio(
|
54 |
-
["Latin", "IPA"],
|
55 |
-
label="Output Format",
|
56 |
-
info="Latin = Standard Latin alphabet, IPA = International Phonetic Alphabet",
|
57 |
-
value="Latin"
|
58 |
-
)
|
59 |
-
include_arabic = gr.Checkbox(False, label="Also transliterate Arabic script (Latin mode only)")
|
60 |
-
|
61 |
-
with gr.Column():
|
62 |
-
output_text = gr.Textbox(
|
63 |
-
label="Transliteration Output",
|
64 |
-
lines=5,
|
65 |
-
interactive=False
|
66 |
-
)
|
67 |
-
stats = gr.Markdown(value="")
|
68 |
-
|
69 |
-
# Example inputs
|
70 |
-
examples = [
|
71 |
-
["Қазақ тілі - Түркі тілдерінің бірі.", "kk", "Latin"],
|
72 |
-
["Қазақ тілі - Түркі тілдерінің бірі.", "kk", "IPA"],
|
73 |
-
["Кыргыз тили - Түрк тилдеринин бири.", "ky", "Latin"],
|
74 |
-
["Кыргыз тили - Түрк тилдеринин бири.", "ky", "IPA"]
|
75 |
-
]
|
76 |
-
gr.Markdown("👉 **Tip:** Click an example below to instantly fill in the fields and see the transliteration!")
|
77 |
-
gr.Examples(examples, [input_text, lang, output_format])
|
78 |
-
|
79 |
-
# Live preview: update output while the user types
|
80 |
-
input_text.input(
|
81 |
-
fn=transliterate,
|
82 |
-
inputs=[input_text, lang, include_arabic, output_format],
|
83 |
-
outputs=[output_text, stats]
|
84 |
-
)
|
85 |
-
|
86 |
-
# Also fire when the textbox's value finishes changing (e.g., clicking an example)
|
87 |
-
input_text.change(
|
88 |
-
fn=transliterate,
|
89 |
-
inputs=[input_text, lang, include_arabic, output_format],
|
90 |
-
outputs=[output_text, stats]
|
91 |
-
)
|
92 |
-
|
93 |
-
lang.change(
|
94 |
-
fn=transliterate,
|
95 |
-
inputs=[input_text, lang, include_arabic, output_format],
|
96 |
-
outputs=[output_text, stats]
|
97 |
-
)
|
98 |
-
|
99 |
-
include_arabic.change(
|
100 |
-
fn=transliterate,
|
101 |
-
inputs=[input_text, lang, include_arabic, output_format],
|
102 |
-
outputs=[output_text, stats]
|
103 |
-
)
|
104 |
-
|
105 |
-
output_format.change(
|
106 |
-
fn=transliterate,
|
107 |
-
inputs=[input_text, lang, include_arabic, output_format],
|
108 |
-
outputs=[output_text, stats]
|
109 |
-
)
|
110 |
-
# Sync Arabic checkbox with mode
|
111 |
-
output_format.change(_sync_arabic_checkbox, output_format, include_arabic)
|
112 |
-
|
113 |
-
# Launch the app
|
114 |
-
if __name__ == "__main__":
|
115 |
-
demo.queue().launch()
|
116 |
|
|
|
|
|
|
|
|
1 |
+
# Turkic Transliteration Demo for Hugging Face Spaces
|
2 |
+
# This application demonstrates transliteration between Cyrillic, Latin, and IPA for Turkic languages
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
# Import the web demo UI builder from the turkic_translit package
|
5 |
+
from turkic_translit.web_demo import build_ui
|
6 |
|
7 |
+
# Create the Gradio interface with default settings
|
8 |
+
# The build_ui function configures a Gradio Interface with:
|
9 |
+
# - Input fields for text entry
|
10 |
+
# - Language selection (Kazakh/Kyrgyz)
|
11 |
+
# - Script selection (Cyrillic/Latin/IPA)
|
12 |
+
# - Real-time transliteration preview
|
13 |
+
demo = build_ui()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# Enable queuing for better performance with multiple users
|
16 |
+
# This prevents the server from being overwhelmed by concurrent requests
|
17 |
+
demo.queue()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
# Launch the web application
|
20 |
+
# In Hugging Face Spaces, this will make the app available to users
|
21 |
+
demo.launch()
|
env.yml
DELETED
@@ -1,27 +0,0 @@
|
|
1 |
-
name: turkic
|
2 |
-
channels:
|
3 |
-
- conda-forge
|
4 |
-
- defaults
|
5 |
-
dependencies:
|
6 |
-
# ---------- Core binaries ----------
|
7 |
-
- python =3.11 # pick 3.10 or 3.11; 3.12 wheels still new
|
8 |
-
- icu =73.* # matches PyICU 2.15 wheel
|
9 |
-
- sentencepiece >=0.2
|
10 |
-
- rapidfuzz >=3.5
|
11 |
-
- numpy <2 # required by fasttext-wheel
|
12 |
-
# ---------- Build / test ----------
|
13 |
-
- pip
|
14 |
-
- pytest >=8.0
|
15 |
-
- packaging >=23.0 # used in tests
|
16 |
-
# ---------- pip-only wheels ----------
|
17 |
-
- pip:
|
18 |
-
# PyICU is handled differently for Windows vs. other platforms
|
19 |
-
# On Windows, we'll use the script/get_pyicu_wheel.py helper
|
20 |
-
- pyicu==2.15.2 ; platform_system != "Windows"
|
21 |
-
# language ID & vectors
|
22 |
-
- fasttext-wheel==0.9.2
|
23 |
-
# phoneme & feature extraction
|
24 |
-
- epitran==1.26.0
|
25 |
-
- git+https://github.com/dmort27/panphon.git@master#egg=panphon
|
26 |
-
# editable install of your package
|
27 |
-
- -e .
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pyproject.toml
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
[build-system]
|
2 |
-
requires = ["setuptools>=68", "wheel"]
|
3 |
-
build-backend = "setuptools.build_meta"
|
4 |
-
|
5 |
-
|
6 |
-
[project]
|
7 |
-
name = "turkic_transliterate"
|
8 |
-
version = "0.1.0"
|
9 |
-
description = "Deterministic Latin and IPA transliteration for Kazakh, Kyrgyz, plus tokenizer/glue scripts."
|
10 |
-
authors = [ {name="Austin Wagner", email="[email protected]"} ]
|
11 |
-
requires-python = ">=3.9"
|
12 |
-
|
13 |
-
dependencies = [
|
14 |
-
# Core dependencies (alphabetized)
|
15 |
-
"epitran>=1.0,<1.27", # 1.26.0 is the latest on PyPI
|
16 |
-
# Universal fasttext-wheel for all platforms and Python 3.10-3.13
|
17 |
-
"fasttext-wheel==0.9.2",
|
18 |
-
"numpy<2",
|
19 |
-
"packaging>=23.0", # Used in tests/test_fasttext.py
|
20 |
-
"panphon>=0.20,<0.22", # 0.21.2 is the newest published wheel
|
21 |
-
# Source build for non-Windows platforms
|
22 |
-
"PyICU>=2.15 ; sys_platform != 'win32'",
|
23 |
-
# Windows wheels for PyICU
|
24 |
-
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp310-cp310-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.10'",
|
25 |
-
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11'",
|
26 |
-
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp312-cp312-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.12'",
|
27 |
-
"PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp313-cp313-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.13'",
|
28 |
-
"pytest>=8.0", # Test runner
|
29 |
-
"rapidfuzz>=3.5",
|
30 |
-
"rich>=13.7", # Color-aware logging and console output
|
31 |
-
"sentencepiece>=0.2.0"
|
32 |
-
]
|
33 |
-
|
34 |
-
[project.optional-dependencies]
|
35 |
-
# winlid dependency kept for backward compatibility but empty since fasttext-wheel is now in main dependencies
|
36 |
-
winlid = []
|
37 |
-
|
38 |
-
# Development tools
|
39 |
-
dev = ["black", "ruff"]
|
40 |
-
# User interface dependencies
|
41 |
-
ui = ["gradio"]
|
42 |
-
|
43 |
-
[project.scripts]
|
44 |
-
turkic-translit = "turkic_translit.cli:main"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
-
turkic-transliterate>=0.1.
|
|
|
|
|
2 |
|
3 |
# Core dependencies
|
4 |
epitran>=1.0,<1.27
|
|
|
1 |
+
turkic-transliterate[ui]>=0.1.2
|
2 |
+
|
3 |
+
#turkic-transliterate[ui] @ git+https://github.com/wagner-austin/turkic_transliteration.git@main
|
4 |
|
5 |
# Core dependencies
|
6 |
epitran>=1.0,<1.27
|