wagner-austin commited on
Commit
5e627b0
·
1 Parent(s): f2744fd

Enhance documentation with detailed README and code comments

Browse files
Files changed (7) hide show
  1. .gitattributes +0 -35
  2. .gitignore +0 -6
  3. README.md +22 -0
  4. app.py +17 -112
  5. env.yml +0 -27
  6. pyproject.toml +0 -44
  7. requirements.txt +3 -1
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore DELETED
@@ -1,6 +0,0 @@
1
- __pycache__/
2
- *.egg-info/
3
- *.whl
4
- .venv/
5
- .pytest_cache/
6
- temp.txt
 
 
 
 
 
 
 
README.md CHANGED
@@ -11,4 +11,26 @@ license: apache-2.0
11
  short_description: Transliteration of Kazakh & Kyrgyz into Latin and IPA
12
  ---
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
11
  short_description: Transliteration of Kazakh & Kyrgyz into Latin and IPA
12
  ---
13
 
14
+ # Turkic Transliteration Demo
15
+
16
+ ## Overview
17
+ This Hugging Face Space demonstrates transliteration capabilities for Turkic languages, specifically Kazakh and Kyrgyz. The demo converts text between Cyrillic script, Latin script, and International Phonetic Alphabet (IPA) notation.
18
+
19
+ ## Features
20
+ - **Bidirectional Transliteration**: Convert between Cyrillic and Latin scripts
21
+ - **IPA Transcription**: Generate accurate phonetic transcriptions using IPA
22
+ - **Language Selection**: Support for Kazakh and Kyrgyz languages
23
+ - **Interactive Interface**: User-friendly web interface powered by Gradio
24
+
25
+ ## Usage
26
+ 1. Select your source and target scripts
27
+ 2. Choose the language (Kazakh or Kyrgyz)
28
+ 3. Enter or paste your text in the input field
29
+ 4. View the transliterated/transcribed output instantly
30
+
31
+ ## Technical Details
32
+ This demo is powered by the `turkic-transliterate` Python package, which provides specialized transliteration tools for Turkic languages with high accuracy for linguistic research and practical applications.
33
+
34
+ ---
35
+
36
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,116 +1,21 @@
1
- """
2
- A simple web interface to demonstrate the Turkish transliteration.
3
- """
4
- import gradio as gr
5
- from turkic_translit.core import to_latin, to_ipa
6
- import unicodedata as ud
7
 
 
 
8
 
9
- def transliterate(text, lang, include_arabic, output_format):
10
- if not text:
11
- return "", ""
12
- try:
13
- if output_format == "Latin":
14
- result = to_latin(text, lang, include_arabic=include_arabic)
15
- format_label = "Latin"
16
- else:
17
- result = to_ipa(text, lang)
18
- format_label = "IPA"
19
- result = ud.normalize("NFC", result)
20
- stats_md = (f"**Bytes** — Cyrillic : {len(text.encode('utf8'))}, "
21
- f"{format_label} : {len(result.encode('utf8'))}")
22
- return result, stats_md
23
- except Exception as e:
24
- raise gr.Error(str(e))
25
 
26
- # Function to reset & (de)activate the Arabic checkbox when mode changes
27
- def _sync_arabic_checkbox(mode: str):
28
- """Uncheck and disable in IPA; leave state untouched in Latin."""
29
- if mode == "IPA":
30
- # Forced off + greyed-out
31
- return gr.update(value=False, interactive=False)
32
- # Latin: keep whatever the user last chose, stay interactive
33
- return gr.update(interactive=True)
34
-
35
- # Create the Gradio interface
36
- with gr.Blocks(title="Turkic Transliteration Demo") as demo:
37
- gr.Markdown("# Turkic Transliteration Demo")
38
- gr.Markdown("Enter Cyrillic text for Kazakh (kk) or Kyrgyz (ky) and see the Latin transliteration")
39
-
40
- with gr.Row():
41
- with gr.Column():
42
- input_text = gr.Textbox(
43
- label="Input Text (Cyrillic)",
44
- placeholder="Enter Kazakh or Kyrgyz text in Cyrillic script...",
45
- lines=5
46
- )
47
- lang = gr.Radio(
48
- ["kk", "ky"],
49
- label="Language",
50
- info="kk = Kazakh, ky = Kyrgyz",
51
- value="kk"
52
- )
53
- output_format = gr.Radio(
54
- ["Latin", "IPA"],
55
- label="Output Format",
56
- info="Latin = Standard Latin alphabet, IPA = International Phonetic Alphabet",
57
- value="Latin"
58
- )
59
- include_arabic = gr.Checkbox(False, label="Also transliterate Arabic script (Latin mode only)")
60
-
61
- with gr.Column():
62
- output_text = gr.Textbox(
63
- label="Transliteration Output",
64
- lines=5,
65
- interactive=False
66
- )
67
- stats = gr.Markdown(value="")
68
-
69
- # Example inputs
70
- examples = [
71
- ["Қазақ тілі - Түркі тілдерінің бірі.", "kk", "Latin"],
72
- ["Қазақ тілі - Түркі тілдерінің бірі.", "kk", "IPA"],
73
- ["Кыргыз тили - Түрк тилдеринин бири.", "ky", "Latin"],
74
- ["Кыргыз тили - Түрк тилдеринин бири.", "ky", "IPA"]
75
- ]
76
- gr.Markdown("👉 **Tip:** Click an example below to instantly fill in the fields and see the transliteration!")
77
- gr.Examples(examples, [input_text, lang, output_format])
78
-
79
- # Live preview: update output while the user types
80
- input_text.input(
81
- fn=transliterate,
82
- inputs=[input_text, lang, include_arabic, output_format],
83
- outputs=[output_text, stats]
84
- )
85
-
86
- # Also fire when the textbox's value finishes changing (e.g., clicking an example)
87
- input_text.change(
88
- fn=transliterate,
89
- inputs=[input_text, lang, include_arabic, output_format],
90
- outputs=[output_text, stats]
91
- )
92
-
93
- lang.change(
94
- fn=transliterate,
95
- inputs=[input_text, lang, include_arabic, output_format],
96
- outputs=[output_text, stats]
97
- )
98
-
99
- include_arabic.change(
100
- fn=transliterate,
101
- inputs=[input_text, lang, include_arabic, output_format],
102
- outputs=[output_text, stats]
103
- )
104
-
105
- output_format.change(
106
- fn=transliterate,
107
- inputs=[input_text, lang, include_arabic, output_format],
108
- outputs=[output_text, stats]
109
- )
110
- # Sync Arabic checkbox with mode
111
- output_format.change(_sync_arabic_checkbox, output_format, include_arabic)
112
-
113
- # Launch the app
114
- if __name__ == "__main__":
115
- demo.queue().launch()
116
 
 
 
 
 
1
+ # Turkic Transliteration Demo for Hugging Face Spaces
2
+ # This application demonstrates transliteration between Cyrillic, Latin, and IPA for Turkic languages
 
 
 
 
3
 
4
+ # Import the web demo UI builder from the turkic_translit package
5
+ from turkic_translit.web_demo import build_ui
6
 
7
+ # Create the Gradio interface with default settings
8
+ # The build_ui function configures a Gradio Interface with:
9
+ # - Input fields for text entry
10
+ # - Language selection (Kazakh/Kyrgyz)
11
+ # - Script selection (Cyrillic/Latin/IPA)
12
+ # - Real-time transliteration preview
13
+ demo = build_ui()
 
 
 
 
 
 
 
 
 
14
 
15
+ # Enable queuing for better performance with multiple users
16
+ # This prevents the server from being overwhelmed by concurrent requests
17
+ demo.queue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Launch the web application
20
+ # In Hugging Face Spaces, this will make the app available to users
21
+ demo.launch()
env.yml DELETED
@@ -1,27 +0,0 @@
1
- name: turkic
2
- channels:
3
- - conda-forge
4
- - defaults
5
- dependencies:
6
- # ---------- Core binaries ----------
7
- - python =3.11 # pick 3.10 or 3.11; 3.12 wheels still new
8
- - icu =73.* # matches PyICU 2.15 wheel
9
- - sentencepiece >=0.2
10
- - rapidfuzz >=3.5
11
- - numpy <2 # required by fasttext-wheel
12
- # ---------- Build / test ----------
13
- - pip
14
- - pytest >=8.0
15
- - packaging >=23.0 # used in tests
16
- # ---------- pip-only wheels ----------
17
- - pip:
18
- # PyICU is handled differently for Windows vs. other platforms
19
- # On Windows, we'll use the script/get_pyicu_wheel.py helper
20
- - pyicu==2.15.2 ; platform_system != "Windows"
21
- # language ID & vectors
22
- - fasttext-wheel==0.9.2
23
- # phoneme & feature extraction
24
- - epitran==1.26.0
25
- - git+https://github.com/dmort27/panphon.git@master#egg=panphon
26
- # editable install of your package
27
- - -e .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml DELETED
@@ -1,44 +0,0 @@
1
- [build-system]
2
- requires = ["setuptools>=68", "wheel"]
3
- build-backend = "setuptools.build_meta"
4
-
5
-
6
- [project]
7
- name = "turkic_transliterate"
8
- version = "0.1.0"
9
- description = "Deterministic Latin and IPA transliteration for Kazakh, Kyrgyz, plus tokenizer/glue scripts."
10
- authors = [ {name="Austin Wagner", email="[email protected]"} ]
11
- requires-python = ">=3.9"
12
-
13
- dependencies = [
14
- # Core dependencies (alphabetized)
15
- "epitran>=1.0,<1.27", # 1.26.0 is the latest on PyPI
16
- # Universal fasttext-wheel for all platforms and Python 3.10-3.13
17
- "fasttext-wheel==0.9.2",
18
- "numpy<2",
19
- "packaging>=23.0", # Used in tests/test_fasttext.py
20
- "panphon>=0.20,<0.22", # 0.21.2 is the newest published wheel
21
- # Source build for non-Windows platforms
22
- "PyICU>=2.15 ; sys_platform != 'win32'",
23
- # Windows wheels for PyICU
24
- "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp310-cp310-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.10'",
25
- "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp311-cp311-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.11'",
26
- "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp312-cp312-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.12'",
27
- "PyICU @ https://github.com/cgohlke/pyicu-build/releases/download/v2.15/pyicu-2.15-cp313-cp313-win_amd64.whl ; sys_platform == 'win32' and python_version == '3.13'",
28
- "pytest>=8.0", # Test runner
29
- "rapidfuzz>=3.5",
30
- "rich>=13.7", # Color-aware logging and console output
31
- "sentencepiece>=0.2.0"
32
- ]
33
-
34
- [project.optional-dependencies]
35
- # winlid dependency kept for backward compatibility but empty since fasttext-wheel is now in main dependencies
36
- winlid = []
37
-
38
- # Development tools
39
- dev = ["black", "ruff"]
40
- # User interface dependencies
41
- ui = ["gradio"]
42
-
43
- [project.scripts]
44
- turkic-translit = "turkic_translit.cli:main"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,4 +1,6 @@
1
- turkic-transliterate>=0.1.1
 
 
2
 
3
  # Core dependencies
4
  epitran>=1.0,<1.27
 
1
+ turkic-transliterate[ui]>=0.1.2
2
+
3
+ #turkic-transliterate[ui] @ git+https://github.com/wagner-austin/turkic_transliteration.git@main
4
 
5
  # Core dependencies
6
  epitran>=1.0,<1.27