Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,12 +15,28 @@ LANG_CODES = {
|
|
15 |
"Sanskrit": "sa"
|
16 |
}
|
17 |
|
|
|
18 |
LANG_PRIMERS = {
|
19 |
-
"English": (
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
}
|
25 |
|
26 |
# Script detection regexes
|
@@ -77,17 +93,17 @@ def transcribe(audio, language_choice):
|
|
77 |
lang_code = LANG_CODES[language_choice]
|
78 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
79 |
|
80 |
-
# Pass 1: loose mode to get context
|
81 |
loose_text = transcribe_once(
|
82 |
audio_path=audio,
|
83 |
lang_code=lang_code,
|
84 |
-
initial_prompt=
|
85 |
beam_size=8,
|
86 |
temperature=0.4,
|
87 |
condition_on_previous_text=True
|
88 |
)
|
89 |
|
90 |
-
# Pass 2: strict mode with context
|
91 |
strict_prompt = f"{primer_strong}\nContext: {loose_text}"
|
92 |
strict_text = transcribe_once(
|
93 |
audio_path=audio,
|
@@ -98,7 +114,7 @@ def transcribe(audio, language_choice):
|
|
98 |
condition_on_previous_text=False
|
99 |
)
|
100 |
|
101 |
-
# Check script match — if wrong, retry without context, only
|
102 |
if not is_script(strict_text, language_choice):
|
103 |
strict_text = transcribe_once(
|
104 |
audio_path=audio,
|
@@ -109,7 +125,7 @@ def transcribe(audio, language_choice):
|
|
109 |
condition_on_previous_text=False
|
110 |
)
|
111 |
|
112 |
-
#
|
113 |
if is_script(strict_text, language_choice):
|
114 |
hk_translit = transliterate_to_hk(strict_text, language_choice)
|
115 |
else:
|
|
|
15 |
"Sanskrit": "sa"
|
16 |
}
|
17 |
|
18 |
+
# Stronger primers in native script with example sentences
|
19 |
LANG_PRIMERS = {
|
20 |
+
"English": (
|
21 |
+
"The transcript should be in English only.",
|
22 |
+
"Write only in English without translation. Example: This is an English sentence."
|
23 |
+
),
|
24 |
+
"Tamil": (
|
25 |
+
"நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
|
26 |
+
"தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."
|
27 |
+
),
|
28 |
+
"Malayalam": (
|
29 |
+
"ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
|
30 |
+
"മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."
|
31 |
+
),
|
32 |
+
"Hindi": (
|
33 |
+
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
34 |
+
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"
|
35 |
+
),
|
36 |
+
"Sanskrit": (
|
37 |
+
"प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
|
38 |
+
"केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।"
|
39 |
+
)
|
40 |
}
|
41 |
|
42 |
# Script detection regexes
|
|
|
93 |
lang_code = LANG_CODES[language_choice]
|
94 |
primer_weak, primer_strong = LANG_PRIMERS[language_choice]
|
95 |
|
96 |
+
# Pass 1: loose mode to get context (optional weak primer for bias)
|
97 |
loose_text = transcribe_once(
|
98 |
audio_path=audio,
|
99 |
lang_code=lang_code,
|
100 |
+
initial_prompt=primer_weak, # <-- Weak primer used here too
|
101 |
beam_size=8,
|
102 |
temperature=0.4,
|
103 |
condition_on_previous_text=True
|
104 |
)
|
105 |
|
106 |
+
# Pass 2: strict mode with strong primer + context
|
107 |
strict_prompt = f"{primer_strong}\nContext: {loose_text}"
|
108 |
strict_text = transcribe_once(
|
109 |
audio_path=audio,
|
|
|
114 |
condition_on_previous_text=False
|
115 |
)
|
116 |
|
117 |
+
# Check script match — if wrong, retry without context, only strong primer
|
118 |
if not is_script(strict_text, language_choice):
|
119 |
strict_text = transcribe_once(
|
120 |
audio_path=audio,
|
|
|
125 |
condition_on_previous_text=False
|
126 |
)
|
127 |
|
128 |
+
# Transliteration
|
129 |
if is_script(strict_text, language_choice):
|
130 |
hk_translit = transliterate_to_hk(strict_text, language_choice)
|
131 |
else:
|