sudhanm commited on
Commit
5618139
·
verified ·
1 Parent(s): 2798eed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -10
app.py CHANGED
@@ -15,12 +15,28 @@ LANG_CODES = {
15
  "Sanskrit": "sa"
16
  }
17
 
 
18
  LANG_PRIMERS = {
19
- "English": ("", ""),
20
- "Tamil": ("The transcript should be in Tamil script.", "Write only in Tamil script without translation."),
21
- "Malayalam": ("The transcript should be in Malayalam script.", "Write only in Malayalam script without translation."),
22
- "Hindi": ("The transcript should be in Devanagari script.", "Write only in Devanagari script without translation."),
23
- "Sanskrit": ("The transcript should be in Devanagari script.", "Write only in Devanagari script without translation.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
 
26
  # Script detection regexes
@@ -77,17 +93,17 @@ def transcribe(audio, language_choice):
77
  lang_code = LANG_CODES[language_choice]
78
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
79
 
80
- # Pass 1: loose mode to get context
81
  loose_text = transcribe_once(
82
  audio_path=audio,
83
  lang_code=lang_code,
84
- initial_prompt="",
85
  beam_size=8,
86
  temperature=0.4,
87
  condition_on_previous_text=True
88
  )
89
 
90
- # Pass 2: strict mode with context
91
  strict_prompt = f"{primer_strong}\nContext: {loose_text}"
92
  strict_text = transcribe_once(
93
  audio_path=audio,
@@ -98,7 +114,7 @@ def transcribe(audio, language_choice):
98
  condition_on_previous_text=False
99
  )
100
 
101
- # Check script match — if wrong, retry without context, only strict primer
102
  if not is_script(strict_text, language_choice):
103
  strict_text = transcribe_once(
104
  audio_path=audio,
@@ -109,7 +125,7 @@ def transcribe(audio, language_choice):
109
  condition_on_previous_text=False
110
  )
111
 
112
- # Final check before transliteration
113
  if is_script(strict_text, language_choice):
114
  hk_translit = transliterate_to_hk(strict_text, language_choice)
115
  else:
 
15
  "Sanskrit": "sa"
16
  }
17
 
18
+ # Stronger primers in native script with example sentences
19
  LANG_PRIMERS = {
20
+ "English": (
21
+ "The transcript should be in English only.",
22
+ "Write only in English without translation. Example: This is an English sentence."
23
+ ),
24
+ "Tamil": (
25
+ "நகல் தமிழ் எழுத்துக்களில் மட்டும் இருக்க வேண்டும்.",
26
+ "தமிழ் எழுத்துக்களில் மட்டும் எழுதவும், மொழிபெயர்ப்பு செய்யக்கூடாது. உதாரணம்: இது ஒரு தமிழ் வாக்கியம்."
27
+ ),
28
+ "Malayalam": (
29
+ "ട്രാൻസ്ക്രിപ്റ്റ് മലയാള ലിപിയിൽ ആയിരിക്കണം.",
30
+ "മലയാള ലിപിയിൽ മാത്രം എഴുതുക, വിവർത്തനം ചെയ്യരുത്. ഉദാഹരണം: ഇതൊരു മലയാള വാക്യമാണ്. എനിക്ക് മലയാളം അറിയാം."
31
+ ),
32
+ "Hindi": (
33
+ "प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
34
+ "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: यह एक हिंदी वाक्य है।"
35
+ ),
36
+ "Sanskrit": (
37
+ "प्रतिलिपि केवल देवनागरी लिपि में होनी चाहिए।",
38
+ "केवल देवनागरी लिपि में लिखें, अनुवाद न करें। उदाहरण: अहं संस्कृतं जानामि।"
39
+ )
40
  }
41
 
42
  # Script detection regexes
 
93
  lang_code = LANG_CODES[language_choice]
94
  primer_weak, primer_strong = LANG_PRIMERS[language_choice]
95
 
96
+ # Pass 1: loose mode to get context (optional weak primer for bias)
97
  loose_text = transcribe_once(
98
  audio_path=audio,
99
  lang_code=lang_code,
100
+ initial_prompt=primer_weak, # <-- Weak primer used here too
101
  beam_size=8,
102
  temperature=0.4,
103
  condition_on_previous_text=True
104
  )
105
 
106
+ # Pass 2: strict mode with strong primer + context
107
  strict_prompt = f"{primer_strong}\nContext: {loose_text}"
108
  strict_text = transcribe_once(
109
  audio_path=audio,
 
114
  condition_on_previous_text=False
115
  )
116
 
117
+ # Check script match — if wrong, retry without context, only strong primer
118
  if not is_script(strict_text, language_choice):
119
  strict_text = transcribe_once(
120
  audio_path=audio,
 
125
  condition_on_previous_text=False
126
  )
127
 
128
+ # Transliteration
129
  if is_script(strict_text, language_choice):
130
  hk_translit = transliterate_to_hk(strict_text, language_choice)
131
  else: