Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -145,16 +145,28 @@ def transliterate_with_qwen(text, source_lang):
|
|
145 |
|
146 |
model, tokenizer = load_qwen_model()
|
147 |
if model is None or tokenizer is None:
|
148 |
-
return text #
|
149 |
|
150 |
try:
|
151 |
-
# Create prompts
|
152 |
if source_lang == "Tamil":
|
153 |
-
system_prompt = "Convert Tamil
|
154 |
-
user_prompt = f"Tamil
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
else: # Malayalam
|
156 |
-
system_prompt = "Convert Malayalam
|
157 |
-
user_prompt = f"Malayalam
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
# Format for Qwen
|
160 |
messages = [
|
@@ -166,27 +178,100 @@ def transliterate_with_qwen(text, source_lang):
|
|
166 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
167 |
inputs = inputs.to(DEVICE)
|
168 |
|
169 |
-
# Generate
|
170 |
with torch.no_grad():
|
171 |
outputs = model.generate(
|
172 |
**inputs,
|
173 |
-
max_new_tokens=
|
174 |
-
temperature=0.
|
175 |
do_sample=True,
|
176 |
-
pad_token_id=tokenizer.eos_token_id
|
|
|
|
|
177 |
)
|
178 |
|
179 |
# Extract response
|
180 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
181 |
response = full_response[len(prompt):].strip()
|
182 |
|
183 |
-
# Clean response
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
except Exception as e:
|
188 |
print(f"Qwen transliteration error: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
# ---------------- SPEECH RECOGNITION ---------------- #
|
192 |
|
|
|
145 |
|
146 |
model, tokenizer = load_qwen_model()
|
147 |
if model is None or tokenizer is None:
|
148 |
+
return get_simple_transliteration(text, source_lang) # Simple fallback
|
149 |
|
150 |
try:
|
151 |
+
# Create better prompts with examples
|
152 |
if source_lang == "Tamil":
|
153 |
+
system_prompt = "You are a Tamil transliteration expert. Convert Tamil script to English letters (Thanglish) like how Tamil people type on phones."
|
154 |
+
user_prompt = f"""Convert this Tamil text to Thanglish using English letters:
|
155 |
+
|
156 |
+
Tamil: நான் தமிழ் படிக்கிறேன்
|
157 |
+
Thanglish: naan tamil padikkiren
|
158 |
+
|
159 |
+
Tamil: {text}
|
160 |
+
Thanglish:"""
|
161 |
else: # Malayalam
|
162 |
+
system_prompt = "You are a Malayalam transliteration expert. Convert Malayalam script to English letters (Manglish) like how Malayalam people type on phones."
|
163 |
+
user_prompt = f"""Convert this Malayalam text to Manglish using English letters:
|
164 |
+
|
165 |
+
Malayalam: ഞാൻ മലയാളം പഠിക്കുന്നു
|
166 |
+
Manglish: njan malayalam padikkunnu
|
167 |
+
|
168 |
+
Malayalam: {text}
|
169 |
+
Manglish:"""
|
170 |
|
171 |
# Format for Qwen
|
172 |
messages = [
|
|
|
178 |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
|
179 |
inputs = inputs.to(DEVICE)
|
180 |
|
181 |
+
# Generate with better parameters
|
182 |
with torch.no_grad():
|
183 |
outputs = model.generate(
|
184 |
**inputs,
|
185 |
+
max_new_tokens=100,
|
186 |
+
temperature=0.3,
|
187 |
do_sample=True,
|
188 |
+
pad_token_id=tokenizer.eos_token_id,
|
189 |
+
eos_token_id=tokenizer.eos_token_id,
|
190 |
+
repetition_penalty=1.2
|
191 |
)
|
192 |
|
193 |
# Extract response
|
194 |
full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
195 |
response = full_response[len(prompt):].strip()
|
196 |
|
197 |
+
# Clean response - remove any remaining script characters
|
198 |
+
import re
|
199 |
+
response = response.split('\n')[0].strip() # Take first line
|
200 |
+
response = re.sub(r'[^\x00-\x7F]+', '', response) # Remove non-ASCII (script chars)
|
201 |
+
response = response.strip()
|
202 |
+
|
203 |
+
# Validate response (should not contain original script)
|
204 |
+
if source_lang == "Malayalam" and any(char in response for char in "അആഇഈഉഊഋഎഏഐഒഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലവശഷസഹളഴറ"):
|
205 |
+
return get_simple_transliteration(text, source_lang)
|
206 |
+
elif source_lang == "Tamil" and any(char in response for char in "அஆஇஈஉஊஎஏஐஒஓஔகஙசஞடணதநபமயரலவழளற"):
|
207 |
+
return get_simple_transliteration(text, source_lang)
|
208 |
+
|
209 |
+
return response if response else get_simple_transliteration(text, source_lang)
|
210 |
|
211 |
except Exception as e:
|
212 |
print(f"Qwen transliteration error: {e}")
|
213 |
+
return get_simple_transliteration(text, source_lang)
|
214 |
+
|
215 |
+
def get_simple_transliteration(text, lang_choice):
|
216 |
+
"""Simple transliteration if Qwen fails"""
|
217 |
+
# Basic word-level mappings for common words
|
218 |
+
if lang_choice == "Malayalam":
|
219 |
+
word_map = {
|
220 |
+
"കേരളം": "kerala",
|
221 |
+
"എന്റെ": "ente",
|
222 |
+
"സ്വന്തം": "swantham",
|
223 |
+
"നാടാണ്": "naadaan",
|
224 |
+
"എനിക്ക്": "enikku",
|
225 |
+
"മലയാളം": "malayalam",
|
226 |
+
"വളരെ": "valare",
|
227 |
+
"ഇഷ്ടമാണ്": "ishtamaan",
|
228 |
+
"ഞാൻ": "njan",
|
229 |
+
"പുസ്തകം": "pusthakam",
|
230 |
+
"വായിക്കുന്നു": "vaayikkunnu"
|
231 |
+
}
|
232 |
+
elif lang_choice == "Tamil":
|
233 |
+
word_map = {
|
234 |
+
"அன்னை": "annai",
|
235 |
+
"தமிழ்": "tamil",
|
236 |
+
"எங்கள்": "engal",
|
237 |
+
"தாய்மொழி": "thaaimozhi",
|
238 |
+
"நான்": "naan",
|
239 |
+
"இன்று": "indru",
|
240 |
+
"நல்ல": "nalla",
|
241 |
+
"வானிலை": "vaanilai"
|
242 |
+
}
|
243 |
+
else:
|
244 |
return text
|
245 |
+
|
246 |
+
# Simple word replacement
|
247 |
+
words = text.split()
|
248 |
+
result_words = []
|
249 |
+
for word in words:
|
250 |
+
# Remove punctuation for lookup
|
251 |
+
clean_word = word.rstrip('.,!?')
|
252 |
+
punct = word[len(clean_word):]
|
253 |
+
|
254 |
+
if clean_word in word_map:
|
255 |
+
result_words.append(word_map[clean_word] + punct)
|
256 |
+
else:
|
257 |
+
# For unknown words, try basic phonetic conversion
|
258 |
+
result_words.append(basic_phonetic_convert(clean_word, lang_choice) + punct)
|
259 |
+
|
260 |
+
return ' '.join(result_words)
|
261 |
+
|
262 |
+
def basic_phonetic_convert(word, lang_choice):
|
263 |
+
"""Very basic phonetic conversion for unknown words"""
|
264 |
+
# This is a minimal fallback - just remove complex characters
|
265 |
+
import re
|
266 |
+
if lang_choice == "Malayalam":
|
267 |
+
# Replace some common Malayalam characters with approximate sounds
|
268 |
+
result = word.replace('ം', 'm').replace('ൺ', 'n').replace('ൻ', 'n')
|
269 |
+
result = re.sub(r'[^\x00-\x7F]+', '', result) # Remove remaining script chars
|
270 |
+
return result if result else "unknown"
|
271 |
+
elif lang_choice == "Tamil":
|
272 |
+
result = re.sub(r'[^\x00-\x7F]+', '', word) # Remove script chars
|
273 |
+
return result if result else "unknown"
|
274 |
+
return word
|
275 |
|
276 |
# ---------------- SPEECH RECOGNITION ---------------- #
|
277 |
|