sudhanm commited on
Commit
9c2f50b
·
verified ·
1 Parent(s): be6893d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -13
app.py CHANGED
@@ -145,16 +145,28 @@ def transliterate_with_qwen(text, source_lang):
145
 
146
  model, tokenizer = load_qwen_model()
147
  if model is None or tokenizer is None:
148
- return text # Return original if model fails
149
 
150
  try:
151
- # Create prompts
152
  if source_lang == "Tamil":
153
- system_prompt = "Convert Tamil text to natural Thanglish (how Tamil people type on phones). Only output the romanized text."
154
- user_prompt = f"Tamil: {text}\nThanglish:"
 
 
 
 
 
 
155
  else: # Malayalam
156
- system_prompt = "Convert Malayalam text to natural Manglish (how Malayalam people type on phones). Only output the romanized text."
157
- user_prompt = f"Malayalam: {text}\nManglish:"
 
 
 
 
 
 
158
 
159
  # Format for Qwen
160
  messages = [
@@ -166,27 +178,100 @@ def transliterate_with_qwen(text, source_lang):
166
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
167
  inputs = inputs.to(DEVICE)
168
 
169
- # Generate
170
  with torch.no_grad():
171
  outputs = model.generate(
172
  **inputs,
173
- max_new_tokens=50,
174
- temperature=0.1,
175
  do_sample=True,
176
- pad_token_id=tokenizer.eos_token_id
 
 
177
  )
178
 
179
  # Extract response
180
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
181
  response = full_response[len(prompt):].strip()
182
 
183
- # Clean response
184
- response = response.split('\n')[0].strip() # Take first line only
185
- return response if response else text
 
 
 
 
 
 
 
 
 
 
186
 
187
  except Exception as e:
188
  print(f"Qwen transliteration error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  # ---------------- SPEECH RECOGNITION ---------------- #
192
 
 
145
 
146
  model, tokenizer = load_qwen_model()
147
  if model is None or tokenizer is None:
148
+ return get_simple_transliteration(text, source_lang) # Simple fallback
149
 
150
  try:
151
+ # Create better prompts with examples
152
  if source_lang == "Tamil":
153
+ system_prompt = "You are a Tamil transliteration expert. Convert Tamil script to English letters (Thanglish) like how Tamil people type on phones."
154
+ user_prompt = f"""Convert this Tamil text to Thanglish using English letters:
155
+
156
+ Tamil: நான் தமிழ் படிக்கிறேன்
157
+ Thanglish: naan tamil padikkiren
158
+
159
+ Tamil: {text}
160
+ Thanglish:"""
161
  else: # Malayalam
162
+ system_prompt = "You are a Malayalam transliteration expert. Convert Malayalam script to English letters (Manglish) like how Malayalam people type on phones."
163
+ user_prompt = f"""Convert this Malayalam text to Manglish using English letters:
164
+
165
+ Malayalam: ഞാൻ മലയാളം പഠിക്കുന്നു
166
+ Manglish: njan malayalam padikkunnu
167
+
168
+ Malayalam: {text}
169
+ Manglish:"""
170
 
171
  # Format for Qwen
172
  messages = [
 
178
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
179
  inputs = inputs.to(DEVICE)
180
 
181
+ # Generate with better parameters
182
  with torch.no_grad():
183
  outputs = model.generate(
184
  **inputs,
185
+ max_new_tokens=100,
186
+ temperature=0.3,
187
  do_sample=True,
188
+ pad_token_id=tokenizer.eos_token_id,
189
+ eos_token_id=tokenizer.eos_token_id,
190
+ repetition_penalty=1.2
191
  )
192
 
193
  # Extract response
194
  full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
195
  response = full_response[len(prompt):].strip()
196
 
197
+ # Clean response - remove any remaining script characters
198
+ import re
199
+ response = response.split('\n')[0].strip() # Take first line
200
+ response = re.sub(r'[^\x00-\x7F]+', '', response) # Remove non-ASCII (script chars)
201
+ response = response.strip()
202
+
203
+ # Validate response (should not contain original script)
204
+ if source_lang == "Malayalam" and any(char in response for char in "അആഇഈഉഊഋഎഏഐഒഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലവശഷസഹളഴറ"):
205
+ return get_simple_transliteration(text, source_lang)
206
+ elif source_lang == "Tamil" and any(char in response for char in "அஆஇஈஉஊஎஏஐஒஓஔகஙசஞடணதநபமயரலவழளற"):
207
+ return get_simple_transliteration(text, source_lang)
208
+
209
+ return response if response else get_simple_transliteration(text, source_lang)
210
 
211
  except Exception as e:
212
  print(f"Qwen transliteration error: {e}")
213
+ return get_simple_transliteration(text, source_lang)
214
+
215
+ def get_simple_transliteration(text, lang_choice):
216
+ """Simple transliteration if Qwen fails"""
217
+ # Basic word-level mappings for common words
218
+ if lang_choice == "Malayalam":
219
+ word_map = {
220
+ "കേരളം": "kerala",
221
+ "എന്റെ": "ente",
222
+ "സ്വന്തം": "swantham",
223
+ "നാടാണ്": "naadaan",
224
+ "എനിക്ക്": "enikku",
225
+ "മലയാളം": "malayalam",
226
+ "വളരെ": "valare",
227
+ "ഇഷ്ടമാണ്": "ishtamaan",
228
+ "ഞാൻ": "njan",
229
+ "പുസ്തകം": "pusthakam",
230
+ "വായിക്കുന്നു": "vaayikkunnu"
231
+ }
232
+ elif lang_choice == "Tamil":
233
+ word_map = {
234
+ "அன்னை": "annai",
235
+ "தமிழ்": "tamil",
236
+ "எங்கள்": "engal",
237
+ "தாய்மொழி": "thaaimozhi",
238
+ "நான்": "naan",
239
+ "இன்று": "indru",
240
+ "நல்ல": "nalla",
241
+ "வானிலை": "vaanilai"
242
+ }
243
+ else:
244
  return text
245
+
246
+ # Simple word replacement
247
+ words = text.split()
248
+ result_words = []
249
+ for word in words:
250
+ # Remove punctuation for lookup
251
+ clean_word = word.rstrip('.,!?')
252
+ punct = word[len(clean_word):]
253
+
254
+ if clean_word in word_map:
255
+ result_words.append(word_map[clean_word] + punct)
256
+ else:
257
+ # For unknown words, try basic phonetic conversion
258
+ result_words.append(basic_phonetic_convert(clean_word, lang_choice) + punct)
259
+
260
+ return ' '.join(result_words)
261
+
262
+ def basic_phonetic_convert(word, lang_choice):
263
+ """Very basic phonetic conversion for unknown words"""
264
+ # This is a minimal fallback - just remove complex characters
265
+ import re
266
+ if lang_choice == "Malayalam":
267
+ # Replace some common Malayalam characters with approximate sounds
268
+ result = word.replace('ം', 'm').replace('ൺ', 'n').replace('ൻ', 'n')
269
+ result = re.sub(r'[^\x00-\x7F]+', '', result) # Remove remaining script chars
270
+ return result if result else "unknown"
271
+ elif lang_choice == "Tamil":
272
+ result = re.sub(r'[^\x00-\x7F]+', '', word) # Remove script chars
273
+ return result if result else "unknown"
274
+ return word
275
 
276
  # ---------------- SPEECH RECOGNITION ---------------- #
277