Salam-AI / app.py
AlphaWice's picture
Update app.py
a0cbc8f verified
raw
history blame
15 kB
import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import re
# Global variables to store the models
atlas_pipe = None
transliteration_tokenizer = None
transliteration_model = None
def load_models():
"""Load both Atlas-Chat and Transliteration models"""
global atlas_pipe, transliteration_tokenizer, transliteration_model
# Load Atlas-Chat model
if atlas_pipe is None:
print("๐Ÿ”๏ธ Loading Atlas-Chat-2B model...")
atlas_pipe = pipeline(
"text-generation",
model="MBZUAI-Paris/Atlas-Chat-2B",
model_kwargs={"torch_dtype": torch.bfloat16},
device="cuda" if torch.cuda.is_available() else "cpu"
)
print("โœ… Atlas-Chat model loaded!")
# Load Transliteration model
if transliteration_tokenizer is None or transliteration_model is None:
print("๐Ÿ”„ Loading Transliteration model...")
transliteration_tokenizer = AutoTokenizer.from_pretrained("atlasia/Transliteration-Moroccan-Darija")
transliteration_model = AutoModelForSeq2SeqLM.from_pretrained("atlasia/Transliteration-Moroccan-Darija")
print("โœ… Transliteration model loaded!")
return atlas_pipe, transliteration_tokenizer, transliteration_model
def detect_arabizi(text):
"""
Detect if input text is written in Arabizi (Latin script with numbers)
Returns True if Arabizi is detected
"""
if not text or len(text.strip()) < 2:
return False
# Check for Arabic script - if present, it's NOT Arabizi
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
if re.search(arabic_pattern, text):
return False
# Arabizi indicators - numbers used as letters
arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8']
has_arabizi_numbers = any(num in text for num in arabizi_numbers)
# Common Arabizi words and patterns
arabizi_patterns = [
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun',
'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima',
'feen', 'fin', 'fen', 'fain', 'mnin',
'imta', 'meta', 'waqt', 'mata', 'emta',
'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma',
'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia',
'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk',
'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou',
'galt', 'galti', 'gal', 'galet', 'galou',
'rah', 'raha', 'rahi', 'rahom', 'rahin',
'kan', 'kanu', 'kana', 'kanet', 'kano',
'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin',
'daba', 'dak', 'dakchi', 'dik', 'dok',
'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf',
'chway', 'chwiya', 'shwiya', 'chwia',
'khoya', 'khuya', 'akhi', 'kho',
'khti', 'khtiya', 'ukhti', 'kht',
'mama', 'baba', 'lwaldin', 'lwalidin',
'salam', 'salamu aleikum', 'slm',
'yallah', 'yalla', 'hya', 'aji',
'mabghitsh', 'mabghach', 'makansh', 'machi',
'walakin', 'walaken', 'ama', 'mais',
'kayn', 'makaynsh', 'chi', 'tayi'
]
text_lower = text.lower()
has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns)
# Decision logic
if has_arabizi_numbers and has_arabizi_words:
return True
if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6:
return True
if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7:
return True
return False
def arabizi_to_arabic_ai(arabizi_text):
"""
Convert Arabizi text to Arabic using the specialized AI model
"""
try:
_, tokenizer, model = load_models()
# Tokenize the input text
input_tokens = tokenizer(arabizi_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
# Perform transliteration
with torch.no_grad():
output_tokens = model.generate(
**input_tokens,
max_length=512,
num_beams=4,
early_stopping=True,
no_repeat_ngram_size=2
)
# Decode the output tokens
arabic_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
return arabic_text.strip()
except Exception as e:
print(f"โŒ Error in Arabiziโ†’Arabic conversion: {e}")
# Fallback to original text if conversion fails
return arabizi_text
def arabic_to_arabizi(arabic_text):
"""
Convert Arabic script to Arabizi using character mappings
(Keeping this as backup since no reverse model available)
"""
if not arabic_text:
return arabic_text
# COMPREHENSIVE WORD MAPPINGS (Arabic โ†’ Arabizi)
word_mappings = {
# Common words first (most likely to appear)
'ุฃู†ุง': 'ana', 'ู†ุชุง': 'nta', 'ู†ุชูŠ': 'nti', 'ู‡ูˆุง': 'howa', 'ู‡ูŠุง': 'hiya',
'ุญู†ุง': 'hna', 'ุฃุญู†ุง': 'ahna', 'ู†ุชูˆู…ุง': 'ntuma', 'ู‡ูˆู…ุง': 'huma',
'ุดูƒูˆู†': 'shkoun', 'ุฃุดู†ูˆ': 'achno', 'ุดู†ูˆ': 'chno', 'ูˆุงุด': 'wach',
'ูƒูŠูุงุด': 'kifash', 'ูƒูŠู': 'kif', 'ููŠู†': 'feen', 'ู…ู†ูŠู†': 'mnin',
'ุฅู…ุชุง': 'imta', 'ู…ุชุง': 'meta', 'ุนู„ุงุด': '3lach', 'ุฃุด': 'ach',
'ุจุบูŠุช': 'bghit', 'ุจุบูŠุชูŠ': 'bghiti', 'ุจุบุง': 'bgha', 'ุจุบูŠู†ุง': 'bghina',
'ูƒุงู†': 'kan', 'ูƒุงู†ุง': 'kana', 'ูƒุงู†ุช': 'kanet', 'ูƒุงู†ูˆ': 'kanu',
'ู‚ู„ุช': 'galt', 'ู‚ู„ุชูŠ': 'galti', 'ู‚ุงู„': 'gal', 'ู‚ุงู„ุช': 'galet',
'ุฑุงุญ': 'rah', 'ุฑุงู‡ุง': 'raha', 'ุฑุงู‡ูŠ': 'rahi', 'ุฑุงู‡ู…': 'rahom',
'ุบุงุฏูŠ': 'ghadi', 'ุบุงุฏ': 'ghad', 'ุบุง': 'gha',
'ู‡ุงุฏ': 'had', 'ู‡ุงุฏุง': 'hada', 'ู‡ุงุฏูŠ': 'hadi', 'ู‡ุงุฏุดูŠ': 'hadchi',
'ุฏุงูƒ': 'dak', 'ุฏูŠูƒ': 'dik', 'ุฏุงูƒุดูŠ': 'dakchi',
'ุจุฒุงู': 'bzzaf', 'ุดูˆูŠุงุฉ': 'chwiya', 'ูƒูˆู„ุดูŠ': 'kolchi',
'ู…ุงุดูŠ': 'machi', 'ู…ุงุจุบูŠุชุด': 'mabghitsh', 'ู…ุงูƒุงูŠู†ุด': 'makainch',
'ุฏุงุจุง': 'daba', 'ุชูˆุง': 'tawa', 'ุบุฏุง': 'ghda',
'ู…ุงู…ุง': 'mama', 'ุจุงุจุง': 'baba', 'ุฎูˆูŠุง': 'khoya', 'ุฎุชูŠ': 'khti',
'ุณู„ุงู…': 'salam', 'ูŠุงู„ุงู‡': 'yallah', 'ู‡ูŠุง': 'hya',
'ุงู„ู…ุบุฑุจ': 'lmaghrib', 'ู…ุบุฑุจ': 'maghrib',
'ุทุงุฌูŠู†': 'tajine', 'ุฃุชุงูŠ': 'atay', 'ุฎูˆุจุฒ': 'khobz',
'ูƒุงูŠู†': 'kayn', 'ู…ุงูƒุงูŠู†ุด': 'makaynsh', 'ุดูŠ': 'chi',
'ุฒูˆูŠู†': 'zwin', 'ุฒูˆูŠู†ุง': 'zwina', 'ู…ุฒูŠุงู†': 'mzyan', 'ู…ุฒูŠุงู†ุง': 'mzyana',
'ุฏุฑุช': 'dert', 'ุฏุฑุชูŠ': 'derti', 'ุฏุงุฑ': 'dar', 'ุฏุฑุงุช': 'derat',
'ู…ุดูŠุช': 'mchit', 'ู…ุดูŠุชูŠ': 'mchiti', 'ู…ุดุง': 'mcha', 'ู…ุดุงุช': 'mchat',
'ุฌูŠุช': 'jit', 'ุฌูŠุชูŠ': 'jiti', 'ุฌุง': 'ja', 'ุฌุงุช': 'jat',
'ุดูุช': 'cheft', 'ุดูุชูŠ': 'chefti', 'ุดุงู': 'chaf', 'ุดุงูุช': 'chafat',
'ุณู…ุนุช': 'sme3t', 'ุณู…ุนุชูŠ': 'sme3ti', 'ุณู…ุน': 'sma3', 'ุณู…ุนุงุช': 'sma3at',
'ุฃูƒู„ุช': 'klit', 'ุฃูƒู„ุชูŠ': 'kliti', 'ูƒู„ุง': 'kla', 'ูƒู„ุงุช': 'klat',
'ุดุฑุจุช': 'chrebt', 'ุดุฑุจุชูŠ': 'chrebti', 'ุดุฑุจ': 'chreb', 'ุดุฑุจุงุช': 'chrebat',
'ู†ุนุณุช': 'ne3st', 'ู†ุนุณุชูŠ': 'ne3sti', 'ู†ุนุณ': 'ne3s', 'ู†ุนุณุงุช': 'ne3sat',
'ุฎุฑุฌุช': 'khrjt', 'ุฎุฑุฌุชูŠ': 'khrjti', 'ุฎุฑุฌ': 'khrj', 'ุฎุฑุฌุงุช': 'khrjat',
'ุฏุฎู„ุช': 'dkhlt', 'ุฏุฎู„ุชูŠ': 'dkhlti', 'ุฏุฎู„': 'dkhl', 'ุฏุฎู„ุงุช': 'dkhlat',
'ู‚ุฑูŠุช': 'qrit', 'ู‚ุฑูŠุชูŠ': 'qriti', 'ู‚ุฑุง': 'qra', 'ู‚ุฑุงุช': 'qrat',
'ูƒุชุจุช': 'ktebt', 'ูƒุชุจุชูŠ': 'ktebti', 'ูƒุชุจ': 'kteb', 'ูƒุชุจุงุช': 'ktebat',
'ู„ุนุจุช': 'l3ebt', 'ู„ุนุจุชูŠ': 'l3ebti', 'ู„ุนุจ': 'l3eb', 'ู„ุนุจุงุช': 'l3ebat',
'ุฎุฏู…ุช': 'khdmt', 'ุฎุฏู…ุชูŠ': 'khdmti', 'ุฎุฏู…': 'khdm', 'ุฎุฏู…ุงุช': 'khdmat',
'ุตู„ูŠุช': 'sllit', 'ุตู„ูŠุชูŠ': 'slliti', 'ุตู„ุง': 'slla', 'ุตู„ุงุช': 'sllat',
'ุทุจุฎุช': '6bkht', 'ุทุจุฎุชูŠ': '6bkhti', 'ุทุจุฎ': '6bekh', 'ุทุจุฎุงุช': '6bekhat',
'ูˆุงุญุฏ': 'wa7ed', 'ุฌูˆุฌ': 'joj', 'ุชู„ุงุชุง': 'tlata', 'ุฑุจุนุง': 'reb3a',
'ุฎู…ุณุง': 'khamsa', 'ุณุชุง': 'setta', 'ุณุจุนุง': 'seb3a', 'ุชู…ู†ูŠุง': 'tmnya',
'ุชุณุนุง': 'tes3a', 'ุนุดุฑุง': '3echra', 'ุญุฏุงุด': '7dach', 'ุทู†ุงุด': '6nach',
'ู†ู‡ุงุฑ': 'nhar', 'ู„ูŠู„ุง': 'lila', 'ุตุจุงุญ': 'sba7', 'ุนุดูŠุง': '3echiya',
'ุฃู…ุณ': 'ems', 'ุงู„ุจุงุฑุญ': 'lbare7', 'ุบุฏุง': 'ghda', 'ุจุนุฏ ุบุฏุง': 'b3d ghda',
'ุฏุงุฑ': 'dar', 'ุจูŠุช': 'bit', 'ุดุงุฑุน': 'char3', 'ู…ุฏูŠู†ุง': 'mdina',
'ูƒุฑู‡ูˆุจุง': 'karhouba', 'ุทูˆู…ูˆุจูŠู„': 'tomobil', 'ู‚ุทุงุฑ': 'q6ar', 'ุจุงุต': 'bas',
'ู…ุงูƒู„ุง': 'makla', 'ุดุฑุงุจ': 'chrab', 'ู„ู…ุง': 'lma', 'ุนุทุด': '36ch',
'ุฌูˆุน': 'jo3', 'ุดุจุนุงู†': 'cheb3an', 'ุนูŠุงู†': '3yyan', 'ุตุญูŠุญ': 's7i7',
'ู…ุฑูŠุถ': 'mrid', 'ุฏูƒุชูˆุฑ': 'doktor', 'ุณุจูŠุทุงุฑ': 'sbitar', 'ุฏูˆุง': 'dwa',
'ูู„ูˆุณ': 'flous', 'ุฏุฑู‡ู…': 'derhem', 'ุฑูŠุงู„': 'riyal', 'ุงู„ูŠูˆุฑูˆ': 'lyoro',
'ุฎุฏู…ุง': 'khedma', 'ู…ุนู„ู…': 'mo3alim', 'ุทุงู„ุจ': 'talib', 'ุฃุณุชุงุฐ': 'ostaz',
'ูƒุชุงุจ': 'ktab', 'ู‚ู„ู…': 'qalam', 'ูƒุงุบุท': 'kaghet', 'ุทุงูˆู„ุง': 'tabla'
}
# CHARACTER MAPPINGS (Arabic โ†’ Arabizi)
char_mappings = {
'ุง': 'a', 'ุจ': 'b', 'ุช': 't', 'ุซ': 'th', 'ุฌ': 'j', 'ุญ': '7',
'ุฎ': 'kh', 'ุฏ': 'd', 'ุฐ': 'dh', 'ุฑ': 'r', 'ุฒ': 'z', 'ุณ': 's',
'ุด': 'sh', 'ุต': 's', 'ุถ': 'd', 'ุท': '6', 'ุธ': 'z', 'ุน': '3',
'ุบ': 'gh', 'ู': 'f', 'ู‚': '9', 'ูƒ': 'k', 'ู„': 'l', 'ู…': 'm',
'ู†': 'n', 'ู‡': 'h', 'ูˆ': 'w', 'ูŠ': 'y', 'ุก': '2',
'ุข': 'aa', 'ุฃ': 'a', 'ุฅ': 'i', 'ุฉ': 'a', 'ู‰': 'a',
'ุŸ': '?', 'ุŒ': ',', 'ุ›': ';', '๏ผš': ':', '๏ผ': '!',
'ูŽ': 'a', 'ู': 'o', 'ู': 'i', 'ู‹': 'an', 'ูŒ': 'on', 'ู': 'in'
}
result = arabic_text
# Step 1: Apply word mappings
for arabic_word, arabizi_word in word_mappings.items():
# Use word boundaries to avoid partial matches
result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result)
# Step 2: Apply character mappings
for arabic_char, arabizi_char in char_mappings.items():
result = result.replace(arabic_char, arabizi_char)
return result.strip()
def chat_with_atlas(message, history):
"""Generate response from Atlas-Chat model with AI-powered Arabizi conversion"""
if not message.strip():
return "ahlan wa sahlan! kifash n9der n3awnek? / ู…ุฑุญุจุง! ูƒูŠูุงุด ู†ู‚ุฏุฑ ู†ุนุงูˆู†ูƒุŸ"
try:
# Load models
atlas_model, _, _ = load_models()
# Detect if input is Arabizi
is_arabizi_input = detect_arabizi(message)
print("\n" + "="*60)
print("๐Ÿ” DEBUG LOG - FULL CONVERSION PIPELINE")
print("="*60)
print(f"๐Ÿ“ฅ ORIGINAL INPUT: '{message}'")
print(f"๐Ÿค– ARABIZI DETECTED: {is_arabizi_input}")
# Prepare input for the model
if is_arabizi_input:
# Convert Arabizi to Arabic using AI model
print(f"\n๐Ÿ”„ STEP 1: Converting Arabizi to Arabic...")
arabic_input = arabizi_to_arabic_ai(message)
print(f"โœ… ARABIC CONVERSION: '{arabic_input}'")
model_input = arabic_input
else:
# Use original input (Arabic or English)
print(f"\nโžก๏ธ NO CONVERSION NEEDED - Using original input")
model_input = message
print(f"\n๐Ÿค– STEP 2: Sending to Atlas-Chat model...")
print(f"๐Ÿ“ค MODEL INPUT: '{model_input}'")
# Generate response using Arabic input
messages = [{"role": "user", "content": model_input}]
outputs = atlas_model(
messages,
max_new_tokens=256,
temperature=0.1,
do_sample=True,
pad_token_id=atlas_model.tokenizer.eos_token_id
)
# Extract the response
response = outputs[0]["generated_text"][-1]["content"].strip()
print(f"โœ… MODEL RESPONSE (Arabic): '{response}'")
# Convert response back to Arabizi if input was Arabizi
if is_arabizi_input:
print(f"\n๐Ÿ”„ STEP 3: Converting response back to Arabizi...")
arabizi_response = arabic_to_arabizi(response)
print(f"โœ… FINAL ARABIZI RESPONSE: '{arabizi_response}'")
print("="*60)
print("๐ŸŽฏ FINAL OUTPUT TO USER:", arabizi_response)
print("="*60 + "\n")
return arabizi_response
else:
# Return original response for Arabic/English
print(f"\nโžก๏ธ NO BACK-CONVERSION NEEDED")
print("="*60)
print("๐ŸŽฏ FINAL OUTPUT TO USER:", response)
print("="*60 + "\n")
return response
except Exception as e:
print(f"\nโŒ ERROR OCCURRED: {str(e)}")
print("="*60 + "\n")
# Return error in appropriate language
if detect_arabizi(message):
return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!"
else:
return f"ุนุฐุฑุงู‹ุŒ ูˆุงุฌู‡ุช ุฎุทุฃ: {str(e)}. ุฌุฑุจ ู…ุฑุฉ ุฃุฎุฑู‰! / Sorry, error occurred: {str(e)}. Try again!"
# Create the Gradio interface
demo = gr.ChatInterface(
fn=chat_with_atlas,
title="๐Ÿ”๏ธ Atlas-Chat: Advanced Moroccan Arabic AI",
description="""
**ู…ุฑุญุจุง ุจูƒ ููŠ ุฃุทู„ุณ ุดุงุช ุงู„ู…ุทูˆุฑ!** Welcome to Advanced Atlas-Chat! ๐Ÿ‡ฒ๐Ÿ‡ฆ
**๐Ÿง  AI-Powered Language Detection & Conversion:**
- **Arabic Script (ุงู„ุนุฑุจูŠุฉ)** โ†’ AI responds in Arabic
- **Arabizi (3arabi bi 7oruf latin)** โ†’ AI-powered conversion โ†’ Arabizi response
- **English** โ†’ AI responds in English
**โšก Professional Arabizi Conversion**
- Uses specialized AI model trained on Moroccan Darija
- Perfect understanding of context: "kayn chi" โ†’ "ูƒุงูŠู† ุดูŠ"
- Handles complex phrases accurately
**ุฌุฑุจ ู‡ุฐู‡ ุงู„ุฃุณุฆู„ุฉ / Try these questions:**
""",
examples=[
"ุดูƒูˆู† ู„ูŠ ุตู†ุนูƒุŸ",
"shkoun li sna3ek?",
"ุงุดู†ูˆ ู‡ูˆ ุงู„ุทุงุฌูŠู†ุŸ",
"achno howa tajine?",
"ุดู†ูˆ ูƒูŠุชุณู…ู‰ ุงู„ู…ู†ุชุฎุจ ุงู„ู…ุบุฑุจูŠุŸ",
"chno kaytsma lmontakhab lmaghribi?",
"What is Morocco famous for?",
"ูƒูŠูุงุด ู†ู‚ุฏุฑ ู†ุชุนู„ู… ุงู„ุฏุงุฑุฌุฉุŸ",
"kifash n9der nt3elem darija?",
"wach kayn atay f lmaghrib?",
"3lach lmaghrib zwien bzzaf?",
"kifash nsali tajine?",
"chno homa l2aklat lmaghribiya?",
"kayn chi restaurants zwinin f casa?",
"mr7ba! kif dayr?"
],
cache_examples=False
)
# Launch the app
if __name__ == "__main__":
demo.launch()