|
import gradio as gr |
|
import torch |
|
from transformers import pipeline |
|
import re |
|
import os |
|
from huggingface_hub import login |
|
from gradio_client import Client |
|
|
|
|
|
if "HF_TOKEN" in os.environ: |
|
login(token=os.environ["HF_TOKEN"]) |
|
|
|
|
|
atlas_pipe = None |
|
transliteration_client = None |
|
|
|
def load_models(): |
|
"""Load Atlas-Chat model and setup transliteration client""" |
|
global atlas_pipe, transliteration_client |
|
|
|
|
|
if atlas_pipe is None: |
|
print("🏔️ Loading Atlas-Chat-2B model...") |
|
atlas_pipe = pipeline( |
|
"text-generation", |
|
model="MBZUAI-Paris/Atlas-Chat-2B", |
|
model_kwargs={"torch_dtype": torch.bfloat16}, |
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
) |
|
print("✅ Atlas-Chat model loaded!") |
|
|
|
|
|
if transliteration_client is None: |
|
try: |
|
|
|
print("🔗 Connecting to transliteration service...") |
|
transliteration_client = Client("YOUR-USERNAME/arabizi-transliteration-helper") |
|
print("✅ Transliteration client connected!") |
|
except Exception as e: |
|
print(f"❌ Failed to connect to transliteration service: {e}") |
|
transliteration_client = None |
|
|
|
return atlas_pipe, transliteration_client |
|
|
|
def detect_arabizi(text): |
|
""" |
|
Detect if input text is written in Arabizi (Latin script with numbers) |
|
Returns True if Arabizi is detected |
|
""" |
|
if not text or len(text.strip()) < 2: |
|
return False |
|
|
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' |
|
if re.search(arabic_pattern, text): |
|
return False |
|
|
|
|
|
arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8'] |
|
has_arabizi_numbers = any(num in text for num in arabizi_numbers) |
|
|
|
|
|
arabizi_patterns = [ |
|
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun', |
|
'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima', |
|
'feen', 'fin', 'fen', 'fain', 'mnin', |
|
'imta', 'meta', 'waqt', 'mata', 'emta', |
|
'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma', |
|
'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia', |
|
'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk', |
|
'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou', |
|
'galt', 'galti', 'gal', 'galet', 'galou', |
|
'rah', 'raha', 'rahi', 'rahom', 'rahin', |
|
'kan', 'kanu', 'kana', 'kanet', 'kano', |
|
'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin', |
|
'daba', 'dak', 'dakchi', 'dik', 'dok', |
|
'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf', |
|
'chway', 'chwiya', 'shwiya', 'chwia', |
|
'khoya', 'khuya', 'akhi', 'kho', |
|
'khti', 'khtiya', 'ukhti', 'kht', |
|
'mama', 'baba', 'lwaldin', 'lwalidin', |
|
'salam', 'salamu aleikum', 'slm', |
|
'yallah', 'yalla', 'hya', 'aji', |
|
'mabghitsh', 'mabghach', 'makansh', 'machi', |
|
'walakin', 'walaken', 'ama', 'mais', |
|
'kayn', 'makaynsh', 'chi', 'tayi' |
|
] |
|
|
|
text_lower = text.lower() |
|
has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns) |
|
|
|
|
|
if has_arabizi_numbers and has_arabizi_words: |
|
return True |
|
if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6: |
|
return True |
|
if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7: |
|
return True |
|
|
|
return False |
|
|
|
def arabizi_to_arabic_client(arabizi_text): |
|
""" |
|
Convert Arabizi text to Arabic using the helper Space |
|
""" |
|
try: |
|
_, client = load_models() |
|
|
|
if client is None: |
|
print("❌ Transliteration client not available, using fallback") |
|
return arabizi_text |
|
|
|
|
|
result = client.predict(arabizi_text, api_name="/predict") |
|
|
|
|
|
if isinstance(result, str) and result.startswith("Error:"): |
|
print(f"❌ Transliteration service error: {result}") |
|
return arabizi_text |
|
|
|
return result.strip() if result else arabizi_text |
|
|
|
except Exception as e: |
|
print(f"❌ Error calling transliteration service: {e}") |
|
return arabizi_text |
|
|
|
def arabic_to_arabizi(arabic_text): |
|
""" |
|
Convert Arabic script to Arabizi using comprehensive hard-coded mappings |
|
""" |
|
if not arabic_text: |
|
return arabic_text |
|
|
|
|
|
word_mappings = { |
|
|
|
'أنا': 'ana', 'نتا': 'nta', 'نتي': 'nti', 'هوا': 'howa', 'هيا': 'hiya', |
|
'حنا': 'hna', 'أحنا': 'ahna', 'نتوما': 'ntuma', 'هوما': 'huma', |
|
'شكون': 'shkoun', 'أشنو': 'achno', 'شنو': 'chno', 'واش': 'wach', |
|
'كيفاش': 'kifash', 'كيف': 'kif', 'فين': 'feen', 'منين': 'mnin', |
|
'إمتا': 'imta', 'متا': 'meta', 'علاش': '3lach', 'أش': 'ach', |
|
'بغيت': 'bghit', 'بغيتي': 'bghiti', 'بغا': 'bgha', 'بغينا': 'bghina', |
|
'كان': 'kan', 'كانا': 'kana', 'كانت': 'kanet', 'كانو': 'kanu', |
|
'قلت': 'galt', 'قلتي': 'galti', 'قال': 'gal', 'قالت': 'galet', |
|
'راح': 'rah', 'راها': 'raha', 'راهي': 'rahi', 'راهم': 'rahom', |
|
'غادي': 'ghadi', 'غاد': 'ghad', 'غا': 'gha', |
|
'هاد': 'had', 'هادا': 'hada', 'هادي': 'hadi', 'هادشي': 'hadchi', |
|
'داك': 'dak', 'ديك': 'dik', 'داكشي': 'dakchi', |
|
'بزاف': 'bzzaf', 'شوياة': 'chwiya', 'كولشي': 'kolchi', |
|
'ماشي': 'machi', 'مابغيتش': 'mabghitsh', 'ماكاينش': 'makainch', |
|
'دابا': 'daba', 'توا': 'tawa', 'غدا': 'ghda', |
|
'ماما': 'mama', 'بابا': 'baba', 'خويا': 'khoya', 'ختي': 'khti', |
|
'سلام': 'salam', 'يالاه': 'yallah', 'هيا': 'hya', |
|
'المغرب': 'lmaghrib', 'مغرب': 'maghrib', |
|
'طاجين': 'tajine', 'أتاي': 'atay', 'خوبز': 'khobz', |
|
'كاين': 'kayn', 'ماكاينش': 'makaynsh', 'شي': 'chi', |
|
'زوين': 'zwin', 'زوينا': 'zwina', 'مزيان': 'mzyan', 'مزيانا': 'mzyana', |
|
'كاينين': 'kaynin', 'مطعم': 'ma63am', 'مطاعم': 'ma6a3im', |
|
'مشهور': 'mashhur', 'مشهورين': 'mashhurin', 'وسط': 'wost', |
|
'المدينة': 'lmdina', 'مدينة': 'mdina', 'إيطالي': 'italiy', |
|
'ياباني': 'yabani', 'مغربي': 'maghribi', 'فرنسي': 'fransi', |
|
'أمريكي': 'amriki', 'صيني': 'sini', 'هندي': 'hindi', |
|
'لحم': 'la7m', 'دجاج': 'djaj', 'حوت': '7ut', 'خضرة': 'khodra', |
|
'فواكه': 'fawakeh', 'جبن': 'jben', 'زبدة': 'zebda', 'حليب': '7lib', |
|
'قهوة': 'qahwa', 'شاي': 'atay', 'ماء': 'ma', 'عصير': '3asir', |
|
'خبز': 'khobz', 'رز': 'roz', 'مكرونة': 'makarona', 'بطاطا': 'batata', |
|
'طماطم': 'toma6im', 'بصل': 'basal', 'ثوم': 'tum', 'فلفل': 'felfel', |
|
'ملح': 'mel7', 'سكر': 'sokkar', 'زيت': 'zit', 'خل': 'khall' |
|
} |
|
|
|
|
|
char_mappings = { |
|
'ا': 'a', 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', 'ح': '7', |
|
'خ': 'kh', 'د': 'd', 'ذ': 'dh', 'ر': 'r', 'ز': 'z', 'س': 's', |
|
'ش': 'sh', 'ص': 's', 'ض': 'd', 'ط': '6', 'ظ': 'z', 'ع': '3', |
|
'غ': 'gh', 'ف': 'f', 'ق': '9', 'ك': 'k', 'ل': 'l', 'م': 'm', |
|
'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ء': '2', |
|
'آ': 'aa', 'أ': 'a', 'إ': 'i', 'ة': 'a', 'ى': 'a', |
|
'؟': '?', '،': ',', '؛': ';', ':': ':', '!': '!', |
|
'َ': 'a', 'ُ': 'o', 'ِ': 'i', 'ً': 'an', 'ٌ': 'on', 'ٍ': 'in' |
|
} |
|
|
|
result = arabic_text |
|
|
|
|
|
for arabic_word, arabizi_word in word_mappings.items(): |
|
|
|
result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result) |
|
|
|
|
|
for arabic_char, arabizi_char in char_mappings.items(): |
|
result = result.replace(arabic_char, arabizi_char) |
|
|
|
return result.strip() |
|
|
|
def chat_with_atlas(message, history): |
|
"""Generate response from Atlas-Chat model with Space-to-Space Arabizi conversion""" |
|
if not message.strip(): |
|
return "ahlan wa sahlan! kifash n9der n3awnek? / مرحبا! كيفاش نقدر نعاونك؟" |
|
|
|
try: |
|
|
|
atlas_model, _ = load_models() |
|
|
|
|
|
is_arabizi_input = detect_arabizi(message) |
|
|
|
print("\n" + "="*50) |
|
print("🔍 ATLAS-CHAT DEBUG LOG") |
|
print("="*50) |
|
print(f"📥 INPUT: '{message}'") |
|
print(f"🔍 ARABIZI: {is_arabizi_input}") |
|
|
|
|
|
if is_arabizi_input: |
|
print("🔄 Converting Arabizi→Arabic via Helper Space...") |
|
arabic_input = arabizi_to_arabic_client(message) |
|
print(f"✅ ARABIC: '{arabic_input}'") |
|
model_input = arabic_input |
|
else: |
|
print("➡️ No conversion needed") |
|
model_input = message |
|
|
|
print(f"🤖 Sending to Atlas-Chat...") |
|
|
|
|
|
messages = [{"role": "user", "content": model_input}] |
|
|
|
outputs = atlas_model( |
|
messages, |
|
max_new_tokens=256, |
|
temperature=0.1, |
|
do_sample=True, |
|
pad_token_id=atlas_model.tokenizer.eos_token_id |
|
) |
|
|
|
|
|
response = outputs[0]["generated_text"][-1]["content"].strip() |
|
print(f"✅ RESPONSE: '{response[:100]}{'...' if len(response) > 100 else ''}'") |
|
|
|
|
|
if is_arabizi_input: |
|
print("🔄 Converting Arabic→Arabizi...") |
|
arabizi_response = arabic_to_arabizi(response) |
|
print(f"✅ FINAL: '{arabizi_response[:100]}{'...' if len(arabizi_response) > 100 else ''}'") |
|
print("="*50 + "\n") |
|
return arabizi_response |
|
else: |
|
print("="*50 + "\n") |
|
return response |
|
|
|
except Exception as e: |
|
print(f"\n❌ ERROR: {str(e)}") |
|
print("="*50 + "\n") |
|
|
|
if detect_arabizi(message): |
|
return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!" |
|
else: |
|
return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred: {str(e)}. Try again!" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=chat_with_atlas, |
|
title="🏔️ Atlas-Chat: AI-Powered Moroccan Arabic Assistant", |
|
description=""" |
|
**مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦 |
|
|
|
**🚀 Powered by Hugging Face Inference API:** |
|
- **Arabic Script (العربية)** → Direct conversation |
|
- **Arabizi (3arabi bi 7oruf latin)** → API conversion → Arabizi response |
|
- **English** → Direct conversation |
|
|
|
**⚡ Features:** |
|
- Professional AI Arabizi conversion via API |
|
- No local model conflicts |
|
- Fast and reliable responses |
|
- Comprehensive language detection |
|
|
|
**جرب هذه الأسئلة / Try these questions:** |
|
""", |
|
examples=[ |
|
"شكون لي صنعك؟", |
|
"shkoun li sna3ek?", |
|
"اشنو هو الطاجين؟", |
|
"achno howa tajine?", |
|
"شنو كيتسمى المنتخب المغربي؟", |
|
"chno kaytsma lmontakhab lmaghribi?", |
|
"What is Morocco famous for?", |
|
"كيفاش نقدر نتعلم الدارجة؟", |
|
"kifash n9der nt3elem darija?", |
|
"wach kayn atay f lmaghrib?", |
|
"3lach lmaghrib zwien bzzaf?", |
|
"kifash nsali tajine?", |
|
"chno homa l2aklat lmaghribiya?", |
|
"kayn chi restaurants zwinin f casa?", |
|
"mr7ba! kif dayr?" |
|
], |
|
cache_examples=False |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |