|
import gradio as gr |
|
import torch |
|
from transformers import pipeline |
|
import re |
|
|
|
|
|
pipe = None |
|
|
|
def load_model(): |
|
"""Load the Atlas-Chat model""" |
|
global pipe |
|
if pipe is None: |
|
print("🏔️ Loading Atlas-Chat-2B model...") |
|
pipe = pipeline( |
|
"text-generation", |
|
model="MBZUAI-Paris/Atlas-Chat-2B", |
|
model_kwargs={"torch_dtype": torch.bfloat16}, |
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
) |
|
print("✅ Model loaded successfully!") |
|
return pipe |
|
|
|
def detect_arabizi(text): |
|
""" |
|
Detect if input text is written in Arabizi (Latin script with numbers) |
|
Returns True if Arabizi is detected |
|
""" |
|
if not text or len(text.strip()) < 2: |
|
return False |
|
|
|
|
|
clean_text = text.lower().replace(" ", "") |
|
|
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' |
|
if re.search(arabic_pattern, text): |
|
return False |
|
|
|
|
|
arabizi_numbers = ['2', '3', '7', '9'] |
|
arabizi_patterns = [ |
|
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', |
|
'kif', 'kifash', 'ki', 'kayf', |
|
'feen', 'fin', 'fen', |
|
'imta', 'meta', 'waqt', |
|
'3la', '3ala', 'ala', |
|
'hna', '7na', 'ahna', |
|
'nta', 'nti', 'ntuma', |
|
'howa', 'hiya', 'huma', |
|
'ma3', 'maa3', 'maak', 'maaki', |
|
'had', 'hadchi', 'hada', 'hadi', |
|
'bghit', 'bghiti', 'bgha', |
|
'galt', 'galti', 'gal', |
|
'rah', 'raha', 'rahi', |
|
'kan', 'kanu', 'kana', |
|
'ghadi', 'ghad', 'gha', |
|
'daba', 'dak', 'dakchi', |
|
'bzf', 'bzzaf', 'bezzaf', |
|
'chway', 'chwiya', 'shwiya', |
|
'khoya', 'khuya', 'akhi', |
|
'khti', 'khtiya', 'ukhti', |
|
'allah', 'llah', 'rabi', |
|
'inchallah', 'insha allah', |
|
'hamdulillah', 'alhamdulillah', |
|
'salam', 'salamu aleikum', |
|
'baraka', 'barakallahu', |
|
'yallah', 'yalla', 'hya' |
|
] |
|
|
|
|
|
latin_letters = sum(1 for c in clean_text if c.isalpha() and ord(c) < 128) |
|
|
|
|
|
arabizi_number_count = sum(1 for num in arabizi_numbers if num in clean_text) |
|
|
|
|
|
arabizi_word_count = sum(1 for pattern in arabizi_patterns if pattern in clean_text) |
|
|
|
|
|
total_chars = len(clean_text) |
|
|
|
|
|
if arabizi_number_count >= 2: |
|
return True |
|
|
|
if arabizi_word_count >= 2: |
|
return True |
|
|
|
|
|
if arabizi_number_count >= 1 and latin_letters > total_chars * 0.7: |
|
return True |
|
|
|
if arabizi_word_count >= 1 and latin_letters > total_chars * 0.8: |
|
return True |
|
|
|
|
|
if latin_letters > total_chars * 0.9 and total_chars > 10: |
|
|
|
if arabizi_number_count >= 1 or arabizi_word_count >= 1: |
|
return True |
|
|
|
return False |
|
|
|
def determine_response_language(user_input): |
|
""" |
|
Determine what language/script the response should be in |
|
Returns: 'arabizi', 'arabic', or 'english' |
|
""" |
|
if detect_arabizi(user_input): |
|
return 'arabizi' |
|
|
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' |
|
if re.search(arabic_pattern, user_input): |
|
return 'arabic' |
|
|
|
|
|
return 'english' |
|
|
|
def create_system_prompt(response_lang): |
|
"""Create appropriate system prompt based on desired response language""" |
|
|
|
if response_lang == 'arabizi': |
|
return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija). |
|
|
|
CRITICAL INSTRUCTION: The user has written in Arabizi (Latin script), so you MUST respond ONLY in Arabizi using Latin letters and numbers. |
|
|
|
ARABIZI RULES YOU MUST FOLLOW: |
|
- Use ONLY Latin letters (a-z) and numbers for Arabic sounds |
|
- Use these number substitutions: 3=ع, 7=ح, 9=ق, 2=ء, 5=خ, 6=ط, 8=غ |
|
- Write naturally in Moroccan Darija but with Latin script |
|
- Examples: "ana" (أنا), "hna" (حنا), "3la" (على), "7na" (حنا), "wach" (واش) |
|
- Do NOT use any Arabic script characters |
|
- Do NOT switch to English unless the user specifically asks for translation |
|
|
|
Respond naturally in Arabizi about Moroccan culture, language, and general topics.""" |
|
|
|
elif response_lang == 'arabic': |
|
return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija). Respond in Arabic script (Darija) as this is what the user is using. Be helpful and culturally aware about Morocco and its traditions.""" |
|
|
|
else: |
|
return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija) but also fluent in English. The user has written in English, so respond in English while being knowledgeable about Moroccan culture and language.""" |
|
|
|
def chat_with_atlas(message, history): |
|
"""Generate response from Atlas-Chat model with language detection""" |
|
if not message.strip(): |
|
return "مرحبا! أهلا وسهلا. Please enter a message! / Ahlan wa sahlan!" |
|
|
|
try: |
|
|
|
model = load_model() |
|
|
|
|
|
response_lang = determine_response_language(message) |
|
|
|
|
|
system_prompt = create_system_prompt(response_lang) |
|
|
|
|
|
if response_lang == 'arabizi': |
|
|
|
enhanced_message = f"""System: {system_prompt} |
|
|
|
User message (in Arabizi): {message} |
|
|
|
Remember: Respond ONLY in Arabizi (Latin letters + numbers). Do not use Arabic script.""" |
|
|
|
messages = [{"role": "user", "content": enhanced_message}] |
|
else: |
|
messages = [ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": message} |
|
] |
|
|
|
|
|
outputs = model( |
|
messages, |
|
max_new_tokens=256, |
|
temperature=0.1, |
|
do_sample=True, |
|
pad_token_id=model.tokenizer.eos_token_id |
|
) |
|
|
|
|
|
response = outputs[0]["generated_text"][-1]["content"].strip() |
|
|
|
|
|
if response_lang == 'arabizi': |
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' |
|
if re.search(arabic_pattern, response): |
|
|
|
response = "ana Atlas-Chat, kay3jebni n7der m3ak! chno bghiti t3ref 3la lmaghrib? (I'm Atlas-Chat, I'd love to chat with you! What do you want to know about Morocco?)" |
|
|
|
return response |
|
|
|
except Exception as e: |
|
return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred. Try again!" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=chat_with_atlas, |
|
title="🏔️ Atlas-Chat: Moroccan Arabic AI Assistant", |
|
description=""" |
|
**مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦 |
|
|
|
I'm an AI assistant specialized in **Moroccan Arabic (Darija)** with smart language detection: |
|
|
|
- **Arabic Script (العربية)** → I respond in Arabic |
|
- **Arabizi (3arabi bi 7oruf latin)** → I respond in Arabizi |
|
- **English** → I respond in English |
|
|
|
**جرب هذه الأسئلة / Try these questions:** |
|
""", |
|
examples=[ |
|
"شكون لي صنعك؟", |
|
"shkoun li sna3ek?", |
|
"اشنو هو الطاجين؟", |
|
"achno howa tajine?", |
|
"شنو كيتسمى المنتخب المغربي؟", |
|
"chno kaytsma lmontakhab lmaghribi?", |
|
"What is Morocco famous for?", |
|
"Tell me about Casablanca", |
|
"كيفاش نقدر نتعلم الدارجة؟", |
|
"kifash n9der nt3elem darija?" |
|
], |
|
cache_examples=False |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |