File size: 8,938 Bytes
4f4e064 f536a8a b97cadb 4f4e064 f536a8a 4f4e064 f536a8a 4f4e064 b97cadb f536a8a b97cadb f536a8a b97cadb f536a8a b97cadb f536a8a b97cadb f536a8a b97cadb 4f4e064 f536a8a 4f4e064 f536a8a b97cadb f536a8a b97cadb f536a8a b97cadb f536a8a b97cadb 4f4e064 0615894 4f4e064 f536a8a 4f4e064 f536a8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
import gradio as gr
import torch
from transformers import pipeline
import re
# Global variable to store the model
pipe = None
def load_model():
"""Load the Atlas-Chat model"""
global pipe
if pipe is None:
print("🏔️ Loading Atlas-Chat-2B model...")
pipe = pipeline(
"text-generation",
model="MBZUAI-Paris/Atlas-Chat-2B",
model_kwargs={"torch_dtype": torch.bfloat16},
device="cuda" if torch.cuda.is_available() else "cpu"
)
print("✅ Model loaded successfully!")
return pipe
def detect_arabizi(text):
"""
Detect if input text is written in Arabizi (Latin script with numbers)
Returns True if Arabizi is detected
"""
if not text or len(text.strip()) < 2:
return False
# Remove spaces and convert to lowercase for analysis
clean_text = text.lower().replace(" ", "")
# Check for Arabic script - if present, it's NOT Arabizi
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
if re.search(arabic_pattern, text):
return False
# Arabizi indicators
arabizi_numbers = ['2', '3', '7', '9'] # Common Arabic letter substitutions
arabizi_patterns = [
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', # What
'kif', 'kifash', 'ki', 'kayf', # How
'feen', 'fin', 'fen', # Where
'imta', 'meta', 'waqt', # When
'3la', '3ala', 'ala', # On/about
'hna', '7na', 'ahna', # We/us
'nta', 'nti', 'ntuma', # You
'howa', 'hiya', 'huma', # He/she/they
'ma3', 'maa3', 'maak', 'maaki', # With
'had', 'hadchi', 'hada', 'hadi', # This
'bghit', 'bghiti', 'bgha', # Want
'galt', 'galti', 'gal', # Said
'rah', 'raha', 'rahi', # Going
'kan', 'kanu', 'kana', # Was/were
'ghadi', 'ghad', 'gha', # Will/going to
'daba', 'dak', 'dakchi', # Now/that
'bzf', 'bzzaf', 'bezzaf', # A lot
'chway', 'chwiya', 'shwiya', # A little
'khoya', 'khuya', 'akhi', # Brother
'khti', 'khtiya', 'ukhti', # Sister
'allah', 'llah', 'rabi', # God
'inchallah', 'insha allah', # God willing
'hamdulillah', 'alhamdulillah', # Praise God
'salam', 'salamu aleikum', # Peace
'baraka', 'barakallahu', # Blessing
'yallah', 'yalla', 'hya' # Come on/let's go
]
# Count Latin letters
latin_letters = sum(1 for c in clean_text if c.isalpha() and ord(c) < 128)
# Count Arabizi number substitutions
arabizi_number_count = sum(1 for num in arabizi_numbers if num in clean_text)
# Count Arabizi word patterns
arabizi_word_count = sum(1 for pattern in arabizi_patterns if pattern in clean_text)
# Decision logic
total_chars = len(clean_text)
# Strong indicators
if arabizi_number_count >= 2: # Multiple number substitutions
return True
if arabizi_word_count >= 2: # Multiple Arabizi words
return True
# Medium indicators
if arabizi_number_count >= 1 and latin_letters > total_chars * 0.7:
return True
if arabizi_word_count >= 1 and latin_letters > total_chars * 0.8:
return True
# Weak but possible indicators
if latin_letters > total_chars * 0.9 and total_chars > 10:
# Mostly Latin letters in longer text - could be Arabizi
if arabizi_number_count >= 1 or arabizi_word_count >= 1:
return True
return False
def determine_response_language(user_input):
"""
Determine what language/script the response should be in
Returns: 'arabizi', 'arabic', or 'english'
"""
if detect_arabizi(user_input):
return 'arabizi'
# Check for Arabic script
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
if re.search(arabic_pattern, user_input):
return 'arabic'
# Default to English for Latin-only text without Arabizi indicators
return 'english'
def create_system_prompt(response_lang):
"""Create appropriate system prompt based on desired response language"""
if response_lang == 'arabizi':
return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija).
CRITICAL INSTRUCTION: The user has written in Arabizi (Latin script), so you MUST respond ONLY in Arabizi using Latin letters and numbers.
ARABIZI RULES YOU MUST FOLLOW:
- Use ONLY Latin letters (a-z) and numbers for Arabic sounds
- Use these number substitutions: 3=ع, 7=ح, 9=ق, 2=ء, 5=خ, 6=ط, 8=غ
- Write naturally in Moroccan Darija but with Latin script
- Examples: "ana" (أنا), "hna" (حنا), "3la" (على), "7na" (حنا), "wach" (واش)
- Do NOT use any Arabic script characters
- Do NOT switch to English unless the user specifically asks for translation
Respond naturally in Arabizi about Moroccan culture, language, and general topics."""
elif response_lang == 'arabic':
return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija). Respond in Arabic script (Darija) as this is what the user is using. Be helpful and culturally aware about Morocco and its traditions."""
else: # English
return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija) but also fluent in English. The user has written in English, so respond in English while being knowledgeable about Moroccan culture and language."""
def chat_with_atlas(message, history):
"""Generate response from Atlas-Chat model with language detection"""
if not message.strip():
return "مرحبا! أهلا وسهلا. Please enter a message! / Ahlan wa sahlan!"
try:
# Load model if not already loaded
model = load_model()
# Determine response language
response_lang = determine_response_language(message)
# Create appropriate system prompt
system_prompt = create_system_prompt(response_lang)
# Prepare messages with system context
if response_lang == 'arabizi':
# Extra emphasis for Arabizi responses
enhanced_message = f"""System: {system_prompt}
User message (in Arabizi): {message}
Remember: Respond ONLY in Arabizi (Latin letters + numbers). Do not use Arabic script."""
messages = [{"role": "user", "content": enhanced_message}]
else:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": message}
]
# Generate response
outputs = model(
messages,
max_new_tokens=256,
temperature=0.1,
do_sample=True,
pad_token_id=model.tokenizer.eos_token_id
)
# Extract the response
response = outputs[0]["generated_text"][-1]["content"].strip()
# Post-process for Arabizi if needed
if response_lang == 'arabizi':
# Remove any Arabic script that might have leaked through
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
if re.search(arabic_pattern, response):
# If Arabic script is detected, provide a fallback Arabizi response
response = "ana Atlas-Chat, kay3jebni n7der m3ak! chno bghiti t3ref 3la lmaghrib? (I'm Atlas-Chat, I'd love to chat with you! What do you want to know about Morocco?)"
return response
except Exception as e:
return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred. Try again!"
# Create the Gradio interface
demo = gr.ChatInterface(
fn=chat_with_atlas,
title="🏔️ Atlas-Chat: Moroccan Arabic AI Assistant",
description="""
**مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦
I'm an AI assistant specialized in **Moroccan Arabic (Darija)** with smart language detection:
- **Arabic Script (العربية)** → I respond in Arabic
- **Arabizi (3arabi bi 7oruf latin)** → I respond in Arabizi
- **English** → I respond in English
**جرب هذه الأسئلة / Try these questions:**
""",
examples=[
"شكون لي صنعك؟",
"shkoun li sna3ek?",
"اشنو هو الطاجين؟",
"achno howa tajine?",
"شنو كيتسمى المنتخب المغربي؟",
"chno kaytsma lmontakhab lmaghribi?",
"What is Morocco famous for?",
"Tell me about Casablanca",
"كيفاش نقدر نتعلم الدارجة؟",
"kifash n9der nt3elem darija?"
],
cache_examples=False
)
# Launch the app
if __name__ == "__main__":
demo.launch() |