File size: 8,938 Bytes
4f4e064
f536a8a
 
b97cadb
4f4e064
f536a8a
 
4f4e064
f536a8a
 
 
 
 
 
 
 
 
 
 
 
 
4f4e064
b97cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f536a8a
b97cadb
f536a8a
b97cadb
f536a8a
 
 
 
 
b97cadb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f536a8a
 
 
 
 
 
 
 
 
 
 
 
b97cadb
 
 
 
 
 
 
 
 
f536a8a
 
 
b97cadb
4f4e064
f536a8a
4f4e064
f536a8a
 
 
 
 
b97cadb
 
 
 
 
f536a8a
 
 
 
 
b97cadb
 
 
f536a8a
b97cadb
f536a8a
 
b97cadb
 
4f4e064
0615894
4f4e064
 
f536a8a
4f4e064
f536a8a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import gradio as gr
import torch
from transformers import pipeline
import re

# Global variable to store the model
pipe = None

def load_model():
    """Load the Atlas-Chat model"""
    global pipe
    if pipe is None:
        print("🏔️ Loading Atlas-Chat-2B model...")
        pipe = pipeline(
            "text-generation",
            model="MBZUAI-Paris/Atlas-Chat-2B",
            model_kwargs={"torch_dtype": torch.bfloat16},
            device="cuda" if torch.cuda.is_available() else "cpu"
        )
        print("✅ Model loaded successfully!")
    return pipe

def detect_arabizi(text):
    """
    Detect if input text is written in Arabizi (Latin script with numbers)
    Returns True if Arabizi is detected
    """
    if not text or len(text.strip()) < 2:
        return False
    
    # Remove spaces and convert to lowercase for analysis
    clean_text = text.lower().replace(" ", "")
    
    # Check for Arabic script - if present, it's NOT Arabizi
    arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
    if re.search(arabic_pattern, text):
        return False
    
    # Arabizi indicators
    arabizi_numbers = ['2', '3', '7', '9']  # Common Arabic letter substitutions
    arabizi_patterns = [
        'wach', 'wash', 'ach', 'achno', 'chno', 'shno',  # What
        'kif', 'kifash', 'ki', 'kayf',  # How
        'feen', 'fin', 'fen',  # Where  
        'imta', 'meta', 'waqt',  # When
        '3la', '3ala', 'ala',  # On/about
        'hna', '7na', 'ahna',  # We/us
        'nta', 'nti', 'ntuma',  # You
        'howa', 'hiya', 'huma',  # He/she/they
        'ma3', 'maa3', 'maak', 'maaki',  # With
        'had', 'hadchi', 'hada', 'hadi',  # This
        'bghit', 'bghiti', 'bgha',  # Want
        'galt', 'galti', 'gal',  # Said
        'rah', 'raha', 'rahi',  # Going
        'kan', 'kanu', 'kana',  # Was/were
        'ghadi', 'ghad', 'gha',  # Will/going to
        'daba', 'dak', 'dakchi',  # Now/that
        'bzf', 'bzzaf', 'bezzaf',  # A lot
        'chway', 'chwiya', 'shwiya',  # A little
        'khoya', 'khuya', 'akhi',  # Brother
        'khti', 'khtiya', 'ukhti',  # Sister
        'allah', 'llah', 'rabi',  # God
        'inchallah', 'insha allah',  # God willing
        'hamdulillah', 'alhamdulillah',  # Praise God
        'salam', 'salamu aleikum',  # Peace
        'baraka', 'barakallahu',  # Blessing
        'yallah', 'yalla', 'hya'  # Come on/let's go
    ]
    
    # Count Latin letters
    latin_letters = sum(1 for c in clean_text if c.isalpha() and ord(c) < 128)
    
    # Count Arabizi number substitutions
    arabizi_number_count = sum(1 for num in arabizi_numbers if num in clean_text)
    
    # Count Arabizi word patterns
    arabizi_word_count = sum(1 for pattern in arabizi_patterns if pattern in clean_text)
    
    # Decision logic
    total_chars = len(clean_text)
    
    # Strong indicators
    if arabizi_number_count >= 2:  # Multiple number substitutions
        return True
    
    if arabizi_word_count >= 2:  # Multiple Arabizi words
        return True
    
    # Medium indicators 
    if arabizi_number_count >= 1 and latin_letters > total_chars * 0.7:
        return True
    
    if arabizi_word_count >= 1 and latin_letters > total_chars * 0.8:
        return True
    
    # Weak but possible indicators
    if latin_letters > total_chars * 0.9 and total_chars > 10:
        # Mostly Latin letters in longer text - could be Arabizi
        if arabizi_number_count >= 1 or arabizi_word_count >= 1:
            return True
    
    return False

def determine_response_language(user_input):
    """
    Determine what language/script the response should be in
    Returns: 'arabizi', 'arabic', or 'english'
    """
    if detect_arabizi(user_input):
        return 'arabizi'
    
    # Check for Arabic script
    arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
    if re.search(arabic_pattern, user_input):
        return 'arabic'
    
    # Default to English for Latin-only text without Arabizi indicators
    return 'english'

def create_system_prompt(response_lang):
    """Create appropriate system prompt based on desired response language"""
    
    if response_lang == 'arabizi':
        return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija). 

CRITICAL INSTRUCTION: The user has written in Arabizi (Latin script), so you MUST respond ONLY in Arabizi using Latin letters and numbers.

ARABIZI RULES YOU MUST FOLLOW:
- Use ONLY Latin letters (a-z) and numbers for Arabic sounds
- Use these number substitutions: 3=ع, 7=ح, 9=ق, 2=ء, 5=خ, 6=ط, 8=غ
- Write naturally in Moroccan Darija but with Latin script
- Examples: "ana" (أنا), "hna" (حنا), "3la" (على), "7na" (حنا), "wach" (واش)
- Do NOT use any Arabic script characters
- Do NOT switch to English unless the user specifically asks for translation

Respond naturally in Arabizi about Moroccan culture, language, and general topics."""

    elif response_lang == 'arabic':
        return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija). Respond in Arabic script (Darija) as this is what the user is using. Be helpful and culturally aware about Morocco and its traditions."""
    
    else:  # English
        return """You are Atlas-Chat, an AI assistant specialized in Moroccan Arabic (Darija) but also fluent in English. The user has written in English, so respond in English while being knowledgeable about Moroccan culture and language."""

def chat_with_atlas(message, history):
    """Generate response from Atlas-Chat model with language detection"""
    if not message.strip():
        return "مرحبا! أهلا وسهلا. Please enter a message! / Ahlan wa sahlan!"
    
    try:
        # Load model if not already loaded
        model = load_model()
        
        # Determine response language
        response_lang = determine_response_language(message)
        
        # Create appropriate system prompt
        system_prompt = create_system_prompt(response_lang)
        
        # Prepare messages with system context
        if response_lang == 'arabizi':
            # Extra emphasis for Arabizi responses
            enhanced_message = f"""System: {system_prompt}

User message (in Arabizi): {message}

Remember: Respond ONLY in Arabizi (Latin letters + numbers). Do not use Arabic script."""
            
            messages = [{"role": "user", "content": enhanced_message}]
        else:
            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": message}
            ]
        
        # Generate response
        outputs = model(
            messages, 
            max_new_tokens=256, 
            temperature=0.1,
            do_sample=True,
            pad_token_id=model.tokenizer.eos_token_id
        )
        
        # Extract the response
        response = outputs[0]["generated_text"][-1]["content"].strip()
        
        # Post-process for Arabizi if needed
        if response_lang == 'arabizi':
            # Remove any Arabic script that might have leaked through
            arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]'
            if re.search(arabic_pattern, response):
                # If Arabic script is detected, provide a fallback Arabizi response
                response = "ana Atlas-Chat, kay3jebni n7der m3ak! chno bghiti t3ref 3la lmaghrib? (I'm Atlas-Chat, I'd love to chat with you! What do you want to know about Morocco?)"
        
        return response
        
    except Exception as e:
        return f"عذراً، واجهت خطأ: {str(e)}. جرب مرة أخرى! / Sorry, error occurred. Try again!"

# Create the Gradio interface
demo = gr.ChatInterface(
    fn=chat_with_atlas,
    title="🏔️ Atlas-Chat: Moroccan Arabic AI Assistant",
    description="""
    **مرحبا بك في أطلس شات!** Welcome to Atlas-Chat! 🇲🇦
    
    I'm an AI assistant specialized in **Moroccan Arabic (Darija)** with smart language detection:
    
    - **Arabic Script (العربية)** → I respond in Arabic
    - **Arabizi (3arabi bi 7oruf latin)** → I respond in Arabizi  
    - **English** → I respond in English
    
    **جرب هذه الأسئلة / Try these questions:**
    """,
    examples=[
        "شكون لي صنعك؟",
        "shkoun li sna3ek?",
        "اشنو هو الطاجين؟",
        "achno howa tajine?", 
        "شنو كيتسمى المنتخب المغربي؟",
        "chno kaytsma lmontakhab lmaghribi?",
        "What is Morocco famous for?",
        "Tell me about Casablanca",
        "كيفاش نقدر نتعلم الدارجة؟",
        "kifash n9der nt3elem darija?"
    ],
    cache_examples=False
)

# Launch the app
if __name__ == "__main__":
    demo.launch()