|
import gradio as gr |
|
import torch |
|
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM |
|
import re |
|
|
|
|
|
atlas_pipe = None |
|
transliteration_tokenizer = None |
|
transliteration_model = None |
|
|
|
def load_models(): |
|
"""Load both Atlas-Chat and Transliteration models""" |
|
global atlas_pipe, transliteration_tokenizer, transliteration_model |
|
|
|
|
|
if atlas_pipe is None: |
|
print("๐๏ธ Loading Atlas-Chat-2B model...") |
|
atlas_pipe = pipeline( |
|
"text-generation", |
|
model="MBZUAI-Paris/Atlas-Chat-2B", |
|
model_kwargs={"torch_dtype": torch.bfloat16}, |
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
) |
|
print("โ
Atlas-Chat model loaded!") |
|
|
|
|
|
if transliteration_tokenizer is None or transliteration_model is None: |
|
print("๐ Loading Transliteration model...") |
|
transliteration_tokenizer = AutoTokenizer.from_pretrained("atlasia/Transliteration-Moroccan-Darija") |
|
transliteration_model = AutoModelForSeq2SeqLM.from_pretrained("atlasia/Transliteration-Moroccan-Darija") |
|
print("โ
Transliteration model loaded!") |
|
|
|
return atlas_pipe, transliteration_tokenizer, transliteration_model |
|
|
|
def detect_arabizi(text): |
|
""" |
|
Detect if input text is written in Arabizi (Latin script with numbers) |
|
Returns True if Arabizi is detected |
|
""" |
|
if not text or len(text.strip()) < 2: |
|
return False |
|
|
|
|
|
arabic_pattern = r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]' |
|
if re.search(arabic_pattern, text): |
|
return False |
|
|
|
|
|
arabizi_numbers = ['2', '3', '7', '9', '5', '6', '8'] |
|
has_arabizi_numbers = any(num in text for num in arabizi_numbers) |
|
|
|
|
|
arabizi_patterns = [ |
|
'wach', 'wash', 'ach', 'achno', 'chno', 'shno', 'shkoun', 'chkoun', |
|
'kif', 'kifash', 'ki', 'kayf', 'kien', 'kima', |
|
'feen', 'fin', 'fen', 'fain', 'mnin', |
|
'imta', 'meta', 'waqt', 'mata', 'emta', |
|
'hna', 'ahna', 'ana', 'nta', 'nti', 'ntuma', 'ntouma', |
|
'howa', 'hiya', 'huma', 'houma', 'hoa', 'hia', |
|
'had', 'hadchi', 'hada', 'hadi', 'hadou', 'hadouk', |
|
'bghit', 'bghiti', 'bgha', 'bghina', 'bghitiou', |
|
'galt', 'galti', 'gal', 'galet', 'galou', |
|
'rah', 'raha', 'rahi', 'rahom', 'rahin', |
|
'kan', 'kanu', 'kana', 'kanet', 'kano', |
|
'ghadi', 'ghad', 'gha', 'ghadia', 'ghadiyin', |
|
'daba', 'dak', 'dakchi', 'dik', 'dok', |
|
'bzf', 'bzzaf', 'bezzaf', 'bzaaaaf', |
|
'chway', 'chwiya', 'shwiya', 'chwia', |
|
'khoya', 'khuya', 'akhi', 'kho', |
|
'khti', 'khtiya', 'ukhti', 'kht', |
|
'mama', 'baba', 'lwaldin', 'lwalidin', |
|
'salam', 'salamu aleikum', 'slm', |
|
'yallah', 'yalla', 'hya', 'aji', |
|
'mabghitsh', 'mabghach', 'makansh', 'machi', |
|
'walakin', 'walaken', 'ama', 'mais', |
|
'kayn', 'makaynsh', 'chi', 'tayi' |
|
] |
|
|
|
text_lower = text.lower() |
|
has_arabizi_words = any(pattern in text_lower for pattern in arabizi_patterns) |
|
|
|
|
|
if has_arabizi_numbers and has_arabizi_words: |
|
return True |
|
if has_arabizi_numbers and len([c for c in text if c.isalpha()]) > len(text) * 0.6: |
|
return True |
|
if has_arabizi_words and len([c for c in text if c.isalpha()]) > len(text) * 0.7: |
|
return True |
|
|
|
return False |
|
|
|
def arabizi_to_arabic_ai(arabizi_text): |
|
""" |
|
Convert Arabizi text to Arabic using the specialized AI model |
|
""" |
|
try: |
|
_, tokenizer, model = load_models() |
|
|
|
|
|
input_tokens = tokenizer(arabizi_text, return_tensors="pt", padding=True, truncation=True, max_length=512) |
|
|
|
|
|
with torch.no_grad(): |
|
output_tokens = model.generate( |
|
**input_tokens, |
|
max_length=512, |
|
num_beams=4, |
|
early_stopping=True, |
|
no_repeat_ngram_size=2 |
|
) |
|
|
|
|
|
arabic_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True) |
|
|
|
return arabic_text.strip() |
|
|
|
except Exception as e: |
|
print(f"โ Error in ArabiziโArabic conversion: {e}") |
|
|
|
return arabizi_text |
|
|
|
def arabic_to_arabizi(arabic_text): |
|
""" |
|
Convert Arabic script to Arabizi using character mappings |
|
(Keeping this as backup since no reverse model available) |
|
""" |
|
if not arabic_text: |
|
return arabic_text |
|
|
|
|
|
word_mappings = { |
|
|
|
'ุฃูุง': 'ana', 'ูุชุง': 'nta', 'ูุชู': 'nti', 'ููุง': 'howa', 'ููุง': 'hiya', |
|
'ุญูุง': 'hna', 'ุฃุญูุง': 'ahna', 'ูุชูู
ุง': 'ntuma', 'ููู
ุง': 'huma', |
|
'ุดููู': 'shkoun', 'ุฃุดูู': 'achno', 'ุดูู': 'chno', 'ูุงุด': 'wach', |
|
'ูููุงุด': 'kifash', 'ููู': 'kif', 'ููู': 'feen', 'ู
ููู': 'mnin', |
|
'ุฅู
ุชุง': 'imta', 'ู
ุชุง': 'meta', 'ุนูุงุด': '3lach', 'ุฃุด': 'ach', |
|
'ุจุบูุช': 'bghit', 'ุจุบูุชู': 'bghiti', 'ุจุบุง': 'bgha', 'ุจุบููุง': 'bghina', |
|
'ูุงู': 'kan', 'ูุงูุง': 'kana', 'ูุงูุช': 'kanet', 'ูุงูู': 'kanu', |
|
'ููุช': 'galt', 'ููุชู': 'galti', 'ูุงู': 'gal', 'ูุงูุช': 'galet', |
|
'ุฑุงุญ': 'rah', 'ุฑุงูุง': 'raha', 'ุฑุงูู': 'rahi', 'ุฑุงูู
': 'rahom', |
|
'ุบุงุฏู': 'ghadi', 'ุบุงุฏ': 'ghad', 'ุบุง': 'gha', |
|
'ูุงุฏ': 'had', 'ูุงุฏุง': 'hada', 'ูุงุฏู': 'hadi', 'ูุงุฏุดู': 'hadchi', |
|
'ุฏุงู': 'dak', 'ุฏูู': 'dik', 'ุฏุงูุดู': 'dakchi', |
|
'ุจุฒุงู': 'bzzaf', 'ุดููุงุฉ': 'chwiya', 'ูููุดู': 'kolchi', |
|
'ู
ุงุดู': 'machi', 'ู
ุงุจุบูุชุด': 'mabghitsh', 'ู
ุงูุงููุด': 'makainch', |
|
'ุฏุงุจุง': 'daba', 'ุชูุง': 'tawa', 'ุบุฏุง': 'ghda', |
|
'ู
ุงู
ุง': 'mama', 'ุจุงุจุง': 'baba', 'ุฎููุง': 'khoya', 'ุฎุชู': 'khti', |
|
'ุณูุงู
': 'salam', 'ูุงูุงู': 'yallah', 'ููุง': 'hya', |
|
'ุงูู
ุบุฑุจ': 'lmaghrib', 'ู
ุบุฑุจ': 'maghrib', |
|
'ุทุงุฌูู': 'tajine', 'ุฃุชุงู': 'atay', 'ุฎูุจุฒ': 'khobz', |
|
'ูุงูู': 'kayn', 'ู
ุงูุงููุด': 'makaynsh', 'ุดู': 'chi', |
|
'ุฒููู': 'zwin', 'ุฒูููุง': 'zwina', 'ู
ุฒูุงู': 'mzyan', 'ู
ุฒูุงูุง': 'mzyana', |
|
'ุฏุฑุช': 'dert', 'ุฏุฑุชู': 'derti', 'ุฏุงุฑ': 'dar', 'ุฏุฑุงุช': 'derat', |
|
'ู
ุดูุช': 'mchit', 'ู
ุดูุชู': 'mchiti', 'ู
ุดุง': 'mcha', 'ู
ุดุงุช': 'mchat', |
|
'ุฌูุช': 'jit', 'ุฌูุชู': 'jiti', 'ุฌุง': 'ja', 'ุฌุงุช': 'jat', |
|
'ุดูุช': 'cheft', 'ุดูุชู': 'chefti', 'ุดุงู': 'chaf', 'ุดุงูุช': 'chafat', |
|
'ุณู
ุนุช': 'sme3t', 'ุณู
ุนุชู': 'sme3ti', 'ุณู
ุน': 'sma3', 'ุณู
ุนุงุช': 'sma3at', |
|
'ุฃููุช': 'klit', 'ุฃููุชู': 'kliti', 'ููุง': 'kla', 'ููุงุช': 'klat', |
|
'ุดุฑุจุช': 'chrebt', 'ุดุฑุจุชู': 'chrebti', 'ุดุฑุจ': 'chreb', 'ุดุฑุจุงุช': 'chrebat', |
|
'ูุนุณุช': 'ne3st', 'ูุนุณุชู': 'ne3sti', 'ูุนุณ': 'ne3s', 'ูุนุณุงุช': 'ne3sat', |
|
'ุฎุฑุฌุช': 'khrjt', 'ุฎุฑุฌุชู': 'khrjti', 'ุฎุฑุฌ': 'khrj', 'ุฎุฑุฌุงุช': 'khrjat', |
|
'ุฏุฎูุช': 'dkhlt', 'ุฏุฎูุชู': 'dkhlti', 'ุฏุฎู': 'dkhl', 'ุฏุฎูุงุช': 'dkhlat', |
|
'ูุฑูุช': 'qrit', 'ูุฑูุชู': 'qriti', 'ูุฑุง': 'qra', 'ูุฑุงุช': 'qrat', |
|
'ูุชุจุช': 'ktebt', 'ูุชุจุชู': 'ktebti', 'ูุชุจ': 'kteb', 'ูุชุจุงุช': 'ktebat', |
|
'ูุนุจุช': 'l3ebt', 'ูุนุจุชู': 'l3ebti', 'ูุนุจ': 'l3eb', 'ูุนุจุงุช': 'l3ebat', |
|
'ุฎุฏู
ุช': 'khdmt', 'ุฎุฏู
ุชู': 'khdmti', 'ุฎุฏู
': 'khdm', 'ุฎุฏู
ุงุช': 'khdmat', |
|
'ุตููุช': 'sllit', 'ุตููุชู': 'slliti', 'ุตูุง': 'slla', 'ุตูุงุช': 'sllat', |
|
'ุทุจุฎุช': '6bkht', 'ุทุจุฎุชู': '6bkhti', 'ุทุจุฎ': '6bekh', 'ุทุจุฎุงุช': '6bekhat', |
|
'ูุงุญุฏ': 'wa7ed', 'ุฌูุฌ': 'joj', 'ุชูุงุชุง': 'tlata', 'ุฑุจุนุง': 'reb3a', |
|
'ุฎู
ุณุง': 'khamsa', 'ุณุชุง': 'setta', 'ุณุจุนุง': 'seb3a', 'ุชู
ููุง': 'tmnya', |
|
'ุชุณุนุง': 'tes3a', 'ุนุดุฑุง': '3echra', 'ุญุฏุงุด': '7dach', 'ุทูุงุด': '6nach', |
|
'ููุงุฑ': 'nhar', 'ูููุง': 'lila', 'ุตุจุงุญ': 'sba7', 'ุนุดูุง': '3echiya', |
|
'ุฃู
ุณ': 'ems', 'ุงูุจุงุฑุญ': 'lbare7', 'ุบุฏุง': 'ghda', 'ุจุนุฏ ุบุฏุง': 'b3d ghda', |
|
'ุฏุงุฑ': 'dar', 'ุจูุช': 'bit', 'ุดุงุฑุน': 'char3', 'ู
ุฏููุง': 'mdina', |
|
'ูุฑููุจุง': 'karhouba', 'ุทูู
ูุจูู': 'tomobil', 'ูุทุงุฑ': 'q6ar', 'ุจุงุต': 'bas', |
|
'ู
ุงููุง': 'makla', 'ุดุฑุงุจ': 'chrab', 'ูู
ุง': 'lma', 'ุนุทุด': '36ch', |
|
'ุฌูุน': 'jo3', 'ุดุจุนุงู': 'cheb3an', 'ุนูุงู': '3yyan', 'ุตุญูุญ': 's7i7', |
|
'ู
ุฑูุถ': 'mrid', 'ุฏูุชูุฑ': 'doktor', 'ุณุจูุทุงุฑ': 'sbitar', 'ุฏูุง': 'dwa', |
|
'ูููุณ': 'flous', 'ุฏุฑูู
': 'derhem', 'ุฑูุงู': 'riyal', 'ุงูููุฑู': 'lyoro', |
|
'ุฎุฏู
ุง': 'khedma', 'ู
ุนูู
': 'mo3alim', 'ุทุงูุจ': 'talib', 'ุฃุณุชุงุฐ': 'ostaz', |
|
'ูุชุงุจ': 'ktab', 'ููู
': 'qalam', 'ูุงุบุท': 'kaghet', 'ุทุงููุง': 'tabla' |
|
} |
|
|
|
|
|
char_mappings = { |
|
'ุง': 'a', 'ุจ': 'b', 'ุช': 't', 'ุซ': 'th', 'ุฌ': 'j', 'ุญ': '7', |
|
'ุฎ': 'kh', 'ุฏ': 'd', 'ุฐ': 'dh', 'ุฑ': 'r', 'ุฒ': 'z', 'ุณ': 's', |
|
'ุด': 'sh', 'ุต': 's', 'ุถ': 'd', 'ุท': '6', 'ุธ': 'z', 'ุน': '3', |
|
'ุบ': 'gh', 'ู': 'f', 'ู': '9', 'ู': 'k', 'ู': 'l', 'ู
': 'm', |
|
'ู': 'n', 'ู': 'h', 'ู': 'w', 'ู': 'y', 'ุก': '2', |
|
'ุข': 'aa', 'ุฃ': 'a', 'ุฅ': 'i', 'ุฉ': 'a', 'ู': 'a', |
|
'ุ': '?', 'ุ': ',', 'ุ': ';', '๏ผ': ':', '๏ผ': '!', |
|
'ู': 'a', 'ู': 'o', 'ู': 'i', 'ู': 'an', 'ู': 'on', 'ู': 'in' |
|
} |
|
|
|
result = arabic_text |
|
|
|
|
|
for arabic_word, arabizi_word in word_mappings.items(): |
|
|
|
result = re.sub(r'\b' + re.escape(arabic_word) + r'\b', arabizi_word, result) |
|
|
|
|
|
for arabic_char, arabizi_char in char_mappings.items(): |
|
result = result.replace(arabic_char, arabizi_char) |
|
|
|
return result.strip() |
|
|
|
def chat_with_atlas(message, history): |
|
"""Generate response from Atlas-Chat model with AI-powered Arabizi conversion""" |
|
if not message.strip(): |
|
return "ahlan wa sahlan! kifash n9der n3awnek? / ู
ุฑุญุจุง! ูููุงุด ููุฏุฑ ูุนุงูููุ" |
|
|
|
try: |
|
|
|
atlas_model, _, _ = load_models() |
|
|
|
|
|
is_arabizi_input = detect_arabizi(message) |
|
|
|
print("\n" + "="*60) |
|
print("๐ DEBUG LOG - FULL CONVERSION PIPELINE") |
|
print("="*60) |
|
print(f"๐ฅ ORIGINAL INPUT: '{message}'") |
|
print(f"๐ค ARABIZI DETECTED: {is_arabizi_input}") |
|
|
|
|
|
if is_arabizi_input: |
|
|
|
print(f"\n๐ STEP 1: Converting Arabizi to Arabic...") |
|
arabic_input = arabizi_to_arabic_ai(message) |
|
print(f"โ
ARABIC CONVERSION: '{arabic_input}'") |
|
model_input = arabic_input |
|
else: |
|
|
|
print(f"\nโก๏ธ NO CONVERSION NEEDED - Using original input") |
|
model_input = message |
|
|
|
print(f"\n๐ค STEP 2: Sending to Atlas-Chat model...") |
|
print(f"๐ค MODEL INPUT: '{model_input}'") |
|
|
|
|
|
messages = [{"role": "user", "content": model_input}] |
|
|
|
outputs = atlas_model( |
|
messages, |
|
max_new_tokens=256, |
|
temperature=0.1, |
|
do_sample=True, |
|
pad_token_id=atlas_model.tokenizer.eos_token_id |
|
) |
|
|
|
|
|
response = outputs[0]["generated_text"][-1]["content"].strip() |
|
print(f"โ
MODEL RESPONSE (Arabic): '{response}'") |
|
|
|
|
|
if is_arabizi_input: |
|
print(f"\n๐ STEP 3: Converting response back to Arabizi...") |
|
arabizi_response = arabic_to_arabizi(response) |
|
print(f"โ
FINAL ARABIZI RESPONSE: '{arabizi_response}'") |
|
print("="*60) |
|
print("๐ฏ FINAL OUTPUT TO USER:", arabizi_response) |
|
print("="*60 + "\n") |
|
return arabizi_response |
|
else: |
|
|
|
print(f"\nโก๏ธ NO BACK-CONVERSION NEEDED") |
|
print("="*60) |
|
print("๐ฏ FINAL OUTPUT TO USER:", response) |
|
print("="*60 + "\n") |
|
return response |
|
|
|
except Exception as e: |
|
print(f"\nโ ERROR OCCURRED: {str(e)}") |
|
print("="*60 + "\n") |
|
|
|
if detect_arabizi(message): |
|
return f"sorry, kan chi mochkil: {str(e)}. 3awd jar'b!" |
|
else: |
|
return f"ุนุฐุฑุงูุ ูุงุฌูุช ุฎุทุฃ: {str(e)}. ุฌุฑุจ ู
ุฑุฉ ุฃุฎุฑู! / Sorry, error occurred: {str(e)}. Try again!" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=chat_with_atlas, |
|
title="๐๏ธ Atlas-Chat: Advanced Moroccan Arabic AI", |
|
description=""" |
|
**ู
ุฑุญุจุง ุจู ูู ุฃุทูุณ ุดุงุช ุงูู
ุทูุฑ!** Welcome to Advanced Atlas-Chat! ๐ฒ๐ฆ |
|
|
|
**๐ง AI-Powered Language Detection & Conversion:** |
|
- **Arabic Script (ุงูุนุฑุจูุฉ)** โ AI responds in Arabic |
|
- **Arabizi (3arabi bi 7oruf latin)** โ AI-powered conversion โ Arabizi response |
|
- **English** โ AI responds in English |
|
|
|
**โก Professional Arabizi Conversion** |
|
- Uses specialized AI model trained on Moroccan Darija |
|
- Perfect understanding of context: "kayn chi" โ "ูุงูู ุดู" |
|
- Handles complex phrases accurately |
|
|
|
**ุฌุฑุจ ูุฐู ุงูุฃุณุฆูุฉ / Try these questions:** |
|
""", |
|
examples=[ |
|
"ุดููู ูู ุตูุนูุ", |
|
"shkoun li sna3ek?", |
|
"ุงุดูู ูู ุงูุทุงุฌููุ", |
|
"achno howa tajine?", |
|
"ุดูู ููุชุณู
ู ุงูู
ูุชุฎุจ ุงูู
ุบุฑุจูุ", |
|
"chno kaytsma lmontakhab lmaghribi?", |
|
"What is Morocco famous for?", |
|
"ูููุงุด ููุฏุฑ ูุชุนูู
ุงูุฏุงุฑุฌุฉุ", |
|
"kifash n9der nt3elem darija?", |
|
"wach kayn atay f lmaghrib?", |
|
"3lach lmaghrib zwien bzzaf?", |
|
"kifash nsali tajine?", |
|
"chno homa l2aklat lmaghribiya?", |
|
"kayn chi restaurants zwinin f casa?", |
|
"mr7ba! kif dayr?" |
|
], |
|
cache_examples=False |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |