|
import gradio as gr |
|
import torch |
|
import torch.nn.functional as F |
|
import numpy as np |
|
import plotly.express as px |
|
import pandas as pd |
|
import spaces |
|
from typing import List, Tuple |
|
from torch import Tensor |
|
from transformers import AutoTokenizer, AutoModel |
|
|
|
|
|
embedder = None |
|
|
|
class QwenEmbedder: |
|
def __init__(self, embedding_dim=768): |
|
self.tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left') |
|
self.model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B') |
|
self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>") |
|
self.max_length = 8192 |
|
self.embedding_dim = embedding_dim |
|
self.projection = torch.nn.Linear(768, embedding_dim) if embedding_dim != 768 else None |
|
|
|
def to_device(self, device): |
|
self.model = self.model.to(device) |
|
if self.projection is not None: |
|
self.projection = self.projection.to(device) |
|
return self |
|
|
|
def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> Tensor: |
|
if with_instruction: |
|
task = 'Process and understand the following text' |
|
texts = [get_detailed_instruct(task, text) for text in texts] |
|
|
|
batch_dict = tokenize(self.tokenizer, texts, self.eod_id, self.max_length) |
|
batch_dict = {k: v.to(self.model.device) for k, v in batch_dict.items()} |
|
|
|
with torch.no_grad(): |
|
outputs = self.model(**batch_dict) |
|
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) |
|
|
|
|
|
if self.projection is not None: |
|
embeddings = self.projection(embeddings) |
|
|
|
embeddings = F.normalize(embeddings, p=2, dim=1) |
|
|
|
return embeddings |
|
|
|
@spaces.GPU(duration=120) |
|
def initialize_embedder(embedding_dim=768): |
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print(f"Initializing embedder on device: {device}") |
|
|
|
|
|
model = QwenEmbedder(embedding_dim=embedding_dim) |
|
return model.to_device(device) |
|
|
|
@spaces.GPU(duration=120) |
|
def process_with_embedder(fn_name, *args): |
|
"""Generic handler for embedder operations""" |
|
global embedder |
|
if embedder is None: |
|
embedder = initialize_embedder() |
|
|
|
|
|
fn_map = { |
|
'compute_similarity': compute_similarity, |
|
'rerank_documents': rerank_documents, |
|
'process_batch_embeddings': process_batch_embeddings, |
|
'process_retrieval': process_retrieval, |
|
'process_cross_lingual': process_cross_lingual, |
|
'classify_text': classify_text, |
|
'cluster_documents': cluster_documents, |
|
'analyze_sentiment': analyze_sentiment, |
|
'extract_concepts': extract_concepts |
|
} |
|
|
|
return fn_map[fn_name](embedder, *args) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
zero = torch.Tensor([0]).to(device) |
|
print(f"Device being used: {zero.device}") |
|
|
|
def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: |
|
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) |
|
if left_padding: |
|
return last_hidden_states[:, -1] |
|
else: |
|
sequence_lengths = attention_mask.sum(dim=1) - 1 |
|
batch_size = last_hidden_states.shape[0] |
|
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] |
|
|
|
def get_detailed_instruct(task_description: str, query: str) -> str: |
|
return f'Instruct: {task_description}\nQuery: {query}' |
|
|
|
def tokenize(tokenizer, input_texts, eod_id, max_length): |
|
batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2) |
|
for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]): |
|
seq.append(eod_id) |
|
att.append(1) |
|
batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt") |
|
return batch_dict |
|
|
|
def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str) -> float: |
|
embeddings = embedder.get_embeddings([text1, text2]) |
|
similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() |
|
return round(similarity, 3) |
|
|
|
def rerank_documents(embedder: QwenEmbedder, query: str, documents: str) -> List[Tuple[str, float]]: |
|
docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] |
|
|
|
|
|
task = 'Given a search query, retrieve relevant passages that answer the query' |
|
query_with_instruct = get_detailed_instruct(task, query) |
|
|
|
|
|
query_embedding = embedder.get_embeddings([query_with_instruct]) |
|
doc_embeddings = embedder.get_embeddings(docs_list) |
|
|
|
|
|
scores = (query_embedding @ doc_embeddings.T).squeeze(0) |
|
results = [(doc, float(score)) for doc, score in zip(docs_list, scores)] |
|
results.sort(key=lambda x: x[1], reverse=True) |
|
|
|
return [(doc, round(score, 3)) for doc, score in results] |
|
|
|
def process_batch_embeddings(embedder: QwenEmbedder, texts: str) -> pd.DataFrame: |
|
text_list = [text.strip() for text in texts.split('\n') if text.strip()] |
|
if len(text_list) < 1: |
|
return pd.DataFrame() |
|
|
|
embeddings = embedder.get_embeddings(text_list) |
|
scores = (embeddings @ embeddings.T).cpu().numpy() |
|
|
|
|
|
df_similarities = pd.DataFrame( |
|
scores, |
|
index=text_list, |
|
columns=text_list |
|
) |
|
|
|
return df_similarities.round(3) |
|
|
|
def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str) -> pd.DataFrame: |
|
|
|
query_list = [q.strip() for q in queries.split('\n') if q.strip()] |
|
doc_list = [d.strip() for d in documents.split('\n') if d.strip()] |
|
|
|
if not query_list or not doc_list: |
|
return pd.DataFrame() |
|
|
|
|
|
instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list] |
|
|
|
|
|
query_embeddings = embedder.get_embeddings(instructed_queries) |
|
doc_embeddings = embedder.get_embeddings(doc_list) |
|
|
|
|
|
scores = (query_embeddings @ doc_embeddings.T).cpu().numpy() |
|
|
|
|
|
df = pd.DataFrame(scores, index=query_list, columns=doc_list) |
|
return df.round(3) |
|
|
|
def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str) -> dict: |
|
texts = [arabic_text, english_text] |
|
embeddings = embedder.get_embeddings(texts) |
|
similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item() |
|
return {"similarity": round(similarity, 3)} |
|
|
|
def classify_text(embedder: QwenEmbedder, text: str, categories: str) -> List[Tuple[str, float]]: |
|
cat_list = [c.strip() for c in categories.split('\n') if c.strip()] |
|
text_embedding = embedder.get_embeddings([text]) |
|
cat_embeddings = embedder.get_embeddings(cat_list) |
|
scores = (text_embedding @ cat_embeddings.T).squeeze(0) |
|
results = [(cat, float(score)) for cat, score in zip(cat_list, scores)] |
|
results.sort(key=lambda x: x[1], reverse=True) |
|
return [(cat, round(score, 3)) for cat, score in results] |
|
|
|
def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int) -> pd.DataFrame: |
|
from sklearn.cluster import KMeans |
|
doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()] |
|
if len(doc_list) < num_clusters: |
|
return pd.DataFrame() |
|
|
|
embeddings = embedder.get_embeddings(doc_list) |
|
|
|
|
|
kmeans = KMeans(n_clusters=num_clusters, random_state=42) |
|
clusters = kmeans.fit_predict(embeddings.cpu().numpy()) |
|
|
|
|
|
cluster_centers = kmeans.cluster_centers_ |
|
cluster_center_docs = [] |
|
|
|
for i in range(num_clusters): |
|
cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i] |
|
cluster_embeddings = embedder.get_embeddings(cluster_docs) |
|
center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0) |
|
similarities = F.cosine_similarity(cluster_embeddings, center_embedding) |
|
center_doc = cluster_docs[similarities.argmax().item()] |
|
cluster_center_docs.append(center_doc) |
|
|
|
|
|
df = pd.DataFrame({ |
|
'Document': doc_list, |
|
'Cluster': clusters, |
|
'Cluster Center Document': [cluster_center_docs[c] for c in clusters] |
|
}) |
|
return df.sort_values('Cluster') |
|
|
|
def analyze_sentiment(embedder: QwenEmbedder, text: str) -> Tuple[str, dict]: |
|
|
|
anchors = { |
|
"very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية", |
|
"positive": "هذا جيد وممتع", |
|
"neutral": "هذا عادي ومقبول", |
|
"negative": "هذا سيء ومزعج", |
|
"very_negative": "هذا فظيع جداً ومحبط للغاية" |
|
} |
|
|
|
|
|
text_embedding = embedder.get_embeddings([text]) |
|
anchor_embeddings = embedder.get_embeddings(list(anchors.values())) |
|
|
|
|
|
scores = (text_embedding @ anchor_embeddings.T).squeeze(0) |
|
results = list(zip(anchors.keys(), scores.tolist())) |
|
results.sort(key=lambda x: x[1], reverse=True) |
|
|
|
|
|
return ( |
|
results[0][0], |
|
{k: round(float(v), 3) for k, v in results} |
|
) |
|
|
|
def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str) -> List[Tuple[str, float]]: |
|
|
|
concept_anchors = { |
|
"emotions": [ |
|
"الفرح والسعادة", |
|
"الحزن والأسى", |
|
"الغضب والإحباط", |
|
"الخوف والقلق", |
|
"الحب والعاطفة", |
|
"الأمل والتفاؤل" |
|
], |
|
"topics": [ |
|
"السياسة والحكم", |
|
"الاقتصاد والمال", |
|
"العلوم والتكنولوجيا", |
|
"الفن والثقافة", |
|
"الرياضة والترفيه", |
|
"التعليم والمعرفة" |
|
], |
|
"themes": [ |
|
"العدالة والمساواة", |
|
"التقدم والتطور", |
|
"التقاليد والتراث", |
|
"الحرية والاستقلال", |
|
"التعاون والوحدة", |
|
"الإبداع والابتكار" |
|
] |
|
} |
|
|
|
anchors = concept_anchors.get(concept_type, concept_anchors["topics"]) |
|
|
|
|
|
text_embedding = embedder.get_embeddings([text]) |
|
anchor_embeddings = embedder.get_embeddings(anchors) |
|
|
|
|
|
scores = (text_embedding @ anchor_embeddings.T).squeeze(0) |
|
results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)] |
|
results.sort(key=lambda x: x[1], reverse=True) |
|
|
|
return [(concept, round(score, 3)) for concept, score in results] |
|
|
|
|
|
custom_css = """ |
|
:root { |
|
--primary-color: #2196F3; |
|
--secondary-color: #1976D2; |
|
--background-color: #f8f9fa; |
|
--sidebar-bg: #ffffff; |
|
--text-color: #333333; |
|
--border-color: #e0e0e0; |
|
} |
|
|
|
.container { |
|
max-width: 1200px; |
|
margin: auto; |
|
padding: 20px; |
|
} |
|
|
|
.sidebar { |
|
background-color: var(--sidebar-bg); |
|
border-right: 1px solid var(--border-color); |
|
padding: 20px; |
|
margin-right: 20px; |
|
position: sticky; |
|
top: 0; |
|
height: 100vh; |
|
overflow-y: auto; |
|
} |
|
|
|
.main-content { |
|
background-color: var(--background-color); |
|
padding: 20px; |
|
border-radius: 10px; |
|
} |
|
|
|
.features-grid { |
|
display: grid; |
|
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); |
|
gap: 15px; |
|
margin: 15px 0; |
|
} |
|
|
|
.feature-card { |
|
background: white; |
|
padding: 15px; |
|
border-radius: 6px; |
|
box-shadow: 0 1px 3px rgba(0,0,0,0.1); |
|
transition: all 0.3s ease; |
|
border: 1px solid var(--border-color); |
|
text-align: center; |
|
} |
|
|
|
.feature-card:hover { |
|
transform: translateY(-3px); |
|
box-shadow: 0 3px 6px rgba(0,0,0,0.15); |
|
border-color: var(--primary-color); |
|
} |
|
|
|
.feature-icon { |
|
font-size: 24px; |
|
margin-bottom: 10px; |
|
color: var(--primary-color); |
|
} |
|
|
|
.feature-card h3 { |
|
color: var(--text-color); |
|
margin: 8px 0; |
|
font-size: 0.95em; |
|
font-weight: 600; |
|
} |
|
|
|
.feature-card p { |
|
color: #666; |
|
font-size: 0.8em; |
|
line-height: 1.3; |
|
margin: 5px 0; |
|
} |
|
|
|
.features-summary { |
|
margin: 40px 0; |
|
padding: 30px; |
|
background: white; |
|
border-radius: 12px; |
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1); |
|
} |
|
|
|
.features-summary h2 { |
|
color: var(--text-color); |
|
margin-bottom: 25px; |
|
text-align: center; |
|
font-size: 1.5em; |
|
} |
|
|
|
.feature-list { |
|
display: grid; |
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); |
|
gap: 30px; |
|
} |
|
|
|
.feature-group { |
|
padding: 20px; |
|
background: var(--background-color); |
|
border-radius: 8px; |
|
border: 1px solid var(--border-color); |
|
} |
|
|
|
.feature-group h3 { |
|
color: var(--primary-color); |
|
margin-bottom: 15px; |
|
font-size: 1.2em; |
|
} |
|
|
|
.feature-group ul { |
|
list-style: none; |
|
padding: 0; |
|
margin: 0; |
|
} |
|
|
|
.feature-group li { |
|
padding: 8px 0; |
|
color: var(--text-color); |
|
position: relative; |
|
padding-left: 20px; |
|
} |
|
|
|
.feature-group li:before { |
|
content: "•"; |
|
color: var(--primary-color); |
|
position: absolute; |
|
left: 0; |
|
} |
|
|
|
.description { |
|
margin: 20px 0; |
|
padding: 15px; |
|
border-radius: 8px; |
|
background-color: #ffffff; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
} |
|
|
|
.example { |
|
margin: 10px 0; |
|
padding: 15px; |
|
border-left: 4px solid var(--primary-color); |
|
background-color: #ffffff; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
} |
|
|
|
.warning { |
|
color: #721c24; |
|
background-color: #f8d7da; |
|
border: 1px solid #f5c6cb; |
|
padding: 15px; |
|
border-radius: 8px; |
|
margin: 10px 0; |
|
} |
|
|
|
.settings { |
|
background-color: #ffffff; |
|
padding: 20px; |
|
border-radius: 8px; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
margin: 20px 0; |
|
} |
|
|
|
.tab-content { |
|
padding: 20px; |
|
background-color: #ffffff; |
|
border-radius: 8px; |
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1); |
|
} |
|
|
|
.heading { |
|
color: var(--text-color); |
|
margin-bottom: 20px; |
|
padding-bottom: 10px; |
|
border-bottom: 2px solid var(--primary-color); |
|
} |
|
|
|
button.primary { |
|
background-color: var(--primary-color) !important; |
|
} |
|
|
|
button.secondary { |
|
background-color: var(--secondary-color) !important; |
|
} |
|
""" |
|
|
|
|
|
def create_demo(): |
|
demo = gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft()) |
|
|
|
with demo: |
|
with gr.Row(): |
|
|
|
with gr.Column(scale=1, elem_classes="sidebar"): |
|
gr.Markdown(""" |
|
# Qwen Embeddings |
|
|
|
### Navigation |
|
- [Configuration](#configuration) |
|
- [Features](#features) |
|
- [Documentation](#documentation) |
|
""") |
|
|
|
with gr.Accordion("Configuration", open=True): |
|
gr.Markdown(""" |
|
### Model Settings |
|
Configure the embedding model parameters below. |
|
""") |
|
|
|
embedding_dim = gr.Slider( |
|
minimum=32, |
|
maximum=1024, |
|
value=768, |
|
step=32, |
|
label="Embedding Dimension", |
|
elem_classes="settings" |
|
) |
|
update_dim_btn = gr.Button("Update Dimension", variant="secondary") |
|
dim_status = gr.Textbox(label="Status", interactive=False) |
|
|
|
with gr.Accordion("Documentation", open=False): |
|
gr.Markdown(""" |
|
### Usage Guide |
|
|
|
1. **Embedding Dimension** |
|
- 32-128: Fast, simple tasks |
|
- 256-512: Balanced performance |
|
- 768: Default, full model |
|
- 1024: Maximum detail |
|
|
|
2. **Best Practices** |
|
- Use appropriate dimensions for your task |
|
- Consider batch size for multiple documents |
|
- Test different settings for optimal results |
|
""") |
|
|
|
|
|
with gr.Column(scale=4): |
|
gr.Markdown(""" |
|
# Advanced Text Processing Suite |
|
|
|
Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings. |
|
This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages. |
|
""") |
|
|
|
|
|
gr.HTML(""" |
|
<div class="features-grid"> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🔄</div> |
|
<h3>Text Similarity</h3> |
|
<p>Compare text meanings</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🔍</div> |
|
<h3>Semantic Search</h3> |
|
<p>Find relevant docs</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">📊</div> |
|
<h3>Batch Analysis</h3> |
|
<p>Process multiple texts</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🎯</div> |
|
<h3>Multi-Query</h3> |
|
<p>Advanced retrieval</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🌐</div> |
|
<h3>Cross-Lingual</h3> |
|
<p>Cross-language match</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🏷️</div> |
|
<h3>Classification</h3> |
|
<p>Categorize texts</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🔮</div> |
|
<h3>Clustering</h3> |
|
<p>Group documents</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">😊</div> |
|
<h3>Sentiment</h3> |
|
<p>Analyze emotions</p> |
|
</div> |
|
<div class="feature-card"> |
|
<div class="feature-icon">🎨</div> |
|
<h3>Concepts</h3> |
|
<p>Extract themes</p> |
|
</div> |
|
</div> |
|
""") |
|
|
|
with gr.Tabs() as tabs: |
|
|
|
with gr.Tab("Text Similarity Analysis"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Text Similarity Analysis |
|
Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning). |
|
|
|
<div class="example"> |
|
<strong>Try these Arabic examples:</strong><br> |
|
• "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br> |
|
• "السماء صافية اليوم" and "الطقس حار جداً" |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
text1 = gr.Textbox( |
|
label="First Text", |
|
lines=3, |
|
placeholder="Enter first text here...", |
|
value="أحب القراءة كثيراً" |
|
) |
|
text2 = gr.Textbox( |
|
label="Second Text", |
|
lines=3, |
|
placeholder="Enter second text here...", |
|
value="القراءة من أحب هواياتي" |
|
) |
|
similarity_btn = gr.Button("Calculate Similarity", variant="primary") |
|
similarity_score = gr.Number(label="Similarity Score") |
|
|
|
similarity_btn.click( |
|
fn=lambda t1, t2: process_with_embedder('compute_similarity', t1, t2), |
|
inputs=[text1, text2], |
|
outputs=similarity_score |
|
) |
|
|
|
|
|
with gr.Tab("Semantic Search & Reranking"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Semantic Search & Document Reranking |
|
Search through a collection of documents and rank them by semantic relevance to your query. |
|
|
|
<div class="example"> |
|
<strong>Try these Arabic queries:</strong><br> |
|
• "ما هي عواصم الدول العربية؟"<br> |
|
• "أين تقع أكبر المدن العربية؟"<br> |
|
• "ما هي المراكز الثقافية العربية؟" |
|
</div> |
|
""") |
|
|
|
query_text = gr.Textbox( |
|
label="Search Query", |
|
placeholder="Enter your search query...", |
|
value="ما هي عواصم الدول العربية؟" |
|
) |
|
documents_text = gr.Textbox( |
|
label="Documents Collection (one per line)", |
|
lines=10, |
|
placeholder="Enter documents here, one per line...", |
|
value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها. |
|
الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي. |
|
دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا. |
|
بغداد عاصمة العراق وتقع على نهر دجلة. |
|
الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية. |
|
تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي.""" |
|
) |
|
rerank_btn = gr.Button("Search & Rank", variant="primary") |
|
rerank_results = gr.Dataframe( |
|
headers=["Document", "Relevance Score"], |
|
label="Search Results" |
|
) |
|
|
|
rerank_btn.click( |
|
fn=lambda q, d: process_with_embedder('rerank_documents', q, d), |
|
inputs=[query_text, documents_text], |
|
outputs=rerank_results |
|
) |
|
|
|
|
|
with gr.Tab("Batch Similarity Analysis"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Batch Similarity Analysis |
|
Analyze semantic relationships between multiple texts simultaneously. |
|
|
|
<div class="example"> |
|
<strong>The example shows Arabic proverbs about friendship:</strong><br> |
|
See how the model captures the semantic relationships between similar themes. |
|
</div> |
|
""") |
|
|
|
batch_texts = gr.Textbox( |
|
label="Input Texts (one per line)", |
|
lines=10, |
|
placeholder="Enter texts here, one per line...", |
|
value="""الصديق وقت الضيق. |
|
الصديق الحقيقي يظهر عند الشدائد. |
|
عند المحن تعرف إخوانك. |
|
وقت الشدة بتعرف صحابك. |
|
الصاحب ساحب.""" |
|
) |
|
process_btn = gr.Button("Analyze Relationships", variant="primary") |
|
similarity_matrix = gr.Dataframe( |
|
label="Similarity Matrix", |
|
wrap=True |
|
) |
|
|
|
process_btn.click( |
|
fn=lambda t: process_with_embedder('process_batch_embeddings', t), |
|
inputs=[batch_texts], |
|
outputs=[similarity_matrix] |
|
) |
|
|
|
|
|
with gr.Tab("Multi-Query Retrieval"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Multi-Query Document Retrieval |
|
Match multiple queries against multiple documents simultaneously using semantic search. |
|
|
|
<div class="description"> |
|
This tab implements the exact retrieval logic from the Qwen example, allowing you to: |
|
- Define a custom task prompt |
|
- Input multiple queries |
|
- Input multiple documents |
|
- See all query-document match scores in a matrix |
|
</div> |
|
|
|
<div class="example"> |
|
<strong>Try these examples:</strong><br> |
|
<strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br> |
|
<strong>Queries:</strong> |
|
• "ما هي أكبر المدن العربية؟" |
|
• "أين تقع أهم المراكز الثقافية؟"<br> |
|
<strong>Documents:</strong> Use the example documents or add your own |
|
</div> |
|
""") |
|
|
|
task_prompt = gr.Textbox( |
|
label="Task Prompt", |
|
placeholder="Enter the task description here...", |
|
value="Given a web search query, retrieve relevant passages that answer the query", |
|
lines=2 |
|
) |
|
|
|
with gr.Row(): |
|
queries_text = gr.Textbox( |
|
label="Queries (one per line)", |
|
placeholder="Enter your queries here, one per line...", |
|
value="""ما هي أكبر المدن العربية؟ |
|
أين تقع أهم المراكز الثقافية؟""", |
|
lines=5 |
|
) |
|
documents_text = gr.Textbox( |
|
label="Documents (one per line)", |
|
placeholder="Enter your documents here, one per line...", |
|
value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية. |
|
الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم. |
|
دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي. |
|
بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""", |
|
lines=5 |
|
) |
|
|
|
retrieve_btn = gr.Button("Process Retrieval", variant="primary") |
|
retrieval_matrix = gr.Dataframe( |
|
label="Query-Document Relevance Matrix", |
|
wrap=True |
|
) |
|
|
|
gr.Markdown(""" |
|
<div class="description"> |
|
<strong>How to read the results:</strong> |
|
- Each row represents a query |
|
- Each column represents a document |
|
- Values show the relevance score (0-1) between each query-document pair |
|
- Higher scores indicate better matches |
|
</div> |
|
""") |
|
|
|
retrieve_btn.click( |
|
fn=lambda p, q, d: process_with_embedder('process_retrieval', p, q, d), |
|
inputs=[task_prompt, queries_text, documents_text], |
|
outputs=[retrieval_matrix] |
|
) |
|
|
|
|
|
with gr.Tab("Cross-Lingual Matching"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Cross-Lingual Semantic Matching |
|
Compare the meaning of texts across Arabic and English languages. |
|
|
|
<div class="description"> |
|
This feature demonstrates the model's ability to understand semantic similarity across different languages. |
|
Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning. |
|
</div> |
|
|
|
<div class="example"> |
|
<strong>Try these examples:</strong><br> |
|
<strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br> |
|
<strong>English:</strong> "Reading nourishes the mind and soul"<br> |
|
Or try your own pairs of semantically similar texts in both languages. |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
arabic_text = gr.Textbox( |
|
label="Arabic Text", |
|
placeholder="Enter Arabic text here...", |
|
value="القراءة غذاء العقل والروح", |
|
lines=3 |
|
) |
|
english_text = gr.Textbox( |
|
label="English Text", |
|
placeholder="Enter English text here...", |
|
value="Reading nourishes the mind and soul", |
|
lines=3 |
|
) |
|
|
|
match_btn = gr.Button("Compare Texts", variant="primary") |
|
with gr.Row(): |
|
cross_lingual_score = gr.Number( |
|
label="Cross-Lingual Similarity Score", |
|
value=None |
|
) |
|
|
|
gr.Markdown(""" |
|
<div class="description"> |
|
<strong>Understanding the score:</strong> |
|
- Score ranges from 0 (completely different meaning) to 1 (same meaning) |
|
- Scores above 0.7 usually indicate strong semantic similarity |
|
- The model considers the meaning, not just word-for-word translation |
|
</div> |
|
""") |
|
|
|
match_btn.click( |
|
fn=lambda a, e: process_with_embedder('process_cross_lingual', a, e), |
|
inputs=[arabic_text, english_text], |
|
outputs=[cross_lingual_score] |
|
) |
|
|
|
|
|
with gr.Tab("Text Classification"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Text Classification |
|
Classify text into predefined categories using semantic similarity. |
|
|
|
<div class="description"> |
|
The model will compare your text against each category and rank them by relevance. |
|
You can define your own categories or use the provided examples. |
|
</div> |
|
""") |
|
|
|
input_text = gr.Textbox( |
|
label="Input Text", |
|
placeholder="Enter the text to classify...", |
|
value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل", |
|
lines=3 |
|
) |
|
|
|
categories_text = gr.Textbox( |
|
label="Categories (one per line)", |
|
placeholder="Enter categories here...", |
|
value="""التكنولوجيا والابتكار |
|
الاقتصاد والأعمال |
|
التعليم والتدريب |
|
الثقافة والفنون |
|
الصحة والطب""", |
|
lines=5 |
|
) |
|
|
|
classify_btn = gr.Button("Classify Text", variant="primary") |
|
classification_results = gr.Dataframe( |
|
headers=["Category", "Relevance Score"], |
|
label="Classification Results" |
|
) |
|
|
|
classify_btn.click( |
|
fn=lambda t, c: process_with_embedder('classify_text', t, c), |
|
inputs=[input_text, categories_text], |
|
outputs=classification_results |
|
) |
|
|
|
|
|
with gr.Tab("Document Clustering"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Document Clustering |
|
Group similar documents together using semantic clustering. |
|
|
|
<div class="description"> |
|
This feature will: |
|
- Group similar documents into clusters |
|
- Identify the most representative document for each cluster |
|
- Help discover themes and patterns in your document collection |
|
</div> |
|
""") |
|
|
|
cluster_docs = gr.Textbox( |
|
label="Documents (one per line)", |
|
placeholder="Enter documents to cluster...", |
|
value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب. |
|
الروبوتات تساعد الأطباء في إجراء العمليات الجراحية. |
|
التعلم الآلي يحسن من دقة التشخيص الطبي. |
|
الفن يعبر عن مشاعر الإنسان وأحاسيسه. |
|
الموسيقى لغة عالمية تتخطى حدود الثقافات. |
|
الرسم والنحت من أقدم أشكال التعبير الفني. |
|
التجارة الإلكترونية تغير نمط التسوق التقليدي. |
|
التسوق عبر الإنترنت يوفر الوقت والجهد. |
|
المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""", |
|
lines=10 |
|
) |
|
|
|
num_clusters = gr.Slider( |
|
minimum=2, |
|
maximum=10, |
|
value=3, |
|
step=1, |
|
label="Number of Clusters" |
|
) |
|
|
|
cluster_btn = gr.Button("Cluster Documents", variant="primary") |
|
clustering_results = gr.Dataframe( |
|
label="Clustering Results" |
|
) |
|
|
|
cluster_btn.click( |
|
fn=lambda d, n: process_with_embedder('cluster_documents', d, n), |
|
inputs=[cluster_docs, num_clusters], |
|
outputs=clustering_results |
|
) |
|
|
|
|
|
with gr.Tab("Sentiment Analysis"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Arabic Sentiment Analysis |
|
Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors. |
|
|
|
<div class="description"> |
|
The model will compare your text against predefined sentiment anchors and determine: |
|
- The overall sentiment |
|
- Confidence scores for each sentiment level |
|
</div> |
|
""") |
|
|
|
sentiment_text = gr.Textbox( |
|
label="Text to Analyze", |
|
placeholder="Enter text to analyze sentiment...", |
|
value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين", |
|
lines=3 |
|
) |
|
|
|
analyze_btn = gr.Button("Analyze Sentiment", variant="primary") |
|
|
|
with gr.Row(): |
|
sentiment_label = gr.Label(label="Overall Sentiment") |
|
sentiment_scores = gr.Json(label="Detailed Scores") |
|
|
|
analyze_btn.click( |
|
fn=lambda t: process_with_embedder('analyze_sentiment', t), |
|
inputs=[sentiment_text], |
|
outputs=[sentiment_label, sentiment_scores] |
|
) |
|
|
|
|
|
with gr.Tab("Concept Extraction"): |
|
with gr.Column(elem_classes="tab-content"): |
|
gr.Markdown(""" |
|
### Concept Extraction |
|
Extract key concepts and themes from Arabic text. |
|
|
|
<div class="description"> |
|
Analyze text to identify: |
|
- Emotional content |
|
- Main topics |
|
- Underlying themes |
|
</div> |
|
""") |
|
|
|
concept_text = gr.Textbox( |
|
label="Text to Analyze", |
|
placeholder="Enter text to analyze...", |
|
value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة", |
|
lines=3 |
|
) |
|
|
|
concept_type = gr.Radio( |
|
choices=["emotions", "topics", "themes"], |
|
value="themes", |
|
label="Concept Type" |
|
) |
|
|
|
extract_btn = gr.Button("Extract Concepts", variant="primary") |
|
concept_results = gr.Dataframe( |
|
headers=["Concept", "Relevance Score"], |
|
label="Extracted Concepts" |
|
) |
|
|
|
extract_btn.click( |
|
fn=lambda t, c: process_with_embedder('extract_concepts', t, c), |
|
inputs=[concept_text, concept_type], |
|
outputs=concept_results |
|
) |
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def update_embedder_dim(dim): |
|
global embedder |
|
try: |
|
embedder = initialize_embedder(embedding_dim=dim) |
|
return f"Successfully updated embedding dimension to {dim}" |
|
except Exception as e: |
|
return f"Error updating dimension: {str(e)}" |
|
|
|
update_dim_btn.click( |
|
fn=update_embedder_dim, |
|
inputs=[embedding_dim], |
|
outputs=dim_status |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
demo = create_demo() |
|
demo.queue() |
|
demo.launch() |