Omartificial-Intelligence-Space commited on
Commit
e01888b
·
verified ·
1 Parent(s): a2c9fec

Upload 2 files

Browse files
Files changed (2) hide show
  1. qwen_embedding_app.py +1014 -0
  2. requirements.txt +10 -0
qwen_embedding_app.py ADDED
@@ -0,0 +1,1014 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ import plotly.express as px
6
+ import pandas as pd
7
+ import spaces
8
+ from typing import List, Tuple
9
+ from torch import Tensor
10
+ from transformers import AutoTokenizer, AutoModel
11
+
12
+ # Check for GPU support and configure appropriately
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ zero = torch.Tensor([0]).to(device)
15
+ print(f"Device being used: {zero.device}")
16
+
17
+ def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
18
+ left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
19
+ if left_padding:
20
+ return last_hidden_states[:, -1]
21
+ else:
22
+ sequence_lengths = attention_mask.sum(dim=1) - 1
23
+ batch_size = last_hidden_states.shape[0]
24
+ return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
25
+
26
+ def get_detailed_instruct(task_description: str, query: str) -> str:
27
+ return f'Instruct: {task_description}\nQuery: {query}'
28
+
29
+ def tokenize(tokenizer, input_texts, eod_id, max_length):
30
+ batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
31
+ for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
32
+ seq.append(eod_id)
33
+ att.append(1)
34
+ batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
35
+ return batch_dict
36
+
37
+ class QwenEmbedder:
38
+ def __init__(self, embedding_dim=768):
39
+ self.tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-Embedding-0.6B', padding_side='left')
40
+ self.model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
41
+ # Uncomment below for better performance if GPU available
42
+ # self.model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B',
43
+ # attn_implementation="flash_attention_2",
44
+ # torch_dtype=torch.float16
45
+ # ).cuda()
46
+ self.eod_id = self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
47
+ self.max_length = 8192
48
+ self.embedding_dim = embedding_dim
49
+ self.projection = torch.nn.Linear(768, embedding_dim) if embedding_dim != 768 else None
50
+
51
+ def get_embeddings(self, texts: List[str], with_instruction: bool = False) -> Tensor:
52
+ if with_instruction:
53
+ task = 'Process and understand the following text'
54
+ texts = [get_detailed_instruct(task, text) for text in texts]
55
+
56
+ batch_dict = tokenize(self.tokenizer, texts, self.eod_id, self.max_length)
57
+ batch_dict.to(self.model.device)
58
+
59
+ with torch.no_grad():
60
+ outputs = self.model(**batch_dict)
61
+ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
62
+
63
+ # Project to desired dimension if needed
64
+ if self.projection is not None:
65
+ embeddings = self.projection(embeddings)
66
+
67
+ embeddings = F.normalize(embeddings, p=2, dim=1)
68
+
69
+ return embeddings
70
+
71
+ def compute_similarity(embedder: QwenEmbedder, text1: str, text2: str) -> float:
72
+ embeddings = embedder.get_embeddings([text1, text2])
73
+ similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
74
+ return round(similarity, 3)
75
+
76
+ def rerank_documents(embedder: QwenEmbedder, query: str, documents: str) -> List[Tuple[str, float]]:
77
+ docs_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
78
+
79
+ # Add instruction to query
80
+ task = 'Given a search query, retrieve relevant passages that answer the query'
81
+ query_with_instruct = get_detailed_instruct(task, query)
82
+
83
+ # Get embeddings
84
+ query_embedding = embedder.get_embeddings([query_with_instruct])
85
+ doc_embeddings = embedder.get_embeddings(docs_list)
86
+
87
+ # Calculate similarities
88
+ scores = (query_embedding @ doc_embeddings.T).squeeze(0)
89
+ results = [(doc, float(score)) for doc, score in zip(docs_list, scores)]
90
+ results.sort(key=lambda x: x[1], reverse=True)
91
+
92
+ return [(doc, round(score, 3)) for doc, score in results]
93
+
94
+ def process_batch_embeddings(embedder: QwenEmbedder, texts: str) -> pd.DataFrame:
95
+ text_list = [text.strip() for text in texts.split('\n') if text.strip()]
96
+ if len(text_list) < 1:
97
+ return pd.DataFrame()
98
+
99
+ embeddings = embedder.get_embeddings(text_list)
100
+ scores = (embeddings @ embeddings.T).cpu().numpy()
101
+
102
+ # Create similarity matrix DataFrame
103
+ df_similarities = pd.DataFrame(
104
+ scores,
105
+ index=text_list,
106
+ columns=text_list
107
+ )
108
+
109
+ return df_similarities.round(3)
110
+
111
+ def process_retrieval(embedder: QwenEmbedder, task_prompt: str, queries: str, documents: str) -> pd.DataFrame:
112
+ # Process queries and documents
113
+ query_list = [q.strip() for q in queries.split('\n') if q.strip()]
114
+ doc_list = [d.strip() for d in documents.split('\n') if d.strip()]
115
+
116
+ if not query_list or not doc_list:
117
+ return pd.DataFrame()
118
+
119
+ # Add instruction to queries
120
+ instructed_queries = [get_detailed_instruct(task_prompt, q) for q in query_list]
121
+
122
+ # Get embeddings for both queries and documents
123
+ query_embeddings = embedder.get_embeddings(instructed_queries)
124
+ doc_embeddings = embedder.get_embeddings(doc_list)
125
+
126
+ # Calculate similarity scores
127
+ scores = (query_embeddings @ doc_embeddings.T).cpu().numpy()
128
+
129
+ # Create DataFrame with results
130
+ df = pd.DataFrame(scores, index=query_list, columns=doc_list)
131
+ return df.round(3)
132
+
133
+ def process_cross_lingual(embedder: QwenEmbedder, arabic_text: str, english_text: str) -> dict:
134
+ texts = [arabic_text, english_text]
135
+ embeddings = embedder.get_embeddings(texts)
136
+ similarity = torch.cosine_similarity(embeddings[0:1], embeddings[1:2]).item()
137
+ return {"similarity": round(similarity, 3)}
138
+
139
+ def classify_text(embedder: QwenEmbedder, text: str, categories: str) -> List[Tuple[str, float]]:
140
+ cat_list = [c.strip() for c in categories.split('\n') if c.strip()]
141
+ text_embedding = embedder.get_embeddings([text])
142
+ cat_embeddings = embedder.get_embeddings(cat_list)
143
+ scores = (text_embedding @ cat_embeddings.T).squeeze(0)
144
+ results = [(cat, float(score)) for cat, score in zip(cat_list, scores)]
145
+ results.sort(key=lambda x: x[1], reverse=True)
146
+ return [(cat, round(score, 3)) for cat, score in results]
147
+
148
+ def cluster_documents(embedder: QwenEmbedder, documents: str, num_clusters: int) -> pd.DataFrame:
149
+ from sklearn.cluster import KMeans
150
+ doc_list = [doc.strip() for doc in documents.split('\n') if doc.strip()]
151
+ if len(doc_list) < num_clusters:
152
+ return pd.DataFrame()
153
+
154
+ embeddings = embedder.get_embeddings(doc_list)
155
+
156
+ # Perform clustering
157
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
158
+ clusters = kmeans.fit_predict(embeddings.cpu().numpy())
159
+
160
+ # Calculate center document for each cluster
161
+ cluster_centers = kmeans.cluster_centers_
162
+ cluster_center_docs = []
163
+
164
+ for i in range(num_clusters):
165
+ cluster_docs = [doc for doc, cluster in zip(doc_list, clusters) if cluster == i]
166
+ cluster_embeddings = embedder.get_embeddings(cluster_docs)
167
+ center_embedding = torch.tensor(cluster_centers[i]).unsqueeze(0)
168
+ similarities = F.cosine_similarity(cluster_embeddings, center_embedding)
169
+ center_doc = cluster_docs[similarities.argmax().item()]
170
+ cluster_center_docs.append(center_doc)
171
+
172
+ # Create results DataFrame
173
+ df = pd.DataFrame({
174
+ 'Document': doc_list,
175
+ 'Cluster': clusters,
176
+ 'Cluster Center Document': [cluster_center_docs[c] for c in clusters]
177
+ })
178
+ return df.sort_values('Cluster')
179
+
180
+ def analyze_sentiment(embedder: QwenEmbedder, text: str) -> Tuple[str, dict]:
181
+ # Define sentiment anchors
182
+ anchors = {
183
+ "very_positive": "هذا رائع جداً ومدهش! أنا سعيد للغاية",
184
+ "positive": "هذا جيد وممتع",
185
+ "neutral": "هذا عادي ومقبول",
186
+ "negative": "هذا سيء ومزعج",
187
+ "very_negative": "هذا فظيع جداً ومحبط للغاية"
188
+ }
189
+
190
+ # Get embeddings
191
+ text_embedding = embedder.get_embeddings([text])
192
+ anchor_embeddings = embedder.get_embeddings(list(anchors.values()))
193
+
194
+ # Calculate similarities
195
+ scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
196
+ results = list(zip(anchors.keys(), scores.tolist()))
197
+ results.sort(key=lambda x: x[1], reverse=True)
198
+
199
+ # Return tuple of (sentiment, scores_dict)
200
+ return (
201
+ results[0][0],
202
+ {k: round(float(v), 3) for k, v in results}
203
+ )
204
+
205
+ def extract_concepts(embedder: QwenEmbedder, text: str, concept_type: str) -> List[Tuple[str, float]]:
206
+ # Define concept anchors based on type
207
+ concept_anchors = {
208
+ "emotions": [
209
+ "الفرح والسعادة",
210
+ "الحزن والأسى",
211
+ "الغضب والإحباط",
212
+ "الخوف والقلق",
213
+ "الحب والعاطفة",
214
+ "الأمل والتفاؤل"
215
+ ],
216
+ "topics": [
217
+ "السياسة والحكم",
218
+ "الاقتصاد والمال",
219
+ "العلوم والتكنولوجيا",
220
+ "الفن والثقافة",
221
+ "الرياضة والترفيه",
222
+ "التعليم والمعرفة"
223
+ ],
224
+ "themes": [
225
+ "العدالة والمساواة",
226
+ "التقدم والتطور",
227
+ "التقاليد والتراث",
228
+ "الحرية والاستقلال",
229
+ "التعاون والوحدة",
230
+ "الإبداع والابتكار"
231
+ ]
232
+ }
233
+
234
+ anchors = concept_anchors.get(concept_type, concept_anchors["topics"])
235
+
236
+ # Get embeddings
237
+ text_embedding = embedder.get_embeddings([text])
238
+ anchor_embeddings = embedder.get_embeddings(anchors)
239
+
240
+ # Calculate similarities
241
+ scores = (text_embedding @ anchor_embeddings.T).squeeze(0)
242
+ results = [(anchor, float(score)) for anchor, score in zip(anchors, scores)]
243
+ results.sort(key=lambda x: x[1], reverse=True)
244
+
245
+ return [(concept, round(score, 3)) for concept, score in results]
246
+
247
+ # Add a function to reinitialize embedder with new dimension
248
+ def reinitialize_embedder(dim: int) -> QwenEmbedder:
249
+ global embedder
250
+ embedder = QwenEmbedder(embedding_dim=dim)
251
+ return "Embedder reinitialized with dimension: " + str(dim)
252
+
253
+ # Initialize the embedder with default dimension
254
+ embedder = QwenEmbedder()
255
+
256
+ # Update the CSS to improve feature visibility
257
+ custom_css = """
258
+ :root {
259
+ --primary-color: #2196F3;
260
+ --secondary-color: #1976D2;
261
+ --background-color: #f8f9fa;
262
+ --sidebar-bg: #ffffff;
263
+ --text-color: #333333;
264
+ --border-color: #e0e0e0;
265
+ }
266
+
267
+ .container {
268
+ max-width: 1200px;
269
+ margin: auto;
270
+ padding: 20px;
271
+ }
272
+
273
+ .sidebar {
274
+ background-color: var(--sidebar-bg);
275
+ border-right: 1px solid var(--border-color);
276
+ padding: 20px;
277
+ margin-right: 20px;
278
+ position: sticky;
279
+ top: 0;
280
+ height: 100vh;
281
+ overflow-y: auto;
282
+ }
283
+
284
+ .main-content {
285
+ background-color: var(--background-color);
286
+ padding: 20px;
287
+ border-radius: 10px;
288
+ }
289
+
290
+ .features-grid {
291
+ display: grid;
292
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
293
+ gap: 20px;
294
+ margin: 20px 0;
295
+ }
296
+
297
+ .feature-card {
298
+ background: white;
299
+ padding: 20px;
300
+ border-radius: 8px;
301
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
302
+ transition: all 0.3s ease;
303
+ border: 1px solid var(--border-color);
304
+ }
305
+
306
+ .feature-card:hover {
307
+ transform: translateY(-5px);
308
+ box-shadow: 0 4px 8px rgba(0,0,0,0.2);
309
+ border-color: var(--primary-color);
310
+ }
311
+
312
+ .feature-icon {
313
+ font-size: 28px;
314
+ margin-bottom: 15px;
315
+ color: var(--primary-color);
316
+ }
317
+
318
+ .feature-card h3 {
319
+ color: var(--text-color);
320
+ margin: 10px 0;
321
+ font-size: 1.1em;
322
+ }
323
+
324
+ .feature-card p {
325
+ color: #666;
326
+ font-size: 0.9em;
327
+ line-height: 1.4;
328
+ }
329
+
330
+ .features-summary {
331
+ margin: 40px 0;
332
+ padding: 30px;
333
+ background: white;
334
+ border-radius: 12px;
335
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
336
+ }
337
+
338
+ .features-summary h2 {
339
+ color: var(--text-color);
340
+ margin-bottom: 25px;
341
+ text-align: center;
342
+ font-size: 1.5em;
343
+ }
344
+
345
+ .feature-list {
346
+ display: grid;
347
+ grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
348
+ gap: 30px;
349
+ }
350
+
351
+ .feature-group {
352
+ padding: 20px;
353
+ background: var(--background-color);
354
+ border-radius: 8px;
355
+ border: 1px solid var(--border-color);
356
+ }
357
+
358
+ .feature-group h3 {
359
+ color: var(--primary-color);
360
+ margin-bottom: 15px;
361
+ font-size: 1.2em;
362
+ }
363
+
364
+ .feature-group ul {
365
+ list-style: none;
366
+ padding: 0;
367
+ margin: 0;
368
+ }
369
+
370
+ .feature-group li {
371
+ padding: 8px 0;
372
+ color: var(--text-color);
373
+ position: relative;
374
+ padding-left: 20px;
375
+ }
376
+
377
+ .feature-group li:before {
378
+ content: "•";
379
+ color: var(--primary-color);
380
+ position: absolute;
381
+ left: 0;
382
+ }
383
+
384
+ .description {
385
+ margin: 20px 0;
386
+ padding: 15px;
387
+ border-radius: 8px;
388
+ background-color: #ffffff;
389
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
390
+ }
391
+
392
+ .example {
393
+ margin: 10px 0;
394
+ padding: 15px;
395
+ border-left: 4px solid var(--primary-color);
396
+ background-color: #ffffff;
397
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
398
+ }
399
+
400
+ .warning {
401
+ color: #721c24;
402
+ background-color: #f8d7da;
403
+ border: 1px solid #f5c6cb;
404
+ padding: 15px;
405
+ border-radius: 8px;
406
+ margin: 10px 0;
407
+ }
408
+
409
+ .settings {
410
+ background-color: #ffffff;
411
+ padding: 20px;
412
+ border-radius: 8px;
413
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
414
+ margin: 20px 0;
415
+ }
416
+
417
+ .tab-content {
418
+ padding: 20px;
419
+ background-color: #ffffff;
420
+ border-radius: 8px;
421
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
422
+ }
423
+
424
+ .heading {
425
+ color: var(--text-color);
426
+ margin-bottom: 20px;
427
+ padding-bottom: 10px;
428
+ border-bottom: 2px solid var(--primary-color);
429
+ }
430
+
431
+ button.primary {
432
+ background-color: var(--primary-color) !important;
433
+ }
434
+
435
+ button.secondary {
436
+ background-color: var(--secondary-color) !important;
437
+ }
438
+ """
439
+
440
+ # Create the Gradio interface
441
+ with gr.Blocks(title="Advanced Text Processing with Qwen", css=custom_css, theme=gr.themes.Soft()) as demo:
442
+ # Store embedder in state
443
+ state = gr.State(embedder)
444
+
445
+ with gr.Row():
446
+ # Sidebar
447
+ with gr.Column(scale=1, elem_classes="sidebar"):
448
+ gr.Markdown("""
449
+ # Qwen Embeddings
450
+
451
+ ### Navigation
452
+ - [Configuration](#configuration)
453
+ - [Features](#features)
454
+ - [Documentation](#documentation)
455
+ """)
456
+
457
+ with gr.Accordion("Configuration", open=True):
458
+ gr.Markdown("""
459
+ ### Model Settings
460
+ Configure the embedding model parameters below.
461
+ """)
462
+
463
+ embedding_dim = gr.Slider(
464
+ minimum=32,
465
+ maximum=1024,
466
+ value=768,
467
+ step=32,
468
+ label="Embedding Dimension",
469
+ elem_classes="settings"
470
+ )
471
+ update_dim_btn = gr.Button("Update Dimension", variant="secondary")
472
+ dim_status = gr.Textbox(label="Status", interactive=False)
473
+
474
+ with gr.Accordion("Documentation", open=False):
475
+ gr.Markdown("""
476
+ ### Usage Guide
477
+
478
+ 1. **Embedding Dimension**
479
+ - 32-128: Fast, simple tasks
480
+ - 256-512: Balanced performance
481
+ - 768: Default, full model
482
+ - 1024: Maximum detail
483
+
484
+ 2. **Best Practices**
485
+ - Use appropriate dimensions for your task
486
+ - Consider batch size for multiple documents
487
+ - Test different settings for optimal results
488
+ """)
489
+
490
+ # Main Content
491
+ with gr.Column(scale=4):
492
+ gr.Markdown("""
493
+ # Advanced Text Processing Suite
494
+
495
+ Welcome to the Advanced Text Processing Suite powered by Qwen Embeddings.
496
+ This tool provides state-of-the-art text analysis capabilities with support for Arabic and multiple languages.
497
+ """)
498
+
499
+ # Feature Grid
500
+ gr.HTML("""
501
+ <div class="features-grid">
502
+ <div class="feature-card">
503
+ <div class="feature-icon">🔄</div>
504
+ <h3>Text Similarity</h3>
505
+ <p>Compare semantic meaning between texts</p>
506
+ </div>
507
+ <div class="feature-card">
508
+ <div class="feature-icon">🔍</div>
509
+ <h3>Semantic Search</h3>
510
+ <p>Find relevant documents by meaning</p>
511
+ </div>
512
+ <div class="feature-card">
513
+ <div class="feature-icon">📊</div>
514
+ <h3>Batch Analysis</h3>
515
+ <p>Process multiple texts simultaneously</p>
516
+ </div>
517
+ <div class="feature-card">
518
+ <div class="feature-icon">🎯</div>
519
+ <h3>Multi-Query Retrieval</h3>
520
+ <p>Match queries with relevant documents</p>
521
+ </div>
522
+ <div class="feature-card">
523
+ <div class="feature-icon">🌐</div>
524
+ <h3>Cross-Lingual</h3>
525
+ <p>Match meaning across languages</p>
526
+ </div>
527
+ <div class="feature-card">
528
+ <div class="feature-icon">🏷️</div>
529
+ <h3>Text Classification</h3>
530
+ <p>Categorize text into predefined classes</p>
531
+ </div>
532
+ <div class="feature-card">
533
+ <div class="feature-icon">🔮</div>
534
+ <h3>Document Clustering</h3>
535
+ <p>Group similar documents together</p>
536
+ </div>
537
+ <div class="feature-card">
538
+ <div class="feature-icon">😊</div>
539
+ <h3>Sentiment Analysis</h3>
540
+ <p>Analyze emotional content in text</p>
541
+ </div>
542
+ <div class="feature-card">
543
+ <div class="feature-icon">🎨</div>
544
+ <h3>Concept Extraction</h3>
545
+ <p>Identify key themes and topics</p>
546
+ </div>
547
+ </div>
548
+
549
+ <div class="features-summary">
550
+ <h2>Advanced Features</h2>
551
+ <div class="feature-list">
552
+ <div class="feature-group">
553
+ <h3>Text Analysis</h3>
554
+ <ul>
555
+ <li>Semantic similarity scoring</li>
556
+ <li>Cross-language understanding</li>
557
+ <li>Batch text processing</li>
558
+ <li>Emotion detection</li>
559
+ </ul>
560
+ </div>
561
+ <div class="feature-group">
562
+ <h3>Document Processing</h3>
563
+ <ul>
564
+ <li>Smart document search</li>
565
+ <li>Automated clustering</li>
566
+ <li>Theme extraction</li>
567
+ <li>Content categorization</li>
568
+ </ul>
569
+ </div>
570
+ <div class="feature-group">
571
+ <h3>Model Configuration</h3>
572
+ <ul>
573
+ <li>Adjustable embedding dimensions</li>
574
+ <li>GPU acceleration support</li>
575
+ <li>Batch size optimization</li>
576
+ <li>Multi-language support</li>
577
+ </ul>
578
+ </div>
579
+ </div>
580
+ </div>
581
+ """)
582
+
583
+ with gr.Tabs() as tabs:
584
+ # Text Similarity Tab
585
+ with gr.Tab("Text Similarity Analysis"):
586
+ with gr.Column(elem_classes="tab-content"):
587
+ gr.Markdown("""
588
+ ### Text Similarity Analysis
589
+ Compare the semantic similarity between two texts. The score ranges from 0 (completely different) to 1 (identical meaning).
590
+
591
+ <div class="example">
592
+ <strong>Try these Arabic examples:</strong><br>
593
+ • "أحب القراءة كثيراً" and "القراءة من أحب هواياتي"<br>
594
+ • "السماء صافية اليوم" and "الطقس حار جداً"
595
+ </div>
596
+ """)
597
+
598
+ with gr.Row():
599
+ text1 = gr.Textbox(
600
+ label="First Text",
601
+ lines=3,
602
+ placeholder="Enter first text here...",
603
+ value="أحب القراءة كثيراً"
604
+ )
605
+ text2 = gr.Textbox(
606
+ label="Second Text",
607
+ lines=3,
608
+ placeholder="Enter second text here...",
609
+ value="القراءة من أحب هواياتي"
610
+ )
611
+ similarity_btn = gr.Button("Calculate Similarity", variant="primary")
612
+ similarity_score = gr.Number(label="Similarity Score")
613
+
614
+ similarity_btn.click(
615
+ fn=lambda t1, t2, s: compute_similarity(s.value, t1, t2),
616
+ inputs=[text1, text2, state],
617
+ outputs=similarity_score
618
+ )
619
+
620
+ # Document Reranking Tab
621
+ with gr.Tab("Semantic Search & Reranking"):
622
+ with gr.Column(elem_classes="tab-content"):
623
+ gr.Markdown("""
624
+ ### Semantic Search & Document Reranking
625
+ Search through a collection of documents and rank them by semantic relevance to your query.
626
+
627
+ <div class="example">
628
+ <strong>Try these Arabic queries:</strong><br>
629
+ • "ما هي عواصم الدول العربية؟"<br>
630
+ • "أين تقع أكبر المدن العربية؟"<br>
631
+ • "ما هي المراكز الثقافية العربية؟"
632
+ </div>
633
+ """)
634
+
635
+ query_text = gr.Textbox(
636
+ label="Search Query",
637
+ placeholder="Enter your search query...",
638
+ value="ما هي عواصم الدول العربية؟"
639
+ )
640
+ documents_text = gr.Textbox(
641
+ label="Documents Collection (one per line)",
642
+ lines=10,
643
+ placeholder="Enter documents here, one per line...",
644
+ value="""القاهرة هي عاصمة جمهورية مصر العربية وأكبر مدنها.
645
+ الرياض هي عاصمة المملكة العربية السعودية ومركزها الاقتصادي.
646
+ دمشق هي أقدم عاصمة مأهولة في التاريخ وهي عاصمة سوريا.
647
+ بغداد عاصمة العراق وتقع على نهر دجلة.
648
+ الدار البيضاء أكبر مدن المغرب وعاصمته الاقتصادية.
649
+ تونس هي عاصمة الجمهورية التونسية ومركزها الثقافي."""
650
+ )
651
+ rerank_btn = gr.Button("Search & Rank", variant="primary")
652
+ rerank_results = gr.Dataframe(
653
+ headers=["Document", "Relevance Score"],
654
+ label="Search Results"
655
+ )
656
+
657
+ rerank_btn.click(
658
+ fn=lambda q, d, s: rerank_documents(s.value, q, d),
659
+ inputs=[query_text, documents_text, state],
660
+ outputs=rerank_results
661
+ )
662
+
663
+ # Batch Analysis Tab
664
+ with gr.Tab("Batch Similarity Analysis"):
665
+ with gr.Column(elem_classes="tab-content"):
666
+ gr.Markdown("""
667
+ ### Batch Similarity Analysis
668
+ Analyze semantic relationships between multiple texts simultaneously.
669
+
670
+ <div class="example">
671
+ <strong>The example shows Arabic proverbs about friendship:</strong><br>
672
+ See how the model captures the semantic relationships between similar themes.
673
+ </div>
674
+ """)
675
+
676
+ batch_texts = gr.Textbox(
677
+ label="Input Texts (one per line)",
678
+ lines=10,
679
+ placeholder="Enter texts here, one per line...",
680
+ value="""الصديق وقت الضيق.
681
+ الصديق الحقيقي يظهر عند الشدائد.
682
+ عند المحن تعرف إخوانك.
683
+ وقت الشدة بتعرف صحابك.
684
+ الصاحب ساحب."""
685
+ )
686
+ process_btn = gr.Button("Analyze Relationships", variant="primary")
687
+ similarity_matrix = gr.Dataframe(
688
+ label="Similarity Matrix",
689
+ wrap=True
690
+ )
691
+
692
+ process_btn.click(
693
+ fn=lambda t, s: process_batch_embeddings(s.value, t),
694
+ inputs=[batch_texts, state],
695
+ outputs=[similarity_matrix]
696
+ )
697
+
698
+ # Add new Retrieval Tab
699
+ with gr.Tab("Multi-Query Retrieval"):
700
+ with gr.Column(elem_classes="tab-content"):
701
+ gr.Markdown("""
702
+ ### Multi-Query Document Retrieval
703
+ Match multiple queries against multiple documents simultaneously using semantic search.
704
+
705
+ <div class="description">
706
+ This tab implements the exact retrieval logic from the Qwen example, allowing you to:
707
+ - Define a custom task prompt
708
+ - Input multiple queries
709
+ - Input multiple documents
710
+ - See all query-document match scores in a matrix
711
+ </div>
712
+
713
+ <div class="example">
714
+ <strong>Try these examples:</strong><br>
715
+ <strong>Task prompt:</strong> "Given a web search query, retrieve relevant passages that answer the query"<br>
716
+ <strong>Queries:</strong>
717
+ • "ما هي أكبر المدن العربية؟"
718
+ • "أين تقع أهم المراكز الثقافية؟"<br>
719
+ <strong>Documents:</strong> Use the example documents or add your own
720
+ </div>
721
+ """)
722
+
723
+ task_prompt = gr.Textbox(
724
+ label="Task Prompt",
725
+ placeholder="Enter the task description here...",
726
+ value="Given a web search query, retrieve relevant passages that answer the query",
727
+ lines=2
728
+ )
729
+
730
+ with gr.Row():
731
+ queries_text = gr.Textbox(
732
+ label="Queries (one per line)",
733
+ placeholder="Enter your queries here, one per line...",
734
+ value="""ما هي أكبر المدن العربية؟
735
+ أين تقع أهم المراكز الثقافية؟""",
736
+ lines=5
737
+ )
738
+ documents_text = gr.Textbox(
739
+ label="Documents (one per line)",
740
+ placeholder="Enter your documents here, one per line...",
741
+ value="""القاهرة هي أكبر مدينة عربية وعاصمة مصر، وتضم العديد من المعالم الثقافية والتاريخية.
742
+ الرياض عاصمة المملكة العربية السعودية ومركز ثقافي واقتصادي مهم.
743
+ دبي مدينة عالمية في الإمارات العربية المتحدة ومركز تجاري رئيسي.
744
+ بيروت عاصمة لبنان ومركز ثقافي مهم في العالم العربي.""",
745
+ lines=5
746
+ )
747
+
748
+ retrieve_btn = gr.Button("Process Retrieval", variant="primary")
749
+ retrieval_matrix = gr.Dataframe(
750
+ label="Query-Document Relevance Matrix",
751
+ wrap=True
752
+ )
753
+
754
+ gr.Markdown("""
755
+ <div class="description">
756
+ <strong>How to read the results:</strong>
757
+ - Each row represents a query
758
+ - Each column represents a document
759
+ - Values show the relevance score (0-1) between each query-document pair
760
+ - Higher scores indicate better matches
761
+ </div>
762
+ """)
763
+
764
+ retrieve_btn.click(
765
+ fn=lambda p, q, d, s: process_retrieval(s.value, p, q, d),
766
+ inputs=[task_prompt, queries_text, documents_text, state],
767
+ outputs=[retrieval_matrix]
768
+ )
769
+
770
+ # Add Cross-Lingual Tab after the Multi-Query Retrieval tab
771
+ with gr.Tab("Cross-Lingual Matching"):
772
+ with gr.Column(elem_classes="tab-content"):
773
+ gr.Markdown("""
774
+ ### Cross-Lingual Semantic Matching
775
+ Compare the meaning of texts across Arabic and English languages.
776
+
777
+ <div class="description">
778
+ This feature demonstrates the model's ability to understand semantic similarity across different languages.
779
+ Try comparing similar concepts expressed in Arabic and English to see how well the model captures cross-lingual meaning.
780
+ </div>
781
+
782
+ <div class="example">
783
+ <strong>Try these examples:</strong><br>
784
+ <strong>Arabic:</strong> "القراءة غذاء العقل والروح"<br>
785
+ <strong>English:</strong> "Reading nourishes the mind and soul"<br>
786
+ Or try your own pairs of semantically similar texts in both languages.
787
+ </div>
788
+ """)
789
+
790
+ with gr.Row():
791
+ arabic_text = gr.Textbox(
792
+ label="Arabic Text",
793
+ placeholder="Enter Arabic text here...",
794
+ value="القراءة غذاء العقل والروح",
795
+ lines=3
796
+ )
797
+ english_text = gr.Textbox(
798
+ label="English Text",
799
+ placeholder="Enter English text here...",
800
+ value="Reading nourishes the mind and soul",
801
+ lines=3
802
+ )
803
+
804
+ match_btn = gr.Button("Compare Texts", variant="primary")
805
+ with gr.Row():
806
+ cross_lingual_score = gr.Number(
807
+ label="Cross-Lingual Similarity Score",
808
+ value=None
809
+ )
810
+
811
+ gr.Markdown("""
812
+ <div class="description">
813
+ <strong>Understanding the score:</strong>
814
+ - Score ranges from 0 (completely different meaning) to 1 (same meaning)
815
+ - Scores above 0.7 usually indicate strong semantic similarity
816
+ - The model considers the meaning, not just word-for-word translation
817
+ </div>
818
+ """)
819
+
820
+ match_btn.click(
821
+ fn=lambda a, e, s: process_cross_lingual(s.value, a, e)["similarity"],
822
+ inputs=[arabic_text, english_text, state],
823
+ outputs=[cross_lingual_score]
824
+ )
825
+
826
+ # Add Text Classification Tab
827
+ with gr.Tab("Text Classification"):
828
+ with gr.Column(elem_classes="tab-content"):
829
+ gr.Markdown("""
830
+ ### Text Classification
831
+ Classify text into predefined categories using semantic similarity.
832
+
833
+ <div class="description">
834
+ The model will compare your text against each category and rank them by relevance.
835
+ You can define your own categories or use the provided examples.
836
+ </div>
837
+ """)
838
+
839
+ input_text = gr.Textbox(
840
+ label="Input Text",
841
+ placeholder="Enter the text to classify...",
842
+ value="الذكاء الاصطناعي يغير طريقة عملنا وتفكيرنا في المستقبل",
843
+ lines=3
844
+ )
845
+
846
+ categories_text = gr.Textbox(
847
+ label="Categories (one per line)",
848
+ placeholder="Enter categories here...",
849
+ value="""التكنولوجيا والابتكار
850
+ الاقتصاد والأعمال
851
+ التعليم والتدريب
852
+ الثقافة والفنون
853
+ الصحة والطب""",
854
+ lines=5
855
+ )
856
+
857
+ classify_btn = gr.Button("Classify Text", variant="primary")
858
+ classification_results = gr.Dataframe(
859
+ headers=["Category", "Relevance Score"],
860
+ label="Classification Results"
861
+ )
862
+
863
+ classify_btn.click(
864
+ fn=lambda t, c, s: classify_text(s.value, t, c),
865
+ inputs=[input_text, categories_text, state],
866
+ outputs=classification_results
867
+ )
868
+
869
+ # Add Document Clustering Tab
870
+ with gr.Tab("Document Clustering"):
871
+ with gr.Column(elem_classes="tab-content"):
872
+ gr.Markdown("""
873
+ ### Document Clustering
874
+ Group similar documents together using semantic clustering.
875
+
876
+ <div class="description">
877
+ This feature will:
878
+ - Group similar documents into clusters
879
+ - Identify the most representative document for each cluster
880
+ - Help discover themes and patterns in your document collection
881
+ </div>
882
+ """)
883
+
884
+ cluster_docs = gr.Textbox(
885
+ label="Documents (one per line)",
886
+ placeholder="Enter documents to cluster...",
887
+ value="""الذكاء الاصطناعي يفتح آفاقاً جديدة في مجال الطب.
888
+ الروبوتات تساعد الأطباء في إجراء العمليات الجراحية.
889
+ التعلم الآلي يحسن من دقة التشخيص الطبي.
890
+ الفن يعبر عن مشاعر الإنسان وأحاسيسه.
891
+ الموسيقى لغة عالمية تتخطى حدود الثقافات.
892
+ الرسم والنحت من أقدم أشكال التعبير الفني.
893
+ التجارة الإلكترونية تغير نمط التسوق التقليدي.
894
+ التسوق عبر الإنترنت يوفر الوقت والجهد.
895
+ المتاجر الرقمية تتيح خيارات أوسع للمستهلكين.""",
896
+ lines=10
897
+ )
898
+
899
+ num_clusters = gr.Slider(
900
+ minimum=2,
901
+ maximum=10,
902
+ value=3,
903
+ step=1,
904
+ label="Number of Clusters"
905
+ )
906
+
907
+ cluster_btn = gr.Button("Cluster Documents", variant="primary")
908
+ clustering_results = gr.Dataframe(
909
+ label="Clustering Results"
910
+ )
911
+
912
+ cluster_btn.click(
913
+ fn=lambda d, n, s: cluster_documents(s.value, d, n),
914
+ inputs=[cluster_docs, num_clusters, state],
915
+ outputs=clustering_results
916
+ )
917
+
918
+ # Add Sentiment Analysis Tab
919
+ with gr.Tab("Sentiment Analysis"):
920
+ with gr.Column(elem_classes="tab-content"):
921
+ gr.Markdown("""
922
+ ### Arabic Sentiment Analysis
923
+ Analyze the sentiment of Arabic text using semantic similarity to sentiment anchors.
924
+
925
+ <div class="description">
926
+ The model will compare your text against predefined sentiment anchors and determine:
927
+ - The overall sentiment
928
+ - Confidence scores for each sentiment level
929
+ </div>
930
+ """)
931
+
932
+ sentiment_text = gr.Textbox(
933
+ label="Text to Analyze",
934
+ placeholder="Enter text to analyze sentiment...",
935
+ value="هذا المشروع رائع جداً وسيحدث تغييراً إيجابياً في حياة الكثيرين",
936
+ lines=3
937
+ )
938
+
939
+ analyze_btn = gr.Button("Analyze Sentiment", variant="primary")
940
+
941
+ with gr.Row():
942
+ sentiment_label = gr.Label(label="Overall Sentiment")
943
+ sentiment_scores = gr.Json(label="Detailed Scores")
944
+
945
+ analyze_btn.click(
946
+ fn=lambda t, s: analyze_sentiment(s.value, t),
947
+ inputs=[sentiment_text, state],
948
+ outputs=[sentiment_label, sentiment_scores]
949
+ )
950
+
951
+ # Add Concept Extraction Tab
952
+ with gr.Tab("Concept Extraction"):
953
+ with gr.Column(elem_classes="tab-content"):
954
+ gr.Markdown("""
955
+ ### Concept Extraction
956
+ Extract key concepts and themes from Arabic text.
957
+
958
+ <div class="description">
959
+ Analyze text to identify:
960
+ - Emotional content
961
+ - Main topics
962
+ - Underlying themes
963
+ </div>
964
+ """)
965
+
966
+ concept_text = gr.Textbox(
967
+ label="Text to Analyze",
968
+ placeholder="Enter text to analyze...",
969
+ value="نحن نؤمن بأهمية التعليم والابتكار لبناء مستقبل أفضل لأجيالنا القادمة",
970
+ lines=3
971
+ )
972
+
973
+ concept_type = gr.Radio(
974
+ choices=["emotions", "topics", "themes"],
975
+ value="themes",
976
+ label="Concept Type"
977
+ )
978
+
979
+ extract_btn = gr.Button("Extract Concepts", variant="primary")
980
+ concept_results = gr.Dataframe(
981
+ headers=["Concept", "Relevance Score"],
982
+ label="Extracted Concepts"
983
+ )
984
+
985
+ extract_btn.click(
986
+ fn=lambda t, c, s: extract_concepts(s.value, t, c),
987
+ inputs=[concept_text, concept_type, state],
988
+ outputs=concept_results
989
+ )
990
+
991
+ # Fix dimension update functionality
992
+ def update_embedder_dim(dim, state):
993
+ try:
994
+ new_embedder = QwenEmbedder(embedding_dim=dim)
995
+ state.value = new_embedder
996
+ return state, f"Successfully updated embedding dimension to {dim}"
997
+ except Exception as e:
998
+ return state, f"Error updating dimension: {str(e)}"
999
+
1000
+ update_dim_btn.click(
1001
+ fn=update_embedder_dim,
1002
+ inputs=[embedding_dim, state],
1003
+ outputs=[state, dim_status]
1004
+ )
1005
+
1006
+ # Wrap the demo creation in the spaces decorator
1007
+ @spaces.GPU(duration=120)
1008
+ def create_demo():
1009
+ # ... rest of your existing demo code ...
1010
+ return demo
1011
+
1012
+ if __name__ == "__main__":
1013
+ demo = create_demo()
1014
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ numpy>=1.21.0
3
+ requests>=2.26.0
4
+ scipy>=1.7.0
5
+ sentence-transformers>=2.2.0
6
+ torch>=2.0.0
7
+ scikit-learn>=1.0.0
8
+ transformers>=4.51.0
9
+ plotly>=5.18.0
10
+ pandas>=2.0.0