File size: 48,363 Bytes
f63c425 5224f4e f4c0f01 b7669f4 f4c0f01 b7669f4 f4c0f01 39d753a b7669f4 9e85002 5224f4e f4c0f01 6116543 f4c0f01 5224f4e 6116543 9e85002 f4c0f01 39d753a f63c425 f4c0f01 b7669f4 39d753a b7669f4 f63c425 39d753a b7669f4 f4c0f01 f63c425 f4c0f01 f63c425 9e85002 b7669f4 9e85002 39d753a f4c0f01 b7669f4 f4c0f01 9e85002 f4c0f01 39d753a 9e85002 6116543 9e85002 5224f4e f4c0f01 39d753a f63c425 f4c0f01 f63c425 5224f4e 9e85002 f4c0f01 39d753a f4c0f01 9e85002 f4c0f01 9e85002 f4c0f01 f63c425 f4c0f01 5224f4e 9e85002 f4c0f01 39d753a f4c0f01 9e85002 f4c0f01 9e85002 f4c0f01 5224f4e f63c425 39d753a f4c0f01 39d753a f4c0f01 b7669f4 f63c425 39d753a f4c0f01 b7669f4 f63c425 39d753a f4c0f01 b7669f4 f4c0f01 5224f4e f63c425 f4c0f01 b7669f4 f4c0f01 5224f4e 9e85002 f4c0f01 39d753a f4c0f01 b7669f4 5224f4e f4c0f01 5224f4e 39d753a 6116543 39d753a f63c425 39d753a b7669f4 39d753a 6116543 9e85002 b7669f4 6116543 39d753a 6116543 5224f4e 39d753a b7669f4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 |
# Modified app.py for Hugging Face Spaces - Vision 2030 Virtual Assistant with Enhanced Knowledge Base
import gradio as gr
import time
import logging
import os
import re
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import PyPDF2
import io
import json
from langdetect import detect
from sentence_transformers import SentenceTransformer
import faiss
import torch
import spaces
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
logger = logging.getLogger('vision2030_assistant')
# Check for GPU availability
has_gpu = torch.cuda.is_available()
logger.info(f"GPU available: {has_gpu}")
class Vision2030Assistant:
def __init__(self):
"""Initialize the Vision 2030 Assistant with embedding models and enhanced knowledge base"""
logger.info("Initializing Vision 2030 Assistant...")
# Initialize embedding models
self.load_embedding_models()
# Create an enhanced knowledge base
self._create_enhanced_knowledge_base()
# Create sample data with the enhanced knowledge
self._create_sample_data()
self._create_indices()
# Create sample evaluation data
self._create_sample_eval_data()
# Initialize metrics
self.metrics = {
"response_times": [],
"user_ratings": [],
"factual_accuracy": []
}
self.response_history = []
# Save original generate_response for fallback
self.original_generate_response = self._basic_generate_response
logger.info("Vision 2030 Assistant initialized successfully")
def _create_enhanced_knowledge_base(self):
"""Create an enhanced knowledge base with detailed information about Vision 2030"""
logger.info("Creating enhanced Vision 2030 knowledge base")
# Create a more comprehensive knowledge base
self.vision2030_knowledge = {
"general": {
"en": [
"Vision 2030 is Saudi Arabia's strategic framework to reduce dependence on oil, diversify the economy, and develop public sectors.",
"The key pillars of Vision 2030 are a vibrant society, a thriving economy, and an ambitious nation.",
"Vision 2030 was announced by Crown Prince Mohammed bin Salman in April 2016.",
"The true wealth of Saudi Arabia, as mentioned in Vision 2030, is its people and their potential."
],
"ar": [
"رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة.",
"الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح.",
"تم الإعلان عن رؤية 2030 من قبل ولي العهد محمد بن سلمان في أبريل 2016.",
"الثروة الحقيقية للمملكة العربية السعودية، كما ذكر في رؤية 2030، هي شعبها وإمكاناته."
]
},
"real_wealth": {
"en": [
"The real wealth of Saudi Arabia, as emphasized in Vision 2030, is its people, particularly the youth.",
"Vision 2030 recognizes that the Saudi people, with their strong values and capabilities, are the true wealth of the nation.",
"The document states: 'Our people are our most valuable asset and the enablers of our success'."
],
"ar": [
"الثروة الحقيقية للمملكة العربية السعودية، كما أكدت رؤية 2030، هي شعبها، وخاصة الشباب.",
"تعترف رؤية 2030 بأن الشعب السعودي، بقيمه وقدراته القوية، هو الثروة الحقيقية للأمة.",
"تنص الوثيقة على: 'شعبنا هو أثمن أصولنا وأساس نجاحنا'."
]
},
"global_gateway": {
"en": [
"Saudi Arabia aims to strengthen its position as a global gateway by leveraging its strategic location between Asia, Europe, and Africa.",
"The Kingdom plans to build a unique logistical hub connecting three continents and improve infrastructure to facilitate trade.",
"Vision 2030 intends to establish special economic zones with competitive regulations to attract international investors.",
"The plan includes enhancing seaports, building regional connectivity through railways, and expanding airports."
],
"ar": [
"تهدف المملكة العربية السعودية إلى تعزيز مكانتها كبوابة عالمية من خلال الاستفادة من موقعها الاستراتيجي بين آسيا وأوروبا وأفريقيا.",
"تخطط المملكة لبناء مركز لوجستي فريد يربط بين ثلاث قارات وتحسين البنية التحتية لتسهيل التجارة.",
"تعتزم رؤية 2030 إنشاء مناطق اقتصادية خاصة ذات لوائح تنافسية لجذب المستثمرين الدوليين.",
"تتضمن الخطة تعزيز الموانئ البحرية، وبناء الربط الإقليمي من خلال السكك الحديدية، وتوسيع المطارات."
]
},
"tourism": {
"en": [
"Vision 2030 aims to develop tourism as a key non-oil sector, including religious, cultural, and leisure tourism.",
"The plan includes developing the Red Sea as a world-class luxury tourist destination, with a focus on sustainability.",
"Vision 2030 targets increasing tourism's contribution to GDP from 3% to 10% and hosting 100 million tourists annually by 2030.",
"The Al-Ula region is being developed as a major archaeological and cultural tourism destination."
],
"ar": [
"تهدف رؤية 2030 إلى تطوير السياحة كقطاع غير نفطي رئيسي، بما في ذلك السياحة الدينية والثقافية والترفيهية.",
"تتضمن الخطة تطوير البحر الأحمر كوجهة سياحية فاخرة على مستوى عالمي، مع التركيز على الاستدامة.",
"تستهدف رؤية 2030 زيادة مساهمة السياحة في الناتج المحلي الإجمالي من 3٪ إلى 10٪ واستضافة 100 مليون سائح سنويًا بحلول عام 2030.",
"يتم تطوير منطقة العلا كوجهة سياحية أثرية وثقافية رئيسية."
]
},
"youth": {
"en": [
"Vision 2030 recognizes youth as the Kingdom's most valuable resource, with 60% of the population under 30 years old.",
"The plan aims to reduce youth unemployment from 30% to 7% through education reform and economic growth.",
"Vision 2030 includes building a culture of entrepreneurship to harness the creative energy of Saudi youth.",
"The plan supports youth development programs, sports initiatives, and enhanced educational opportunities."
],
"ar": [
"تعترف رؤية 2030 بالشباب كأثمن موارد المملكة، حيث يشكلون 60٪ من السكان تحت سن 30 عامًا.",
"تهدف الخطة إلى خفض بطالة الشباب من 30٪ إلى 7٪ من خلال إصلاح التعليم والنمو الاقتصادي.",
"تتضمن رؤية 2030 بناء ثقافة ريادة الأعمال للاستفادة من الطاقة الإبداعية للشباب السعودي.",
"تدعم الخطة برامج تنمية الشباب، والمبادرات الرياضية، وتعزيز الفرص التعليمية."
]
},
"women": {
"en": [
"Vision 2030 aims to increase women's participation in the workforce from 22% to 30%.",
"The plan supports women's rights and empowerment across economic, social, and political spheres.",
"Vision 2030 has already resulted in policy changes allowing women to drive, travel independently, and participate more fully in public life.",
"The plan includes initiatives to increase female leadership positions in both public and private sectors."
],
"ar": [
"تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪.",
"تدعم الخطة حقوق المرأة وتمكينها في المجالات الاقتصادية والاجتماعية والسياسية.",
"أدت رؤية 2030 بالفعل إلى تغييرات في السياسات تسمح للمرأة بالقيادة، والسفر بشكل مستقل، والمشاركة بشكل أكبر في الحياة العامة.",
"تتضمن الخطة مبادرات لزيادة المناصب القيادية النسائية في القطاعين العام والخاص."
]
},
"projects": {
"en": [
"NEOM is a planned cross-border smart city in the Tabuk Province of northwestern Saudi Arabia, a key project of Vision 2030.",
"The Red Sea Project is a Vision 2030 initiative to develop luxury tourism destinations across 50 islands off Saudi Arabia's Red Sea coast.",
"Qiddiya is an entertainment mega-project being built in Riyadh as part of Vision 2030, intended to be the world's largest entertainment city.",
"The Line is a revolutionary urban development project within NEOM featuring a 170 km-long linear city without cars or streets.",
"AMAALA is an ultra-luxury tourism project on the Red Sea that focuses on wellness, healthy living, and meditation."
],
"ar": [
"نيوم هي مدينة ذكية مخططة عبر الحدود في مقاطعة تبوك شمال غرب المملكة العربية السعودية، وهي مشروع رئيسي من رؤية 2030.",
"مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي.",
"القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030، ويهدف إلى أن يكون أكبر مدينة ترفيهية في العالم.",
"ذا لاين هو مشروع تطوير حضري ثوري ضمن نيوم يتميز بمدينة خطية طولها 170 كم بدون سيارات أو شوارع.",
"أمالا هو مشروع سياحي فائق الفخامة على البحر الأحمر يركز على العافية والحياة الصحية والتأمل."
]
},
"economic_goals": {
"en": [
"Vision 2030 targets increasing the private sector's contribution to GDP from 40% to 65%.",
"One goal of Vision 2030 is to increase foreign direct investment from 3.8% to 5.7% of GDP.",
"Vision 2030 aims to raise the share of non-oil exports in non-oil GDP from 16% to 50%.",
"The plan targets increasing SME contribution to GDP from 20% to 35%.",
"Vision 2030 aims to lower the unemployment rate from 11.6% to 7%."
],
"ar": [
"تستهدف رؤية 2030 زيادة مساهمة القطاع الخاص في الناتج المحلي الإجمالي من 40٪ إلى 65٪.",
"أحد أهداف رؤية 2030 هو زيادة الاستثمار الأجنبي المباشر من 3.8٪ إلى 5.7٪ من الناتج المحلي الإجمالي.",
"تهدف رؤية 2030 إلى رفع حصة الصادرات غير النفطية في الناتج المحلي الإجمالي غير النفطي من 16٪ إلى 50٪.",
"تستهدف الخطة زيادة مساهمة المنشآت الصغيرة والمتوسطة في الناتج المحلي الإجمالي من 20٪ إلى 35٪.",
"تهدف رؤية 2030 إلى خفض معدل البطالة من 11.6٪ إلى 7٪."
]
},
"digital_transformation": {
"en": [
"Vision 2030 includes plans to develop the digital infrastructure and support for tech startups in Saudi Arabia.",
"The plan aims to increase internet penetration to 95% of households in urban areas and 65% in rural areas.",
"Vision 2030 focuses on building a digital economy, enhancing e-government services, and developing digital skills.",
"The plan includes initiatives to position Saudi Arabia as a leader in the Fourth Industrial Revolution technologies."
],
"ar": [
"تتضمن رؤية 2030 خططًا لتطوير البنية التحتية الرقمية ودعم الشركات الناشئة التكنولوجية في المملكة العربية السعودية.",
"تهدف الخطة إلى زيادة انتشار الإنترنت إلى 95٪ من الأسر في المناطق الحضرية و 65٪ في المناطق الريفية.",
"تركز رؤية 2030 على بناء اقتصاد رقمي، وتعزيز خدمات الحكومة الإلكترونية، وتطوير المهارات الرقمية.",
"تتضمن الخطة مبادرات لوضع المملكة العربية السعودية كرائدة في تقنيات الثورة الصناعية الرابعة."
]
}
}
# Initialize text lists before adding knowledge base content
self.english_texts = []
self.arabic_texts = []
# Add all knowledge to the text collections
for category in self.vision2030_knowledge:
self.english_texts.extend(self.vision2030_knowledge[category]["en"])
self.arabic_texts.extend(self.vision2030_knowledge[category]["ar"])
logger.info(f"Created enhanced knowledge base: {len(self.english_texts)} English, {len(self.arabic_texts)} Arabic texts")
@spaces.GPU
def load_embedding_models(self):
"""Load embedding models for retrieval with GPU support"""
logger.info("Loading embedding models...")
try:
# Load embedding models
self.arabic_embedder = SentenceTransformer('CAMeL-Lab/bert-base-arabic-camelbert-ca')
self.english_embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Move to GPU if available
if has_gpu:
self.arabic_embedder = self.arabic_embedder.to('cuda')
self.english_embedder = self.english_embedder.to('cuda')
logger.info("Models moved to GPU")
logger.info("Embedding models loaded successfully")
except Exception as e:
logger.error(f"Error loading embedding models: {str(e)}")
# Create simple fallback embedding method
self._create_fallback_embedders()
def _create_fallback_embedders(self):
"""Create fallback embedding methods if model loading fails"""
logger.warning("Using fallback embedding methods")
# Simple fallback using character-level encoding
def simple_encode(text, dim=384):
import hashlib
# Create a hash of the text
hash_object = hashlib.md5(text.encode())
# Use the hash to seed a random number generator
np.random.seed(int(hash_object.hexdigest(), 16) % 2**32)
# Generate a random vector
return np.random.randn(dim).astype(np.float32)
# Create embedding function objects
class SimpleEmbedder:
def __init__(self, dim=384):
self.dim = dim
def encode(self, text):
return simple_encode(text, self.dim)
self.arabic_embedder = SimpleEmbedder()
self.english_embedder = SimpleEmbedder()
def _create_sample_data(self):
"""Create sample Vision 2030 data"""
logger.info("Creating additional sample data")
# We already loaded the comprehensive knowledge base in _create_enhanced_knowledge_base
# Add any additional sample data here if needed
pass
@spaces.GPU
def _create_indices(self):
"""Create FAISS indices for fast text retrieval"""
logger.info("Creating FAISS indices for text retrieval")
try:
# Process and embed English texts
self.english_vectors = []
for text in self.english_texts:
try:
if has_gpu and hasattr(self.english_embedder, 'to') and callable(getattr(self.english_embedder, 'to')):
# If it's a real model on GPU
with torch.no_grad():
vec = self.english_embedder.encode(text)
else:
# If it's our fallback
vec = self.english_embedder.encode(text)
self.english_vectors.append(vec)
except Exception as e:
logger.error(f"Error encoding English text: {str(e)}")
# Use a random vector as fallback
self.english_vectors.append(np.random.randn(384).astype(np.float32))
# Create English index
if self.english_vectors:
self.english_index = faiss.IndexFlatL2(len(self.english_vectors[0]))
self.english_index.add(np.array(self.english_vectors))
logger.info(f"Created English index with {len(self.english_vectors)} vectors")
else:
logger.warning("No English texts to index")
# Process and embed Arabic texts
self.arabic_vectors = []
for text in self.arabic_texts:
try:
if has_gpu and hasattr(self.arabic_embedder, 'to') and callable(getattr(self.arabic_embedder, 'to')):
# If it's a real model on GPU
with torch.no_grad():
vec = self.arabic_embedder.encode(text)
else:
# If it's our fallback
vec = self.arabic_embedder.encode(text)
self.arabic_vectors.append(vec)
except Exception as e:
logger.error(f"Error encoding Arabic text: {str(e)}")
# Use a random vector as fallback
self.arabic_vectors.append(np.random.randn(384).astype(np.float32))
# Create Arabic index
if self.arabic_vectors:
self.arabic_index = faiss.IndexFlatL2(len(self.arabic_vectors[0]))
self.arabic_index.add(np.array(self.arabic_vectors))
logger.info(f"Created Arabic index with {len(self.arabic_vectors)} vectors")
else:
logger.warning("No Arabic texts to index")
except Exception as e:
logger.error(f"Error creating FAISS indices: {str(e)}")
raise
def _create_sample_eval_data(self):
"""Create sample evaluation data with ground truth"""
self.eval_data = [
{
"question": "What are the key pillars of Vision 2030?",
"lang": "en",
"reference_answer": "The key pillars of Vision 2030 are a vibrant society, a thriving economy, and an ambitious nation."
},
{
"question": "ما هي الركائز الرئيسية لرؤية 2030؟",
"lang": "ar",
"reference_answer": "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
},
{
"question": "What is NEOM?",
"lang": "en",
"reference_answer": "NEOM is a planned cross-border smart city in the Tabuk Province of northwestern Saudi Arabia, a key project of Vision 2030."
},
{
"question": "ما هو مشروع البحر الأحمر؟",
"lang": "ar",
"reference_answer": "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
},
{
"question": "ما هي الثروة الحقيقية التي تعتز بها المملكة كما وردت في الرؤية؟",
"lang": "ar",
"reference_answer": "الثروة الحقيقية للمملكة العربية السعودية، كما أكدت رؤية 2030، هي شعبها، وخاصة الشباب."
},
{
"question": "كيف تسعى المملكة إلى تعزيز مكانتها كبوابة للعالم؟",
"lang": "ar",
"reference_answer": "تهدف المملكة العربية السعودية إلى تعزيز مكانتها كبوابة عالمية من خلال الاستفادة من موقعها الاستراتيجي بين آسيا وأوروبا وأفريقيا."
}
]
logger.info(f"Created {len(self.eval_data)} sample evaluation examples")
@spaces.GPU
def retrieve_context(self, query, lang):
"""Retrieve relevant context for a query based on language"""
start_time = time.time()
try:
if lang == "ar":
if has_gpu and hasattr(self.arabic_embedder, 'to') and callable(getattr(self.arabic_embedder, 'to')):
with torch.no_grad():
query_vec = self.arabic_embedder.encode(query)
else:
query_vec = self.arabic_embedder.encode(query)
D, I = self.arabic_index.search(np.array([query_vec]), k=2) # Get top 2 most relevant chunks
context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
else:
if has_gpu and hasattr(self.english_embedder, 'to') and callable(getattr(self.english_embedder, 'to')):
with torch.no_grad():
query_vec = self.english_embedder.encode(query)
else:
query_vec = self.english_embedder.encode(query)
D, I = self.english_index.search(np.array([query_vec]), k=2) # Get top 2 most relevant chunks
context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
retrieval_time = time.time() - start_time
logger.info(f"Retrieved context in {retrieval_time:.2f}s")
return context
except Exception as e:
logger.error(f"Error retrieving context: {str(e)}")
return ""
def _basic_generate_response(self, user_input):
"""Basic response generation with retrieval-based approach"""
if not user_input or user_input.strip() == "":
return ""
start_time = time.time()
# Default response in case of failure
default_response = {
"en": "I apologize, but I couldn't process your request properly. Please try again.",
"ar": "أعتذر، لم أتمكن من معالجة طلبك بشكل صحيح. الرجاء المحاولة مرة أخرى."
}
try:
# Detect language
try:
lang = detect(user_input)
if lang != "ar": # Simplify to just Arabic vs non-Arabic
lang = "en"
except:
lang = "en" # Default fallback
logger.info(f"Detected language: {lang}")
# Retrieve relevant context
context = self.retrieve_context(user_input, lang)
# Simplified response generation
if lang == "ar":
if "ركائز" in user_input or "اركان" in user_input:
reply = "الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
elif "نيوم" in user_input:
reply = "نيوم هي مدينة ذكية مخططة عبر الحدود في مقاطعة تبوك شمال غرب المملكة العربية السعودية، وهي مشروع رئيسي من رؤية 2030."
elif "البحر الأحمر" in user_input or "البحر الاحمر" in user_input:
reply = "مشروع البحر الأحمر هو مبادرة رؤية 2030 لتطوير وجهات سياحية فاخرة عبر 50 جزيرة قبالة ساحل البحر الأحمر السعودي."
elif "المرأة" in user_input or "النساء" in user_input:
reply = "تهدف رؤية 2030 إلى زيادة مشاركة المرأة في القوى العاملة من 22٪ إلى 30٪."
elif "القدية" in user_input:
reply = "القدية هي مشروع ترفيهي ضخم يتم بناؤه في الرياض كجزء من رؤية 2030، ويهدف إلى أن يكون أكبر مدينة ترفيهية في العالم."
elif "ماهي" in user_input or "ما هي" in user_input:
reply = "رؤية 2030 هي الإطار الاستراتيجي للمملكة العربية السعودية للحد من الاعتماد على النفط وتنويع الاقتصاد وتطوير القطاعات العامة. الركائز الرئيسية لرؤية 2030 هي مجتمع حيوي، واقتصاد مزدهر، ووطن طموح."
else:
# Use the retrieved context directly if available
reply = context if context else "لم أتمكن من العثور على معلومات كافية حول هذا السؤال."
else: # English
if "pillar" in user_input.lower() or "key" in user_input.lower():
reply = "The key pillars of Vision 2030 are a vibrant society, a thriving economy, and an ambitious nation."
elif "neom" in user_input.lower():
reply = "NEOM is a planned cross-border smart city in the Tabuk Province of northwestern Saudi Arabia, a key project of Vision 2030."
elif "red sea" in user_input.lower():
reply = "The Red Sea Project is a Vision 2030 initiative to develop luxury tourism destinations across 50 islands off Saudi Arabia's Red Sea coast."
elif "women" in user_input.lower() or "female" in user_input.lower():
reply = "Vision 2030 aims to increase women's participation in the workforce from 22% to 30%."
elif "qiddiya" in user_input.lower():
reply = "Qiddiya is an entertainment mega-project being built in Riyadh as part of Vision 2030, intended to be the world's largest entertainment city."
elif "what is" in user_input.lower():
reply = "Vision 2030 is Saudi Arabia's strategic framework to reduce dependence on oil, diversify the economy, and develop public sectors. The key pillars are a vibrant society, a thriving economy, and an ambitious nation."
else:
# Use the retrieved context directly if available
reply = context if context else "I couldn't find enough information about this question."
except Exception as e:
logger.error(f"Error generating response: {str(e)}")
reply = default_response.get(lang, default_response["en"])
# Record response time
response_time = time.time() - start_time
self.metrics["response_times"].append(response_time)
logger.info(f"Generated response in {response_time:.2f}s")
# Store the interaction for later evaluation
interaction = {
"timestamp": datetime.now().isoformat(),
"user_input": user_input,
"response": reply,
"language": lang,
"response_time": response_time
}
self.response_history.append(interaction)
return reply
def generate_response(self, user_input):
"""Enhanced response generation with specific question handling"""
if not user_input or user_input.strip() == "":
return ""
start_time = time.time()
try:
# Detect language
try:
lang = detect(user_input)
if lang != "ar": # Simplify to just Arabic vs non-Arabic
lang = "en"
except:
lang = "en" # Default fallback
logger.info(f"Detected language: {lang}")
# Check for specific question patterns first
if lang == "ar":
# About real wealth
if "الثروة الحقيقية" in user_input or "أثمن" in user_input or "ثروة" in user_input:
response = self.vision2030_knowledge["real_wealth"]["ar"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About global gateway
if "بوابة للعالم" in user_input or "مكانتها" in user_input or "موقعها الاستراتيجي" in user_input or "تعزيز مكانتها" in user_input:
response = self.vision2030_knowledge["global_gateway"]["ar"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About youth
if "الشباب" in user_input:
response = self.vision2030_knowledge["youth"]["ar"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About women
if "المرأة" in user_input or "النساء" in user_input:
response = self.vision2030_knowledge["women"]["ar"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About tourism
if "سياحة" in user_input or "السياحة" in user_input:
response = self.vision2030_knowledge["tourism"]["ar"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
else:
# English questions
# About real wealth
if "real wealth" in user_input.lower() or "valuable asset" in user_input.lower():
response = self.vision2030_knowledge["real_wealth"]["en"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About global gateway
if "global gateway" in user_input.lower() or "strategic location" in user_input.lower():
response = self.vision2030_knowledge["global_gateway"]["en"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About youth
if "youth" in user_input.lower() or "young" in user_input.lower():
response = self.vision2030_knowledge["youth"]["en"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About women
if "women" in user_input.lower() or "female" in user_input.lower():
response = self.vision2030_knowledge["women"]["en"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# About tourism
if "tourism" in user_input.lower() or "tourist" in user_input.lower():
response = self.vision2030_knowledge["tourism"]["en"][0]
# Record metrics and return
self._record_metrics(user_input, response, lang, start_time)
return response
# If no specific match, fall back to the regular response generation
response = self.original_generate_response(user_input)
return response
except Exception as e:
logger.error(f"Error in enhanced generation: {str(e)}")
# Fall back to regular generation
return self.original_generate_response(user_input)
def _record_metrics(self, user_input, response, lang, start_time):
"""Record metrics for a generated response"""
# Record response time
response_time = time.time() - start_time
self.metrics["response_times"].append(response_time)
logger.info(f"Generated response in {response_time:.2f}s")
# Store the interaction for later evaluation
interaction = {
"timestamp": datetime.now().isoformat(),
"user_input": user_input,
"response": response,
"language": lang,
"response_time": response_time
}
self.response_history.append(interaction)
def evaluate_factual_accuracy(self, response, reference):
"""Simple evaluation of factual accuracy by keyword matching"""
# This is a simplified approach - in production, use more sophisticated methods
keywords_reference = set(re.findall(r'\b\w+\b', reference.lower()))
keywords_response = set(re.findall(r'\b\w+\b', response.lower()))
# Remove common stopwords (simplified approach)
english_stopwords = {"the", "is", "a", "an", "and", "or", "of", "to", "in", "for", "with", "by", "on", "at"}
arabic_stopwords = {"في", "من", "إلى", "على", "و", "هي", "هو", "عن", "مع"}
keywords_reference = {w for w in keywords_reference if w not in english_stopwords and w not in arabic_stopwords}
keywords_response = {w for w in keywords_response if w not in english_stopwords and w not in arabic_stopwords}
common_keywords = keywords_reference.intersection(keywords_response)
if len(keywords_reference) > 0:
accuracy = len(common_keywords) / len(keywords_reference)
else:
accuracy = 0
return accuracy
@spaces.GPU
def evaluate_on_test_set(self):
"""Evaluate the assistant on the test set"""
logger.info("Running evaluation on test set")
eval_results = []
for example in self.eval_data:
# Generate response
response = self.generate_response(example["question"])
# Calculate factual accuracy
accuracy = self.evaluate_factual_accuracy(response, example["reference_answer"])
eval_results.append({
"question": example["question"],
"reference": example["reference_answer"],
"response": response,
"factual_accuracy": accuracy
})
self.metrics["factual_accuracy"].append(accuracy)
# Calculate average factual accuracy
avg_accuracy = sum(self.metrics["factual_accuracy"]) / len(self.metrics["factual_accuracy"]) if self.metrics["factual_accuracy"] else 0
avg_response_time = sum(self.metrics["response_times"]) / len(self.metrics["response_times"]) if self.metrics["response_times"] else 0
results = {
"average_factual_accuracy": avg_accuracy,
"average_response_time": avg_response_time,
"detailed_results": eval_results
}
logger.info(f"Evaluation results: Factual accuracy = {avg_accuracy:.2f}, Avg response time = {avg_response_time:.2f}s")
return results
def visualize_evaluation_results(self, results):
"""Generate visualization of evaluation results"""
# Create a DataFrame from the detailed results
df = pd.DataFrame(results["detailed_results"])
# Create the figure for visualizations
fig = plt.figure(figsize=(12, 8))
# Bar chart of factual accuracy by question
plt.subplot(2, 1, 1)
bars = plt.bar(range(len(df)), df["factual_accuracy"], color="skyblue")
plt.axhline(y=results["average_factual_accuracy"], color='r', linestyle='-',
label=f"Avg: {results['average_factual_accuracy']:.2f}")
plt.xlabel("Question Index")
plt.ylabel("Factual Accuracy")
plt.title("Factual Accuracy by Question")
plt.ylim(0, 1.1)
plt.legend()
# Add language information
df["language"] = df["question"].apply(lambda x: "Arabic" if detect(x) == "ar" else "English")
# Group by language
lang_accuracy = df.groupby("language")["factual_accuracy"].mean()
# Bar chart of accuracy by language
plt.subplot(2, 1, 2)
lang_bars = plt.bar(lang_accuracy.index, lang_accuracy.values, color=["lightblue", "lightgreen"])
plt.axhline(y=results["average_factual_accuracy"], color='r', linestyle='-',
label=f"Overall: {results['average_factual_accuracy']:.2f}")
plt.xlabel("Language")
plt.ylabel("Average Factual Accuracy")
plt.title("Factual Accuracy by Language")
plt.ylim(0, 1.1)
# Add value labels
for i, v in enumerate(lang_accuracy):
plt.text(i, v + 0.05, f"{v:.2f}", ha='center')
plt.tight_layout()
return fig
def record_user_feedback(self, user_input, response, rating, feedback_text=""):
"""Record user feedback for a response"""
feedback = {
"timestamp": datetime.now().isoformat(),
"user_input": user_input,
"response": response,
"rating": rating,
"feedback_text": feedback_text
}
self.metrics["user_ratings"].append(rating)
# In a production system, store this in a database
logger.info(f"Recorded user feedback: rating={rating}")
return True
@spaces.GPU
def process_uploaded_pdf(self, file):
"""Process uploaded PDF and extract text content"""
if file is None:
return "No file uploaded. Please select a PDF file."
try:
logger.info(f"Processing uploaded file")
# Convert bytes to file-like object
file_stream = io.BytesIO(file)
# Use PyPDF2 to read the file content
reader = PyPDF2.PdfReader(file_stream)
# Extract text from the PDF
full_text = ""
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
extracted_text = page.extract_text()
if extracted_text:
full_text += extracted_text + "\n"
if not full_text.strip():
return "The uploaded PDF doesn't contain extractable text. Please try another file."
# Process the extracted text
chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
# Categorize text by language
english_chunks = []
arabic_chunks = []
for chunk in chunks:
try:
lang = detect(chunk)
if lang == "ar":
arabic_chunks.append(chunk)
else:
english_chunks.append(chunk)
except:
# If language detection fails, assume English
english_chunks.append(chunk)
# Add the extracted chunks to our knowledge base
self.english_texts.extend(english_chunks)
self.arabic_texts.extend(arabic_chunks)
# Recreate indices
self._create_indices()
logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments."
except Exception as e:
logger.error(f"Error processing PDF: {str(e)}")
return f"❌ Error processing the PDF: {str(e)}. Please try another file."
# Create the Gradio interface
def create_interface():
# Initialize the assistant
assistant = Vision2030Assistant()
def chat(message, history):
if not message or message.strip() == "":
return history, ""
# Generate response
reply = assistant.generate_response(message)
# Update history
history.append((message, reply))
return history, ""
def provide_feedback(history, rating, feedback_text):
# Record feedback for the last conversation
if history and len(history) > 0:
last_interaction = history[-1]
assistant.record_user_feedback(last_interaction[0], last_interaction[1], rating, feedback_text)
return f"Thank you for your feedback! (Rating: {rating}/5)"
return "No conversation found to rate."
@spaces.GPU
def run_evaluation():
results = assistant.evaluate_on_test_set()
# Create summary text
summary = f"""
Evaluation Results:
------------------
Total questions evaluated: {len(results['detailed_results'])}
Overall factual accuracy: {results['average_factual_accuracy']:.2f}
Average response time: {results['average_response_time']:.4f} seconds
Detailed Results:
"""
for i, result in enumerate(results['detailed_results']):
summary += f"\nQ{i+1}: {result['question']}\n"
summary += f"Reference: {result['reference']}\n"
summary += f"Response: {result['response']}\n"
summary += f"Accuracy: {result['factual_accuracy']:.2f}\n"
summary += "-" * 40 + "\n"
# Return both the results summary and visualization
fig = assistant.visualize_evaluation_results(results)
return summary, fig
# Create the Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Vision 2030 Virtual Assistant 🌟")
gr.Markdown("Ask questions about Saudi Arabia's Vision 2030 in both Arabic and English")
with gr.Tab("Chat"):
chatbot = gr.Chatbot(height=400)
msg = gr.Textbox(label="Your Question", placeholder="Ask about Vision 2030...")
with gr.Row():
submit_btn = gr.Button("Submit")
clear_btn = gr.Button("Clear Chat")
gr.Markdown("### Provide Feedback")
with gr.Row():
rating = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Rate the Response (1-5)")
feedback_text = gr.Textbox(label="Additional Comments (Optional)")
feedback_btn = gr.Button("Submit Feedback")
feedback_result = gr.Textbox(label="Feedback Status")
with gr.Tab("Evaluation"):
evaluate_btn = gr.Button("Run Evaluation on Test Set")
eval_output = gr.Textbox(label="Evaluation Results", lines=20)
eval_chart = gr.Plot(label="Evaluation Metrics")
with gr.Tab("Upload PDF"):
gr.Markdown("""
### Upload a Vision 2030 PDF Document
Upload a PDF document to enhance the assistant's knowledge base.
""")
with gr.Row():
file_input = gr.File(
label="Select PDF File",
file_types=[".pdf"],
type="binary" # This is critical - use binary mode
)
with gr.Row():
upload_btn = gr.Button("Process PDF", variant="primary")
with gr.Row():
upload_status = gr.Textbox(
label="Upload Status",
placeholder="Upload status will appear here...",
interactive=False
)
gr.Markdown("""
### Notes:
- The PDF should contain text that can be extracted (not scanned images)
- After uploading, return to the Chat tab to ask questions about the uploaded content
""")
# Set up event handlers
msg.submit(chat, [msg, chatbot], [chatbot, msg])
submit_btn.click(chat, [msg, chatbot], [chatbot, msg])
clear_btn.click(lambda: [], None, chatbot)
feedback_btn.click(provide_feedback, [chatbot, rating, feedback_text], feedback_result)
evaluate_btn.click(run_evaluation, None, [eval_output, eval_chart])
upload_btn.click(assistant.process_uploaded_pdf, [file_input], [upload_status])
return demo
# Launch the app
demo = create_interface()
demo.launch() |