Update app.py
Browse files
app.py
CHANGED
@@ -361,27 +361,63 @@ class Vision2030Assistant:
|
|
361 |
|
362 |
@spaces.GPU
|
363 |
def retrieve_context(self, query, lang):
|
364 |
-
"""Retrieve relevant context
|
365 |
start_time = time.time()
|
366 |
|
367 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
368 |
if lang == "ar":
|
369 |
-
if has_gpu and hasattr(self.arabic_embedder, 'to')
|
370 |
with torch.no_grad():
|
371 |
query_vec = self.arabic_embedder.encode(query)
|
372 |
else:
|
373 |
query_vec = self.arabic_embedder.encode(query)
|
374 |
|
375 |
-
D, I = self.arabic_index.search(np.array([query_vec]), k=2)
|
376 |
context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
|
377 |
else:
|
378 |
-
if has_gpu and hasattr(self.english_embedder, 'to')
|
379 |
with torch.no_grad():
|
380 |
query_vec = self.english_embedder.encode(query)
|
381 |
else:
|
382 |
query_vec = self.english_embedder.encode(query)
|
383 |
|
384 |
-
D, I = self.english_index.search(np.array([query_vec]), k=2)
|
385 |
context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
|
386 |
|
387 |
retrieval_time = time.time() - start_time
|
@@ -710,65 +746,117 @@ class Vision2030Assistant:
|
|
710 |
logger.info(f"Recorded user feedback: rating={rating}")
|
711 |
|
712 |
return True
|
|
|
713 |
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
719 |
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
730 |
-
full_text = ""
|
731 |
-
for page_num in range(len(reader.pages)):
|
732 |
-
page = reader.pages[page_num]
|
733 |
-
extracted_text = page.extract_text()
|
734 |
-
if extracted_text:
|
735 |
-
full_text += extracted_text + "\n"
|
736 |
-
|
737 |
-
if not full_text.strip():
|
738 |
-
return "The uploaded PDF doesn't contain extractable text. Please try another file."
|
739 |
-
|
740 |
-
# Process the extracted text
|
741 |
-
chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
|
742 |
-
|
743 |
-
# Categorize text by language
|
744 |
-
english_chunks = []
|
745 |
-
arabic_chunks = []
|
746 |
-
|
747 |
-
for chunk in chunks:
|
748 |
-
try:
|
749 |
-
lang = detect(chunk)
|
750 |
-
if lang == "ar":
|
751 |
-
arabic_chunks.append(chunk)
|
752 |
-
else:
|
753 |
-
english_chunks.append(chunk)
|
754 |
-
except:
|
755 |
-
# If language detection fails, assume English
|
756 |
english_chunks.append(chunk)
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
|
761 |
-
|
762 |
-
|
763 |
-
|
764 |
-
|
765 |
-
|
766 |
-
|
767 |
-
|
768 |
-
|
769 |
-
|
770 |
-
|
771 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
772 |
|
773 |
# Create the Gradio interface
|
774 |
def create_interface():
|
|
|
361 |
|
362 |
@spaces.GPU
|
363 |
def retrieve_context(self, query, lang):
|
364 |
+
"""Retrieve relevant context with priority to PDF content"""
|
365 |
start_time = time.time()
|
366 |
|
367 |
try:
|
368 |
+
# First check if we have PDF content
|
369 |
+
if hasattr(self, 'has_pdf_content') and self.has_pdf_content:
|
370 |
+
# Try to retrieve from PDF content first
|
371 |
+
if lang == "ar" and hasattr(self, 'pdf_arabic_index'):
|
372 |
+
if has_gpu and hasattr(self.arabic_embedder, 'to'):
|
373 |
+
with torch.no_grad():
|
374 |
+
query_vec = self.arabic_embedder.encode(query)
|
375 |
+
else:
|
376 |
+
query_vec = self.arabic_embedder.encode(query)
|
377 |
+
|
378 |
+
D, I = self.pdf_arabic_index.search(np.array([query_vec]), k=2)
|
379 |
+
|
380 |
+
# If we found good matches in the PDF
|
381 |
+
if D[0][0] < 1.0: # Check if the distance is small enough
|
382 |
+
context = "\n".join([self.pdf_arabic_texts[i] for i in I[0] if i < len(self.pdf_arabic_texts) and i >= 0])
|
383 |
+
if context.strip():
|
384 |
+
logger.info("Retrieved context from PDF (Arabic)")
|
385 |
+
return context
|
386 |
+
|
387 |
+
elif lang == "en" and hasattr(self, 'pdf_english_index'):
|
388 |
+
if has_gpu and hasattr(self.english_embedder, 'to'):
|
389 |
+
with torch.no_grad():
|
390 |
+
query_vec = self.english_embedder.encode(query)
|
391 |
+
else:
|
392 |
+
query_vec = self.english_embedder.encode(query)
|
393 |
+
|
394 |
+
D, I = self.pdf_english_index.search(np.array([query_vec]), k=2)
|
395 |
+
|
396 |
+
# If we found good matches in the PDF
|
397 |
+
if D[0][0] < 1.0: # Check if the distance is small enough
|
398 |
+
context = "\n".join([self.pdf_english_texts[i] for i in I[0] if i < len(self.pdf_english_texts) and i >= 0])
|
399 |
+
if context.strip():
|
400 |
+
logger.info("Retrieved context from PDF (English)")
|
401 |
+
return context
|
402 |
+
|
403 |
+
# Fall back to the pre-built knowledge base if no good PDF matches
|
404 |
if lang == "ar":
|
405 |
+
if has_gpu and hasattr(self.arabic_embedder, 'to'):
|
406 |
with torch.no_grad():
|
407 |
query_vec = self.arabic_embedder.encode(query)
|
408 |
else:
|
409 |
query_vec = self.arabic_embedder.encode(query)
|
410 |
|
411 |
+
D, I = self.arabic_index.search(np.array([query_vec]), k=2)
|
412 |
context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
|
413 |
else:
|
414 |
+
if has_gpu and hasattr(self.english_embedder, 'to'):
|
415 |
with torch.no_grad():
|
416 |
query_vec = self.english_embedder.encode(query)
|
417 |
else:
|
418 |
query_vec = self.english_embedder.encode(query)
|
419 |
|
420 |
+
D, I = self.english_index.search(np.array([query_vec]), k=2)
|
421 |
context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
|
422 |
|
423 |
retrieval_time = time.time() - start_time
|
|
|
746 |
logger.info(f"Recorded user feedback: rating={rating}")
|
747 |
|
748 |
return True
|
749 |
+
|
750 |
|
751 |
+
@spaces.GPU
|
752 |
+
def process_uploaded_pdf(self, file):
|
753 |
+
"""Process uploaded PDF and prioritize its content for answering questions"""
|
754 |
+
if file is None:
|
755 |
+
return "No file uploaded. Please select a PDF file."
|
756 |
+
|
757 |
+
try:
|
758 |
+
logger.info(f"Processing uploaded file")
|
759 |
+
|
760 |
+
# Convert bytes to file-like object
|
761 |
+
file_stream = io.BytesIO(file)
|
762 |
+
|
763 |
+
# Use PyPDF2 to read the file content
|
764 |
+
reader = PyPDF2.PdfReader(file_stream)
|
765 |
+
|
766 |
+
# Extract text from the PDF
|
767 |
+
full_text = ""
|
768 |
+
for page_num in range(len(reader.pages)):
|
769 |
+
page = reader.pages[page_num]
|
770 |
+
extracted_text = page.extract_text()
|
771 |
+
if extracted_text:
|
772 |
+
full_text += extracted_text + "\n"
|
773 |
+
|
774 |
+
if not full_text.strip():
|
775 |
+
return "The uploaded PDF doesn't contain extractable text. Please try another file."
|
776 |
+
|
777 |
+
# Process the extracted text with better chunking
|
778 |
+
# Break into meaningful chunks by headings or paragraphs
|
779 |
+
chunks = []
|
780 |
+
paragraphs = re.split(r'\n\s*\n', full_text)
|
781 |
+
|
782 |
+
for paragraph in paragraphs:
|
783 |
+
if len(paragraph) > 400: # For very long paragraphs
|
784 |
+
# Try to split by logical sections
|
785 |
+
sections = re.split(r'(?:[.!?])\s+(?=[A-Z]|[\u0621-\u064A])', paragraph)
|
786 |
+
chunks.extend([s.strip() for s in sections if len(s.strip()) > 50])
|
787 |
+
else:
|
788 |
+
if len(paragraph.strip()) > 50: # Only add non-trivial chunks
|
789 |
+
chunks.append(paragraph.strip())
|
790 |
|
791 |
+
# Categorize text by language
|
792 |
+
english_chunks = []
|
793 |
+
arabic_chunks = []
|
794 |
+
|
795 |
+
for chunk in chunks:
|
796 |
+
try:
|
797 |
+
lang = detect(chunk)
|
798 |
+
if lang == "ar":
|
799 |
+
arabic_chunks.append(chunk)
|
800 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
801 |
english_chunks.append(chunk)
|
802 |
+
except:
|
803 |
+
# If language detection fails, try to determine by character set
|
804 |
+
if any('\u0600' <= c <= '\u06FF' for c in chunk):
|
805 |
+
arabic_chunks.append(chunk)
|
806 |
+
else:
|
807 |
+
english_chunks.append(chunk)
|
808 |
+
|
809 |
+
# IMPORTANT: Create separate indices for PDF content to prioritize it
|
810 |
+
self.pdf_english_texts = english_chunks
|
811 |
+
self.pdf_arabic_texts = arabic_chunks
|
812 |
+
|
813 |
+
# Process and embed English PDF texts
|
814 |
+
self.pdf_english_vectors = []
|
815 |
+
for text in english_chunks:
|
816 |
+
try:
|
817 |
+
if has_gpu and hasattr(self.english_embedder, 'to'):
|
818 |
+
with torch.no_grad():
|
819 |
+
vec = self.english_embedder.encode(text)
|
820 |
+
else:
|
821 |
+
vec = self.english_embedder.encode(text)
|
822 |
+
self.pdf_english_vectors.append(vec)
|
823 |
+
except Exception as e:
|
824 |
+
logger.error(f"Error encoding English PDF text: {str(e)}")
|
825 |
+
continue
|
826 |
+
|
827 |
+
# Process and embed Arabic PDF texts
|
828 |
+
self.pdf_arabic_vectors = []
|
829 |
+
for text in arabic_chunks:
|
830 |
+
try:
|
831 |
+
if has_gpu and hasattr(self.arabic_embedder, 'to'):
|
832 |
+
with torch.no_grad():
|
833 |
+
vec = self.arabic_embedder.encode(text)
|
834 |
+
else:
|
835 |
+
vec = self.arabic_embedder.encode(text)
|
836 |
+
self.pdf_arabic_vectors.append(vec)
|
837 |
+
except Exception as e:
|
838 |
+
logger.error(f"Error encoding Arabic PDF text: {str(e)}")
|
839 |
+
continue
|
840 |
+
|
841 |
+
# Create PDF-specific indices
|
842 |
+
if self.pdf_english_vectors:
|
843 |
+
self.pdf_english_index = faiss.IndexFlatL2(len(self.pdf_english_vectors[0]))
|
844 |
+
self.pdf_english_index.add(np.array(self.pdf_english_vectors))
|
845 |
+
|
846 |
+
if self.pdf_arabic_vectors:
|
847 |
+
self.pdf_arabic_index = faiss.IndexFlatL2(len(self.pdf_arabic_vectors[0]))
|
848 |
+
self.pdf_arabic_index.add(np.array(self.pdf_arabic_vectors))
|
849 |
+
|
850 |
+
# Set flag to indicate PDF content is available
|
851 |
+
self.has_pdf_content = True
|
852 |
+
|
853 |
+
logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
|
854 |
+
|
855 |
+
return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. PDF content will now be prioritized when answering questions."
|
856 |
+
|
857 |
+
except Exception as e:
|
858 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
859 |
+
return f"❌ Error processing the PDF: {str(e)}. Please try another file."
|
860 |
|
861 |
# Create the Gradio interface
|
862 |
def create_interface():
|