abdull4h commited on
Commit
d30267d
·
verified ·
1 Parent(s): f63c425

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -61
app.py CHANGED
@@ -361,27 +361,63 @@ class Vision2030Assistant:
361
 
362
  @spaces.GPU
363
  def retrieve_context(self, query, lang):
364
- """Retrieve relevant context for a query based on language"""
365
  start_time = time.time()
366
 
367
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  if lang == "ar":
369
- if has_gpu and hasattr(self.arabic_embedder, 'to') and callable(getattr(self.arabic_embedder, 'to')):
370
  with torch.no_grad():
371
  query_vec = self.arabic_embedder.encode(query)
372
  else:
373
  query_vec = self.arabic_embedder.encode(query)
374
 
375
- D, I = self.arabic_index.search(np.array([query_vec]), k=2) # Get top 2 most relevant chunks
376
  context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
377
  else:
378
- if has_gpu and hasattr(self.english_embedder, 'to') and callable(getattr(self.english_embedder, 'to')):
379
  with torch.no_grad():
380
  query_vec = self.english_embedder.encode(query)
381
  else:
382
  query_vec = self.english_embedder.encode(query)
383
 
384
- D, I = self.english_index.search(np.array([query_vec]), k=2) # Get top 2 most relevant chunks
385
  context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
386
 
387
  retrieval_time = time.time() - start_time
@@ -710,65 +746,117 @@ class Vision2030Assistant:
710
  logger.info(f"Recorded user feedback: rating={rating}")
711
 
712
  return True
 
713
 
714
- @spaces.GPU
715
- def process_uploaded_pdf(self, file):
716
- """Process uploaded PDF and extract text content"""
717
- if file is None:
718
- return "No file uploaded. Please select a PDF file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
 
720
- try:
721
- logger.info(f"Processing uploaded file")
722
-
723
- # Convert bytes to file-like object
724
- file_stream = io.BytesIO(file)
725
-
726
- # Use PyPDF2 to read the file content
727
- reader = PyPDF2.PdfReader(file_stream)
728
-
729
- # Extract text from the PDF
730
- full_text = ""
731
- for page_num in range(len(reader.pages)):
732
- page = reader.pages[page_num]
733
- extracted_text = page.extract_text()
734
- if extracted_text:
735
- full_text += extracted_text + "\n"
736
-
737
- if not full_text.strip():
738
- return "The uploaded PDF doesn't contain extractable text. Please try another file."
739
-
740
- # Process the extracted text
741
- chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
742
-
743
- # Categorize text by language
744
- english_chunks = []
745
- arabic_chunks = []
746
-
747
- for chunk in chunks:
748
- try:
749
- lang = detect(chunk)
750
- if lang == "ar":
751
- arabic_chunks.append(chunk)
752
- else:
753
- english_chunks.append(chunk)
754
- except:
755
- # If language detection fails, assume English
756
  english_chunks.append(chunk)
757
-
758
- # Add the extracted chunks to our knowledge base
759
- self.english_texts.extend(english_chunks)
760
- self.arabic_texts.extend(arabic_chunks)
761
-
762
- # Recreate indices
763
- self._create_indices()
764
-
765
- logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
766
-
767
- return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments."
768
-
769
- except Exception as e:
770
- logger.error(f"Error processing PDF: {str(e)}")
771
- return f"❌ Error processing the PDF: {str(e)}. Please try another file."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
772
 
773
  # Create the Gradio interface
774
  def create_interface():
 
361
 
362
  @spaces.GPU
363
  def retrieve_context(self, query, lang):
364
+ """Retrieve relevant context with priority to PDF content"""
365
  start_time = time.time()
366
 
367
  try:
368
+ # First check if we have PDF content
369
+ if hasattr(self, 'has_pdf_content') and self.has_pdf_content:
370
+ # Try to retrieve from PDF content first
371
+ if lang == "ar" and hasattr(self, 'pdf_arabic_index'):
372
+ if has_gpu and hasattr(self.arabic_embedder, 'to'):
373
+ with torch.no_grad():
374
+ query_vec = self.arabic_embedder.encode(query)
375
+ else:
376
+ query_vec = self.arabic_embedder.encode(query)
377
+
378
+ D, I = self.pdf_arabic_index.search(np.array([query_vec]), k=2)
379
+
380
+ # If we found good matches in the PDF
381
+ if D[0][0] < 1.0: # Check if the distance is small enough
382
+ context = "\n".join([self.pdf_arabic_texts[i] for i in I[0] if i < len(self.pdf_arabic_texts) and i >= 0])
383
+ if context.strip():
384
+ logger.info("Retrieved context from PDF (Arabic)")
385
+ return context
386
+
387
+ elif lang == "en" and hasattr(self, 'pdf_english_index'):
388
+ if has_gpu and hasattr(self.english_embedder, 'to'):
389
+ with torch.no_grad():
390
+ query_vec = self.english_embedder.encode(query)
391
+ else:
392
+ query_vec = self.english_embedder.encode(query)
393
+
394
+ D, I = self.pdf_english_index.search(np.array([query_vec]), k=2)
395
+
396
+ # If we found good matches in the PDF
397
+ if D[0][0] < 1.0: # Check if the distance is small enough
398
+ context = "\n".join([self.pdf_english_texts[i] for i in I[0] if i < len(self.pdf_english_texts) and i >= 0])
399
+ if context.strip():
400
+ logger.info("Retrieved context from PDF (English)")
401
+ return context
402
+
403
+ # Fall back to the pre-built knowledge base if no good PDF matches
404
  if lang == "ar":
405
+ if has_gpu and hasattr(self.arabic_embedder, 'to'):
406
  with torch.no_grad():
407
  query_vec = self.arabic_embedder.encode(query)
408
  else:
409
  query_vec = self.arabic_embedder.encode(query)
410
 
411
+ D, I = self.arabic_index.search(np.array([query_vec]), k=2)
412
  context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
413
  else:
414
+ if has_gpu and hasattr(self.english_embedder, 'to'):
415
  with torch.no_grad():
416
  query_vec = self.english_embedder.encode(query)
417
  else:
418
  query_vec = self.english_embedder.encode(query)
419
 
420
+ D, I = self.english_index.search(np.array([query_vec]), k=2)
421
  context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
422
 
423
  retrieval_time = time.time() - start_time
 
746
  logger.info(f"Recorded user feedback: rating={rating}")
747
 
748
  return True
749
+
750
 
751
+ @spaces.GPU
752
+ def process_uploaded_pdf(self, file):
753
+ """Process uploaded PDF and prioritize its content for answering questions"""
754
+ if file is None:
755
+ return "No file uploaded. Please select a PDF file."
756
+
757
+ try:
758
+ logger.info(f"Processing uploaded file")
759
+
760
+ # Convert bytes to file-like object
761
+ file_stream = io.BytesIO(file)
762
+
763
+ # Use PyPDF2 to read the file content
764
+ reader = PyPDF2.PdfReader(file_stream)
765
+
766
+ # Extract text from the PDF
767
+ full_text = ""
768
+ for page_num in range(len(reader.pages)):
769
+ page = reader.pages[page_num]
770
+ extracted_text = page.extract_text()
771
+ if extracted_text:
772
+ full_text += extracted_text + "\n"
773
+
774
+ if not full_text.strip():
775
+ return "The uploaded PDF doesn't contain extractable text. Please try another file."
776
+
777
+ # Process the extracted text with better chunking
778
+ # Break into meaningful chunks by headings or paragraphs
779
+ chunks = []
780
+ paragraphs = re.split(r'\n\s*\n', full_text)
781
+
782
+ for paragraph in paragraphs:
783
+ if len(paragraph) > 400: # For very long paragraphs
784
+ # Try to split by logical sections
785
+ sections = re.split(r'(?:[.!?])\s+(?=[A-Z]|[\u0621-\u064A])', paragraph)
786
+ chunks.extend([s.strip() for s in sections if len(s.strip()) > 50])
787
+ else:
788
+ if len(paragraph.strip()) > 50: # Only add non-trivial chunks
789
+ chunks.append(paragraph.strip())
790
 
791
+ # Categorize text by language
792
+ english_chunks = []
793
+ arabic_chunks = []
794
+
795
+ for chunk in chunks:
796
+ try:
797
+ lang = detect(chunk)
798
+ if lang == "ar":
799
+ arabic_chunks.append(chunk)
800
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
801
  english_chunks.append(chunk)
802
+ except:
803
+ # If language detection fails, try to determine by character set
804
+ if any('\u0600' <= c <= '\u06FF' for c in chunk):
805
+ arabic_chunks.append(chunk)
806
+ else:
807
+ english_chunks.append(chunk)
808
+
809
+ # IMPORTANT: Create separate indices for PDF content to prioritize it
810
+ self.pdf_english_texts = english_chunks
811
+ self.pdf_arabic_texts = arabic_chunks
812
+
813
+ # Process and embed English PDF texts
814
+ self.pdf_english_vectors = []
815
+ for text in english_chunks:
816
+ try:
817
+ if has_gpu and hasattr(self.english_embedder, 'to'):
818
+ with torch.no_grad():
819
+ vec = self.english_embedder.encode(text)
820
+ else:
821
+ vec = self.english_embedder.encode(text)
822
+ self.pdf_english_vectors.append(vec)
823
+ except Exception as e:
824
+ logger.error(f"Error encoding English PDF text: {str(e)}")
825
+ continue
826
+
827
+ # Process and embed Arabic PDF texts
828
+ self.pdf_arabic_vectors = []
829
+ for text in arabic_chunks:
830
+ try:
831
+ if has_gpu and hasattr(self.arabic_embedder, 'to'):
832
+ with torch.no_grad():
833
+ vec = self.arabic_embedder.encode(text)
834
+ else:
835
+ vec = self.arabic_embedder.encode(text)
836
+ self.pdf_arabic_vectors.append(vec)
837
+ except Exception as e:
838
+ logger.error(f"Error encoding Arabic PDF text: {str(e)}")
839
+ continue
840
+
841
+ # Create PDF-specific indices
842
+ if self.pdf_english_vectors:
843
+ self.pdf_english_index = faiss.IndexFlatL2(len(self.pdf_english_vectors[0]))
844
+ self.pdf_english_index.add(np.array(self.pdf_english_vectors))
845
+
846
+ if self.pdf_arabic_vectors:
847
+ self.pdf_arabic_index = faiss.IndexFlatL2(len(self.pdf_arabic_vectors[0]))
848
+ self.pdf_arabic_index.add(np.array(self.pdf_arabic_vectors))
849
+
850
+ # Set flag to indicate PDF content is available
851
+ self.has_pdf_content = True
852
+
853
+ logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
854
+
855
+ return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. PDF content will now be prioritized when answering questions."
856
+
857
+ except Exception as e:
858
+ logger.error(f"Error processing PDF: {str(e)}")
859
+ return f"❌ Error processing the PDF: {str(e)}. Please try another file."
860
 
861
  # Create the Gradio interface
862
  def create_interface():