Spaces:

abdull4h
/

vision-2030-virtual-assistant

Running

App Files Files Community

abdull4h commited on Mar 20

Commit

d30267d

verified ·

1 Parent(s): f63c425

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -61

app.py CHANGED Viewed

@@ -361,27 +361,63 @@ class Vision2030Assistant:
     @spaces.GPU
     def retrieve_context(self, query, lang):
-        """Retrieve relevant context for a query based on language"""
         start_time = time.time()
         try:
             if lang == "ar":
-                if has_gpu and hasattr(self.arabic_embedder, 'to') and callable(getattr(self.arabic_embedder, 'to')):
                     with torch.no_grad():
                         query_vec = self.arabic_embedder.encode(query)
                 else:
                     query_vec = self.arabic_embedder.encode(query)
-                D, I = self.arabic_index.search(np.array([query_vec]), k=2)  # Get top 2 most relevant chunks
                 context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
             else:
-                if has_gpu and hasattr(self.english_embedder, 'to') and callable(getattr(self.english_embedder, 'to')):
                     with torch.no_grad():
                         query_vec = self.english_embedder.encode(query)
                 else:
                     query_vec = self.english_embedder.encode(query)
-                D, I = self.english_index.search(np.array([query_vec]), k=2)  # Get top 2 most relevant chunks
                 context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
             retrieval_time = time.time() - start_time
@@ -710,65 +746,117 @@ class Vision2030Assistant:
         logger.info(f"Recorded user feedback: rating={rating}")
         return True
-    @spaces.GPU
-    def process_uploaded_pdf(self, file):
-        """Process uploaded PDF and extract text content"""
-        if file is None:
-            return "No file uploaded. Please select a PDF file."
-        try:
-            logger.info(f"Processing uploaded file")
-            # Convert bytes to file-like object
-            file_stream = io.BytesIO(file)
-            # Use PyPDF2 to read the file content
-            reader = PyPDF2.PdfReader(file_stream)
-            # Extract text from the PDF
-            full_text = ""
-            for page_num in range(len(reader.pages)):
-                page = reader.pages[page_num]
-                extracted_text = page.extract_text()
-                if extracted_text:
-                    full_text += extracted_text + "\n"
-            if not full_text.strip():
-                return "The uploaded PDF doesn't contain extractable text. Please try another file."
-            # Process the extracted text
-            chunks = [chunk.strip() for chunk in re.split(r'\n\s*\n', full_text) if chunk.strip()]
-            # Categorize text by language
-            english_chunks = []
-            arabic_chunks = []
-            for chunk in chunks:
-                try:
-                    lang = detect(chunk)
-                    if lang == "ar":
-                        arabic_chunks.append(chunk)
-                    else:
-                        english_chunks.append(chunk)
-                except:
-                    # If language detection fails, assume English
                     english_chunks.append(chunk)
-            # Add the extracted chunks to our knowledge base
-            self.english_texts.extend(english_chunks)
-            self.arabic_texts.extend(arabic_chunks)
-            # Recreate indices
-            self._create_indices()
-            logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
-            return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments."
-        except Exception as e:
-            logger.error(f"Error processing PDF: {str(e)}")
-            return f"❌ Error processing the PDF: {str(e)}. Please try another file."
 # Create the Gradio interface
 def create_interface():

     @spaces.GPU
     def retrieve_context(self, query, lang):
+        """Retrieve relevant context with priority to PDF content"""
         start_time = time.time()
         try:
+            # First check if we have PDF content
+            if hasattr(self, 'has_pdf_content') and self.has_pdf_content:
+                # Try to retrieve from PDF content first
+                if lang == "ar" and hasattr(self, 'pdf_arabic_index'):
+                    if has_gpu and hasattr(self.arabic_embedder, 'to'):
+                        with torch.no_grad():
+                            query_vec = self.arabic_embedder.encode(query)
+                    else:
+                        query_vec = self.arabic_embedder.encode(query)
+                    D, I = self.pdf_arabic_index.search(np.array([query_vec]), k=2)
+                    # If we found good matches in the PDF
+                    if D[0][0] < 1.0:  # Check if the distance is small enough
+                        context = "\n".join([self.pdf_arabic_texts[i] for i in I[0] if i < len(self.pdf_arabic_texts) and i >= 0])
+                        if context.strip():
+                            logger.info("Retrieved context from PDF (Arabic)")
+                            return context
+                elif lang == "en" and hasattr(self, 'pdf_english_index'):
+                    if has_gpu and hasattr(self.english_embedder, 'to'):
+                        with torch.no_grad():
+                            query_vec = self.english_embedder.encode(query)
+                    else:
+                        query_vec = self.english_embedder.encode(query)
+                    D, I = self.pdf_english_index.search(np.array([query_vec]), k=2)
+                    # If we found good matches in the PDF
+                    if D[0][0] < 1.0:  # Check if the distance is small enough
+                        context = "\n".join([self.pdf_english_texts[i] for i in I[0] if i < len(self.pdf_english_texts) and i >= 0])
+                        if context.strip():
+                            logger.info("Retrieved context from PDF (English)")
+                            return context
+            # Fall back to the pre-built knowledge base if no good PDF matches
             if lang == "ar":
+                if has_gpu and hasattr(self.arabic_embedder, 'to'):
                     with torch.no_grad():
                         query_vec = self.arabic_embedder.encode(query)
                 else:
                     query_vec = self.arabic_embedder.encode(query)
+                D, I = self.arabic_index.search(np.array([query_vec]), k=2)
                 context = "\n".join([self.arabic_texts[i] for i in I[0] if i < len(self.arabic_texts) and i >= 0])
             else:
+                if has_gpu and hasattr(self.english_embedder, 'to'):
                     with torch.no_grad():
                         query_vec = self.english_embedder.encode(query)
                 else:
                     query_vec = self.english_embedder.encode(query)
+                D, I = self.english_index.search(np.array([query_vec]), k=2)
                 context = "\n".join([self.english_texts[i] for i in I[0] if i < len(self.english_texts) and i >= 0])
             retrieval_time = time.time() - start_time
         logger.info(f"Recorded user feedback: rating={rating}")
         return True
+@spaces.GPU
+def process_uploaded_pdf(self, file):
+    """Process uploaded PDF and prioritize its content for answering questions"""
+    if file is None:
+        return "No file uploaded. Please select a PDF file."
+    try:
+        logger.info(f"Processing uploaded file")
+        # Convert bytes to file-like object
+        file_stream = io.BytesIO(file)
+        # Use PyPDF2 to read the file content
+        reader = PyPDF2.PdfReader(file_stream)
+        # Extract text from the PDF
+        full_text = ""
+        for page_num in range(len(reader.pages)):
+            page = reader.pages[page_num]
+            extracted_text = page.extract_text()
+            if extracted_text:
+                full_text += extracted_text + "\n"
+        if not full_text.strip():
+            return "The uploaded PDF doesn't contain extractable text. Please try another file."
+        # Process the extracted text with better chunking
+        # Break into meaningful chunks by headings or paragraphs
+        chunks = []
+        paragraphs = re.split(r'\n\s*\n', full_text)
+        for paragraph in paragraphs:
+            if len(paragraph) > 400:  # For very long paragraphs
+                # Try to split by logical sections
+                sections = re.split(r'(?:[.!?])\s+(?=[A-Z]|[\u0621-\u064A])', paragraph)
+                chunks.extend([s.strip() for s in sections if len(s.strip()) > 50])
+            else:
+                if len(paragraph.strip()) > 50:  # Only add non-trivial chunks
+                    chunks.append(paragraph.strip())
+        # Categorize text by language
+        english_chunks = []
+        arabic_chunks = []
+        for chunk in chunks:
+            try:
+                lang = detect(chunk)
+                if lang == "ar":
+                    arabic_chunks.append(chunk)
+                else:
                     english_chunks.append(chunk)
+            except:
+                # If language detection fails, try to determine by character set
+                if any('\u0600' <= c <= '\u06FF' for c in chunk):
+                    arabic_chunks.append(chunk)
+                else:
+                    english_chunks.append(chunk)
+        # IMPORTANT: Create separate indices for PDF content to prioritize it
+        self.pdf_english_texts = english_chunks
+        self.pdf_arabic_texts = arabic_chunks
+        # Process and embed English PDF texts
+        self.pdf_english_vectors = []
+        for text in english_chunks:
+            try:
+                if has_gpu and hasattr(self.english_embedder, 'to'):
+                    with torch.no_grad():
+                        vec = self.english_embedder.encode(text)
+                else:
+                    vec = self.english_embedder.encode(text)
+                self.pdf_english_vectors.append(vec)
+            except Exception as e:
+                logger.error(f"Error encoding English PDF text: {str(e)}")
+                continue
+        # Process and embed Arabic PDF texts
+        self.pdf_arabic_vectors = []
+        for text in arabic_chunks:
+            try:
+                if has_gpu and hasattr(self.arabic_embedder, 'to'):
+                    with torch.no_grad():
+                        vec = self.arabic_embedder.encode(text)
+                else:
+                    vec = self.arabic_embedder.encode(text)
+                self.pdf_arabic_vectors.append(vec)
+            except Exception as e:
+                logger.error(f"Error encoding Arabic PDF text: {str(e)}")
+                continue
+        # Create PDF-specific indices
+        if self.pdf_english_vectors:
+            self.pdf_english_index = faiss.IndexFlatL2(len(self.pdf_english_vectors[0]))
+            self.pdf_english_index.add(np.array(self.pdf_english_vectors))
+        if self.pdf_arabic_vectors:
+            self.pdf_arabic_index = faiss.IndexFlatL2(len(self.pdf_arabic_vectors[0]))
+            self.pdf_arabic_index.add(np.array(self.pdf_arabic_vectors))
+        # Set flag to indicate PDF content is available
+        self.has_pdf_content = True
+        logger.info(f"Successfully processed PDF: {len(arabic_chunks)} Arabic chunks, {len(english_chunks)} English chunks")
+        return f"✅ Successfully processed the PDF! Found {len(arabic_chunks)} Arabic and {len(english_chunks)} English text segments. PDF content will now be prioritized when answering questions."
+    except Exception as e:
+        logger.error(f"Error processing PDF: {str(e)}")
+        return f"❌ Error processing the PDF: {str(e)}. Please try another file."
 # Create the Gradio interface
 def create_interface():