Spaces:

Kazel
/

demo-updated

Running on Zero

App Files Files Community

Kazel commited on 14 days ago

Commit

af476a6

1 Parent(s): 4e19c9e

logging

Browse files

Files changed (2) hide show

app.py +50 -37
score_utilizer.py +7 -7

app.py CHANGED Viewed

@@ -538,8 +538,9 @@ class PDFSearchApp:
             return str(e)
-    def search_documents(self, query, num_results):
         print(f"Searching for query: {query}")
         if not query:
             print("Please enter a search query")
@@ -594,22 +595,15 @@ class PDFSearchApp:
             middleware = Middleware(collection_name, create_collection=False)
-            # Enhanced multi-page retrieval with vision-guided chunking approach
-            # Get more results than requested to allow for intelligent filtering
-            # Request 3x the number of results for better selection
-            search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
-            # 🎯 DYNAMIC OPTIMIZATION: Determine optimal page count based on query complexity
-            query_complexity = self._analyze_query_complexity(query)
-            optimal_count = self.get_optimal_page_count(search_results, query_complexity)
-            # Use the optimal count if it's different from requested
-            if optimal_count != num_results:
-                print(f"\n🎯 DYNAMIC OPTIMIZATION APPLIED:")
-                print(f"   Requested pages: {num_results}")
-                print(f"   Optimal pages: {optimal_count}")
-                print(f"   Query complexity: {query_complexity}")
-                num_results = optimal_count
             # 📊 COMPREHENSIVE SEARCH RESULTS LOGGING
             print(f"\n🔍 SEARCH RESULTS SUMMARY")
@@ -652,12 +646,12 @@ class PDFSearchApp:
             if not search_results:
                 return "No search results found", "--", "No search results found for your query", [], None, None, None, None
-            # Implement intelligent multi-page selection based on research
-            selected_results = self._select_relevant_pages_new_format(search_results, query, num_results)
             # 📊 SELECTION LOGGING - Show which pages were selected
             print(f"\n🎯 PAGE SELECTION RESULTS")
-            print(f"📄 Requested: {num_results} pages")
             print(f"📄 Selected: {len(selected_results)} pages")
             print(f"📄 Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
             print("-" * 60)
@@ -814,23 +808,47 @@ class PDFSearchApp:
             # Return exactly 7 outputs to match Gradio expectations
             return error_msg, "--", error_msg, [], None, None, None, None
-    def _select_relevant_pages_new_format(self, search_results, query, num_results):
         """
-        Intelligent page selection for new Milvus format: (score, doc_id)
-        Enhanced to automatically use highest-scoring pages with dynamic thresholds
         """
-        if len(search_results) <= num_results:
-            return search_results
-        # Sort by relevance score
         sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
-        # 🎯 ENHANCED SELECTION: Use highest-scoring pages with dynamic thresholds
-        selected = self._select_highest_scoring_pages(sorted_results, query, num_results)
-        print(f"Requested {num_results} pages, selected {len(selected)} pages using enhanced scoring")
-        return selected
     def _select_highest_scoring_pages(self, sorted_results, query, num_results):
         """
@@ -3436,13 +3454,8 @@ def create_ui():
                     placeholder="Ask about any topic in your documents...",
                     lines=2
                 )
-                num_results = gr.Slider(
-                    minimum=1,
-                    maximum=10,
-                    value=3,
-                    step=1,
-                    label="Number of pages to retrieve and cite"
-                )
                 search_btn = gr.Button("Search Documents", variant="primary")
                 gr.Markdown("### Results")
@@ -3502,7 +3515,7 @@ def create_ui():
         # Query events
         search_btn.click(
             fn=app.search_documents,
-            inputs=[query_input, num_results],
             outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
         )

             return str(e)
+    def search_documents(self, query):
         print(f"Searching for query: {query}")
+        print(f"🎯 MODE: Returning only TOP 3 highest-scoring pages")
         if not query:
             print("Please enter a search query")
             middleware = Middleware(collection_name, create_collection=False)
+            # 🎯 TOP 3 PAGES MODE: Always return only the top 3 highest-scoring pages
+            # Get more results than needed to allow for intelligent filtering
+            search_results = middleware.search([query], topk=20)[0]  # Get 20 results for better selection
+            # Fixed to always return top 3 pages
+            num_results = 3
+            print(f"\n🎯 TOP 3 PAGES MODE:")
+            print(f"   Always returning: {num_results} highest-scoring pages")
+            print(f"   Selection strategy: Score-based prioritization")
             # 📊 COMPREHENSIVE SEARCH RESULTS LOGGING
             print(f"\n🔍 SEARCH RESULTS SUMMARY")
             if not search_results:
                 return "No search results found", "--", "No search results found for your query", [], None, None, None, None
+            # 🎯 TOP 3 SELECTION: Always select exactly the top 3 highest-scoring pages
+            selected_results = self._select_top_3_pages(search_results, query)
             # 📊 SELECTION LOGGING - Show which pages were selected
             print(f"\n🎯 PAGE SELECTION RESULTS")
+            print(f"📄 Mode: Top 3 highest-scoring pages")
             print(f"📄 Selected: {len(selected_results)} pages")
             print(f"📄 Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
             print("-" * 60)
             # Return exactly 7 outputs to match Gradio expectations
             return error_msg, "--", error_msg, [], None, None, None, None
+    def _select_top_3_pages(self, search_results, query):
         """
+        Select exactly the top 3 highest-scoring pages
+        Simplified selection focused on the best 3 pages only
         """
+        if not search_results:
+            return []
+        # Sort by relevance score (highest first)
         sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
+        # Always return exactly the top 3 pages
+        top_3 = sorted_results[:3]
+        print(f"\n🎯 TOP 3 PAGES SELECTION:")
+        print(f"📊 Total available results: {len(search_results)}")
+        print(f"🎯 Selected: Top 3 highest-scoring pages")
+        # Log the selected pages with scores
+        for i, (score, doc_id) in enumerate(top_3, 1):
+            page_num = doc_id + 1
+            relevance_level = self._get_relevance_level(score)
+            print(f"   {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
+        # Calculate selection quality metrics
+        if top_3:
+            scores = [result[0] for result in top_3]
+            avg_score = sum(scores) / len(scores)
+            print(f"\n📊 TOP 3 SELECTION QUALITY:")
+            print(f"   Average score: {avg_score:.4f}")
+            print(f"   Highest score: {scores[0]:.4f}")
+            print(f"   Lowest score: {scores[-1]:.4f}")
+            print(f"   Score range: {scores[0] - scores[-1]:.4f}")
+        return top_3
+    def _select_relevant_pages_new_format(self, search_results, query, num_results):
+        """
+        Legacy function - kept for compatibility but now redirects to top 3 selection
+        """
+        return self._select_top_3_pages(search_results, query)
     def _select_highest_scoring_pages(self, sorted_results, query, num_results):
         """
                     placeholder="Ask about any topic in your documents...",
                     lines=2
                 )
+                # Removed number of pages input - always returns top 3 pages
+                gr.Markdown("🎯 **Top 3 Pages Mode**: System automatically returns the 3 highest-scoring pages")
                 search_btn = gr.Button("Search Documents", variant="primary")
                 gr.Markdown("### Results")
         # Query events
         search_btn.click(
             fn=app.search_documents,
+            inputs=[query_input],
             outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
         )

score_utilizer.py CHANGED Viewed

@@ -154,7 +154,7 @@ class ScoreUtilizer:
         return stats
-    def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 5) -> List[Dict]:
         """
         Get the highest-scoring pages from parsed data
@@ -227,8 +227,8 @@ class ScoreUtilizer:
         report.append("=" * 60)
         # Top pages summary
-        top_pages = self.get_highest_scoring_pages(parsed_data, 5)
-        report.append(f"\n🏆 TOP 5 HIGHEST-SCORING PAGES:")
         for i, page in enumerate(top_pages, 1):
             report.append(f"   {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
@@ -253,10 +253,10 @@ class ScoreUtilizer:
         # Usage suggestions
         report.append(f"\n💡 USAGE SUGGESTIONS:")
-        report.append(f"   1. Feed top 3 pages to language model for focused responses")
-        report.append(f"   2. Use excellent pages for critical information extraction")
-        report.append(f"   3. Include very good pages for comprehensive analysis")
-        report.append(f"   4. Consider page diversity for balanced coverage")
         report.append("=" * 60)

         return stats
+    def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 3) -> List[Dict]:
         """
         Get the highest-scoring pages from parsed data
         report.append("=" * 60)
         # Top pages summary
+        top_pages = self.get_highest_scoring_pages(parsed_data, 3)
+        report.append(f"\n🏆 TOP 3 HIGHEST-SCORING PAGES:")
         for i, page in enumerate(top_pages, 1):
             report.append(f"   {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
         # Usage suggestions
         report.append(f"\n💡 USAGE SUGGESTIONS:")
+        report.append(f"   1. System automatically uses top 3 pages for RAG responses")
+        report.append(f"   2. Excellent pages provide primary context")
+        report.append(f"   3. Very good pages ensure comprehensive coverage")
+        report.append(f"   4. Top 3 selection optimizes response quality")
         report.append("=" * 60)