Kazel commited on
Commit
af476a6
Β·
1 Parent(s): 4e19c9e
Files changed (2) hide show
  1. app.py +50 -37
  2. score_utilizer.py +7 -7
app.py CHANGED
@@ -538,8 +538,9 @@ class PDFSearchApp:
538
  return str(e)
539
 
540
 
541
- def search_documents(self, query, num_results):
542
  print(f"Searching for query: {query}")
 
543
 
544
  if not query:
545
  print("Please enter a search query")
@@ -594,22 +595,15 @@ class PDFSearchApp:
594
 
595
  middleware = Middleware(collection_name, create_collection=False)
596
 
597
- # Enhanced multi-page retrieval with vision-guided chunking approach
598
- # Get more results than requested to allow for intelligent filtering
599
- # Request 3x the number of results for better selection
600
- search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
601
 
602
- # 🎯 DYNAMIC OPTIMIZATION: Determine optimal page count based on query complexity
603
- query_complexity = self._analyze_query_complexity(query)
604
- optimal_count = self.get_optimal_page_count(search_results, query_complexity)
605
-
606
- # Use the optimal count if it's different from requested
607
- if optimal_count != num_results:
608
- print(f"\n🎯 DYNAMIC OPTIMIZATION APPLIED:")
609
- print(f" Requested pages: {num_results}")
610
- print(f" Optimal pages: {optimal_count}")
611
- print(f" Query complexity: {query_complexity}")
612
- num_results = optimal_count
613
 
614
  # πŸ“Š COMPREHENSIVE SEARCH RESULTS LOGGING
615
  print(f"\nπŸ” SEARCH RESULTS SUMMARY")
@@ -652,12 +646,12 @@ class PDFSearchApp:
652
  if not search_results:
653
  return "No search results found", "--", "No search results found for your query", [], None, None, None, None
654
 
655
- # Implement intelligent multi-page selection based on research
656
- selected_results = self._select_relevant_pages_new_format(search_results, query, num_results)
657
 
658
  # πŸ“Š SELECTION LOGGING - Show which pages were selected
659
  print(f"\n🎯 PAGE SELECTION RESULTS")
660
- print(f"πŸ“„ Requested: {num_results} pages")
661
  print(f"πŸ“„ Selected: {len(selected_results)} pages")
662
  print(f"πŸ“„ Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
663
  print("-" * 60)
@@ -814,23 +808,47 @@ class PDFSearchApp:
814
  # Return exactly 7 outputs to match Gradio expectations
815
  return error_msg, "--", error_msg, [], None, None, None, None
816
 
817
- def _select_relevant_pages_new_format(self, search_results, query, num_results):
818
  """
819
- Intelligent page selection for new Milvus format: (score, doc_id)
820
- Enhanced to automatically use highest-scoring pages with dynamic thresholds
821
  """
822
- if len(search_results) <= num_results:
823
- return search_results
824
 
825
- # Sort by relevance score
826
  sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
827
 
828
- # 🎯 ENHANCED SELECTION: Use highest-scoring pages with dynamic thresholds
829
- selected = self._select_highest_scoring_pages(sorted_results, query, num_results)
830
 
831
- print(f"Requested {num_results} pages, selected {len(selected)} pages using enhanced scoring")
 
 
832
 
833
- return selected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
 
835
  def _select_highest_scoring_pages(self, sorted_results, query, num_results):
836
  """
@@ -3436,13 +3454,8 @@ def create_ui():
3436
  placeholder="Ask about any topic in your documents...",
3437
  lines=2
3438
  )
3439
- num_results = gr.Slider(
3440
- minimum=1,
3441
- maximum=10,
3442
- value=3,
3443
- step=1,
3444
- label="Number of pages to retrieve and cite"
3445
- )
3446
  search_btn = gr.Button("Search Documents", variant="primary")
3447
 
3448
  gr.Markdown("### Results")
@@ -3502,7 +3515,7 @@ def create_ui():
3502
  # Query events
3503
  search_btn.click(
3504
  fn=app.search_documents,
3505
- inputs=[query_input, num_results],
3506
  outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
3507
  )
3508
 
 
538
  return str(e)
539
 
540
 
541
+ def search_documents(self, query):
542
  print(f"Searching for query: {query}")
543
+ print(f"🎯 MODE: Returning only TOP 3 highest-scoring pages")
544
 
545
  if not query:
546
  print("Please enter a search query")
 
595
 
596
  middleware = Middleware(collection_name, create_collection=False)
597
 
598
+ # 🎯 TOP 3 PAGES MODE: Always return only the top 3 highest-scoring pages
599
+ # Get more results than needed to allow for intelligent filtering
600
+ search_results = middleware.search([query], topk=20)[0] # Get 20 results for better selection
 
601
 
602
+ # Fixed to always return top 3 pages
603
+ num_results = 3
604
+ print(f"\n🎯 TOP 3 PAGES MODE:")
605
+ print(f" Always returning: {num_results} highest-scoring pages")
606
+ print(f" Selection strategy: Score-based prioritization")
 
 
 
 
 
 
607
 
608
  # πŸ“Š COMPREHENSIVE SEARCH RESULTS LOGGING
609
  print(f"\nπŸ” SEARCH RESULTS SUMMARY")
 
646
  if not search_results:
647
  return "No search results found", "--", "No search results found for your query", [], None, None, None, None
648
 
649
+ # 🎯 TOP 3 SELECTION: Always select exactly the top 3 highest-scoring pages
650
+ selected_results = self._select_top_3_pages(search_results, query)
651
 
652
  # πŸ“Š SELECTION LOGGING - Show which pages were selected
653
  print(f"\n🎯 PAGE SELECTION RESULTS")
654
+ print(f"πŸ“„ Mode: Top 3 highest-scoring pages")
655
  print(f"πŸ“„ Selected: {len(selected_results)} pages")
656
  print(f"πŸ“„ Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
657
  print("-" * 60)
 
808
  # Return exactly 7 outputs to match Gradio expectations
809
  return error_msg, "--", error_msg, [], None, None, None, None
810
 
811
+ def _select_top_3_pages(self, search_results, query):
812
  """
813
+ Select exactly the top 3 highest-scoring pages
814
+ Simplified selection focused on the best 3 pages only
815
  """
816
+ if not search_results:
817
+ return []
818
 
819
+ # Sort by relevance score (highest first)
820
  sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
821
 
822
+ # Always return exactly the top 3 pages
823
+ top_3 = sorted_results[:3]
824
 
825
+ print(f"\n🎯 TOP 3 PAGES SELECTION:")
826
+ print(f"πŸ“Š Total available results: {len(search_results)}")
827
+ print(f"🎯 Selected: Top 3 highest-scoring pages")
828
 
829
+ # Log the selected pages with scores
830
+ for i, (score, doc_id) in enumerate(top_3, 1):
831
+ page_num = doc_id + 1
832
+ relevance_level = self._get_relevance_level(score)
833
+ print(f" {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
834
+
835
+ # Calculate selection quality metrics
836
+ if top_3:
837
+ scores = [result[0] for result in top_3]
838
+ avg_score = sum(scores) / len(scores)
839
+ print(f"\nπŸ“Š TOP 3 SELECTION QUALITY:")
840
+ print(f" Average score: {avg_score:.4f}")
841
+ print(f" Highest score: {scores[0]:.4f}")
842
+ print(f" Lowest score: {scores[-1]:.4f}")
843
+ print(f" Score range: {scores[0] - scores[-1]:.4f}")
844
+
845
+ return top_3
846
+
847
+ def _select_relevant_pages_new_format(self, search_results, query, num_results):
848
+ """
849
+ Legacy function - kept for compatibility but now redirects to top 3 selection
850
+ """
851
+ return self._select_top_3_pages(search_results, query)
852
 
853
  def _select_highest_scoring_pages(self, sorted_results, query, num_results):
854
  """
 
3454
  placeholder="Ask about any topic in your documents...",
3455
  lines=2
3456
  )
3457
+ # Removed number of pages input - always returns top 3 pages
3458
+ gr.Markdown("🎯 **Top 3 Pages Mode**: System automatically returns the 3 highest-scoring pages")
 
 
 
 
 
3459
  search_btn = gr.Button("Search Documents", variant="primary")
3460
 
3461
  gr.Markdown("### Results")
 
3515
  # Query events
3516
  search_btn.click(
3517
  fn=app.search_documents,
3518
+ inputs=[query_input],
3519
  outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
3520
  )
3521
 
score_utilizer.py CHANGED
@@ -154,7 +154,7 @@ class ScoreUtilizer:
154
 
155
  return stats
156
 
157
- def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 5) -> List[Dict]:
158
  """
159
  Get the highest-scoring pages from parsed data
160
 
@@ -227,8 +227,8 @@ class ScoreUtilizer:
227
  report.append("=" * 60)
228
 
229
  # Top pages summary
230
- top_pages = self.get_highest_scoring_pages(parsed_data, 5)
231
- report.append(f"\nπŸ† TOP 5 HIGHEST-SCORING PAGES:")
232
  for i, page in enumerate(top_pages, 1):
233
  report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
234
 
@@ -253,10 +253,10 @@ class ScoreUtilizer:
253
 
254
  # Usage suggestions
255
  report.append(f"\nπŸ’‘ USAGE SUGGESTIONS:")
256
- report.append(f" 1. Feed top 3 pages to language model for focused responses")
257
- report.append(f" 2. Use excellent pages for critical information extraction")
258
- report.append(f" 3. Include very good pages for comprehensive analysis")
259
- report.append(f" 4. Consider page diversity for balanced coverage")
260
 
261
  report.append("=" * 60)
262
 
 
154
 
155
  return stats
156
 
157
+ def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 3) -> List[Dict]:
158
  """
159
  Get the highest-scoring pages from parsed data
160
 
 
227
  report.append("=" * 60)
228
 
229
  # Top pages summary
230
+ top_pages = self.get_highest_scoring_pages(parsed_data, 3)
231
+ report.append(f"\nπŸ† TOP 3 HIGHEST-SCORING PAGES:")
232
  for i, page in enumerate(top_pages, 1):
233
  report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
234
 
 
253
 
254
  # Usage suggestions
255
  report.append(f"\nπŸ’‘ USAGE SUGGESTIONS:")
256
+ report.append(f" 1. System automatically uses top 3 pages for RAG responses")
257
+ report.append(f" 2. Excellent pages provide primary context")
258
+ report.append(f" 3. Very good pages ensure comprehensive coverage")
259
+ report.append(f" 4. Top 3 selection optimizes response quality")
260
 
261
  report.append("=" * 60)
262