Spaces:
Running
on
Zero
Running
on
Zero
logging
Browse files- app.py +50 -37
- score_utilizer.py +7 -7
app.py
CHANGED
@@ -538,8 +538,9 @@ class PDFSearchApp:
|
|
538 |
return str(e)
|
539 |
|
540 |
|
541 |
-
def search_documents(self, query
|
542 |
print(f"Searching for query: {query}")
|
|
|
543 |
|
544 |
if not query:
|
545 |
print("Please enter a search query")
|
@@ -594,22 +595,15 @@ class PDFSearchApp:
|
|
594 |
|
595 |
middleware = Middleware(collection_name, create_collection=False)
|
596 |
|
597 |
-
#
|
598 |
-
# Get more results than
|
599 |
-
|
600 |
-
search_results = middleware.search([query], topk=max(num_results * 3, 20))[0]
|
601 |
|
602 |
-
#
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
if optimal_count != num_results:
|
608 |
-
print(f"\nπ― DYNAMIC OPTIMIZATION APPLIED:")
|
609 |
-
print(f" Requested pages: {num_results}")
|
610 |
-
print(f" Optimal pages: {optimal_count}")
|
611 |
-
print(f" Query complexity: {query_complexity}")
|
612 |
-
num_results = optimal_count
|
613 |
|
614 |
# π COMPREHENSIVE SEARCH RESULTS LOGGING
|
615 |
print(f"\nπ SEARCH RESULTS SUMMARY")
|
@@ -652,12 +646,12 @@ class PDFSearchApp:
|
|
652 |
if not search_results:
|
653 |
return "No search results found", "--", "No search results found for your query", [], None, None, None, None
|
654 |
|
655 |
-
#
|
656 |
-
selected_results = self.
|
657 |
|
658 |
# π SELECTION LOGGING - Show which pages were selected
|
659 |
print(f"\nπ― PAGE SELECTION RESULTS")
|
660 |
-
print(f"π
|
661 |
print(f"π Selected: {len(selected_results)} pages")
|
662 |
print(f"π Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
|
663 |
print("-" * 60)
|
@@ -814,23 +808,47 @@ class PDFSearchApp:
|
|
814 |
# Return exactly 7 outputs to match Gradio expectations
|
815 |
return error_msg, "--", error_msg, [], None, None, None, None
|
816 |
|
817 |
-
def
|
818 |
"""
|
819 |
-
|
820 |
-
|
821 |
"""
|
822 |
-
if
|
823 |
-
return
|
824 |
|
825 |
-
# Sort by relevance score
|
826 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
827 |
|
828 |
-
#
|
829 |
-
|
830 |
|
831 |
-
print(f"
|
|
|
|
|
832 |
|
833 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
834 |
|
835 |
def _select_highest_scoring_pages(self, sorted_results, query, num_results):
|
836 |
"""
|
@@ -3436,13 +3454,8 @@ def create_ui():
|
|
3436 |
placeholder="Ask about any topic in your documents...",
|
3437 |
lines=2
|
3438 |
)
|
3439 |
-
|
3440 |
-
|
3441 |
-
maximum=10,
|
3442 |
-
value=3,
|
3443 |
-
step=1,
|
3444 |
-
label="Number of pages to retrieve and cite"
|
3445 |
-
)
|
3446 |
search_btn = gr.Button("Search Documents", variant="primary")
|
3447 |
|
3448 |
gr.Markdown("### Results")
|
@@ -3502,7 +3515,7 @@ def create_ui():
|
|
3502 |
# Query events
|
3503 |
search_btn.click(
|
3504 |
fn=app.search_documents,
|
3505 |
-
inputs=[query_input
|
3506 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
3507 |
)
|
3508 |
|
|
|
538 |
return str(e)
|
539 |
|
540 |
|
541 |
+
def search_documents(self, query):
|
542 |
print(f"Searching for query: {query}")
|
543 |
+
print(f"π― MODE: Returning only TOP 3 highest-scoring pages")
|
544 |
|
545 |
if not query:
|
546 |
print("Please enter a search query")
|
|
|
595 |
|
596 |
middleware = Middleware(collection_name, create_collection=False)
|
597 |
|
598 |
+
# π― TOP 3 PAGES MODE: Always return only the top 3 highest-scoring pages
|
599 |
+
# Get more results than needed to allow for intelligent filtering
|
600 |
+
search_results = middleware.search([query], topk=20)[0] # Get 20 results for better selection
|
|
|
601 |
|
602 |
+
# Fixed to always return top 3 pages
|
603 |
+
num_results = 3
|
604 |
+
print(f"\nπ― TOP 3 PAGES MODE:")
|
605 |
+
print(f" Always returning: {num_results} highest-scoring pages")
|
606 |
+
print(f" Selection strategy: Score-based prioritization")
|
|
|
|
|
|
|
|
|
|
|
|
|
607 |
|
608 |
# π COMPREHENSIVE SEARCH RESULTS LOGGING
|
609 |
print(f"\nπ SEARCH RESULTS SUMMARY")
|
|
|
646 |
if not search_results:
|
647 |
return "No search results found", "--", "No search results found for your query", [], None, None, None, None
|
648 |
|
649 |
+
# π― TOP 3 SELECTION: Always select exactly the top 3 highest-scoring pages
|
650 |
+
selected_results = self._select_top_3_pages(search_results, query)
|
651 |
|
652 |
# π SELECTION LOGGING - Show which pages were selected
|
653 |
print(f"\nπ― PAGE SELECTION RESULTS")
|
654 |
+
print(f"π Mode: Top 3 highest-scoring pages")
|
655 |
print(f"π Selected: {len(selected_results)} pages")
|
656 |
print(f"π Selection rate: {len(selected_results)/len(search_results)*100:.1f}% of available results")
|
657 |
print("-" * 60)
|
|
|
808 |
# Return exactly 7 outputs to match Gradio expectations
|
809 |
return error_msg, "--", error_msg, [], None, None, None, None
|
810 |
|
811 |
+
def _select_top_3_pages(self, search_results, query):
|
812 |
"""
|
813 |
+
Select exactly the top 3 highest-scoring pages
|
814 |
+
Simplified selection focused on the best 3 pages only
|
815 |
"""
|
816 |
+
if not search_results:
|
817 |
+
return []
|
818 |
|
819 |
+
# Sort by relevance score (highest first)
|
820 |
sorted_results = sorted(search_results, key=lambda x: x[0], reverse=True)
|
821 |
|
822 |
+
# Always return exactly the top 3 pages
|
823 |
+
top_3 = sorted_results[:3]
|
824 |
|
825 |
+
print(f"\nπ― TOP 3 PAGES SELECTION:")
|
826 |
+
print(f"π Total available results: {len(search_results)}")
|
827 |
+
print(f"π― Selected: Top 3 highest-scoring pages")
|
828 |
|
829 |
+
# Log the selected pages with scores
|
830 |
+
for i, (score, doc_id) in enumerate(top_3, 1):
|
831 |
+
page_num = doc_id + 1
|
832 |
+
relevance_level = self._get_relevance_level(score)
|
833 |
+
print(f" {i}. Page {page_num:2d} (doc_id: {doc_id:2d}) | Score: {score:8.4f} | {relevance_level}")
|
834 |
+
|
835 |
+
# Calculate selection quality metrics
|
836 |
+
if top_3:
|
837 |
+
scores = [result[0] for result in top_3]
|
838 |
+
avg_score = sum(scores) / len(scores)
|
839 |
+
print(f"\nπ TOP 3 SELECTION QUALITY:")
|
840 |
+
print(f" Average score: {avg_score:.4f}")
|
841 |
+
print(f" Highest score: {scores[0]:.4f}")
|
842 |
+
print(f" Lowest score: {scores[-1]:.4f}")
|
843 |
+
print(f" Score range: {scores[0] - scores[-1]:.4f}")
|
844 |
+
|
845 |
+
return top_3
|
846 |
+
|
847 |
+
def _select_relevant_pages_new_format(self, search_results, query, num_results):
|
848 |
+
"""
|
849 |
+
Legacy function - kept for compatibility but now redirects to top 3 selection
|
850 |
+
"""
|
851 |
+
return self._select_top_3_pages(search_results, query)
|
852 |
|
853 |
def _select_highest_scoring_pages(self, sorted_results, query, num_results):
|
854 |
"""
|
|
|
3454 |
placeholder="Ask about any topic in your documents...",
|
3455 |
lines=2
|
3456 |
)
|
3457 |
+
# Removed number of pages input - always returns top 3 pages
|
3458 |
+
gr.Markdown("π― **Top 3 Pages Mode**: System automatically returns the 3 highest-scoring pages")
|
|
|
|
|
|
|
|
|
|
|
3459 |
search_btn = gr.Button("Search Documents", variant="primary")
|
3460 |
|
3461 |
gr.Markdown("### Results")
|
|
|
3515 |
# Query events
|
3516 |
search_btn.click(
|
3517 |
fn=app.search_documents,
|
3518 |
+
inputs=[query_input],
|
3519 |
outputs=[path, images, llm_answer, cited_pages_display, csv_download, doc_download, excel_download]
|
3520 |
)
|
3521 |
|
score_utilizer.py
CHANGED
@@ -154,7 +154,7 @@ class ScoreUtilizer:
|
|
154 |
|
155 |
return stats
|
156 |
|
157 |
-
def get_highest_scoring_pages(self, parsed_data: Dict, count: int =
|
158 |
"""
|
159 |
Get the highest-scoring pages from parsed data
|
160 |
|
@@ -227,8 +227,8 @@ class ScoreUtilizer:
|
|
227 |
report.append("=" * 60)
|
228 |
|
229 |
# Top pages summary
|
230 |
-
top_pages = self.get_highest_scoring_pages(parsed_data,
|
231 |
-
report.append(f"\nπ TOP
|
232 |
for i, page in enumerate(top_pages, 1):
|
233 |
report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
|
234 |
|
@@ -253,10 +253,10 @@ class ScoreUtilizer:
|
|
253 |
|
254 |
# Usage suggestions
|
255 |
report.append(f"\nπ‘ USAGE SUGGESTIONS:")
|
256 |
-
report.append(f" 1.
|
257 |
-
report.append(f" 2.
|
258 |
-
report.append(f" 3.
|
259 |
-
report.append(f" 4.
|
260 |
|
261 |
report.append("=" * 60)
|
262 |
|
|
|
154 |
|
155 |
return stats
|
156 |
|
157 |
+
def get_highest_scoring_pages(self, parsed_data: Dict, count: int = 3) -> List[Dict]:
|
158 |
"""
|
159 |
Get the highest-scoring pages from parsed data
|
160 |
|
|
|
227 |
report.append("=" * 60)
|
228 |
|
229 |
# Top pages summary
|
230 |
+
top_pages = self.get_highest_scoring_pages(parsed_data, 3)
|
231 |
+
report.append(f"\nπ TOP 3 HIGHEST-SCORING PAGES:")
|
232 |
for i, page in enumerate(top_pages, 1):
|
233 |
report.append(f" {i}. Page {page['page_number']} - Score: {page['score']:.4f} ({page['relevance_level']})")
|
234 |
|
|
|
253 |
|
254 |
# Usage suggestions
|
255 |
report.append(f"\nπ‘ USAGE SUGGESTIONS:")
|
256 |
+
report.append(f" 1. System automatically uses top 3 pages for RAG responses")
|
257 |
+
report.append(f" 2. Excellent pages provide primary context")
|
258 |
+
report.append(f" 3. Very good pages ensure comprehensive coverage")
|
259 |
+
report.append(f" 4. Top 3 selection optimizes response quality")
|
260 |
|
261 |
report.append("=" * 60)
|
262 |
|