Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Sleeping

App Files Files Community

awacke1 commited on Dec 20, 2024

Commit

fcc5344

verified ·

1 Parent(s): a2236b2

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -15

app.py CHANGED Viewed

@@ -124,14 +124,14 @@ class FastDatasetSearcher:
         return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
     def quick_search(self, query, df):
-        """Enhanced search with improved relevance filtering"""
         if df.empty or not query.strip():
             return df
         try:
-            # Define relevance thresholds
-            MIN_KEYWORD_MATCHES = 0.1
-            MIN_SEMANTIC_SCORE = 0.3
             # Get searchable columns
             searchable_cols = []
@@ -150,34 +150,55 @@ class FastDatasetSearcher:
             for _, row in df.iterrows():
                 text_parts = []
                 row_matched = False
-                # Check for direct matches
-                for col in searchable_cols:
                     val = row[col]
                     if val is not None:
                         val_str = str(val).lower()
-                        if any(term in val_str for term in query_terms):
                             row_matched = True
                         text_parts.append(str(val))
                 text = ' '.join(text_parts)
                 if text.strip():
-                    # Calculate term-based keyword score
-                    text_terms = set(text.lower().split())
-                    matching_terms = query_terms.intersection(text_terms)
                     keyword_score = len(matching_terms) / len(query_terms)
                     # Calculate semantic score
                     text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
                     semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
-                    # Weighted combination
-                    combined_score = 0.7 * keyword_score + 0.3 * semantic_score
-                    # Boost exact matches
-                    if row_matched:
-                        combined_score *= 1.5
                 else:
                     combined_score = 0.0
                     row_matched = False
@@ -460,6 +481,7 @@ def perform_arxiv_lookup(query, vocal_summary=True, titles_summary=True, full_au
             st.audio(audio_file_full)
 def render_result(result):
     score = result.get('relevance_score', 0)
     result_filtered = {k: v for k, v in result.items()
                       if k not in ['relevance_score', 'video_embed', 'description_embed', 'audio_embed']}
@@ -469,12 +491,36 @@ def render_result(result):
     cols = st.columns([2, 1])
     with cols[0]:
         for key, value in result_filtered.items():
             if isinstance(value, (str, int, float)):
                 st.write(f"**{key}:** {value}")
     with cols[1]:
         st.metric("Relevance Score", f"{score:.2%}")
 def main():
     st.title("🎥 Advanced Video & Dataset Search with Voice")

         return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
     def quick_search(self, query, df):
+        """Enhanced search with strict token matching and semantic relevance"""
         if df.empty or not query.strip():
             return df
         try:
+            # Define stricter thresholds
+            MIN_SEMANTIC_SCORE = 0.5  # Higher semantic threshold
+            EXACT_MATCH_BOOST = 2.0   # Boost for exact matches
             # Get searchable columns
             searchable_cols = []
             for _, row in df.iterrows():
                 text_parts = []
                 row_matched = False
+                exact_match = False
+                # Prioritize description and matched_text fields
+                priority_fields = ['description', 'matched_text']
+                other_fields = [col for col in searchable_cols if col not in priority_fields]
+                # First check priority fields for exact matches
+                for col in priority_fields:
+                    if col in row:
+                        val = row[col]
+                        if val is not None:
+                            val_str = str(val).lower()
+                            # Check for exact token matches
+                            if query_lower in val_str.split():
+                                exact_match = True
+                            if any(term in val_str.split() for term in query_terms):
+                                row_matched = True
+                            text_parts.append(str(val))
+                # Then check other fields
+                for col in other_fields:
                     val = row[col]
                     if val is not None:
                         val_str = str(val).lower()
+                        if query_lower in val_str.split():
+                            exact_match = True
+                        if any(term in val_str.split() for term in query_terms):
                             row_matched = True
                         text_parts.append(str(val))
                 text = ' '.join(text_parts)
                 if text.strip():
+                    # Calculate exact token matches
+                    text_tokens = set(text.lower().split())
+                    matching_terms = query_terms.intersection(text_tokens)
                     keyword_score = len(matching_terms) / len(query_terms)
                     # Calculate semantic score
                     text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
                     semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
+                    # Weighted scoring with priority for exact matches
+                    combined_score = 0.8 * keyword_score + 0.2 * semantic_score
+                    if exact_match:
+                        combined_score *= EXACT_MATCH_BOOST
+                    elif row_matched:
+                        combined_score *= 1.2
                 else:
                     combined_score = 0.0
                     row_matched = False
             st.audio(audio_file_full)
 def render_result(result):
+    """Render a search result with voice selection and TTS options"""
     score = result.get('relevance_score', 0)
     result_filtered = {k: v for k, v in result.items()
                       if k not in ['relevance_score', 'video_embed', 'description_embed', 'audio_embed']}
     cols = st.columns([2, 1])
     with cols[0]:
+        text_content = []  # Collect text for TTS
         for key, value in result_filtered.items():
             if isinstance(value, (str, int, float)):
                 st.write(f"**{key}:** {value}")
+                if isinstance(value, str) and len(value.strip()) > 0:
+                    text_content.append(f"{key}: {value}")
     with cols[1]:
         st.metric("Relevance Score", f"{score:.2%}")
+        # Voice selection for TTS
+        voices = {
+            "Aria (US Female)": "en-US-AriaNeural",
+            "Guy (US Male)": "en-US-GuyNeural",
+            "Sonia (UK Female)": "en-GB-SoniaNeural",
+            "Tony (UK Male)": "en-GB-TonyNeural",
+            "Jenny (US Female)": "en-US-JennyNeural"
+        }
+        selected_voice = st.selectbox(
+            "Select Voice",
+            list(voices.keys()),
+            key=f"voice_{result.get('video_id', '')}"
+        )
+        if st.button("🔊 Read Description", key=f"read_{result.get('video_id', '')}"):
+            text_to_read = ". ".join(text_content)
+            audio_file = asyncio.run(generate_speech(text_to_read, voices[selected_voice]))
+            if audio_file:
+                st.audio(audio_file)
 def main():
     st.title("🎥 Advanced Video & Dataset Search with Voice")