Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Sleeping

App Files Files Community

awacke1 commited on Dec 19, 2024

Commit

0b7e2f0

verified ·

1 Parent(s): ae1d609

Update app.py

Browse files

Files changed (1) hide show

app.py +220 -48

app.py CHANGED Viewed

@@ -37,39 +37,153 @@ if 'tts_voice' not in st.session_state:
 if 'arxiv_last_query' not in st.session_state:
     st.session_state['arxiv_last_query'] = ""
 class VideoSearch:
     def __init__(self):
         self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
         self.load_dataset()
     def fetch_dataset_rows(self):
-        """Fetch dataset from Hugging Face API"""
         try:
-            url = "https://datasets-server.huggingface.co/first-rows?dataset=omegalabsinc%2Fomega-multimodal&config=default&split=train"
-            response = requests.get(url, timeout=30)
-            if response.status_code == 200:
-                data = response.json()
-                if 'rows' in data:
-                    processed_rows = []
-                    for row_data in data['rows']:
-                        row = row_data.get('row', row_data)
-                        for key in row:
-                            if any(term in key.lower() for term in ['embed', 'vector', 'encoding']):
-                                if isinstance(row[key], str):
-                                    try:
-                                        row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()]
-                                    except:
-                                        continue
-                        processed_rows.append(row)
-                    df = pd.DataFrame(processed_rows)
-                    st.session_state['search_columns'] = [col for col in df.columns
-                                                        if col not in ['video_embed', 'description_embed', 'audio_embed']]
-                    return df
             return self.load_example_data()
-        except:
             return self.load_example_data()
     def prepare_features(self):
         """Prepare embeddings with adaptive field detection"""
         try:
@@ -110,22 +224,6 @@ class VideoSearch:
             num_rows = len(self.dataset)
             self.video_embeds = np.random.randn(num_rows, 384)
             self.text_embeds = np.random.randn(num_rows, 384)
-    def load_example_data(self):
-        """Load example data as fallback"""
-        example_data = [
-            {
-                "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
-                "youtube_id": "IO-vwtyicn4",
-                "description": "This video shows a close-up of an ancient text carved into a surface.",
-                "views": 45489,
-                "start_time": 1452,
-                "end_time": 1458,
-                "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
-                "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
-            }
-        ]
-        return pd.DataFrame(example_data)
     def load_dataset(self):
         self.dataset = self.fetch_dataset_rows()
@@ -174,9 +272,7 @@ async def generate_speech(text, voice=None):
         return None
 def transcribe_audio(audio_path):
-    """Placeholder for ASR transcription (no OpenAI/Anthropic).
-       Integrate your own ASR model or API here."""
-    # For now, just return a message:
     return "ASR not implemented. Integrate a local model or another service here."
 def show_file_manager():
@@ -215,12 +311,10 @@ def show_file_manager():
 def arxiv_search(query, max_results=5):
     """Perform a simple Arxiv search using their API and return top results."""
     base_url = "http://export.arxiv.org/api/query?"
-    # Encode the query
     search_url = base_url + f"search_query={quote(query)}&start=0&max_results={max_results}"
     r = requests.get(search_url)
     if r.status_code == 200:
         root = ET.fromstring(r.text)
-        # Namespace handling
         ns = {'atom': 'http://www.w3.org/2005/Atom'}
         entries = root.findall('atom:entry', ns)
         results = []
@@ -248,7 +342,6 @@ def perform_arxiv_lookup(q, vocal_summary=True, titles_summary=True, full_audio=
         if link:
             st.markdown(f"[View Paper]({link})")
-    # TTS Options
     if vocal_summary:
         spoken_text = f"Here are some Arxiv results for {q}. "
         if titles_summary:
@@ -278,7 +371,7 @@ def main():
     search = VideoSearch()
     # Create tabs
-    tab1, tab2, tab3, tab4 = st.tabs(["🔍 Search", "🎙️ Voice Input", "📚 Arxiv", "📂 Files"])
     # ---- Tab 1: Video Search ----
     with tab1:
@@ -332,7 +425,6 @@ def main():
     # ---- Tab 2: Voice Input ----
     with tab2:
         st.subheader("Voice Input")
         st.write("🎙️ Record your voice:")
         audio_bytes = audio_recorder()
         if audio_bytes:
@@ -373,6 +465,86 @@ def main():
     with tab4:
         show_file_manager()
     # Sidebar
     with st.sidebar:
         st.subheader("⚙️ Settings & History")
@@ -392,4 +564,4 @@ def main():
                      key="tts_voice")
 if __name__ == "__main__":
-    main()

 if 'arxiv_last_query' not in st.session_state:
     st.session_state['arxiv_last_query'] = ""
+def fetch_dataset_info(dataset_id):
+    """Fetch dataset information including all available configs and splits"""
+    info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
+    try:
+        response = requests.get(info_url, timeout=30)
+        if response.status_code == 200:
+            return response.json()
+    except Exception as e:
+        st.warning(f"Error fetching dataset info: {e}")
+    return None
+def fetch_dataset_rows(dataset_id, config="default", split="train", max_rows=100):
+    """Fetch rows from a specific config and split of a dataset"""
+    url = f"https://datasets-server.huggingface.co/first-rows?dataset={dataset_id}&config={config}&split={split}"
+    try:
+        response = requests.get(url, timeout=30)
+        if response.status_code == 200:
+            data = response.json()
+            if 'rows' in data:
+                processed_rows = []
+                for row_data in data['rows']:
+                    row = row_data.get('row', row_data)
+                    # Process embeddings if present
+                    for key in row:
+                        if any(term in key.lower() for term in ['embed', 'vector', 'encoding']):
+                            if isinstance(row[key], str):
+                                try:
+                                    row[key] = [float(x.strip()) for x in row[key].strip('[]').split(',') if x.strip()]
+                                except:
+                                    continue
+                    row['_config'] = config
+                    row['_split'] = split
+                    processed_rows.append(row)
+                return processed_rows
+    except Exception as e:
+        st.warning(f"Error fetching rows for {config}/{split}: {e}")
+    return []
+def search_dataset(dataset_id, search_text, include_configs=None, include_splits=None):
+    """
+    Search across all configurations and splits of a dataset
+    Args:
+        dataset_id (str): The Hugging Face dataset ID
+        search_text (str): Text to search for in descriptions and queries
+        include_configs (list): List of specific configs to search, or None for all
+        include_splits (list): List of specific splits to search, or None for all
+    Returns:
+        tuple: (DataFrame of results, list of available configs, list of available splits)
+    """
+    # Get dataset info
+    dataset_info = fetch_dataset_info(dataset_id)
+    if not dataset_info:
+        return pd.DataFrame(), [], []
+    # Get available configs and splits
+    configs = include_configs if include_configs else dataset_info.get('config_names', ['default'])
+    all_rows = []
+    available_splits = set()
+    # Search across configs and splits
+    for config in configs:
+        try:
+            # First fetch split info for this config
+            splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
+            splits_response = requests.get(splits_url, timeout=30)
+            if splits_response.status_code == 200:
+                splits_data = splits_response.json()
+                splits = [split['split'] for split in splits_data.get('splits', [])]
+                if not splits:
+                    splits = ['train']  # fallback to train if no splits found
+                # Filter splits if specified
+                if include_splits:
+                    splits = [s for s in splits if s in include_splits]
+                available_splits.update(splits)
+                # Fetch and search rows for each split
+                for split in splits:
+                    rows = fetch_dataset_rows(dataset_id, config, split)
+                    for row in rows:
+                        # Search in all text fields
+                        text_content = ' '.join(str(v) for v in row.values() if isinstance(v, (str, int, float)))
+                        if search_text.lower() in text_content.lower():
+                            row['_matched_text'] = text_content
+                            row['_relevance_score'] = text_content.lower().count(search_text.lower())
+                            all_rows.append(row)
+        except Exception as e:
+            st.warning(f"Error processing config {config}: {e}")
+            continue
+    # Convert to DataFrame and sort by relevance
+    if all_rows:
+        df = pd.DataFrame(all_rows)
+        df = df.sort_values('_relevance_score', ascending=False)
+        return df, configs, list(available_splits)
+    return pd.DataFrame(), configs, list(available_splits)
 class VideoSearch:
     def __init__(self):
         self.text_model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.dataset_id = "omegalabsinc/omega-multimodal"
         self.load_dataset()
     def fetch_dataset_rows(self):
+        """Fetch dataset with enhanced search capabilities"""
         try:
+            # First try to get all available data
+            df, configs, splits = search_dataset(
+                self.dataset_id,
+                "",  # empty search text to get all data
+                include_configs=None,  # all configs
+                include_splits=None    # all splits
+            )
+            if not df.empty:
+                st.session_state['search_columns'] = [col for col in df.columns
+                    if col not in ['video_embed', 'description_embed', 'audio_embed']
+                    and not col.startswith('_')]
+                return df
             return self.load_example_data()
+        except Exception as e:
+            st.warning(f"Error loading dataset: {e}")
             return self.load_example_data()
+    def load_example_data(self):
+        """Load example data as fallback"""
+        example_data = [
+            {
+                "video_id": "cd21da96-fcca-4c94-a60f-0b1e4e1e29fc",
+                "youtube_id": "IO-vwtyicn4",
+                "description": "This video shows a close-up of an ancient text carved into a surface.",
+                "views": 45489,
+                "start_time": 1452,
+                "end_time": 1458,
+                "video_embed": [0.014160037972033024, -0.003111184574663639, -0.016604168340563774],
+                "description_embed": [-0.05835828185081482, 0.02589797042310238, 0.11952091753482819]
+            }
+        ]
+        return pd.DataFrame(example_data)
     def prepare_features(self):
         """Prepare embeddings with adaptive field detection"""
         try:
             num_rows = len(self.dataset)
             self.video_embeds = np.random.randn(num_rows, 384)
             self.text_embeds = np.random.randn(num_rows, 384)
     def load_dataset(self):
         self.dataset = self.fetch_dataset_rows()
         return None
 def transcribe_audio(audio_path):
+    """Placeholder for ASR transcription"""
     return "ASR not implemented. Integrate a local model or another service here."
 def show_file_manager():
 def arxiv_search(query, max_results=5):
     """Perform a simple Arxiv search using their API and return top results."""
     base_url = "http://export.arxiv.org/api/query?"
     search_url = base_url + f"search_query={quote(query)}&start=0&max_results={max_results}"
     r = requests.get(search_url)
     if r.status_code == 200:
         root = ET.fromstring(r.text)
         ns = {'atom': 'http://www.w3.org/2005/Atom'}
         entries = root.findall('atom:entry', ns)
         results = []
         if link:
             st.markdown(f"[View Paper]({link})")
     if vocal_summary:
         spoken_text = f"Here are some Arxiv results for {q}. "
         if titles_summary:
     search = VideoSearch()
     # Create tabs
+    tab1, tab2, tab3, tab4, tab5 = st.tabs(["🔍 Search", "🎙️ Voice Input", "📚 Arxiv", "📂 Files", "🔍 Advanced Search"])
     # ---- Tab 1: Video Search ----
     with tab1:
     # ---- Tab 2: Voice Input ----
     with tab2:
         st.subheader("Voice Input")
         st.write("🎙️ Record your voice:")
         audio_bytes = audio_recorder()
         if audio_bytes:
     with tab4:
         show_file_manager()
+    # ---- Tab 5: Advanced Dataset Search ----
+    with tab5:
+        st.subheader("Advanced Dataset Search")
+        # Dataset input
+        dataset_id = st.text_input("Dataset ID:", value="omegalabsinc/omega-multimodal")
+        # Search configuration
+        col1, col2 = st.columns([2, 1])
+        with col1:
+            search_text = st.text_input("Search text:",
+                placeholder="Enter text to search across all fields")
+        # Get available configs and splits
+        if dataset_id:
+            dataset_info = fetch_dataset_info(dataset_id)
+            if dataset_info:
+                configs = dataset_info.get('config_names', ['default'])
+                with col2:
+                    selected_configs = st.multiselect(
+                        "Configurations:",
+                        options=configs,
+                        default=['default'] if 'default' in configs else None
+                    )
+                # Fetch available splits
+                if selected_configs:
+                    all_splits = set()
+                    for config in selected_configs:
+                        splits_url = f"https://datasets-server.huggingface.co/splits?dataset={dataset_id}&config={config}"
+                        try:
+                            response = requests.get(splits_url, timeout=30)
+                            if response.status_code == 200:
+                                splits_data = response.json()
+                                splits = [split['split'] for split in splits_data.get('splits', [])]
+                                all_splits.update(splits)
+                        except Exception as e:
+                            st.warning(f"Error fetching splits for {config}: {e}")
+                    selected_splits = st.multiselect(
+                        "Splits:",
+                        options=list(all_splits),
+                        default=['train'] if 'train' in all_splits else None
+                    )
+                    if st.button("🔍 Search Dataset"):
+                        with st.spinner("Searching dataset..."):
+                            results_df, _, _ = search_dataset(
+                                dataset_id,
+                                search_text,
+                                include_configs=selected_configs,
+                                include_splits=selected_splits
+                            )
+                            if not results_df.empty:
+                                st.write(f"Found {len(results_df)} results")
+                                # Display results in expandable sections
+                                for idx, row in results_df.iterrows():
+                                    with st.expander(
+                                        f"Result {idx+1} (Config: {row['_config']}, Split: {row['_split']}, Score: {row['_relevance_score']})"
+                                    ):
+                                        # Display all fields except internal ones
+                                        for col in row.index:
+                                            if not col.startswith('_') and not any(
+                                                term in col.lower()
+                                                for term in ['embed', 'vector', 'encoding']
+                                            ):
+                                                st.write(f"**{col}:** {row[col]}")
+                                        # Add buttons for audio/video if available
+                                        if 'youtube_id' in row:
+                                            st.video(
+                                                f"https://youtube.com/watch?v={row['youtube_id']}&t={row.get('start_time', 0)}"
+                                            )
+                            else:
+                                st.warning("No results found.")
+            else:
+                st.error("Unable to fetch dataset information.")
     # Sidebar
     with st.sidebar:
         st.subheader("⚙️ Settings & History")
                      key="tts_voice")
 if __name__ == "__main__":
+    main()