Spaces:

Testys
/

semantic-search

Running

App Files Files Community

Testys commited on Mar 21

Commit

74dd725

verified ·

1 Parent(s): 7f7218c

Update search_utils.py

Browse files

Files changed (1) hide show

search_utils.py +41 -200

search_utils.py CHANGED Viewed

@@ -24,208 +24,49 @@ logger = logging.getLogger("MetadataManager")
 class MetadataManager:
     def __init__(self):
-        self.cache_dir = Path("unzipped_cache")
-        self.shard_dir = self.cache_dir / "metadata_shards"
-        self.shard_map = {}
-        self.loaded_shards = {}
         self.total_docs = 0
-        self.api_cache = {}
         logger.info("Initializing MetadataManager")
-        self._ensure_directories()
-        self._unzip_if_needed()
-        self._build_shard_map()
         logger.info(f"Total documents indexed: {self.total_docs}")
-        logger.info(f"Total shards found: {len(self.shard_map)}")
-    def _ensure_directories(self):
-        """Create necessary directories if they don't exist."""
-        self.cache_dir.mkdir(parents=True, exist_ok=True)
-        self.shard_dir.mkdir(parents=True, exist_ok=True)
-    def _unzip_if_needed(self):
-        """Extract the ZIP archive if no parquet files are found."""
-        zip_path = Path("metadata_shards.zip")
-        if not any(self.shard_dir.rglob("*.parquet")):
-            logger.info("No parquet files found, checking for zip archive")
-            if not zip_path.exists():
-                raise FileNotFoundError(f"Metadata ZIP file not found at {zip_path}")
-            logger.info(f"Extracting {zip_path} to {self.shard_dir}")
-            try:
-                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-                    zip_root = self._get_zip_root(zip_ref)
-                    zip_ref.extractall(self.shard_dir)
-                    if zip_root:
-                        nested_dir = self.shard_dir / zip_root
-                        if nested_dir.exists():
-                            self._flatten_directory(nested_dir, self.shard_dir)
-                            nested_dir.rmdir()
-                    parquet_files = list(self.shard_dir.rglob("*.parquet"))
-                    if not parquet_files:
-                        raise RuntimeError("Extraction completed but no parquet files found")
-                    logger.info(f"Found {len(parquet_files)} parquet files after extraction")
-            except Exception as e:
-                logger.error(f"Failed to extract zip file: {str(e)}")
-                self._clean_failed_extraction()
-                raise
-    def _get_zip_root(self, zip_ref):
-        """Identify the common root directory within the ZIP file."""
-        try:
-            first_file = zip_ref.namelist()[0]
-            if '/' in first_file:
-                return first_file.split('/')[0]
-            return ""
-        except Exception as e:
-            logger.warning(f"Error detecting zip root: {str(e)}")
-            return ""
-    def _flatten_directory(self, src_dir, dest_dir):
-        """Move files from a nested directory up to the destination."""
-        for item in src_dir.iterdir():
-            if item.is_dir():
-                self._flatten_directory(item, dest_dir)
-                item.rmdir()
-            else:
-                target = dest_dir / item.name
-                if target.exists():
-                    target.unlink()
-                item.rename(target)
-    def _clean_failed_extraction(self):
-        """Clean up files from a failed extraction attempt."""
-        logger.info("Cleaning up failed extraction")
-        for item in self.shard_dir.iterdir():
-            if item.is_dir():
-                shutil.rmtree(item)
-            else:
-                item.unlink()
-    def _build_shard_map(self):
-        """Build a map from global index ranges to shard filenames."""
-        logger.info("Building shard map from parquet files")
-        parquet_files = list(self.shard_dir.glob("*.parquet"))
-        if not parquet_files:
-            raise FileNotFoundError("No parquet files found after extraction")
-        parquet_files = sorted(parquet_files, key=lambda x: int(x.stem.split("_")[1]))
-        expected_start = 0
-        for f in parquet_files:
-            try:
-                parts = f.stem.split("_")
-                if len(parts) != 3:
-                    raise ValueError("Invalid filename format")
-                start = int(parts[1])
-                end = int(parts[2])
-                if start != expected_start:
-                    raise ValueError(f"Non-contiguous shard start: expected {expected_start}, got {start}")
-                if end <= start:
-                    raise ValueError(f"Invalid shard range: {start}-{end}")
-                self.shard_map[(start, end)] = f.name
-                self.total_docs = end + 1
-                expected_start = end + 1
-                logger.debug(f"Mapped shard {f.name}: indices {start}-{end}")
-            except Exception as e:
-                logger.error(f"Error processing shard {f.name}: {str(e)}")
-                raise RuntimeError("Invalid shard structure") from e
-        logger.info(f"Validated {len(self.shard_map)} continuous shards")
-        logger.info(f"Total document count: {self.total_docs}")
-        sorted_ranges = sorted(self.shard_map.keys())
-        for i in range(1, len(sorted_ranges)):
-            prev_end = sorted_ranges[i-1][1]
-            curr_start = sorted_ranges[i][0]
-            if curr_start != prev_end + 1:
-                logger.warning(f"Gap or overlap detected between shards: {prev_end} to {curr_start}")
-    def _process_shard(self, shard, local_indices):
-        """Load a shard (if not already loaded) and retrieve the specified rows."""
         try:
-            if shard not in self.loaded_shards:
-                shard_path = self.shard_dir / shard
-                if not shard_path.exists():
-                    logger.error(f"Shard file not found: {shard_path}")
-                    return pd.DataFrame(columns=["title", "summary", "similarity", "authors", "source"])
-                file_size_mb = os.path.getsize(shard_path) / (1024 * 1024)
-                logger.info(f"Loading shard file: {shard} (size: {file_size_mb:.2f} MB)")
-                try:
-                    # Load with explicit dtype for source column
-                    self.loaded_shards[shard] = pd.read_parquet(
-                        shard_path,
-                        columns=["title", "summary", "source", "authors"]
-                    )
-                    # Convert source to string type explicitly
-                    self.loaded_shards[shard]['source'] = self.loaded_shards[shard]['source'].astype(str)
-                    # Convert source strings to lists
-                    self.loaded_shards[shard]['source'] = self.loaded_shards[shard]['source'].apply(
-                        lambda x: x.split("; ") if isinstance(x, str) else []
-                    )
-                    # Handle missing summaries
-                    self.loaded_shards[shard]['summary'] = self.loaded_shards[shard]['summary'].fillna("")
-                    logger.info(f"Loaded shard {shard} with {len(self.loaded_shards[shard])} rows")
-                except Exception as e:
-                    logger.error(f"Failed to read parquet file {shard}: {str(e)}")
-                    return pd.DataFrame(columns=["title", "summary", "similarity", "source", "authors"])
-            df = self.loaded_shards[shard]
-            df_len = len(df)
-            valid_local_indices = [idx for idx in local_indices if 0 <= idx < df_len]
-            if len(valid_local_indices) != len(local_indices):
-                logger.warning(f"Filtered {len(local_indices) - len(valid_local_indices)} out-of-bounds indices in shard {shard}")
-            if valid_local_indices:
-                chunk = df.iloc[valid_local_indices]
-                logger.info(f"Retrieved {len(chunk)} records from shard {shard}")
-                return chunk
         except Exception as e:
-            logger.error(f"Error processing shard {shard}: {str(e)}", exc_info=True)
-        return pd.DataFrame(columns=["title", "summary", "similarity", "source", "authors"])
     def get_metadata(self, global_indices):
-        """Retrieve metadata for a batch of global indices using parallel shard processing."""
         if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
-            logger.warning("Empty indices array passed to get_metadata")
-            return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
-        indices_list = global_indices.tolist() if isinstance(global_indices, np.ndarray) else global_indices
-        logger.info(f"Retrieving metadata for {len(indices_list)} indices")
-        valid_indices = [idx for idx in indices_list if 0 <= idx < self.total_docs]
-        invalid_count = len(indices_list) - len(valid_indices)
-        if invalid_count > 0:
-            logger.warning(f"Filtered out {invalid_count} invalid indices")
-        if not valid_indices:
-            logger.warning("No valid indices remain after filtering")
-            return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
-        # Group indices by shard
-        shard_groups = {}
-        for idx in valid_indices:
-            found = False
-            for (start, end), shard in self.shard_map.items():
-                if start <= idx <= end:
-                    shard_groups.setdefault(shard, []).append(idx - start)
-                    found = True
-                    break
-            if not found:
-                logger.warning(f"Index {idx} not found in any shard range")
-        # Process shards concurrently
-        results = []
-        with concurrent.futures.ThreadPoolExecutor() as executor:
-            futures = [executor.submit(self._process_shard, shard, local_indices)
-                       for shard, local_indices in shard_groups.items()]
-            for future in concurrent.futures.as_completed(futures):
-                df_chunk = future.result()
-                if not df_chunk.empty:
-                    results.append(df_chunk)
-        if results:
-            combined = pd.concat(results).reset_index(drop=True)
-            logger.info(f"Combined metadata: {len(combined)} records from {len(results)} shards")
-            return combined
-        else:
-            logger.warning("No metadata records retrieved")
-            return pd.DataFrame(columns=["title", "summary", "similarity", "source"])
 class SemanticSearch:
@@ -383,13 +224,13 @@ class SemanticSearch:
             results['similarity'] = distances
             # Ensure URL lists are properly formatted
-            results['source'] = results['source'].apply(
-                lambda x: [
-                    url.strip().rstrip(')')  # Clean trailing parentheses and whitespace
-                    for url in str(x).split(';')  # Split on semicolons
-                    if url.strip()  # Remove empty strings
-                ] if isinstance(x, (str, list)) else []
-            )
             # Deduplicate and sort
             required_columns = ["title", "summary", "authors", "source", "similarity"]

 class MetadataManager:
     def __init__(self):
+        self.metadata_path = Path("combined.parquet")
+        self.df = None
         self.total_docs = 0
         logger.info("Initializing MetadataManager")
+        self._load_metadata()
         logger.info(f"Total documents indexed: {self.total_docs}")
+    def _load_metadata(self):
+        """Load the combined parquet file directly"""
+        logger.info("Loading metadata from combined.parquet")
         try:
+            # Load the parquet file
+            self.df = pd.read_parquet(self.metadata_path)
+            # Clean and format the data
+            self.df['source'] = self.df['source'].apply(
+                lambda x: [
+                    url.strip()
+                    for url in str(x).split(';')
+                    if url.strip()
+                ]
+            )
+            self.total_docs = len(self.df)
+            logger.info(f"Successfully loaded {self.total_docs} documents")
         except Exception as e:
+            logger.error(f"Failed to load metadata: {str(e)}")
+            raise
     def get_metadata(self, global_indices):
+        """Retrieve metadata for given indices"""
         if isinstance(global_indices, np.ndarray) and global_indices.size == 0:
+            return pd.DataFrame(columns=["title", "summary", 'authors', "similarity", "source"])
+        try:
+            # Directly index the DataFrame
+            results = self.df.iloc[global_indices].copy()
+            return results.reset_index(drop=True)
+        except Exception as e:
+            logger.error(f"Metadata retrieval failed: {str(e)}")
+            return pd.DataFrame(columns=["title", "summary", "similarity", "source", 'authors'])
 class SemanticSearch:
             results['similarity'] = distances
             # Ensure URL lists are properly formatted
+            # results['source'] = results['source'].apply(
+            #     lambda x: [
+            #         url.strip().rstrip(')')  # Clean trailing parentheses and whitespace
+            #         for url in str(x).split(';')  # Split on semicolons
+            #         if url.strip()  # Remove empty strings
+            #     ] if isinstance(x, (str, list)) else []
+            # )
             # Deduplicate and sort
             required_columns = ["title", "summary", "authors", "source", "similarity"]