Spaces:
Running
Running
Update search_utils.py
Browse files- search_utils.py +4 -2
search_utils.py
CHANGED
@@ -38,17 +38,19 @@ class MetadataManager:
|
|
38 |
try:
|
39 |
# Load the parquet file
|
40 |
self.df = pd.read_parquet(self.metadata_path)
|
|
|
41 |
# Clean and format the data
|
42 |
self.df['source'] = self.df['source'].apply(
|
43 |
lambda x: [
|
44 |
url.strip()
|
45 |
for url in str(x).split(';')
|
46 |
-
if url.strip()
|
47 |
]
|
48 |
)
|
49 |
-
# Convert list of sources to a single string
|
50 |
self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
|
51 |
|
|
|
52 |
self.total_docs = len(self.df)
|
53 |
|
54 |
|
|
|
38 |
try:
|
39 |
# Load the parquet file
|
40 |
self.df = pd.read_parquet(self.metadata_path)
|
41 |
+
|
42 |
# Clean and format the data
|
43 |
self.df['source'] = self.df['source'].apply(
|
44 |
lambda x: [
|
45 |
url.strip()
|
46 |
for url in str(x).split(';')
|
47 |
+
if url.strip() and url.startswith('http')
|
48 |
]
|
49 |
)
|
50 |
+
# Convert list of sources to a single string, separate by ' | '
|
51 |
self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
|
52 |
|
53 |
+
|
54 |
self.total_docs = len(self.df)
|
55 |
|
56 |
|