Testys commited on
Commit
4e58098
·
verified ·
1 Parent(s): 8b5c2a1

Update search_utils.py

Browse files
Files changed (1) hide show
  1. search_utils.py +4 -2
search_utils.py CHANGED
@@ -38,17 +38,19 @@ class MetadataManager:
38
  try:
39
  # Load the parquet file
40
  self.df = pd.read_parquet(self.metadata_path)
 
41
  # Clean and format the data
42
  self.df['source'] = self.df['source'].apply(
43
  lambda x: [
44
  url.strip()
45
  for url in str(x).split(';')
46
- if url.strip()
47
  ]
48
  )
49
- # Convert list of sources to a single string
50
  self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
51
 
 
52
  self.total_docs = len(self.df)
53
 
54
 
 
38
  try:
39
  # Load the parquet file
40
  self.df = pd.read_parquet(self.metadata_path)
41
+
42
  # Clean and format the data
43
  self.df['source'] = self.df['source'].apply(
44
  lambda x: [
45
  url.strip()
46
  for url in str(x).split(';')
47
+ if url.strip() and url.startswith('http')
48
  ]
49
  )
50
+ # Convert list of sources to a single string, separate by ' | '
51
  self.df['source'] = self.df['source'].apply(lambda x: ' | '.join(x) if isinstance(x, list) else x)
52
 
53
+
54
  self.total_docs = len(self.df)
55
 
56