vumichien commited on
Commit
0477818
·
1 Parent(s): aeda459

Enhance prediction process by adding missing columns with defaults, ensuring correct data types, and improving error handling. Update cached embeddings with new size.

Browse files
data/cached_embeddings_unit.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:244d9e2e89d023dfcd8eb8eddf81d4295b3028e2ebb19b01638af08432edb6c8
3
- size 730951
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:917d6d46ef5e75ddca3f081169eb9f9323eab50dbed95583037907c26c855ae0
3
+ size 734106
routes/predict.py CHANGED
@@ -174,15 +174,35 @@ async def predict(
174
  try:
175
  # Abstract mapping
176
  if sentence_service.df_abstract_map_data is not None:
177
- # Ensure required columns exist for AbstractSimilarityMapper
178
  required_columns_for_abstract = {
 
179
  "摘要グループ": "",
180
- "確定": "",
 
 
181
  }
182
 
183
- for col, default_value in required_columns_for_abstract.items():
 
184
  if col not in df_output_data.columns:
185
- df_output_data[col] = default_value
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  abstract_similarity_mapper = AbstractSimilarityMapper(
188
  cached_embedding_helper=sentence_service.abstract_cached_embedding_helper,
@@ -190,9 +210,16 @@ async def predict(
190
  )
191
  abstract_similarity_mapper.predict_input(df_input_data=df_output_data)
192
 
 
 
193
  except Exception as e:
194
  print(f"Error processing AbstractSimilarityMapper: {e}")
195
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
196
 
197
  try:
198
  # Name and abstract mapping
@@ -290,25 +317,6 @@ async def predict(
290
  # Fill NaN values and ensure all output columns have proper values
291
  df_output_data = df_output_data.fillna("")
292
 
293
- # Convert columns to string to avoid dtype issues
294
- string_columns = [
295
- "摘要グループ",
296
- "確定",
297
- "出力_基準中科目",
298
- "出力_中科目",
299
- "出力_項目名",
300
- "出力_標準名称",
301
- "出力_基準名称",
302
- "出力_単位",
303
- "出力_集計用単位",
304
- "出力_標準単位",
305
- "出力_基準単位",
306
- "外部・内部区分",
307
- ]
308
- for col in string_columns:
309
- if col in df_output_data.columns:
310
- df_output_data[col] = df_output_data[col].astype(str).replace("nan", "")
311
-
312
  # Debug: Print available columns to see what we have
313
  print(f"Available columns after processing: {list(df_output_data.columns)}")
314
 
 
174
  try:
175
  # Abstract mapping
176
  if sentence_service.df_abstract_map_data is not None:
177
+ # Ensure required columns exist before AbstractSimilarityMapper
178
  required_columns_for_abstract = {
179
+ "標準科目": "",
180
  "摘要グループ": "",
181
+ "確定": "未確定",
182
+ "摘要": "",
183
+ "備考": "",
184
  }
185
 
186
+ # Add missing columns with appropriate defaults
187
+ for col, default_val in required_columns_for_abstract.items():
188
  if col not in df_output_data.columns:
189
+ df_output_data[col] = default_val
190
+ print(
191
+ f"DEBUG: Added missing column '{col}' with default value '{default_val}'"
192
+ )
193
+
194
+ # Ensure data types are correct (convert to string to avoid type issues)
195
+ for col in ["標準科目", "摘要グループ", "確定", "摘要", "備考"]:
196
+ if col in df_output_data.columns:
197
+ df_output_data[col] = df_output_data[col].astype(str).fillna("")
198
+
199
+ # Debug: Print sample data before AbstractSimilarityMapper
200
+ print(f"DEBUG: Sample data before AbstractSimilarityMapper:")
201
+ print(
202
+ df_output_data[["標準科目", "摘要グループ", "確定", "摘要", "備考"]]
203
+ .head(3)
204
+ .to_string()
205
+ )
206
 
207
  abstract_similarity_mapper = AbstractSimilarityMapper(
208
  cached_embedding_helper=sentence_service.abstract_cached_embedding_helper,
 
210
  )
211
  abstract_similarity_mapper.predict_input(df_input_data=df_output_data)
212
 
213
+ print(f"DEBUG: AbstractSimilarityMapper completed successfully")
214
+
215
  except Exception as e:
216
  print(f"Error processing AbstractSimilarityMapper: {e}")
217
+ print(f"DEBUG: Full error traceback:")
218
+ import traceback
219
+
220
+ traceback.print_exc()
221
+ # Don't raise the exception, continue processing
222
+ print(f"DEBUG: Continuing without AbstractSimilarityMapper...")
223
 
224
  try:
225
  # Name and abstract mapping
 
317
  # Fill NaN values and ensure all output columns have proper values
318
  df_output_data = df_output_data.fillna("")
319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  # Debug: Print available columns to see what we have
321
  print(f"Available columns after processing: {list(df_output_data.columns)}")
322
 
services/sentence_transformer_service.py CHANGED
@@ -211,6 +211,13 @@ class SentenceTransformerService:
211
  print(
212
  f"Loaded abstract map data: {len(self.df_abstract_map_data)} entries"
213
  )
 
 
 
 
 
 
 
214
 
215
  # Load name and subject map data
216
  name_and_subject_map_file = os.path.join(
 
211
  print(
212
  f"Loaded abstract map data: {len(self.df_abstract_map_data)} entries"
213
  )
214
+ print(
215
+ f"DEBUG: Abstract map data columns: {list(self.df_abstract_map_data.columns)}"
216
+ )
217
+ print(f"DEBUG: Abstract map data sample:")
218
+ print(self.df_abstract_map_data.head(3).to_string())
219
+ else:
220
+ print(f"DEBUG: Abstract map file not found: {abstract_map_file}")
221
 
222
  # Load name and subject map data
223
  name_and_subject_map_file = os.path.join(