Sin2pi commited on
Commit
45a9c69
·
verified ·
1 Parent(s): 75becfe

Update modelA.py

Browse files
Files changed (1) hide show
  1. modelA.py +26 -31
modelA.py CHANGED
@@ -1089,7 +1089,7 @@ class Echo(nn.Module):
1089
  "eos_token_id": self.eos_token_id,
1090
  })
1091
  return Config()
1092
-
1093
  def setup_tokenizer(token: str):
1094
  from tokenizers import Tokenizer
1095
  tokenizer = Tokenizer.from_file("./tokenizer.json")
@@ -1101,22 +1101,23 @@ def setup_tokenizer(token: str):
1101
  ids = [id for id in ids if id not in sp_ids]
1102
  return ids
1103
 
1104
- def bdec(ids_list, skip_special_tokens=True):
1105
  results = []
1106
  for ids in ids_list:
 
 
 
1107
  if skip_special_tokens:
1108
- if ids and ids[0] == 1:
 
 
1109
  ids = ids[1:]
1110
- while ids and ids[-1] in [0, 2]:
1111
  ids = ids[:-1]
1112
-
1113
- if isinstance(ids, torch.Tensor):
1114
- ids = ids.tolist()
1115
- elif isinstance(ids, np.ndarray):
1116
- ids = ids.tolist()
1117
  results.append(tokenizer.decode(ids))
1118
  return results
1119
 
 
1120
  def save_pretrained(save_dir):
1121
  os.makedirs(save_dir, exist_ok=True)
1122
  tokenizer.save(f"{save_dir}/tokenizer.json")
@@ -1400,44 +1401,38 @@ def wer_batch(references, hypotheses):
1400
  total_words += len(ref_words)
1401
  return (total_errors / total_words) * 100 if total_words > 0 else 0.0
1402
 
 
 
 
 
 
 
 
 
1403
  def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samples=0, optimizer=None, scheduler=None):
1404
- pred_ids = pred.predictions
1405
  label_ids = pred.label_ids
1406
- if isinstance(pred_ids, tuple):
1407
- pred_ids = pred_ids[0]
1408
- if hasattr(pred_ids, "ndim") and pred_ids.ndim == 3:
1409
- if not isinstance(pred_ids, torch.Tensor):
1410
- pred_ids = torch.tensor(pred_ids)
1411
- pred_ids = pred_ids.argmax(dim=-1)
1412
-
1413
- pred_ids = pred_ids.tolist()
1414
- label_ids = label_ids.tolist()
1415
- pad_token_id = tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else 0
1416
- label_ids = [[pad_token_id if token == -100 else token for token in seq] for seq in label_ids]
1417
 
1418
  if print_pred:
1419
  for i in range(min(num_samples, len(pred_ids))):
1420
-
1421
- pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=False)
1422
- label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
1423
-
1424
  print(f"Pred tokens: {pred_ids[i]}")
1425
  print(f"Label tokens: {label_ids[i]}")
1426
  print(f"Pred: '{pred_str[i]}'")
1427
  print(f"Label: '{label_str[i]}'")
1428
-
1429
  print("-" * 40)
1430
-
1431
- pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
1432
- label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
1433
  wer = wer_batch(label_str, pred_str)
1434
  if model is not None:
1435
- trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1_000_000
1436
  efficiency_score = (100 - wer) / trainable_params if trainable_params > 0 else 0.0
1437
  else:
1438
  trainable_params = 0.0
1439
  efficiency_score = 0.0
1440
-
1441
  return {
1442
  "wer": float(wer),
1443
  "efficiency_score": float(efficiency_score),
 
1089
  "eos_token_id": self.eos_token_id,
1090
  })
1091
  return Config()
1092
+
1093
  def setup_tokenizer(token: str):
1094
  from tokenizers import Tokenizer
1095
  tokenizer = Tokenizer.from_file("./tokenizer.json")
 
1101
  ids = [id for id in ids if id not in sp_ids]
1102
  return ids
1103
 
1104
+ def bdec(ids_list, skip_special_tokens=True, pad_token_id=0, bos_token_id=1, eos_token_id=2):
1105
  results = []
1106
  for ids in ids_list:
1107
+ if isinstance(ids, torch.Tensor):
1108
+ ids = ids.tolist()
1109
+ ids = [int(id) for id in ids if id != -100]
1110
  if skip_special_tokens:
1111
+ ids = [id for id in ids if id not in (pad_token_id, bos_token_id, eos_token_id)]
1112
+
1113
+ if ids and ids and ids[0] == bos_token_id:
1114
  ids = ids[1:]
1115
+ while ids and ids[-1] == eos_token_id:
1116
  ids = ids[:-1]
 
 
 
 
 
1117
  results.append(tokenizer.decode(ids))
1118
  return results
1119
 
1120
+
1121
  def save_pretrained(save_dir):
1122
  os.makedirs(save_dir, exist_ok=True)
1123
  tokenizer.save(f"{save_dir}/tokenizer.json")
 
1401
  total_words += len(ref_words)
1402
  return (total_errors / total_words) * 100 if total_words > 0 else 0.0
1403
 
1404
+ def clean_ids(ids, pad_token_id=0):
1405
+ if isinstance(ids, torch.Tensor):
1406
+ ids = ids.tolist()
1407
+ return [int(id) for id in ids if id != -100 and id != pad_token_id]
1408
+
1409
+ def clean_batch(batch_ids, pad_token_id=0):
1410
+ return [clean_ids(seq, pad_token_id) for seq in batch_ids]
1411
+
1412
  def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samples=0, optimizer=None, scheduler=None):
1413
+
1414
  label_ids = pred.label_ids
1415
+ pred_ids = pred.predictions[0]
1416
+ label_ids = clean_batch(label_ids, pad_token_id=tokenizer.pad_token_id)
1417
+ pred_ids = clean_batch(pred_ids, pad_token_id=tokenizer.pad_token_id)
1418
+ label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
1419
+ pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=False)
 
 
 
 
 
 
1420
 
1421
  if print_pred:
1422
  for i in range(min(num_samples, len(pred_ids))):
 
 
 
 
1423
  print(f"Pred tokens: {pred_ids[i]}")
1424
  print(f"Label tokens: {label_ids[i]}")
1425
  print(f"Pred: '{pred_str[i]}'")
1426
  print(f"Label: '{label_str[i]}'")
 
1427
  print("-" * 40)
1428
+
 
 
1429
  wer = wer_batch(label_str, pred_str)
1430
  if model is not None:
1431
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000000
1432
  efficiency_score = (100 - wer) / trainable_params if trainable_params > 0 else 0.0
1433
  else:
1434
  trainable_params = 0.0
1435
  efficiency_score = 0.0
 
1436
  return {
1437
  "wer": float(wer),
1438
  "efficiency_score": float(efficiency_score),