Update modelA.py
Browse files
modelA.py
CHANGED
@@ -1089,7 +1089,7 @@ class Echo(nn.Module):
|
|
1089 |
"eos_token_id": self.eos_token_id,
|
1090 |
})
|
1091 |
return Config()
|
1092 |
-
|
1093 |
def setup_tokenizer(token: str):
|
1094 |
from tokenizers import Tokenizer
|
1095 |
tokenizer = Tokenizer.from_file("./tokenizer.json")
|
@@ -1101,22 +1101,23 @@ def setup_tokenizer(token: str):
|
|
1101 |
ids = [id for id in ids if id not in sp_ids]
|
1102 |
return ids
|
1103 |
|
1104 |
-
def bdec(ids_list, skip_special_tokens=True):
|
1105 |
results = []
|
1106 |
for ids in ids_list:
|
|
|
|
|
|
|
1107 |
if skip_special_tokens:
|
1108 |
-
|
|
|
|
|
1109 |
ids = ids[1:]
|
1110 |
-
while ids and ids[-1]
|
1111 |
ids = ids[:-1]
|
1112 |
-
|
1113 |
-
if isinstance(ids, torch.Tensor):
|
1114 |
-
ids = ids.tolist()
|
1115 |
-
elif isinstance(ids, np.ndarray):
|
1116 |
-
ids = ids.tolist()
|
1117 |
results.append(tokenizer.decode(ids))
|
1118 |
return results
|
1119 |
|
|
|
1120 |
def save_pretrained(save_dir):
|
1121 |
os.makedirs(save_dir, exist_ok=True)
|
1122 |
tokenizer.save(f"{save_dir}/tokenizer.json")
|
@@ -1400,44 +1401,38 @@ def wer_batch(references, hypotheses):
|
|
1400 |
total_words += len(ref_words)
|
1401 |
return (total_errors / total_words) * 100 if total_words > 0 else 0.0
|
1402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1403 |
def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samples=0, optimizer=None, scheduler=None):
|
1404 |
-
|
1405 |
label_ids = pred.label_ids
|
1406 |
-
|
1407 |
-
|
1408 |
-
|
1409 |
-
|
1410 |
-
|
1411 |
-
pred_ids = pred_ids.argmax(dim=-1)
|
1412 |
-
|
1413 |
-
pred_ids = pred_ids.tolist()
|
1414 |
-
label_ids = label_ids.tolist()
|
1415 |
-
pad_token_id = tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else 0
|
1416 |
-
label_ids = [[pad_token_id if token == -100 else token for token in seq] for seq in label_ids]
|
1417 |
|
1418 |
if print_pred:
|
1419 |
for i in range(min(num_samples, len(pred_ids))):
|
1420 |
-
|
1421 |
-
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=False)
|
1422 |
-
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
|
1423 |
-
|
1424 |
print(f"Pred tokens: {pred_ids[i]}")
|
1425 |
print(f"Label tokens: {label_ids[i]}")
|
1426 |
print(f"Pred: '{pred_str[i]}'")
|
1427 |
print(f"Label: '{label_str[i]}'")
|
1428 |
-
|
1429 |
print("-" * 40)
|
1430 |
-
|
1431 |
-
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
|
1432 |
-
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
|
1433 |
wer = wer_batch(label_str, pred_str)
|
1434 |
if model is not None:
|
1435 |
-
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) /
|
1436 |
efficiency_score = (100 - wer) / trainable_params if trainable_params > 0 else 0.0
|
1437 |
else:
|
1438 |
trainable_params = 0.0
|
1439 |
efficiency_score = 0.0
|
1440 |
-
|
1441 |
return {
|
1442 |
"wer": float(wer),
|
1443 |
"efficiency_score": float(efficiency_score),
|
|
|
1089 |
"eos_token_id": self.eos_token_id,
|
1090 |
})
|
1091 |
return Config()
|
1092 |
+
|
1093 |
def setup_tokenizer(token: str):
|
1094 |
from tokenizers import Tokenizer
|
1095 |
tokenizer = Tokenizer.from_file("./tokenizer.json")
|
|
|
1101 |
ids = [id for id in ids if id not in sp_ids]
|
1102 |
return ids
|
1103 |
|
1104 |
+
def bdec(ids_list, skip_special_tokens=True, pad_token_id=0, bos_token_id=1, eos_token_id=2):
|
1105 |
results = []
|
1106 |
for ids in ids_list:
|
1107 |
+
if isinstance(ids, torch.Tensor):
|
1108 |
+
ids = ids.tolist()
|
1109 |
+
ids = [int(id) for id in ids if id != -100]
|
1110 |
if skip_special_tokens:
|
1111 |
+
ids = [id for id in ids if id not in (pad_token_id, bos_token_id, eos_token_id)]
|
1112 |
+
|
1113 |
+
if ids and ids and ids[0] == bos_token_id:
|
1114 |
ids = ids[1:]
|
1115 |
+
while ids and ids[-1] == eos_token_id:
|
1116 |
ids = ids[:-1]
|
|
|
|
|
|
|
|
|
|
|
1117 |
results.append(tokenizer.decode(ids))
|
1118 |
return results
|
1119 |
|
1120 |
+
|
1121 |
def save_pretrained(save_dir):
|
1122 |
os.makedirs(save_dir, exist_ok=True)
|
1123 |
tokenizer.save(f"{save_dir}/tokenizer.json")
|
|
|
1401 |
total_words += len(ref_words)
|
1402 |
return (total_errors / total_words) * 100 if total_words > 0 else 0.0
|
1403 |
|
1404 |
+
def clean_ids(ids, pad_token_id=0):
|
1405 |
+
if isinstance(ids, torch.Tensor):
|
1406 |
+
ids = ids.tolist()
|
1407 |
+
return [int(id) for id in ids if id != -100 and id != pad_token_id]
|
1408 |
+
|
1409 |
+
def clean_batch(batch_ids, pad_token_id=0):
|
1410 |
+
return [clean_ids(seq, pad_token_id) for seq in batch_ids]
|
1411 |
+
|
1412 |
def compute_metrics(pred, tokenizer=None, model=None, print_pred=False, num_samples=0, optimizer=None, scheduler=None):
|
1413 |
+
|
1414 |
label_ids = pred.label_ids
|
1415 |
+
pred_ids = pred.predictions[0]
|
1416 |
+
label_ids = clean_batch(label_ids, pad_token_id=tokenizer.pad_token_id)
|
1417 |
+
pred_ids = clean_batch(pred_ids, pad_token_id=tokenizer.pad_token_id)
|
1418 |
+
label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=False)
|
1419 |
+
pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
1420 |
|
1421 |
if print_pred:
|
1422 |
for i in range(min(num_samples, len(pred_ids))):
|
|
|
|
|
|
|
|
|
1423 |
print(f"Pred tokens: {pred_ids[i]}")
|
1424 |
print(f"Label tokens: {label_ids[i]}")
|
1425 |
print(f"Pred: '{pred_str[i]}'")
|
1426 |
print(f"Label: '{label_str[i]}'")
|
|
|
1427 |
print("-" * 40)
|
1428 |
+
|
|
|
|
|
1429 |
wer = wer_batch(label_str, pred_str)
|
1430 |
if model is not None:
|
1431 |
+
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000000
|
1432 |
efficiency_score = (100 - wer) / trainable_params if trainable_params > 0 else 0.0
|
1433 |
else:
|
1434 |
trainable_params = 0.0
|
1435 |
efficiency_score = 0.0
|
|
|
1436 |
return {
|
1437 |
"wer": float(wer),
|
1438 |
"efficiency_score": float(efficiency_score),
|