Spaces:
Runtime error
Runtime error
File size: 2,021 Bytes
3d5b8ef 3b692d3 3d5b8ef 5bcc7b1 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef 3b692d3 3d5b8ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
# Load dataset
dataset = load_dataset('csv', data_files='hindi_dataset.tsv', delimiter='\t')
# Load MarianMT tokenizer for translation task
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-hi')
# Tokenize the English text (source language)
def tokenize_function(examples):
return tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
# Tokenize both English and Hindi sentences
tokenized_datasets = dataset.map(tokenize_function, batched=True)
def tokenize_target_function(examples):
return tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
tokenized_datasets = tokenized_datasets.map(tokenize_target_function, batched=True)
# Data Collator for padding sequences
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)
# Load MarianMT model for translation
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-hi')
# Define training arguments
training_args = TrainingArguments(
output_dir='./results',
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
save_total_limit=2,
predict_with_generate=True,
)
# Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['test'],
tokenizer=tokenizer,
data_collator=data_collator,
)
# Start training
trainer.train()
# Save the model
trainer.save_model('./my_hindi_translation_model')
# Evaluate the model
results = trainer.evaluate()
print(results)
# Generate a prediction
model.eval()
inputs = tokenizer("How are you?", return_tensors="pt")
outputs = model.generate(inputs["input_ids"], max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|