File size: 2,021 Bytes
3d5b8ef
 
3b692d3
3d5b8ef
5bcc7b1
3b692d3
3d5b8ef
 
3b692d3
3d5b8ef
 
 
3b692d3
3d5b8ef
 
3b692d3
3d5b8ef
 
3b692d3
3d5b8ef
3b692d3
3d5b8ef
 
3b692d3
3d5b8ef
 
3b692d3
3d5b8ef
3b692d3
3d5b8ef
 
 
 
 
 
 
 
 
3b692d3
 
3d5b8ef
3b692d3
 
 
3d5b8ef
 
 
 
3b692d3
 
3d5b8ef
3b692d3
 
3d5b8ef
 
3b692d3
3d5b8ef
 
 
3b692d3
3d5b8ef
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Load dataset
dataset = load_dataset('csv', data_files='hindi_dataset.tsv', delimiter='\t')

# Load MarianMT tokenizer for translation task
tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-hi')

# Tokenize the English text (source language)
def tokenize_function(examples):
    return tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)

# Tokenize both English and Hindi sentences
tokenized_datasets = dataset.map(tokenize_function, batched=True)

def tokenize_target_function(examples):
    return tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = tokenized_datasets.map(tokenize_target_function, batched=True)

# Data Collator for padding sequences
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

# Load MarianMT model for translation
model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-hi')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()

# Save the model
trainer.save_model('./my_hindi_translation_model')

# Evaluate the model
results = trainer.evaluate()
print(results)

# Generate a prediction
model.eval()
inputs = tokenizer("How are you?", return_tensors="pt")
outputs = model.generate(inputs["input_ids"], max_length=128)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))