Redmind commited on
Commit
ac1e0c6
·
verified ·
1 Parent(s): 9271a6a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -37
app.py CHANGED
@@ -1,65 +1,94 @@
1
- from datasets import load_dataset
2
- from transformers import MarianMTModel, MarianTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
 
 
3
 
4
- # Load dataset
5
- dataset = load_dataset('csv', data_files='hindi_dataset.tsv', delimiter='\t')
 
6
 
7
- # Load MarianMT tokenizer for translation task
8
- tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-hi')
9
 
10
- # Tokenize the English text (source language)
11
- def tokenize_function(examples):
12
- return tokenizer(examples['english'], truncation=True, padding='max_length', max_length=128)
13
 
14
- # Tokenize both English and Hindi sentences
15
- tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
 
 
16
 
17
- def tokenize_target_function(examples):
18
- return tokenizer(examples['hindi'], truncation=True, padding='max_length', max_length=128)
 
 
19
 
20
- tokenized_datasets = tokenized_datasets.map(tokenize_target_function, batched=True)
 
 
21
 
22
- # Data Collator for padding sequences
23
- data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)
 
 
24
 
25
- # Load MarianMT model for translation
26
- model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-hi')
 
27
 
28
- # Define training arguments
29
- training_args = TrainingArguments(
30
- output_dir='./results',
31
- evaluation_strategy="epoch",
32
  learning_rate=2e-5,
33
  per_device_train_batch_size=16,
34
  per_device_eval_batch_size=16,
35
  num_train_epochs=3,
36
  weight_decay=0.01,
37
- save_total_limit=2,
38
  predict_with_generate=True,
 
 
 
39
  )
40
 
41
- # Initialize Trainer
42
- trainer = Trainer(
 
 
 
 
 
 
 
 
 
 
 
43
  model=model,
44
  args=training_args,
45
  train_dataset=tokenized_datasets['train'],
46
  eval_dataset=tokenized_datasets['test'],
47
  tokenizer=tokenizer,
48
- data_collator=data_collator,
49
  )
50
 
51
- # Start training
52
  trainer.train()
53
 
54
- # Save the model
55
- trainer.save_model('./my_hindi_translation_model')
56
-
57
  # Evaluate the model
58
- results = trainer.evaluate()
59
- print(results)
 
 
 
 
 
 
60
 
61
- # Generate a prediction
62
- model.eval()
63
- inputs = tokenizer("How are you?", return_tensors="pt")
64
- outputs = model.generate(inputs["input_ids"], max_length=128)
65
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
1
+ from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
2
+ from datasets import Dataset, DatasetDict
3
+ import pandas as pd
4
+ import torch
5
 
6
+ # Load the dataset
7
+ file_path = "hindi_dataset.tsv" # Update with your actual file path
8
+ data = pd.read_csv(file_path, delimiter="\t")
9
 
10
+ # Convert the dataset to Hugging Face Dataset
11
+ hf_dataset = Dataset.from_pandas(data)
12
 
13
+ # Split the dataset into train and test subsets
14
+ split_dataset = hf_dataset.train_test_split(test_size=0.2)
 
15
 
16
+ # Create a DatasetDict with train and test splits
17
+ dataset = DatasetDict({
18
+ "train": split_dataset["train"],
19
+ "test": split_dataset["test"]
20
+ })
21
 
22
+ # Load the tokenizer and model
23
+ model_name = "Helsinki-NLP/opus-mt-en-hi" # Pre-trained English-to-Hindi model
24
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
25
+ model = MarianMTModel.from_pretrained(model_name)
26
 
27
+ # Tokenize source (English) text
28
+ def tokenize_function(examples):
29
+ return tokenizer(examples['source'], truncation=True, padding='max_length', max_length=128)
30
 
31
+ # Tokenize target (Hindi) text
32
+ def tokenize_target_function(examples):
33
+ with tokenizer.as_target_tokenizer():
34
+ return tokenizer(examples['target'], truncation=True, padding='max_length', max_length=128)
35
 
36
+ # Apply tokenization to the dataset
37
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
38
+ tokenized_datasets = tokenized_datasets.map(tokenize_target_function, batched=True)
39
 
40
+ # Define the training arguments
41
+ training_args = Seq2SeqTrainingArguments(
42
+ output_dir="./results",
43
+ eval_strategy="epoch",
44
  learning_rate=2e-5,
45
  per_device_train_batch_size=16,
46
  per_device_eval_batch_size=16,
47
  num_train_epochs=3,
48
  weight_decay=0.01,
49
+ save_total_limit=3,
50
  predict_with_generate=True,
51
+ logging_dir="./logs",
52
+ logging_steps=10,
53
+ save_steps=500
54
  )
55
 
56
+ # Data collator to pad sequences to the same length
57
+ def data_collator(features):
58
+ keys = ["input_ids", "attention_mask", "labels"]
59
+ max_length = max(len(feature[key]) for feature in features for key in keys if key in feature)
60
+ for feature in features:
61
+ for key in keys:
62
+ if key in feature:
63
+ padding = [0] * (max_length - len(feature[key]))
64
+ feature[key].extend(padding)
65
+ return {key: torch.tensor([f[key] for f in features]) for key in keys}
66
+
67
+ # Define the Trainer
68
+ trainer = Seq2SeqTrainer(
69
  model=model,
70
  args=training_args,
71
  train_dataset=tokenized_datasets['train'],
72
  eval_dataset=tokenized_datasets['test'],
73
  tokenizer=tokenizer,
74
+ data_collator=data_collator
75
  )
76
 
77
+ # Train the model
78
  trainer.train()
79
 
 
 
 
80
  # Evaluate the model
81
+ eval_results = trainer.evaluate()
82
+ print("Evaluation Results:", eval_results)
83
+
84
+ # Test the model with sample inputs
85
+ def translate_text(text):
86
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
87
+ translated = model.generate(**inputs)
88
+ return [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
89
 
90
+ # Test translation
91
+ sample_text = "How are you?"
92
+ hindi_translation = translate_text(sample_text)
93
+ print(f"English: {sample_text}")
94
+ print(f"Hindi: {hindi_translation[0]}")