JoeArmani
commited on
Commit
·
3190e1e
0
Parent(s):
Initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .DS_Store +0 -0
- __pycache__/back_translator.cpython-310.pyc +0 -0
- __pycache__/dialogue_augmenter.cpython-310.pyc +0 -0
- __pycache__/paraphraser.cpython-310.pyc +0 -0
- __pycache__/pipeline_config.cpython-310.pyc +0 -0
- __pycache__/processing_pipeline.cpython-310.pyc +0 -0
- __pycache__/quality_metrics.cpython-310.pyc +0 -0
- __pycache__/schema_guided_dialogue_processor.cpython-310.pyc +0 -0
- __pycache__/taskmaster_processor.cpython-310.pyc +0 -0
- augmented_combined_dataset.json +0 -0
- back_translator.py +56 -0
- datasets/.DS_Store +0 -0
- datasets/schema_guided/dialogues_001.json +0 -0
- datasets/schema_guided/dialogues_002.json +0 -0
- datasets/schema_guided/dialogues_003.json +0 -0
- datasets/schema_guided/dialogues_004.json +0 -0
- datasets/schema_guided/dialogues_005.json +0 -0
- datasets/schema_guided/dialogues_006.json +0 -0
- datasets/schema_guided/dialogues_007.json +0 -0
- datasets/schema_guided/dialogues_008.json +0 -0
- datasets/schema_guided/dialogues_009.json +0 -0
- datasets/schema_guided/dialogues_010.json +0 -0
- datasets/schema_guided/dialogues_011.json +0 -0
- datasets/schema_guided/dialogues_012.json +0 -0
- datasets/schema_guided/dialogues_013.json +0 -0
- datasets/schema_guided/dialogues_014.json +0 -0
- datasets/schema_guided/dialogues_015.json +0 -0
- datasets/schema_guided/dialogues_016.json +0 -0
- datasets/schema_guided/dialogues_017.json +0 -0
- datasets/schema_guided/dialogues_018.json +0 -0
- datasets/schema_guided/dialogues_019.json +0 -0
- datasets/schema_guided/dialogues_020.json +0 -0
- datasets/schema_guided/dialogues_021.json +0 -0
- datasets/schema_guided/dialogues_022.json +0 -0
- datasets/schema_guided/dialogues_023.json +0 -0
- datasets/schema_guided/dialogues_024.json +0 -0
- datasets/schema_guided/dialogues_025.json +0 -0
- datasets/schema_guided/dialogues_026.json +0 -0
- datasets/schema_guided/dialogues_027.json +0 -0
- datasets/schema_guided/dialogues_028.json +0 -0
- datasets/schema_guided/dialogues_029.json +0 -0
- datasets/schema_guided/dialogues_030.json +0 -0
- datasets/schema_guided/dialogues_031.json +0 -0
- datasets/schema_guided/dialogues_032.json +0 -0
- datasets/schema_guided/dialogues_033.json +0 -0
- datasets/schema_guided/dialogues_034.json +0 -0
- datasets/schema_guided/dialogues_035.json +0 -0
- datasets/schema_guided/dialogues_036.json +0 -0
- datasets/schema_guided/dialogues_037.json +0 -0
- datasets/schema_guided/dialogues_038.json +0 -0
.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
__pycache__/back_translator.cpython-310.pyc
ADDED
Binary file (2.25 kB). View file
|
|
__pycache__/dialogue_augmenter.cpython-310.pyc
ADDED
Binary file (14.5 kB). View file
|
|
__pycache__/paraphraser.cpython-310.pyc
ADDED
Binary file (1.55 kB). View file
|
|
__pycache__/pipeline_config.cpython-310.pyc
ADDED
Binary file (2.06 kB). View file
|
|
__pycache__/processing_pipeline.cpython-310.pyc
ADDED
Binary file (6.51 kB). View file
|
|
__pycache__/quality_metrics.cpython-310.pyc
ADDED
Binary file (4.49 kB). View file
|
|
__pycache__/schema_guided_dialogue_processor.cpython-310.pyc
ADDED
Binary file (5.82 kB). View file
|
|
__pycache__/taskmaster_processor.cpython-310.pyc
ADDED
Binary file (5.72 kB). View file
|
|
augmented_combined_dataset.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
back_translator.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
MarianMTModel,
|
3 |
+
MarianTokenizer,
|
4 |
+
)
|
5 |
+
|
6 |
+
class BackTranslator:
|
7 |
+
"""
|
8 |
+
Perform Back-translation with pivot language. English -> German -> Spanish -> English
|
9 |
+
Args:
|
10 |
+
source_lang: Source language (default: 'en')
|
11 |
+
pivot_lang: Pivot language (default: 'de')
|
12 |
+
target_lang: Target language (default: 'es')
|
13 |
+
Examples:
|
14 |
+
back_translator = BackTranslator()
|
15 |
+
back_translator.back_translate("Hello, how are you?")
|
16 |
+
"""
|
17 |
+
def __init__(self, source_lang='en', pivot_lang='de', target_lang='es'):
|
18 |
+
# Forward (English to German)
|
19 |
+
pivot_forward_model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}'
|
20 |
+
self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
|
21 |
+
self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
|
22 |
+
|
23 |
+
# Pivot translation model (German to Spanish)
|
24 |
+
pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
|
25 |
+
self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
|
26 |
+
self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
|
27 |
+
|
28 |
+
# Backward (Spanish to English)
|
29 |
+
backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
|
30 |
+
self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
|
31 |
+
self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
|
32 |
+
|
33 |
+
def back_translate(self, text):
|
34 |
+
"""
|
35 |
+
Perform back-translation through German and Spanish to generate text variations.
|
36 |
+
Args:
|
37 |
+
text (str): The input text to be back-translated
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
str: The back-translated text
|
41 |
+
"""
|
42 |
+
# 1. English to German
|
43 |
+
encoded_pivot = self.tokenizer_pivot_forward([text], padding=True, truncation=True, return_tensors='pt')
|
44 |
+
generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
|
45 |
+
pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot, skip_special_tokens=True)[0]
|
46 |
+
|
47 |
+
# 2. German to Spanish
|
48 |
+
encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True, truncation=True, return_tensors='pt')
|
49 |
+
retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
|
50 |
+
tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot, skip_special_tokens=True)[0]
|
51 |
+
|
52 |
+
# 3. Spanish to English
|
53 |
+
encoded_back = self.tokenizer_backward([tgt_text_back], padding=True, truncation=True, return_tensors='pt')
|
54 |
+
retranslated = self.model_backward.generate(**encoded_back)
|
55 |
+
src_text = self.tokenizer_backward.batch_decode(retranslated, skip_special_tokens=True)[0]
|
56 |
+
return src_text
|
datasets/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
datasets/schema_guided/dialogues_001.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_002.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_003.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_004.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_005.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_006.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_007.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_008.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_009.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_010.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_011.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_012.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_013.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_014.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_015.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_016.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_017.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_018.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_019.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_020.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_021.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_022.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_023.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_024.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_025.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_026.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_027.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_028.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_029.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_030.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_031.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_032.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_033.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_034.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_035.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_036.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_037.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
datasets/schema_guided/dialogues_038.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|