JoeArmani commited on
Commit
3190e1e
·
0 Parent(s):

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. __pycache__/back_translator.cpython-310.pyc +0 -0
  3. __pycache__/dialogue_augmenter.cpython-310.pyc +0 -0
  4. __pycache__/paraphraser.cpython-310.pyc +0 -0
  5. __pycache__/pipeline_config.cpython-310.pyc +0 -0
  6. __pycache__/processing_pipeline.cpython-310.pyc +0 -0
  7. __pycache__/quality_metrics.cpython-310.pyc +0 -0
  8. __pycache__/schema_guided_dialogue_processor.cpython-310.pyc +0 -0
  9. __pycache__/taskmaster_processor.cpython-310.pyc +0 -0
  10. augmented_combined_dataset.json +0 -0
  11. back_translator.py +56 -0
  12. datasets/.DS_Store +0 -0
  13. datasets/schema_guided/dialogues_001.json +0 -0
  14. datasets/schema_guided/dialogues_002.json +0 -0
  15. datasets/schema_guided/dialogues_003.json +0 -0
  16. datasets/schema_guided/dialogues_004.json +0 -0
  17. datasets/schema_guided/dialogues_005.json +0 -0
  18. datasets/schema_guided/dialogues_006.json +0 -0
  19. datasets/schema_guided/dialogues_007.json +0 -0
  20. datasets/schema_guided/dialogues_008.json +0 -0
  21. datasets/schema_guided/dialogues_009.json +0 -0
  22. datasets/schema_guided/dialogues_010.json +0 -0
  23. datasets/schema_guided/dialogues_011.json +0 -0
  24. datasets/schema_guided/dialogues_012.json +0 -0
  25. datasets/schema_guided/dialogues_013.json +0 -0
  26. datasets/schema_guided/dialogues_014.json +0 -0
  27. datasets/schema_guided/dialogues_015.json +0 -0
  28. datasets/schema_guided/dialogues_016.json +0 -0
  29. datasets/schema_guided/dialogues_017.json +0 -0
  30. datasets/schema_guided/dialogues_018.json +0 -0
  31. datasets/schema_guided/dialogues_019.json +0 -0
  32. datasets/schema_guided/dialogues_020.json +0 -0
  33. datasets/schema_guided/dialogues_021.json +0 -0
  34. datasets/schema_guided/dialogues_022.json +0 -0
  35. datasets/schema_guided/dialogues_023.json +0 -0
  36. datasets/schema_guided/dialogues_024.json +0 -0
  37. datasets/schema_guided/dialogues_025.json +0 -0
  38. datasets/schema_guided/dialogues_026.json +0 -0
  39. datasets/schema_guided/dialogues_027.json +0 -0
  40. datasets/schema_guided/dialogues_028.json +0 -0
  41. datasets/schema_guided/dialogues_029.json +0 -0
  42. datasets/schema_guided/dialogues_030.json +0 -0
  43. datasets/schema_guided/dialogues_031.json +0 -0
  44. datasets/schema_guided/dialogues_032.json +0 -0
  45. datasets/schema_guided/dialogues_033.json +0 -0
  46. datasets/schema_guided/dialogues_034.json +0 -0
  47. datasets/schema_guided/dialogues_035.json +0 -0
  48. datasets/schema_guided/dialogues_036.json +0 -0
  49. datasets/schema_guided/dialogues_037.json +0 -0
  50. datasets/schema_guided/dialogues_038.json +0 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
__pycache__/back_translator.cpython-310.pyc ADDED
Binary file (2.25 kB). View file
 
__pycache__/dialogue_augmenter.cpython-310.pyc ADDED
Binary file (14.5 kB). View file
 
__pycache__/paraphraser.cpython-310.pyc ADDED
Binary file (1.55 kB). View file
 
__pycache__/pipeline_config.cpython-310.pyc ADDED
Binary file (2.06 kB). View file
 
__pycache__/processing_pipeline.cpython-310.pyc ADDED
Binary file (6.51 kB). View file
 
__pycache__/quality_metrics.cpython-310.pyc ADDED
Binary file (4.49 kB). View file
 
__pycache__/schema_guided_dialogue_processor.cpython-310.pyc ADDED
Binary file (5.82 kB). View file
 
__pycache__/taskmaster_processor.cpython-310.pyc ADDED
Binary file (5.72 kB). View file
 
augmented_combined_dataset.json ADDED
The diff for this file is too large to render. See raw diff
 
back_translator.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ MarianMTModel,
3
+ MarianTokenizer,
4
+ )
5
+
6
+ class BackTranslator:
7
+ """
8
+ Perform Back-translation with pivot language. English -> German -> Spanish -> English
9
+ Args:
10
+ source_lang: Source language (default: 'en')
11
+ pivot_lang: Pivot language (default: 'de')
12
+ target_lang: Target language (default: 'es')
13
+ Examples:
14
+ back_translator = BackTranslator()
15
+ back_translator.back_translate("Hello, how are you?")
16
+ """
17
+ def __init__(self, source_lang='en', pivot_lang='de', target_lang='es'):
18
+ # Forward (English to German)
19
+ pivot_forward_model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}'
20
+ self.tokenizer_pivot_forward = MarianTokenizer.from_pretrained(pivot_forward_model_name)
21
+ self.model_pivot_forward = MarianMTModel.from_pretrained(pivot_forward_model_name)
22
+
23
+ # Pivot translation model (German to Spanish)
24
+ pivot_backward_model_name = f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}'
25
+ self.tokenizer_pivot_backward = MarianTokenizer.from_pretrained(pivot_backward_model_name)
26
+ self.model_pivot_backward = MarianMTModel.from_pretrained(pivot_backward_model_name)
27
+
28
+ # Backward (Spanish to English)
29
+ backward_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
30
+ self.tokenizer_backward = MarianTokenizer.from_pretrained(backward_model_name)
31
+ self.model_backward = MarianMTModel.from_pretrained(backward_model_name)
32
+
33
+ def back_translate(self, text):
34
+ """
35
+ Perform back-translation through German and Spanish to generate text variations.
36
+ Args:
37
+ text (str): The input text to be back-translated
38
+
39
+ Returns:
40
+ str: The back-translated text
41
+ """
42
+ # 1. English to German
43
+ encoded_pivot = self.tokenizer_pivot_forward([text], padding=True, truncation=True, return_tensors='pt')
44
+ generated_pivot = self.model_pivot_forward.generate(**encoded_pivot)
45
+ pivot_text = self.tokenizer_pivot_forward.batch_decode(generated_pivot, skip_special_tokens=True)[0]
46
+
47
+ # 2. German to Spanish
48
+ encoded_back_pivot = self.tokenizer_pivot_backward([pivot_text], padding=True, truncation=True, return_tensors='pt')
49
+ retranslated_pivot = self.model_pivot_backward.generate(**encoded_back_pivot)
50
+ tgt_text_back = self.tokenizer_pivot_backward.batch_decode(retranslated_pivot, skip_special_tokens=True)[0]
51
+
52
+ # 3. Spanish to English
53
+ encoded_back = self.tokenizer_backward([tgt_text_back], padding=True, truncation=True, return_tensors='pt')
54
+ retranslated = self.model_backward.generate(**encoded_back)
55
+ src_text = self.tokenizer_backward.batch_decode(retranslated, skip_special_tokens=True)[0]
56
+ return src_text
datasets/.DS_Store ADDED
Binary file (8.2 kB). View file
 
datasets/schema_guided/dialogues_001.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_002.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_003.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_004.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_005.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_006.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_007.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_008.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_009.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_010.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_011.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_012.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_013.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_014.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_015.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_016.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_017.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_018.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_019.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_020.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_021.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_022.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_023.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_024.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_025.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_026.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_027.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_028.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_029.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_030.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_031.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_032.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_033.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_034.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_035.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_036.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_037.json ADDED
The diff for this file is too large to render. See raw diff
 
datasets/schema_guided/dialogues_038.json ADDED
The diff for this file is too large to render. See raw diff