wony617 commited on
Commit
22556a8
ยท
1 Parent(s): ec613d7

Add agent workflow

Browse files
agent/toctree_handler.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import requests
3
+ from typing import Dict, List, Any
4
+ import os
5
+
6
+ class TocTreeHandler:
7
+ def __init__(self):
8
+ self.en_toctree_url = "https://raw.githubusercontent.com/huggingface/transformers/main/docs/source/en/_toctree.yml"
9
+ self.ko_toctree_url = "https://raw.githubusercontent.com/huggingface/transformers/main/docs/source/ko/_toctree.yml"
10
+ self.local_docs_path = "docs/source/ko"
11
+
12
+ def fetch_toctree(self, url: str) -> Dict[str, Any]:
13
+ """Fetch and parse YAML from URL"""
14
+ response = requests.get(url)
15
+ response.raise_for_status()
16
+ return yaml.safe_load(response.text)
17
+
18
+ def get_en_toctree(self) -> Dict[str, Any]:
19
+ """Get English toctree structure"""
20
+ return self.fetch_toctree(self.en_toctree_url)
21
+
22
+ def get_ko_toctree(self) -> Dict[str, Any]:
23
+ """Get Korean toctree structure"""
24
+ return self.fetch_toctree(self.ko_toctree_url)
25
+
26
+ def extract_title_mappings(self, en_data: List[Dict], ko_data: List[Dict]) -> Dict[str, str]:
27
+ """Extract title mappings between English and Korean"""
28
+ mappings = {}
29
+
30
+ def process_section(en_section: Dict, ko_section: Dict):
31
+ if 'local' in en_section and 'local' in ko_section:
32
+ if en_section['local'] == ko_section['local']:
33
+ en_title = en_section.get('title', '')
34
+ ko_title = ko_section.get('title', '')
35
+ if en_title and ko_title:
36
+ mappings[en_title] = ko_title
37
+
38
+ if 'sections' in en_section and 'sections' in ko_section:
39
+ en_sections = en_section['sections']
40
+ ko_sections = ko_section['sections']
41
+
42
+ for i, en_sub in enumerate(en_sections):
43
+ if i < len(ko_sections):
44
+ process_section(en_sub, ko_sections[i])
45
+
46
+ for i, en_item in enumerate(en_data):
47
+ if i < len(ko_data):
48
+ process_section(en_item, ko_data[i])
49
+
50
+ return mappings
51
+
52
+ def translate_title(self, en_title: str) -> str:
53
+ """Translate English title to Korean using LLM"""
54
+ try:
55
+ from translator.content import llm_translate
56
+
57
+ prompt = f"""Translate the following English documentation title to Korean. Return only the translated title, nothing else.
58
+
59
+ English title: {en_title}
60
+
61
+ Korean title:"""
62
+
63
+ callback_result, translated_title = llm_translate(prompt)
64
+ return translated_title.strip()
65
+ except Exception as e:
66
+ print(f"Error translating title '{en_title}': {e}")
67
+ return en_title
68
+
69
+ def create_local_toctree(self, en_title: str, local_file_path: str) -> Dict[str, str]:
70
+ """Create local toctree entry with Korean title and local path"""
71
+ try:
72
+ # First try to get Korean title from existing mappings
73
+ en_data = self.get_en_toctree()
74
+ ko_data = self.get_ko_toctree()
75
+
76
+ title_mappings = self.extract_title_mappings(en_data, ko_data)
77
+ ko_title = title_mappings.get(en_title)
78
+
79
+ # If no existing mapping, translate the title
80
+ if not ko_title:
81
+ ko_title = self.translate_title(en_title)
82
+
83
+ return {
84
+ 'local': local_file_path,
85
+ 'title': ko_title
86
+ }
87
+ except Exception as e:
88
+ print(f"Error creating local toctree: {e}")
89
+ return {
90
+ 'local': local_file_path,
91
+ 'title': en_title
92
+ }
93
+
94
+ def update_local_toctree_file(self, new_entries: List[Dict[str, str]]):
95
+ """Update or create local _toctree.yml file"""
96
+ toctree_path = os.path.join(self.local_docs_path, "_toctree.yml")
97
+
98
+ os.makedirs(self.local_docs_path, exist_ok=True)
99
+
100
+ if os.path.exists(toctree_path):
101
+ with open(toctree_path, 'r', encoding='utf-8') as f:
102
+ existing_data = yaml.safe_load(f) or []
103
+ else:
104
+ existing_data = []
105
+
106
+ for entry in new_entries:
107
+ if entry not in existing_data:
108
+ existing_data.append(entry)
109
+
110
+ with open(toctree_path, 'w', encoding='utf-8') as f:
111
+ yaml.dump(existing_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
112
+
113
+ def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
114
+ """Use LLM to create updated Korean toctree with new entry at correct position"""
115
+ try:
116
+ from translator.content import llm_translate
117
+
118
+ prompt = f"""You are given English and Korean toctree YAML structures. You need to:
119
+
120
+ 1. Find the entry(local, title) with `- local: {target_local}` in the English toctree
121
+ 2. Translate its title to Korean
122
+ 3. Insert this new entry into the Korean toctree at the same position as it appears in the English toctree
123
+ 4. Return the complete updated Korean toctree
124
+
125
+ English toctree YAML:
126
+ ```yaml
127
+ {en_toctree_yaml}
128
+ ```
129
+
130
+ Current Korean toctree YAML:
131
+ ```yaml
132
+ {ko_toctree_yaml}
133
+ ```
134
+
135
+ Target local path to add: "{target_local}"
136
+
137
+ Return the complete updated Korean toctree in YAML format:
138
+ ```yaml
139
+ # Updated Korean toctree with new entry inserted at correct position
140
+ [complete toctree structure here]
141
+ ```
142
+
143
+ Important positioning rules:
144
+ - Find the exact position (index and nesting level) of the target entry in the English toctree
145
+ - Count from the beginning: if it's the 5th item in English toctree, it should be the 5th item in Korean toctree
146
+ - If it's inside a 'sections' array, maintain that nesting structure
147
+ - Keep all existing Korean entries in their current positions
148
+ - Insert the new Korean entry at the exact same position as the English entry
149
+ - If there are gaps in positions (missing entries), maintain those gaps
150
+ - Preserve the exact YAML structure: {{local: "path", title: "title"}} or {{local: "path", title: "title", sections: [...]}}
151
+
152
+ Example: If English entry is at position [2] (3rd item), insert Korean entry at position [2] in Korean toctree
153
+ Example: If English entry is at position [1]['sections'][0] (1st item in sections of 2nd entry), insert at same nested position"""
154
+
155
+ callback_result, response = llm_translate(prompt)
156
+
157
+ # Parse YAML response
158
+ response = response.strip()
159
+
160
+ try:
161
+ # Extract YAML content between ```yaml and ```
162
+ if "```yaml" in response:
163
+ yaml_start = response.find("```yaml") + 7
164
+ yaml_end = response.find("```", yaml_start)
165
+ yaml_content = response[yaml_start:yaml_end].strip()
166
+ else:
167
+ yaml_content = response
168
+
169
+ updated_ko_toctree = yaml.safe_load(yaml_content)
170
+ return updated_ko_toctree
171
+ except Exception as e:
172
+ print(f"Failed to parse LLM YAML response: {e}")
173
+ print(f"Response was: {response}")
174
+ return None
175
+
176
+ except Exception as e:
177
+ print(f"Error using LLM to create updated toctree: {e}")
178
+ return None
179
+
180
+ def process_pr_commit(self, en_titles: List[str], local_paths: List[str], filepath: str):
181
+ """Process PR commit by using LLM to create complete updated Korean toctree"""
182
+ # Get filepath without prefix
183
+ filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
184
+
185
+ # Get English and Korean toctrees as YAML strings
186
+ en_toctree = self.get_en_toctree()
187
+ ko_toctree = self.get_ko_toctree()
188
+
189
+ en_toctree_yaml = yaml.dump(en_toctree, allow_unicode=True, default_flow_style=False)
190
+ ko_toctree_yaml = yaml.dump(ko_toctree, allow_unicode=True, default_flow_style=False)
191
+
192
+ # Use LLM to create updated Korean toctree
193
+ updated_ko_toctree = self.create_updated_toctree_with_llm(en_toctree_yaml, ko_toctree_yaml, filepath_without_prefix)
194
+
195
+ if not updated_ko_toctree:
196
+ print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
197
+ return []
198
+
199
+ print(f"LLM successfully updated Korean toctree")
200
+
201
+ # Store the updated toctree for commit
202
+ self.updated_ko_toctree = updated_ko_toctree
203
+
204
+ print(f"Updated Korean toctree has {len(updated_ko_toctree)} items")
205
+
206
+ return []
207
+
208
+ def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
209
+ """Commit and push toctree updates as a separate commit"""
210
+ try:
211
+ # Use the updated toctree created by LLM
212
+ if not hasattr(self, 'updated_ko_toctree') or not self.updated_ko_toctree:
213
+ print("No updated Korean toctree available")
214
+ return {"status": "error", "message": "No updated toctree to commit"}
215
+
216
+ ko_data = self.updated_ko_toctree
217
+
218
+ # Convert to YAML string
219
+ toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
220
+
221
+ # Create toctree commit message
222
+ commit_message = "docs: update Korean documentation table of contents - test"
223
+
224
+ # Commit toctree file
225
+ file_result = pr_agent.create_or_update_file(
226
+ owner=owner,
227
+ repo_name=repo_name,
228
+ path="docs/source/ko/_toctree.yml",
229
+ message=commit_message,
230
+ content=toctree_content,
231
+ branch_name=branch_name
232
+ )
233
+
234
+ if file_result.startswith("SUCCESS"):
235
+ return {
236
+ "status": "success",
237
+ "message": f"Toctree committed successfully: {file_result}",
238
+ "commit_message": commit_message
239
+ }
240
+ else:
241
+ return {
242
+ "status": "error",
243
+ "message": f"Toctree commit failed: {file_result}"
244
+ }
245
+
246
+ except Exception as e:
247
+ return {
248
+ "status": "error",
249
+ "message": f"Error committing toctree: {str(e)}"
250
+ }
251
+
252
+ def update_toctree_after_translation(
253
+ self,
254
+ translation_result: dict,
255
+ en_title: str,
256
+ filepath: str,
257
+ pr_agent,
258
+ github_config: dict
259
+ ) -> dict:
260
+ """Update toctree after successful translation PR.
261
+
262
+ Args:
263
+ translation_result: Result from translation PR workflow
264
+ en_title: English title for toctree mapping
265
+ filepath: Original file path
266
+ pr_agent: GitHub PR agent instance
267
+ github_config: GitHub configuration dictionary
268
+
269
+ Returns:
270
+ Dictionary with toctree update result
271
+ """
272
+ if translation_result["status"] == "error" or not en_title:
273
+ return None
274
+
275
+ try:
276
+ local_path = filepath.split("/")[-1].replace(".md", "")
277
+
278
+ # Create new toctree entries
279
+ new_entries = self.process_pr_commit([en_title], [local_path], filepath)
280
+ print("self.updated_ko_toctree = updated_ko_toctree:", self.updated_ko_toctree)
281
+ # Commit toctree as separate commit
282
+ return self.commit_and_push_toctree(
283
+ pr_agent=pr_agent,
284
+ owner=github_config["owner"],
285
+ repo_name=github_config["repo_name"],
286
+ branch_name=translation_result["branch"]
287
+ )
288
+ # return {
289
+ # 'status': 'success',
290
+ # 'message': 'Toctree committed successfully: SUCCESS: File updated - docs/source/ko/_toctree.yml',
291
+ # 'commit_message': 'docs: update Korean documentation table of contents'
292
+ # }
293
+
294
+ except Exception as e:
295
+ return {
296
+ "status": "error",
297
+ "message": f"Error updating toctree: {str(e)}"
298
+ }
translator/prompt_glossary.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PROMPT_WITH_GLOSSARY = """
2
+ You have a glossary of terms with their Korean translations. When translating a sentence, you need to check if any of the words in the sentence are in the glossary, and if so, translate them according to the provided Korean terms. Here is the glossary:
3
+
4
+ ๐Ÿ”น Glossary (English โ†’ Korean):
5
+ - revision: ๊ฐœ์ •
6
+ - method: ๋ฉ”์†Œ๋“œ
7
+ - secrets: ๋น„๋ฐ€๊ฐ’
8
+ - search helper: ๊ฒ€์ƒ‰ ํ—ฌํผ
9
+ - logging level: ๋กœ๊ทธ ๋ ˆ๋ฒจ
10
+ - workflow: ์›Œํฌํ”Œ๋กœ์šฐ
11
+ - corner case: ์ฝ”๋„ˆ ์ผ€์ด์Šค
12
+ - tokenization: ํ† ํฐํ™”
13
+ - architecture: ์•„ํ‚คํ…์ฒ˜
14
+ - attention mask: ์–ดํ…์…˜ ๋งˆ์Šคํฌ
15
+ - backbone: ๋ฐฑ๋ณธ
16
+ - argmax: argmax
17
+ - beam search: ๋น” ์„œ์น˜
18
+ - clustering: ๊ตฐ์ง‘ํ™”
19
+ - configuration: ๊ตฌ์„ฑ
20
+ - context: ๋ฌธ๋งฅ
21
+ - cross entropy: ๊ต์ฐจ ์—”ํŠธ๋กœํ”ผ
22
+ - cross-attention: ํฌ๋กœ์Šค ์–ดํ…์…˜
23
+ - dictionary: ๋”•์…”๋„ˆ๋ฆฌ
24
+ - entry: ์—”ํŠธ๋ฆฌ
25
+ - few shot: ํ“จ์ƒท
26
+ - flatten: ํ‰ํƒ„ํ™”
27
+ - ground truth: ์ •๋‹ต
28
+ - head: ํ—ค๋“œ
29
+ - helper function: ํ—ฌํผ ํ•จ์ˆ˜
30
+ - image captioning: ์ด๋ฏธ์ง€ ์บก์…”๋‹
31
+ - image patch: ์ด๋ฏธ์ง€ ํŒจ์น˜
32
+ - inference: ์ถ”๋ก 
33
+ - instance: ์ธ์Šคํ„ด์Šค
34
+ - Instantiate: ์ธ์Šคํ„ด์Šคํ™”
35
+ - knowledge distillation: ์ง€์‹ ์ฆ๋ฅ˜
36
+ - labels: ๋ ˆ์ด๋ธ”
37
+ - large language models (LLM): ๋Œ€๊ทœ๋ชจ ์–ธ์–ด ๋ชจ๋ธ
38
+ - layer: ๋ ˆ์ด์–ด
39
+ - learning rate scheduler: Learning Rate Scheduler
40
+ - localization: ๋กœ์ปฌ๋ฆฌ์ œ์ด์…˜
41
+ - log mel-filter bank: ๋กœ๊ทธ ๋ฉœ ํ•„ํ„ฐ ๋ฑ…ํฌ
42
+ - look-up table: ๋ฃฉ์—… ํ…Œ์ด๋ธ”
43
+ - loss function: ์†์‹ค ํ•จ์ˆ˜
44
+ - machine learning: ๋จธ์‹  ๋Ÿฌ๋‹
45
+ - mapping: ๋งคํ•‘
46
+ - masked language modeling (MLM): ๋งˆ์Šคํฌ๋“œ ์–ธ์–ด ๋ชจ๋ธ
47
+ - malware: ์•…์„ฑ์ฝ”๋“œ
48
+ - metric: ์ง€ํ‘œ
49
+ - mixed precision: ํ˜ผํ•ฉ ์ •๋ฐ€๋„
50
+ - modality: ๋ชจ๋‹ฌ๋ฆฌํ‹ฐ
51
+ - monolingual model: ๋‹จ์ผ ์–ธ์–ด ๋ชจ๋ธ
52
+ - multi gpu: ๋‹ค์ค‘ GPU
53
+ - multilingual model: ๋‹ค๊ตญ์–ด ๋ชจ๋ธ
54
+ - parsing: ํŒŒ์‹ฑ
55
+ - perplexity (PPL): ํŽ„ํ”Œ๋ ‰์„œํ‹ฐ(Perplexity)
56
+ - pipeline: ํŒŒ์ดํ”„๋ผ์ธ
57
+ - pixel values: ํ”ฝ์…€ ๊ฐ’
58
+ - pooling: ํ’€๋ง
59
+ - position IDs: ์œ„์น˜ ID
60
+ - preprocessing: ์ „์ฒ˜๋ฆฌ
61
+ - prompt: ํ”„๋กฌํ”„ํŠธ
62
+ - pythonic: ํŒŒ์ด์จ๋‹‰
63
+ - query: ์ฟผ๋ฆฌ
64
+ - question answering: ์งˆ์˜ ์‘๋‹ต
65
+ - raw audio waveform: ์›์‹œ ์˜ค๋””์˜ค ํŒŒํ˜•
66
+ - recurrent neural network (RNN): ์ˆœํ™˜ ์‹ ๊ฒฝ๋ง
67
+ - accelerator: ๊ฐ€์†๊ธฐ
68
+ - Accelerate: Accelerate
69
+ - architecture: ์•„ํ‚คํ…์ฒ˜
70
+ - arguments: ์ธ์ˆ˜
71
+ - attention mask: ์–ดํ…์…˜ ๋งˆ์Šคํฌ
72
+ - augmentation: ์ฆ๊ฐ•
73
+ - autoencoding models: ์˜คํ† ์ธ์ฝ”๋”ฉ ๋ชจ๋ธ
74
+ - autoregressive models: ์ž๊ธฐํšŒ๊ท€ ๋ชจ๋ธ
75
+ - backward: ์—ญ๋ฐฉํ–ฅ
76
+ - bounding box: ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค
77
+ - causal language modeling: ์ธ๊ณผ์  ์–ธ์–ด ๋ชจ๋ธ๋ง(causal language modeling)
78
+ - channel: ์ฑ„๋„
79
+ - checkpoint: ์ฒดํฌํฌ์ธํŠธ(checkpoint)
80
+ - chunk: ๋ฌถ์Œ
81
+ - computer vision: ์ปดํ“จํ„ฐ ๋น„์ „
82
+ - convolution: ํ•ฉ์„ฑ๊ณฑ
83
+ - crop: ์ž๋ฅด๊ธฐ
84
+ - custom: ์‚ฌ์šฉ์ž ์ •์˜
85
+ - customize: ๋งž์ถค ์„ค์ •ํ•˜๋‹ค
86
+ - data collator: ๋ฐ์ดํ„ฐ ์ฝœ๋ ˆ์ดํ„ฐ
87
+ - dataset: ๋ฐ์ดํ„ฐ ์„ธํŠธ
88
+ - decoder input IDs: ๋””์ฝ”๋” ์ž…๋ ฅ ID
89
+ - decoder models: ๋””์ฝ”๋” ๋ชจ๋ธ
90
+ - deep learning (DL): ๋”ฅ๋Ÿฌ๋‹
91
+ - directory: ๋””๋ ‰ํ„ฐ๋ฆฌ
92
+ - distributed training: ๋ถ„์‚ฐ ํ•™์Šต
93
+ - downstream: ๋‹ค์šด์ŠคํŠธ๋ฆผ
94
+ - encoder models: ์ธ์ฝ”๋” ๋ชจ๋ธ
95
+ - entity: ๊ฐœ์ฒด
96
+ - epoch: ์—ํญ
97
+ - evaluation method: ํ‰๊ฐ€ ๋ฐฉ๋ฒ•
98
+ - feature extraction: ํŠน์„ฑ ์ถ”์ถœ
99
+ - feature matrix: ํŠน์„ฑ ํ–‰๋ ฌ(feature matrix)
100
+ - fine-tunning: ๋ฏธ์„ธ ์กฐ์ •
101
+ - finetuned models: ๋ฏธ์„ธ ์กฐ์ • ๋ชจ๋ธ
102
+ - hidden state: ์€๋‹‰ ์ƒํƒœ
103
+ - hyperparameter: ํ•˜์ดํผํŒŒ๋ผ๋ฏธํ„ฐ
104
+ - learning: ํ•™์Šต
105
+ - load: ๊ฐ€์ ธ์˜ค๋‹ค
106
+ - method: ๋ฉ”์†Œ๋“œ
107
+ - optimizer: ์˜ตํ‹ฐ๋งˆ์ด์ €
108
+ - pad (padding): ํŒจ๋“œ (ํŒจ๋”ฉ)
109
+ - parameter: ๋งค๊ฐœ๋ณ€์ˆ˜
110
+ - pretrained model: ์‚ฌ์ „ํ›ˆ๋ จ๋œ ๋ชจ๋ธ
111
+ - separator (* [SEP]๋ฅผ ๋ถ€๋ฅด๋Š” ์ด๋ฆ„): ๋ถ„ํ•  ํ† ํฐ
112
+ - sequence: ์‹œํ€€์Šค
113
+ - silent error: ์กฐ์šฉํ•œ ์˜ค๋ฅ˜
114
+ - token: ํ† ํฐ
115
+ - tokenizer: ํ† ํฌ๋‚˜์ด์ €
116
+ - training: ํ›ˆ๋ จ
117
+ - workflow: ์›Œํฌํ”Œ๋กœ์šฐ
118
+
119
+ ๐Ÿ“Œ Instructions:
120
+ 1. Whenever a source term from the glossary appears **in any form** (full match or partial match within a larger phrase), **replace it with the exact Korean translation** from the glossary, keeping the rest of the phrase in Korean.
121
+ - Example: โ€œAttention Interfaceโ€ โ†’ โ€œ์–ดํ…์…˜ ์ธํ„ฐํŽ˜์ด์Šคโ€
122
+ - Example: โ€œArchitecture detailsโ€ โ†’ โ€œ์•„ํ‚คํ…์ฒ˜ ์ƒ์„ธโ€
123
+ 2. Non-glossary words should be translated naturally, respecting context and technical nuance.
124
+
125
+ Please revise the translated sentences accordingly using the terms provided in this glossary.
126
+ """