Merge pull request #2 from Jwaminju/update-translator
Browse files- README.md +1 -1
- agent/handler.py +81 -17
- agent/workflow.py +68 -25
- app.py +54 -11
- pr_generator/agent.py +1 -1
- translation_result/docs/source/en/accelerator_selection.md +13 -13
- translator/content.py +95 -27
- translator/retriever.py +54 -0
README.md
CHANGED
@@ -54,7 +54,7 @@ This project was specifically created to solve [Hugging Face Transformers Issue
|
|
54 |
|
55 |
## π₯ Demo Video
|
56 |
|
57 |
-
[
|
58 |
|
59 |
*Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
|
60 |
|
|
|
54 |
|
55 |
## π₯ Demo Video
|
56 |
|
57 |
+
[Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
|
58 |
|
59 |
*Watch the complete walkthrough: from setup to PR creation in under 5 minutes*
|
60 |
|
agent/handler.py
CHANGED
@@ -8,10 +8,12 @@ import gradio as gr
|
|
8 |
|
9 |
from agent.workflow import (
|
10 |
report_translation_target_files,
|
|
|
11 |
translate_docs_interactive,
|
12 |
generate_github_pr,
|
13 |
)
|
14 |
from pr_generator.searcher import find_reference_pr_simple_stream
|
|
|
15 |
|
16 |
|
17 |
# State management
|
@@ -21,6 +23,7 @@ class ChatState:
|
|
21 |
self.target_language = "ko"
|
22 |
self.k_files = 10
|
23 |
self.files_to_translate = []
|
|
|
24 |
self.current_file_content = {"translated": ""}
|
25 |
self.pr_result = None # Store PR creation result
|
26 |
# GitHub configuration
|
@@ -70,22 +73,29 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
70 |
state.step = "find_files"
|
71 |
|
72 |
status_report, files_list = report_translation_target_files(lang, k)
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
response = f"""**β
File search completed!**
|
76 |
|
77 |
**Status Report:**
|
78 |
{status_report}
|
79 |
-
|
80 |
**π Found first {len(state.files_to_translate)} files to translate:**
|
81 |
"""
|
82 |
|
83 |
if state.files_to_translate:
|
84 |
-
for i, file in enumerate(state.files_to_translate
|
85 |
response += f"\n{i}. `{file}`"
|
86 |
|
87 |
-
if len(state.files_to_translate) > 5:
|
88 |
-
|
89 |
|
90 |
response += "\n\n**π Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
91 |
else:
|
@@ -96,7 +106,18 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
96 |
cleared_input = ""
|
97 |
selected_tab = 1 if state.files_to_translate else 0
|
98 |
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
|
102 |
def start_translation_process():
|
@@ -108,8 +129,8 @@ def start_translation_process():
|
|
108 |
|
109 |
# Call translation function (simplified for demo)
|
110 |
try:
|
111 |
-
|
112 |
-
state.target_language, [[current_file]]
|
113 |
)
|
114 |
|
115 |
state.current_file_content = {"translated": translated}
|
@@ -124,18 +145,24 @@ def start_translation_process():
|
|
124 |
original_file_link = (
|
125 |
"https://github.com/huggingface/transformers/blob/main/" + current_file
|
126 |
)
|
|
|
|
|
|
|
127 |
response = (
|
128 |
-
f"""π Translation for: `{current_file}
|
129 |
"**π Original Content Link:**\n"
|
130 |
""
|
131 |
f"{original_file_link}\n"
|
132 |
"**π Translated Content:**\n"
|
133 |
-
f"\n```\n\n{_extract_content_for_display(translated)}
|
134 |
-
|
|
|
|
|
|
|
|
|
135 |
)
|
136 |
-
|
137 |
-
|
138 |
-
print("extracted")
|
139 |
|
140 |
except Exception as e:
|
141 |
response = f"β Translation failed: {str(e)}"
|
@@ -191,12 +218,14 @@ def handle_user_message(message, history):
|
|
191 |
# User wants to start translation
|
192 |
if state.files_to_translate:
|
193 |
state.step = "translate"
|
194 |
-
response = start_translation_process()
|
|
|
|
|
|
|
195 |
else:
|
196 |
response = (
|
197 |
"β No files available for translation. Please search for files first."
|
198 |
)
|
199 |
-
|
200 |
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
201 |
else:
|
202 |
# General response
|
@@ -288,14 +317,44 @@ def update_github_config(token, owner, repo, reference_pr_url):
|
|
288 |
return f"β
GitHub configuration updated: {owner}/{repo}"
|
289 |
|
290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
def send_message(message, history):
|
292 |
new_history, cleared_input = handle_user_message(message, history)
|
293 |
return new_history, cleared_input, update_status()
|
294 |
|
295 |
|
296 |
# Button handlers with tab switching
|
297 |
-
def start_translate_handler(history, anthropic_key):
|
298 |
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
|
|
|
|
|
|
299 |
new_hist, cleared_input = handle_user_message("start translation", history)
|
300 |
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
301 |
return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
|
@@ -363,11 +422,16 @@ def approve_handler(history, owner, repo, reference_pr_url):
|
|
363 |
translated_content = state.current_file_content["translated"]
|
364 |
response += "\n\nπ **Generating GitHub PR...**"
|
365 |
|
|
|
|
|
|
|
|
|
366 |
pr_response = generate_github_pr(
|
367 |
target_language=state.target_language,
|
368 |
filepath=current_file,
|
369 |
translated_content=translated_content,
|
370 |
github_config=state.github_config,
|
|
|
371 |
)
|
372 |
response += f"\n{pr_response}"
|
373 |
else:
|
|
|
8 |
|
9 |
from agent.workflow import (
|
10 |
report_translation_target_files,
|
11 |
+
report_in_translation_status_files,
|
12 |
translate_docs_interactive,
|
13 |
generate_github_pr,
|
14 |
)
|
15 |
from pr_generator.searcher import find_reference_pr_simple_stream
|
16 |
+
from translator.content import get_full_prompt, get_content, preprocess_content
|
17 |
|
18 |
|
19 |
# State management
|
|
|
23 |
self.target_language = "ko"
|
24 |
self.k_files = 10
|
25 |
self.files_to_translate = []
|
26 |
+
self.additional_instruction = ""
|
27 |
self.current_file_content = {"translated": ""}
|
28 |
self.pr_result = None # Store PR creation result
|
29 |
# GitHub configuration
|
|
|
73 |
state.step = "find_files"
|
74 |
|
75 |
status_report, files_list = report_translation_target_files(lang, k)
|
76 |
+
in_progress_status_report, in_progress_docs = report_in_translation_status_files(
|
77 |
+
lang
|
78 |
+
)
|
79 |
+
state.files_to_translate = (
|
80 |
+
[file[0] for file in files_list if file[0] not in in_progress_docs]
|
81 |
+
if files_list
|
82 |
+
else []
|
83 |
+
)
|
84 |
|
85 |
response = f"""**β
File search completed!**
|
86 |
|
87 |
**Status Report:**
|
88 |
{status_report}
|
89 |
+
{in_progress_status_report}
|
90 |
**π Found first {len(state.files_to_translate)} files to translate:**
|
91 |
"""
|
92 |
|
93 |
if state.files_to_translate:
|
94 |
+
for i, file in enumerate(state.files_to_translate, 1):
|
95 |
response += f"\n{i}. `{file}`"
|
96 |
|
97 |
+
# if len(state.files_to_translate) > 5:
|
98 |
+
# response += f"\n... and {len(state.files_to_translate) - 5} more files"
|
99 |
|
100 |
response += "\n\n**π Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
|
101 |
else:
|
|
|
106 |
cleared_input = ""
|
107 |
selected_tab = 1 if state.files_to_translate else 0
|
108 |
|
109 |
+
# λλ‘λ€μ΄ choicesλ‘ μΈ νμΌ λ¦¬μ€νΈ λ°ν μΆκ°
|
110 |
+
return (
|
111 |
+
history,
|
112 |
+
cleared_input,
|
113 |
+
update_status(),
|
114 |
+
gr.Tabs(selected=selected_tab),
|
115 |
+
update_dropdown_choices(state.files_to_translate),
|
116 |
+
)
|
117 |
+
|
118 |
+
|
119 |
+
def update_dropdown_choices(file_list):
|
120 |
+
return gr.update(choices=file_list, value=None)
|
121 |
|
122 |
|
123 |
def start_translation_process():
|
|
|
129 |
|
130 |
# Call translation function (simplified for demo)
|
131 |
try:
|
132 |
+
translated = translate_docs_interactive(
|
133 |
+
state.target_language, [[current_file]], state.additional_instruction
|
134 |
)
|
135 |
|
136 |
state.current_file_content = {"translated": translated}
|
|
|
145 |
original_file_link = (
|
146 |
"https://github.com/huggingface/transformers/blob/main/" + current_file
|
147 |
)
|
148 |
+
print("Compeleted translation:\n")
|
149 |
+
print(translated)
|
150 |
+
print("----------------------------")
|
151 |
response = (
|
152 |
+
f"""π Translation for: `{current_file}`\n"""
|
153 |
"**π Original Content Link:**\n"
|
154 |
""
|
155 |
f"{original_file_link}\n"
|
156 |
"**π Translated Content:**\n"
|
157 |
+
# f"\n```\n\n{_extract_content_for_display(translated)}\n```"
|
158 |
+
# "\n```\n\n"
|
159 |
+
# f"\n{translated}\n"
|
160 |
+
# f"```"
|
161 |
+
# f"{status}\n"
|
162 |
+
# "β
Translation completed. The code block will be added when generating PR."
|
163 |
)
|
164 |
+
return response, translated
|
165 |
+
|
|
|
166 |
|
167 |
except Exception as e:
|
168 |
response = f"β Translation failed: {str(e)}"
|
|
|
218 |
# User wants to start translation
|
219 |
if state.files_to_translate:
|
220 |
state.step = "translate"
|
221 |
+
response, translated = start_translation_process()
|
222 |
+
history.append([message, response])
|
223 |
+
history.append(["", translated])
|
224 |
+
return history, ""
|
225 |
else:
|
226 |
response = (
|
227 |
"β No files available for translation. Please search for files first."
|
228 |
)
|
|
|
229 |
# Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
|
230 |
else:
|
231 |
# General response
|
|
|
317 |
return f"β
GitHub configuration updated: {owner}/{repo}"
|
318 |
|
319 |
|
320 |
+
def update_prompt_preview(language, file_path, additional_instruction):
|
321 |
+
"""Update prompt preview based on current settings"""
|
322 |
+
if not file_path.strip():
|
323 |
+
return "Select a file to see the prompt preview..."
|
324 |
+
|
325 |
+
try:
|
326 |
+
# Get language name
|
327 |
+
if language == "ko":
|
328 |
+
translation_lang = "Korean"
|
329 |
+
else:
|
330 |
+
translation_lang = language
|
331 |
+
|
332 |
+
# Get sample content (first 500 characters)
|
333 |
+
content = get_content(file_path)
|
334 |
+
to_translate = preprocess_content(content)
|
335 |
+
|
336 |
+
# Truncate for preview
|
337 |
+
sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
|
338 |
+
|
339 |
+
# Generate prompt
|
340 |
+
prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
|
341 |
+
|
342 |
+
return prompt
|
343 |
+
except Exception as e:
|
344 |
+
return f"Error generating prompt preview: {str(e)}"
|
345 |
+
|
346 |
+
|
347 |
def send_message(message, history):
|
348 |
new_history, cleared_input = handle_user_message(message, history)
|
349 |
return new_history, cleared_input, update_status()
|
350 |
|
351 |
|
352 |
# Button handlers with tab switching
|
353 |
+
def start_translate_handler(history, anthropic_key, file_to_translate, additional_instruction=""):
|
354 |
os.environ["ANTHROPIC_API_KEY"] = anthropic_key
|
355 |
+
|
356 |
+
state.additional_instruction = additional_instruction
|
357 |
+
state.files_to_translate = [file_to_translate]
|
358 |
new_hist, cleared_input = handle_user_message("start translation", history)
|
359 |
selected_tabs = 2 if state.current_file_content["translated"] else 0
|
360 |
return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
|
|
|
422 |
translated_content = state.current_file_content["translated"]
|
423 |
response += "\n\nπ **Generating GitHub PR...**"
|
424 |
|
425 |
+
# Extract title from file for toctree mapping
|
426 |
+
file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
|
427 |
+
print(file_name)
|
428 |
+
|
429 |
pr_response = generate_github_pr(
|
430 |
target_language=state.target_language,
|
431 |
filepath=current_file,
|
432 |
translated_content=translated_content,
|
433 |
github_config=state.github_config,
|
434 |
+
en_title=file_name,
|
435 |
)
|
436 |
response += f"\n{pr_response}"
|
437 |
else:
|
agent/workflow.py
CHANGED
@@ -11,7 +11,7 @@ from translator.content import (
|
|
11 |
llm_translate,
|
12 |
preprocess_content,
|
13 |
)
|
14 |
-
from translator.retriever import report
|
15 |
|
16 |
# GitHub PR Agent import
|
17 |
try:
|
@@ -38,8 +38,34 @@ def report_translation_target_files(
|
|
38 |
return status_report, [[file] for file in filepath_list]
|
39 |
|
40 |
|
41 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
"""Translate documentation."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# step 1. Get content from file path
|
44 |
content = get_content(file_path)
|
45 |
to_translate = preprocess_content(content)
|
@@ -47,21 +73,25 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
|
|
47 |
# step 2. Prepare prompt with docs content
|
48 |
if lang == "ko":
|
49 |
translation_lang = "Korean"
|
50 |
-
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
|
|
|
|
|
51 |
|
52 |
# step 3. Translate with LLM
|
53 |
# TODO: MCP clilent λκΈΈ λΆλΆ
|
54 |
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
55 |
-
|
|
|
56 |
# step 4. Add scaffold to translation result
|
57 |
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
58 |
-
|
|
|
59 |
return callback_result, translated_doc
|
60 |
|
61 |
|
62 |
def translate_docs_interactive(
|
63 |
-
translate_lang: str, selected_files: list[list[str]]
|
64 |
-
) -> tuple[str, str
|
65 |
"""Interactive translation function that processes files one by one.
|
66 |
|
67 |
Args:
|
@@ -70,27 +100,17 @@ def translate_docs_interactive(
|
|
70 |
"""
|
71 |
# Extract file paths from the dataframe format
|
72 |
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
73 |
-
if not file_paths:
|
74 |
-
return (
|
75 |
-
"No files selected for translation.",
|
76 |
-
gr.update(visible=False),
|
77 |
-
gr.update(visible=False),
|
78 |
-
gr.update(visible=False),
|
79 |
-
[],
|
80 |
-
0,
|
81 |
-
)
|
82 |
|
83 |
# Start with the first file
|
84 |
current_file = file_paths[0]
|
85 |
|
86 |
status = f"β
Translation completed: `{current_file}` β `{translate_lang}`\n\n"
|
87 |
-
callback_result, translated_content = translate_docs(translate_lang, current_file)
|
88 |
status += f"π° Used token and cost: \n```\n{callback_result}\n```"
|
89 |
|
90 |
-
|
91 |
-
status += f"\n### π Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
|
92 |
|
93 |
-
return
|
94 |
|
95 |
|
96 |
def generate_github_pr(
|
@@ -98,6 +118,7 @@ def generate_github_pr(
|
|
98 |
filepath: str,
|
99 |
translated_content: str = None,
|
100 |
github_config: dict = None,
|
|
|
101 |
) -> str:
|
102 |
"""Generate a GitHub PR for translated documentation.
|
103 |
|
@@ -106,6 +127,7 @@ def generate_github_pr(
|
|
106 |
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
107 |
translated_content: Translated content (if None, read from file)
|
108 |
github_config: GitHub configuration dictionary
|
|
|
109 |
|
110 |
Returns:
|
111 |
PR creation result message
|
@@ -149,9 +171,7 @@ def generate_github_pr(
|
|
149 |
print(f" π File: {filepath}")
|
150 |
print(f" π Language: {target_language}")
|
151 |
print(f" π Reference PR: {github_config['reference_pr_url']}")
|
152 |
-
print(
|
153 |
-
f" π Repository: {github_config['owner']}/{github_config['repo_name']}"
|
154 |
-
)
|
155 |
|
156 |
agent = GitHubPRAgent()
|
157 |
result = agent.run_translation_pr_workflow(
|
@@ -163,14 +183,37 @@ def generate_github_pr(
|
|
163 |
repo_name=github_config["repo_name"],
|
164 |
base_branch=github_config.get("base_branch", "main"),
|
165 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
# Process result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
if result["status"] == "success":
|
169 |
return f"""β
**GitHub PR Creation Successful!**
|
170 |
|
171 |
π **PR URL:** {result["pr_url"]}
|
172 |
πΏ **Branch:** {result["branch"]}
|
173 |
-
π **File:** {result["file_path"]}
|
174 |
|
175 |
{result["message"]}"""
|
176 |
|
@@ -178,7 +221,7 @@ def generate_github_pr(
|
|
178 |
return f"""β οΈ **Partial Success**
|
179 |
|
180 |
πΏ **Branch:** {result["branch"]}
|
181 |
-
π **File:** {result["file_path"]}
|
182 |
|
183 |
{result["message"]}
|
184 |
|
|
|
11 |
llm_translate,
|
12 |
preprocess_content,
|
13 |
)
|
14 |
+
from translator.retriever import report, get_github_issue_open_pr
|
15 |
|
16 |
# GitHub PR Agent import
|
17 |
try:
|
|
|
38 |
return status_report, [[file] for file in filepath_list]
|
39 |
|
40 |
|
41 |
+
def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[str]]:
|
42 |
+
docs, pr_info_list = get_github_issue_open_pr(translate_lang)
|
43 |
+
|
44 |
+
status_report = ""
|
45 |
+
if docs:
|
46 |
+
status_report = f"""\nπ€ Found {len(docs)} in progress for translation.
|
47 |
+
"""
|
48 |
+
for i, file in enumerate(docs):
|
49 |
+
status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
|
50 |
+
status_report += "\n"
|
51 |
+
return status_report, docs
|
52 |
+
|
53 |
+
|
54 |
+
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
55 |
"""Translate documentation."""
|
56 |
+
# Check if translation already exists
|
57 |
+
translation_file_path = (
|
58 |
+
Path(__file__).resolve().parent.parent
|
59 |
+
/ f"translation_result/{file_path}"
|
60 |
+
)
|
61 |
+
|
62 |
+
if translation_file_path.exists():
|
63 |
+
print(f"π Found existing translation: {translation_file_path}")
|
64 |
+
with open(translation_file_path, "r", encoding="utf-8") as f:
|
65 |
+
existing_content = f.read()
|
66 |
+
if existing_content.strip():
|
67 |
+
return "Existing translation loaded (no tokens used)", existing_content
|
68 |
+
|
69 |
# step 1. Get content from file path
|
70 |
content = get_content(file_path)
|
71 |
to_translate = preprocess_content(content)
|
|
|
73 |
# step 2. Prepare prompt with docs content
|
74 |
if lang == "ko":
|
75 |
translation_lang = "Korean"
|
76 |
+
to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
|
77 |
+
|
78 |
+
print("to_translate_with_prompt:\n", to_translate_with_prompt)
|
79 |
|
80 |
# step 3. Translate with LLM
|
81 |
# TODO: MCP clilent λκΈΈ λΆλΆ
|
82 |
callback_result, translated_content = llm_translate(to_translate_with_prompt)
|
83 |
+
print("translated_content:\n")
|
84 |
+
print(translated_content)
|
85 |
# step 4. Add scaffold to translation result
|
86 |
translated_doc = fill_scaffold(content, to_translate, translated_content)
|
87 |
+
print("translated_doc:\n")
|
88 |
+
print(translated_doc)
|
89 |
return callback_result, translated_doc
|
90 |
|
91 |
|
92 |
def translate_docs_interactive(
|
93 |
+
translate_lang: str, selected_files: list[list[str]], additional_instruction: str = ""
|
94 |
+
) -> tuple[str, str]:
|
95 |
"""Interactive translation function that processes files one by one.
|
96 |
|
97 |
Args:
|
|
|
100 |
"""
|
101 |
# Extract file paths from the dataframe format
|
102 |
file_paths = [row[0] for row in selected_files if row and len(row) > 0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# Start with the first file
|
105 |
current_file = file_paths[0]
|
106 |
|
107 |
status = f"β
Translation completed: `{current_file}` β `{translate_lang}`\n\n"
|
108 |
+
callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction)
|
109 |
status += f"π° Used token and cost: \n```\n{callback_result}\n```"
|
110 |
|
111 |
+
print(status)
|
|
|
112 |
|
113 |
+
return translated_content
|
114 |
|
115 |
|
116 |
def generate_github_pr(
|
|
|
118 |
filepath: str,
|
119 |
translated_content: str = None,
|
120 |
github_config: dict = None,
|
121 |
+
en_title: str = None,
|
122 |
) -> str:
|
123 |
"""Generate a GitHub PR for translated documentation.
|
124 |
|
|
|
127 |
filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
|
128 |
translated_content: Translated content (if None, read from file)
|
129 |
github_config: GitHub configuration dictionary
|
130 |
+
en_title: English title for toctree mapping
|
131 |
|
132 |
Returns:
|
133 |
PR creation result message
|
|
|
171 |
print(f" π File: {filepath}")
|
172 |
print(f" π Language: {target_language}")
|
173 |
print(f" π Reference PR: {github_config['reference_pr_url']}")
|
174 |
+
print(f" π Repository: {github_config['owner']}/{github_config['repo_name']}")
|
|
|
|
|
175 |
|
176 |
agent = GitHubPRAgent()
|
177 |
result = agent.run_translation_pr_workflow(
|
|
|
183 |
repo_name=github_config["repo_name"],
|
184 |
base_branch=github_config.get("base_branch", "main"),
|
185 |
)
|
186 |
+
# result = {
|
187 |
+
# 'status': 'partial_success',
|
188 |
+
# 'branch': 'ko-attention_interface',
|
189 |
+
# 'file_path': 'docs/source/ko/attention_interface.md',
|
190 |
+
# 'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
|
191 |
+
# }
|
192 |
+
# Process toctree update after successful translation PR
|
193 |
+
toctree_result = None
|
194 |
+
if en_title:
|
195 |
+
from agent.toctree_handler import TocTreeHandler
|
196 |
+
toctree_handler = TocTreeHandler()
|
197 |
+
toctree_result = toctree_handler.update_toctree_after_translation(
|
198 |
+
result, en_title, filepath, agent, github_config
|
199 |
+
)
|
200 |
+
print("toctree_result:", toctree_result)
|
201 |
|
202 |
# Process result
|
203 |
+
# Generate toctree status message (shared for both success and partial_success)
|
204 |
+
toctree_status = ""
|
205 |
+
if toctree_result:
|
206 |
+
if toctree_result["status"] == "success":
|
207 |
+
toctree_status = f"\nπ **Toctree Updated:** β
{toctree_result['message']}"
|
208 |
+
else:
|
209 |
+
toctree_status = f"\nπ **Toctree Update Failed:** β {toctree_result['message']}"
|
210 |
+
|
211 |
if result["status"] == "success":
|
212 |
return f"""β
**GitHub PR Creation Successful!**
|
213 |
|
214 |
π **PR URL:** {result["pr_url"]}
|
215 |
πΏ **Branch:** {result["branch"]}
|
216 |
+
π **File:** {result["file_path"]}{toctree_status}
|
217 |
|
218 |
{result["message"]}"""
|
219 |
|
|
|
221 |
return f"""β οΈ **Partial Success**
|
222 |
|
223 |
πΏ **Branch:** {result["branch"]}
|
224 |
+
π **File:** {result["file_path"]}{toctree_status}
|
225 |
|
226 |
{result["message"]}
|
227 |
|
app.py
CHANGED
@@ -14,6 +14,7 @@ from agent.handler import (
|
|
14 |
send_message,
|
15 |
start_translate_handler,
|
16 |
sync_language_displays,
|
|
|
17 |
update_status,
|
18 |
update_github_config,
|
19 |
)
|
@@ -30,7 +31,7 @@ css = """
|
|
30 |
background: rgba(255, 255, 180, 0.25);
|
31 |
border-radius: 18px;
|
32 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
33 |
-
padding: 1.
|
34 |
backdrop-filter: blur(8px);
|
35 |
border: 1px solid rgba(255,255,180,0.25);
|
36 |
width: 100%;
|
@@ -40,10 +41,12 @@ css = """
|
|
40 |
background: rgba(255, 255, 180, 0.25);
|
41 |
border-radius: 18px;
|
42 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
43 |
-
padding: 1.
|
44 |
backdrop-filter: blur(8px);
|
45 |
border: 1px solid rgba(255,255,180,0.25);
|
46 |
width: 100%;
|
|
|
|
|
47 |
}
|
48 |
.status-card {
|
49 |
width: 100%
|
@@ -91,7 +94,6 @@ css = """
|
|
91 |
with gr.Blocks(
|
92 |
css=css, title=" π Hugging Face Transformers Docs i18n made easy"
|
93 |
) as demo:
|
94 |
-
|
95 |
# Title
|
96 |
with open("images/hfkr_logo.png", "rb") as img_file:
|
97 |
base64_img = base64.b64encode(img_file.read()).decode()
|
@@ -105,11 +107,12 @@ with gr.Blocks(
|
|
105 |
# Content
|
106 |
with gr.Row():
|
107 |
# Chat interface
|
108 |
-
with gr.Column(scale=
|
109 |
gr.Markdown("### π Hugging Face i18n Agent")
|
110 |
|
111 |
chatbot = gr.Chatbot(
|
112 |
-
value=[[None, get_welcome_message()]], scale=1, height=585
|
|
|
113 |
)
|
114 |
|
115 |
# Controller interface
|
@@ -122,16 +125,15 @@ with gr.Blocks(
|
|
122 |
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
123 |
with gr.TabItem("1. Find Files", id=0):
|
124 |
with gr.Group():
|
125 |
-
lang_dropdown = gr.
|
126 |
choices=[language.value for language in Languages],
|
127 |
label="π Translate To",
|
128 |
value="ko",
|
129 |
)
|
130 |
k_input = gr.Number(
|
131 |
label="π First k missing translated docs",
|
132 |
-
value=
|
133 |
minimum=1,
|
134 |
-
maximum=100,
|
135 |
)
|
136 |
find_btn = gr.Button(
|
137 |
"π Find Files to Translate",
|
@@ -140,6 +142,17 @@ with gr.Blocks(
|
|
140 |
|
141 |
with gr.TabItem("2. Translate", id=1):
|
142 |
with gr.Group():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
translate_lang_display = gr.Dropdown(
|
144 |
choices=[language.value for language in Languages],
|
145 |
label="π Translation Language",
|
@@ -150,6 +163,21 @@ with gr.Blocks(
|
|
150 |
label="π Anthropic API key for translation generation",
|
151 |
type="password",
|
152 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
start_translate_btn = gr.Button(
|
154 |
"π Start Translation", elem_classes="action-button"
|
155 |
)
|
@@ -186,7 +214,7 @@ with gr.Blocks(
|
|
186 |
|
187 |
# Chat Controller
|
188 |
with gr.Column(elem_classes=["control-panel"]):
|
189 |
-
gr.Markdown("### π¬ Chat with agent")
|
190 |
msg_input = gr.Textbox(
|
191 |
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
192 |
container=False,
|
@@ -199,7 +227,7 @@ with gr.Blocks(
|
|
199 |
find_btn.click(
|
200 |
fn=process_file_search_handler,
|
201 |
inputs=[lang_dropdown, k_input, chatbot],
|
202 |
-
outputs=[chatbot, msg_input, status_display, control_tabs],
|
203 |
)
|
204 |
|
205 |
# Sync language across tabs
|
@@ -209,10 +237,17 @@ with gr.Blocks(
|
|
209 |
outputs=[translate_lang_display],
|
210 |
)
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
212 |
# Button event handlers
|
213 |
start_translate_btn.click(
|
214 |
fn=start_translate_handler,
|
215 |
-
inputs=[chatbot, anthropic_key],
|
216 |
outputs=[chatbot, msg_input, status_display, control_tabs],
|
217 |
)
|
218 |
|
@@ -247,5 +282,13 @@ with gr.Blocks(
|
|
247 |
outputs=[chatbot, msg_input, status_display],
|
248 |
)
|
249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
250 |
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
251 |
demo.launch(root_path=root_path)
|
|
|
14 |
send_message,
|
15 |
start_translate_handler,
|
16 |
sync_language_displays,
|
17 |
+
update_prompt_preview,
|
18 |
update_status,
|
19 |
update_github_config,
|
20 |
)
|
|
|
31 |
background: rgba(255, 255, 180, 0.25);
|
32 |
border-radius: 18px;
|
33 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
34 |
+
padding: 1.0em;
|
35 |
backdrop-filter: blur(8px);
|
36 |
border: 1px solid rgba(255,255,180,0.25);
|
37 |
width: 100%;
|
|
|
41 |
background: rgba(255, 255, 180, 0.25);
|
42 |
border-radius: 18px;
|
43 |
box-shadow: 0 4px 24px rgba(0,0,0,0.08);
|
44 |
+
padding: 1.0em;
|
45 |
backdrop-filter: blur(8px);
|
46 |
border: 1px solid rgba(255,255,180,0.25);
|
47 |
width: 100%;
|
48 |
+
overflow: visible !important;
|
49 |
+
|
50 |
}
|
51 |
.status-card {
|
52 |
width: 100%
|
|
|
94 |
with gr.Blocks(
|
95 |
css=css, title=" π Hugging Face Transformers Docs i18n made easy"
|
96 |
) as demo:
|
|
|
97 |
# Title
|
98 |
with open("images/hfkr_logo.png", "rb") as img_file:
|
99 |
base64_img = base64.b64encode(img_file.read()).decode()
|
|
|
107 |
# Content
|
108 |
with gr.Row():
|
109 |
# Chat interface
|
110 |
+
with gr.Column(scale=3, elem_classes=["chat-container"]):
|
111 |
gr.Markdown("### π Hugging Face i18n Agent")
|
112 |
|
113 |
chatbot = gr.Chatbot(
|
114 |
+
value=[[None, get_welcome_message()]], scale=1, height=585,
|
115 |
+
show_copy_button=True
|
116 |
)
|
117 |
|
118 |
# Controller interface
|
|
|
125 |
with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
|
126 |
with gr.TabItem("1. Find Files", id=0):
|
127 |
with gr.Group():
|
128 |
+
lang_dropdown = gr.Radio(
|
129 |
choices=[language.value for language in Languages],
|
130 |
label="π Translate To",
|
131 |
value="ko",
|
132 |
)
|
133 |
k_input = gr.Number(
|
134 |
label="π First k missing translated docs",
|
135 |
+
value=10,
|
136 |
minimum=1,
|
|
|
137 |
)
|
138 |
find_btn = gr.Button(
|
139 |
"π Find Files to Translate",
|
|
|
142 |
|
143 |
with gr.TabItem("2. Translate", id=1):
|
144 |
with gr.Group():
|
145 |
+
files_to_translate = gr.Radio(
|
146 |
+
choices=[],
|
147 |
+
label="π Select a file to translate",
|
148 |
+
interactive=True,
|
149 |
+
value=None,
|
150 |
+
)
|
151 |
+
file_to_translate_input = gr.Textbox(
|
152 |
+
label="π Select in the dropdown or write the file path to translate",
|
153 |
+
value="",
|
154 |
+
)
|
155 |
+
|
156 |
translate_lang_display = gr.Dropdown(
|
157 |
choices=[language.value for language in Languages],
|
158 |
label="π Translation Language",
|
|
|
163 |
label="π Anthropic API key for translation generation",
|
164 |
type="password",
|
165 |
)
|
166 |
+
additional_instruction = gr.Textbox(
|
167 |
+
label="π Additional instructions (Optional - e.g., custom glossary)",
|
168 |
+
placeholder="Example: Translate 'model' as 'λͺ¨λΈ' consistently",
|
169 |
+
lines=2,
|
170 |
+
)
|
171 |
+
|
172 |
+
with gr.Accordion("π Preview Prompt", open=False):
|
173 |
+
prompt_preview = gr.Textbox(
|
174 |
+
label="Current Translation Prompt",
|
175 |
+
lines=8,
|
176 |
+
interactive=False,
|
177 |
+
placeholder="Select a file and language to see the prompt preview...",
|
178 |
+
show_copy_button=True,
|
179 |
+
)
|
180 |
+
|
181 |
start_translate_btn = gr.Button(
|
182 |
"π Start Translation", elem_classes="action-button"
|
183 |
)
|
|
|
214 |
|
215 |
# Chat Controller
|
216 |
with gr.Column(elem_classes=["control-panel"]):
|
217 |
+
gr.Markdown("### π¬ Chat with agent (Only simple chat is available)")
|
218 |
msg_input = gr.Textbox(
|
219 |
placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
|
220 |
container=False,
|
|
|
227 |
find_btn.click(
|
228 |
fn=process_file_search_handler,
|
229 |
inputs=[lang_dropdown, k_input, chatbot],
|
230 |
+
outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
|
231 |
)
|
232 |
|
233 |
# Sync language across tabs
|
|
|
237 |
outputs=[translate_lang_display],
|
238 |
)
|
239 |
|
240 |
+
#
|
241 |
+
files_to_translate.change(
|
242 |
+
fn=lambda x: x,
|
243 |
+
inputs=[files_to_translate],
|
244 |
+
outputs=[file_to_translate_input],
|
245 |
+
)
|
246 |
+
|
247 |
# Button event handlers
|
248 |
start_translate_btn.click(
|
249 |
fn=start_translate_handler,
|
250 |
+
inputs=[chatbot, anthropic_key, file_to_translate_input, additional_instruction],
|
251 |
outputs=[chatbot, msg_input, status_display, control_tabs],
|
252 |
)
|
253 |
|
|
|
282 |
outputs=[chatbot, msg_input, status_display],
|
283 |
)
|
284 |
|
285 |
+
# Update prompt preview when inputs change
|
286 |
+
for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
|
287 |
+
input_component.change(
|
288 |
+
fn=update_prompt_preview,
|
289 |
+
inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
|
290 |
+
outputs=[prompt_preview],
|
291 |
+
)
|
292 |
+
|
293 |
root_path = os.environ.get("GRADIO_ROOT_PATH")
|
294 |
demo.launch(root_path=root_path)
|
pr_generator/agent.py
CHANGED
@@ -518,7 +518,7 @@ Please return only the commit message. No other explanation is needed."""
|
|
518 |
"status": "partial_success",
|
519 |
"branch": branch_name,
|
520 |
"file_path": target_filepath,
|
521 |
-
"message": f"File was saved
|
522 |
"error_details": pr_result,
|
523 |
}
|
524 |
elif "successful" in pr_result and "http" in pr_result:
|
|
|
518 |
"status": "partial_success",
|
519 |
"branch": branch_name,
|
520 |
"file_path": target_filepath,
|
521 |
+
"message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
|
522 |
"error_details": pr_result,
|
523 |
}
|
524 |
elif "successful" in pr_result and "http" in pr_result:
|
translation_result/docs/source/en/accelerator_selection.md
CHANGED
@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
|
|
16 |
|
17 |
# κ°μκΈ° μ ν [[accelerator-selection]]
|
18 |
|
19 |
-
λΆμ°
|
20 |
|
21 |
μ΄ κ°μ΄λλ μ¬μ©ν κ°μκΈ°μ μμ μ¬μ© μμλ₯Ό μ ννλ λ°©λ²μ 보μ¬μ€λλ€.
|
22 |
|
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
|
|
27 |
<hfoptions id="select-accelerator">
|
28 |
<hfoption id="torchrun">
|
29 |
|
30 |
-
`--nproc_per_node`λ₯Ό μ¬μ©νμ¬ μ¬μ©ν κ°μκΈ° μλ₯Ό
|
31 |
|
32 |
```bash
|
33 |
torchrun --nproc_per_node=2 trainer-program.py ...
|
@@ -36,7 +36,7 @@ torchrun --nproc_per_node=2 trainer-program.py ...
|
|
36 |
</hfoption>
|
37 |
<hfoption id="Accelerate">
|
38 |
|
39 |
-
`--num_processes`λ₯Ό μ¬μ©νμ¬ μ¬μ©ν κ°μκΈ° μλ₯Ό
|
40 |
|
41 |
```bash
|
42 |
accelerate launch --num_processes 2 trainer-program.py ...
|
@@ -45,7 +45,7 @@ accelerate launch --num_processes 2 trainer-program.py ...
|
|
45 |
</hfoption>
|
46 |
<hfoption id="DeepSpeed">
|
47 |
|
48 |
-
`--num_gpus`λ₯Ό μ¬μ©νμ¬ μ¬μ©ν GPU μλ₯Ό
|
49 |
|
50 |
```bash
|
51 |
deepspeed --num_gpus 2 trainer-program.py ...
|
@@ -55,7 +55,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
|
|
55 |
</hfoptions>
|
56 |
|
57 |
## κ°μκΈ° μμ [[order-of-accelerators]]
|
58 |
-
μ¬μ©ν νΉμ κ°μκΈ°μ κ·Έ μμλ₯Ό μ ννλ €λ©΄ νλμ¨μ΄μ μ ν©ν νκ²½ λ³μλ₯Ό μ¬μ©νμΈμ. μ΄λ κ°
|
59 |
|
60 |
μλ₯Ό λ€μ΄, 4κ°μ κ°μκΈ°(0, 1, 2, 3)κ° μκ³ κ°μκΈ° 0κ³Ό 2λ§ μ€ννκ³ μΆλ€λ©΄:
|
61 |
|
@@ -66,7 +66,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
|
|
66 |
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
67 |
```
|
68 |
|
69 |
-
GPU 0κ³Ό 2λ§ PyTorch
|
70 |
μμλ₯Ό λ°κΎΈλ €λ©΄ (GPU 2λ₯Ό `cuda:0`μΌλ‘, GPU 0μ `cuda:1`λ‘ μ¬μ©):
|
71 |
|
72 |
|
@@ -80,15 +80,15 @@ GPU μμ΄ μ€ννλ €λ©΄:
|
|
80 |
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
81 |
```
|
82 |
|
83 |
-
`CUDA_DEVICE_ORDER`λ₯Ό μ¬μ©νμ¬ CUDA
|
84 |
|
85 |
-
- PCIe λ²μ€ ID
|
86 |
|
87 |
```bash
|
88 |
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
89 |
```
|
90 |
|
91 |
-
-
|
92 |
|
93 |
```bash
|
94 |
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
@@ -101,7 +101,7 @@ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
|
101 |
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
102 |
```
|
103 |
|
104 |
-
XPU 0κ³Ό 2λ§ PyTorch
|
105 |
μμλ₯Ό λ°κΎΈλ €λ©΄ (XPU 2λ₯Ό `xpu:0`μΌλ‘, XPU 0μ `xpu:1`λ‘ μ¬μ©):
|
106 |
|
107 |
```bash
|
@@ -109,13 +109,13 @@ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
|
|
109 |
```
|
110 |
|
111 |
|
112 |
-
|
113 |
|
114 |
```bash
|
115 |
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
116 |
```
|
117 |
|
118 |
-
Intel XPU
|
119 |
|
120 |
</hfoption>
|
121 |
</hfoptions>
|
@@ -123,5 +123,5 @@ Intel XPUμ μ₯μΉ μ΄κ±° λ° μ λ ¬μ λν μμΈν μ 보λ [Level Zero]
|
|
123 |
|
124 |
|
125 |
> [!WARNING]
|
126 |
-
> νκ²½ λ³μλ λͺ
λ Ήμ€μ μΆκ°νλ λμ
|
127 |
```
|
|
|
16 |
|
17 |
# κ°μκΈ° μ ν [[accelerator-selection]]
|
18 |
|
19 |
+
λΆμ° νμ΅ μ€μλ μ¬μ©ν κ°μκΈ°(CUDA, XPU, MPS, HPU λ±)μ μμ μμλ₯Ό μ§μ ν μ μμ΅λλ€. μ΄λ μλ‘ λ€λ₯Έ μ»΄ν¨ν
μ±λ₯μ κ°μ§ κ°μκΈ°κ° μμ λ λ λΉ λ₯Έ κ°μκΈ°λ₯Ό λ¨Όμ μ¬μ©νκ³ μΆμ κ²½μ°μ μ μ©ν μ μμ΅λλ€. λλ μ¬μ© κ°λ₯ν κ°μκΈ°μ μΌλΆλ§ μ¬μ©ν μλ μμ΅λλ€. μ ν κ³Όμ μ [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)κ³Ό [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) λͺ¨λμμ μλν©λλ€. Accelerateλ [DeepSpeed integration](./main_classes/deepspeed)λ νμνμ§ μμ΅λλ€.
|
20 |
|
21 |
μ΄ κ°μ΄λλ μ¬μ©ν κ°μκΈ°μ μμ μ¬μ© μμλ₯Ό μ ννλ λ°©λ²μ 보μ¬μ€λλ€.
|
22 |
|
|
|
27 |
<hfoptions id="select-accelerator">
|
28 |
<hfoption id="torchrun">
|
29 |
|
30 |
+
`--nproc_per_node`λ₯Ό μ¬μ©νμ¬ μ¬μ©ν κ°μκΈ° μλ₯Ό μ νν©λλ€.
|
31 |
|
32 |
```bash
|
33 |
torchrun --nproc_per_node=2 trainer-program.py ...
|
|
|
36 |
</hfoption>
|
37 |
<hfoption id="Accelerate">
|
38 |
|
39 |
+
`--num_processes`λ₯Ό μ¬μ©νμ¬ μ¬μ©ν κ°μκΈ° μλ₯Ό μ νν©λλ€.
|
40 |
|
41 |
```bash
|
42 |
accelerate launch --num_processes 2 trainer-program.py ...
|
|
|
45 |
</hfoption>
|
46 |
<hfoption id="DeepSpeed">
|
47 |
|
48 |
+
`--num_gpus`λ₯Ό μ¬μ©νμ¬ μ¬μ©ν GPU μλ₯Ό μ νν©λλ€.
|
49 |
|
50 |
```bash
|
51 |
deepspeed --num_gpus 2 trainer-program.py ...
|
|
|
55 |
</hfoptions>
|
56 |
|
57 |
## κ°μκΈ° μμ [[order-of-accelerators]]
|
58 |
+
μ¬μ©ν νΉμ κ°μκΈ°μ κ·Έ μμλ₯Ό μ ννλ €λ©΄ νλμ¨μ΄μ μ ν©ν νκ²½ λ³μλ₯Ό μ¬μ©νμΈμ. μ΄λ μ’
μ’
κ° μ€νμ λν΄ λͺ
λ Ήμ€μμ μ€μ λμ§λ§, `~/.bashrc`λ λ€λ₯Έ μμ κ΅¬μ± νμΌμ μΆκ°ν μλ μμ΅λλ€.
|
59 |
|
60 |
μλ₯Ό λ€μ΄, 4κ°μ κ°μκΈ°(0, 1, 2, 3)κ° μκ³ κ°μκΈ° 0κ³Ό 2λ§ μ€ννκ³ μΆλ€λ©΄:
|
61 |
|
|
|
66 |
CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
|
67 |
```
|
68 |
|
69 |
+
GPU 0κ³Ό 2λ§ PyTorchμμ "보μ΄λ©°" κ°κ° `cuda:0`κ³Ό `cuda:1`λ‘ λ§€νλ©λλ€.
|
70 |
μμλ₯Ό λ°κΎΈλ €λ©΄ (GPU 2λ₯Ό `cuda:0`μΌλ‘, GPU 0μ `cuda:1`λ‘ μ¬μ©):
|
71 |
|
72 |
|
|
|
80 |
CUDA_VISIBLE_DEVICES= python trainer-program.py ...
|
81 |
```
|
82 |
|
83 |
+
`CUDA_DEVICE_ORDER`λ₯Ό μ¬μ©νμ¬ CUDA μ₯μΉμ μμλ₯Ό μ μ΄ν μλ μμ΅λλ€:
|
84 |
|
85 |
+
- PCIe λ²μ€ ID μμ (`nvidia-smi`μ μΌμΉ):
|
86 |
|
87 |
```bash
|
88 |
$hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
|
89 |
```
|
90 |
|
91 |
+
- μ»΄ν¨ν
μ±λ₯ μμ (κ°μ₯ λΉ λ₯Έ κ²λΆν°):
|
92 |
|
93 |
```bash
|
94 |
export CUDA_DEVICE_ORDER=FASTEST_FIRST
|
|
|
101 |
ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
|
102 |
```
|
103 |
|
104 |
+
XPU 0κ³Ό 2λ§ PyTorchμμ "보μ΄λ©°" κ°κ° `xpu:0`κ³Ό `xpu:1`λ‘ λ§€νλ©λλ€.
|
105 |
μμλ₯Ό λ°κΎΈλ €λ©΄ (XPU 2λ₯Ό `xpu:0`μΌλ‘, XPU 0μ `xpu:1`λ‘ μ¬μ©):
|
106 |
|
107 |
```bash
|
|
|
109 |
```
|
110 |
|
111 |
|
112 |
+
λ€μμ μ¬μ©νμ¬ Intel XPUμ μμλ₯Ό μ μ΄ν μλ μμ΅λλ€:
|
113 |
|
114 |
```bash
|
115 |
export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
|
116 |
```
|
117 |
|
118 |
+
Intel XPUμμμ μ₯μΉ μ΄κ±° λ° μ λ ¬μ λν μμΈν μ 보λ [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) λ¬Έμλ₯Ό μ°Έμ‘°νμΈμ.
|
119 |
|
120 |
</hfoption>
|
121 |
</hfoptions>
|
|
|
123 |
|
124 |
|
125 |
> [!WARNING]
|
126 |
+
> νκ²½ λ³μλ λͺ
λ Ήμ€μ μΆκ°νλ λμ λ΄λ³΄λΌ μ μμ΅λλ€. νκ²½ λ³μκ° μ΄λ»κ² μ€μ λμλμ§ μμ΄λ²λ¦¬κ³ μλͺ»λ κ°μκΈ°λ₯Ό μ¬μ©νκ² λ μ μμ΄ νΌλμ μΌκΈ°ν μ μμΌλ―λ‘ κΆμ₯νμ§ μμ΅λλ€. λμ , κ°μ λͺ
λ Ήμ€μμ νΉμ νλ ¨ μ€νμ μν΄ νκ²½ λ³μλ₯Ό μ€μ νλ κ²μ΄ μΌλ°μ μΈ κ΄λ‘μ
λλ€.
|
127 |
```
|
translator/content.py
CHANGED
@@ -5,8 +5,13 @@ import requests
|
|
5 |
from langchain.callbacks import get_openai_callback
|
6 |
from langchain_anthropic import ChatAnthropic
|
7 |
|
|
|
|
|
8 |
|
9 |
def get_content(filepath: str) -> str:
|
|
|
|
|
|
|
10 |
url = string.Template(
|
11 |
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
12 |
).safe_substitute(filepath=filepath)
|
@@ -24,24 +29,31 @@ def preprocess_content(content: str) -> str:
|
|
24 |
## ignore top license comment
|
25 |
to_translate = content[content.find("#") :]
|
26 |
## remove code blocks from text
|
27 |
-
to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
28 |
## remove markdown tables from text
|
29 |
-
to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
30 |
## remove empty lines from text
|
31 |
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
32 |
-
|
33 |
return to_translate
|
34 |
|
35 |
|
36 |
-
def get_full_prompt(language: str, to_translate: str) -> str:
|
37 |
-
|
38 |
"What do these sentences about Hugging Face Transformers "
|
39 |
"(a machine learning library) mean in $language? "
|
40 |
"Please do not translate the word after a π€ emoji "
|
41 |
-
"as it is a product name. Output
|
42 |
-
"
|
43 |
).safe_substitute(language=language)
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
|
47 |
def split_markdown_sections(markdown: str) -> list:
|
@@ -64,33 +76,89 @@ def make_scaffold(content: str, to_translate: str) -> string.Template:
|
|
64 |
scaffold = content
|
65 |
for i, text in enumerate(to_translate.split("\n\n")):
|
66 |
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
|
|
|
|
67 |
return string.Template(scaffold)
|
68 |
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
71 |
scaffold = make_scaffold(content, to_translate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
divided = split_markdown_sections(to_translate)
|
|
|
|
|
73 |
anchors = get_anchors(divided)
|
74 |
-
|
75 |
-
translated
|
76 |
-
|
77 |
-
translated
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
translated_doc = scaffold.safe_substitute(
|
93 |
-
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(
|
94 |
)
|
95 |
return translated_doc
|
96 |
|
|
|
5 |
from langchain.callbacks import get_openai_callback
|
6 |
from langchain_anthropic import ChatAnthropic
|
7 |
|
8 |
+
from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
|
9 |
+
|
10 |
|
11 |
def get_content(filepath: str) -> str:
|
12 |
+
if filepath == "":
|
13 |
+
raise ValueError("No files selected for translation.")
|
14 |
+
|
15 |
url = string.Template(
|
16 |
"https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
|
17 |
).safe_substitute(filepath=filepath)
|
|
|
29 |
## ignore top license comment
|
30 |
to_translate = content[content.find("#") :]
|
31 |
## remove code blocks from text
|
32 |
+
# to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
|
33 |
## remove markdown tables from text
|
34 |
+
# to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
|
35 |
## remove empty lines from text
|
36 |
to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
|
|
|
37 |
return to_translate
|
38 |
|
39 |
|
40 |
+
def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
|
41 |
+
base_prompt = string.Template(
|
42 |
"What do these sentences about Hugging Face Transformers "
|
43 |
"(a machine learning library) mean in $language? "
|
44 |
"Please do not translate the word after a π€ emoji "
|
45 |
+
"as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
|
46 |
+
"No explanations or extrasβonly the translated markdown. Also translate all comments within code blocks as well."
|
47 |
).safe_substitute(language=language)
|
48 |
+
|
49 |
+
base_prompt += "\n\n```md"
|
50 |
+
|
51 |
+
full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
|
52 |
+
|
53 |
+
if additional_instruction.strip():
|
54 |
+
full_prompt += f"\n\nποΈ Additional instructions: {additional_instruction.strip()}"
|
55 |
+
|
56 |
+
return full_prompt
|
57 |
|
58 |
|
59 |
def split_markdown_sections(markdown: str) -> list:
|
|
|
76 |
scaffold = content
|
77 |
for i, text in enumerate(to_translate.split("\n\n")):
|
78 |
scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
|
79 |
+
print("inner scaffold:")
|
80 |
+
print(scaffold)
|
81 |
return string.Template(scaffold)
|
82 |
|
83 |
|
84 |
+
def is_in_code_block(text: str, position: int) -> bool:
|
85 |
+
"""Check if a position in text is inside a code block"""
|
86 |
+
text_before = text[:position]
|
87 |
+
code_block_starts = text_before.count("```")
|
88 |
+
return code_block_starts % 2 == 1
|
89 |
+
|
90 |
+
|
91 |
def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
|
92 |
scaffold = make_scaffold(content, to_translate)
|
93 |
+
print("scaffold:")
|
94 |
+
print(scaffold.template)
|
95 |
+
|
96 |
+
# Get original text sections to maintain structure
|
97 |
+
original_sections = to_translate.split("\n\n")
|
98 |
+
|
99 |
+
# Split markdown sections to get headers and anchors
|
100 |
divided = split_markdown_sections(to_translate)
|
101 |
+
print("divided:")
|
102 |
+
print(divided)
|
103 |
anchors = get_anchors(divided)
|
104 |
+
|
105 |
+
# Split translated content by markdown sections
|
106 |
+
translated_divided = split_markdown_sections(translated)
|
107 |
+
print("translated divided:")
|
108 |
+
print(translated_divided)
|
109 |
+
|
110 |
+
# Ensure we have the same number of headers as the original
|
111 |
+
if len(translated_divided[1::3]) != len(anchors):
|
112 |
+
print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
|
113 |
+
# Adjust anchors list to match translated headers
|
114 |
+
if len(translated_divided[1::3]) < len(anchors):
|
115 |
+
anchors = anchors[:len(translated_divided[1::3])]
|
116 |
+
else:
|
117 |
+
# Add empty anchors for extra headers
|
118 |
+
anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
|
119 |
+
|
120 |
+
# Add anchors to translated headers only if they're not in code blocks
|
121 |
+
for i, korean_title in enumerate(translated_divided[1::3]):
|
122 |
+
if i < len(anchors):
|
123 |
+
# Find the position of this header in the original translated text
|
124 |
+
header_pos = translated.find(korean_title.strip())
|
125 |
+
if header_pos != -1 and not is_in_code_block(translated, header_pos):
|
126 |
+
translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
|
127 |
+
else:
|
128 |
+
translated_divided[1 + i * 3] = korean_title
|
129 |
+
|
130 |
+
# Reconstruct translated content with proper structure
|
131 |
+
reconstructed_translated = "".join([
|
132 |
+
"".join(translated_divided[i * 3 : i * 3 + 3])
|
133 |
+
for i in range(len(translated_divided) // 3)
|
134 |
+
])
|
135 |
+
|
136 |
+
# Split by double newlines to match original structure
|
137 |
+
translated_sections = reconstructed_translated.split("\n\n")
|
138 |
+
|
139 |
+
print("scaffold template count:")
|
140 |
+
print(scaffold.template.count("$hf_i18n_placeholder"))
|
141 |
+
print("original sections length:")
|
142 |
+
print(len(original_sections))
|
143 |
+
print("translated sections length:")
|
144 |
+
print(len(translated_sections))
|
145 |
+
|
146 |
+
# Ensure section counts match
|
147 |
+
placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
|
148 |
+
|
149 |
+
if len(translated_sections) < placeholder_count:
|
150 |
+
# Add empty sections if translated has fewer sections
|
151 |
+
translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
|
152 |
+
elif len(translated_sections) > placeholder_count:
|
153 |
+
# Truncate if translated has more sections
|
154 |
+
translated_sections = translated_sections[:placeholder_count]
|
155 |
+
|
156 |
+
# Final check
|
157 |
+
if len(translated_sections) != placeholder_count:
|
158 |
+
return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
|
159 |
+
|
160 |
translated_doc = scaffold.safe_substitute(
|
161 |
+
{f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
|
162 |
)
|
163 |
return translated_doc
|
164 |
|
translator/retriever.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import os
|
2 |
from pathlib import Path
|
3 |
|
@@ -25,6 +26,59 @@ def get_github_repo_files():
|
|
25 |
return file_paths
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
29 |
"""
|
30 |
Retrieve missing docs
|
|
|
1 |
+
import re
|
2 |
import os
|
3 |
from pathlib import Path
|
4 |
|
|
|
26 |
return file_paths
|
27 |
|
28 |
|
29 |
+
def get_github_issue_open_pr(lang: str = "ko"):
|
30 |
+
"""
|
31 |
+
Get open PR in the github issue, filtered by title starting with 'π [i18n-KO]'.
|
32 |
+
"""
|
33 |
+
if lang == "ko":
|
34 |
+
issue_id = "20179"
|
35 |
+
else:
|
36 |
+
raise ValueError(
|
37 |
+
"No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
|
38 |
+
)
|
39 |
+
|
40 |
+
headers = {
|
41 |
+
"Accept": "application/vnd.github+json",
|
42 |
+
}
|
43 |
+
|
44 |
+
all_open_prs = []
|
45 |
+
page = 1
|
46 |
+
per_page = 100 # Maximum allowed by GitHub API
|
47 |
+
|
48 |
+
while True:
|
49 |
+
url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open&page={page}&per_page={per_page}"
|
50 |
+
response = requests.get(url, headers=headers)
|
51 |
+
|
52 |
+
if response.status_code != 200:
|
53 |
+
raise Exception(f"GitHub API error: {response.status_code} {response.text}")
|
54 |
+
|
55 |
+
page_prs = response.json()
|
56 |
+
if not page_prs: # No more PRs
|
57 |
+
break
|
58 |
+
|
59 |
+
all_open_prs.extend(page_prs)
|
60 |
+
page += 1
|
61 |
+
|
62 |
+
# Break if we got less than per_page results (last page)
|
63 |
+
if len(page_prs) < per_page:
|
64 |
+
break
|
65 |
+
|
66 |
+
filtered_prs = [pr for pr in all_open_prs if pr["title"].startswith("π [i18n-KO]")]
|
67 |
+
|
68 |
+
pattern = re.compile(r"`([^`]+\.md)`")
|
69 |
+
|
70 |
+
filenames = [
|
71 |
+
"docs/source/en/" + match.group(1)
|
72 |
+
for pr in filtered_prs
|
73 |
+
if (match := pattern.search(pr["title"]))
|
74 |
+
]
|
75 |
+
pr_info_list = [
|
76 |
+
f"https://github.com/huggingface/transformers/pull/{pr["url"].rstrip('/').split('/')[-1]}"
|
77 |
+
for pr in filtered_prs
|
78 |
+
]
|
79 |
+
return filenames, pr_info_list
|
80 |
+
|
81 |
+
|
82 |
def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
|
83 |
"""
|
84 |
Retrieve missing docs
|