Spaces:

huggingface-KREW
/

test-github-CI-for-i18n-agent

Running

App Files Files Community

wony617 commited on 23 days ago

Commit

ec613d7

unverified ·

2 Parent(s): 9e33f2c 4eb685d

Merge pull request #2 from Jwaminju/update-translator

Browse files

Files changed (8) hide show

README.md +1 -1
agent/handler.py +81 -17
agent/workflow.py +68 -25
app.py +54 -11
pr_generator/agent.py +1 -1
translation_result/docs/source/en/accelerator_selection.md +13 -13
translator/content.py +95 -27
translator/retriever.py +54 -0

README.md CHANGED Viewed

@@ -54,7 +54,7 @@ This project was specifically created to solve [Hugging Face Transformers Issue
 ## 🎥 Demo Video
-[![Hugging Face i18n Agent Demo](https://img.youtube.com/vi/YOUR_VIDEO_ID/maxresdefault.jpg)](https://www.youtube.com/watch?v=YOUR_VIDEO_ID)
 *Watch the complete walkthrough: from setup to PR creation in under 5 minutes*

 ## 🎥 Demo Video
+[Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
 *Watch the complete walkthrough: from setup to PR creation in under 5 minutes*

agent/handler.py CHANGED Viewed

@@ -8,10 +8,12 @@ import gradio as gr
 from agent.workflow import (
     report_translation_target_files,
     translate_docs_interactive,
     generate_github_pr,
 )
 from pr_generator.searcher import find_reference_pr_simple_stream
 # State management
@@ -21,6 +23,7 @@ class ChatState:
         self.target_language = "ko"
         self.k_files = 10
         self.files_to_translate = []
         self.current_file_content = {"translated": ""}
         self.pr_result = None  # Store PR creation result
         # GitHub configuration
@@ -70,22 +73,29 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
     state.step = "find_files"
     status_report, files_list = report_translation_target_files(lang, k)
-    state.files_to_translate = [file[0] for file in files_list] if files_list else []
     response = f"""**✅ File search completed!**
 **Status Report:**
 {status_report}
 **📁 Found first {len(state.files_to_translate)} files to translate:**
 """
     if state.files_to_translate:
-        for i, file in enumerate(state.files_to_translate[:5], 1):  # Show first 5
             response += f"\n{i}. `{file}`"
-        if len(state.files_to_translate) > 5:
-            response += f"\n... and {len(state.files_to_translate) - 5} more files"
         response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
     else:
@@ -96,7 +106,18 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
     cleared_input = ""
     selected_tab = 1 if state.files_to_translate else 0
-    return history, cleared_input, update_status(), gr.Tabs(selected=selected_tab)
 def start_translation_process():
@@ -108,8 +129,8 @@ def start_translation_process():
     # Call translation function (simplified for demo)
     try:
-        status, translated = translate_docs_interactive(
-            state.target_language, [[current_file]]
         )
         state.current_file_content = {"translated": translated}
@@ -124,18 +145,24 @@ def start_translation_process():
         original_file_link = (
             "https://github.com/huggingface/transformers/blob/main/" + current_file
         )
         response = (
-            f"""🔄 Translation for: `{current_file}`**\n"""
             "**📄 Original Content Link:**\n"
             ""
             f"{original_file_link}\n"
             "**🌐 Translated Content:**\n"
-            f"\n```\n\n{_extract_content_for_display(translated)}```\n"
-            f"{status}\n"
         )
-        print("translated:")
-        print(translated)
-        print("extracted")
     except Exception as e:
         response = f"❌ Translation failed: {str(e)}"
@@ -191,12 +218,14 @@ def handle_user_message(message, history):
         # User wants to start translation
         if state.files_to_translate:
             state.step = "translate"
-            response = start_translation_process()
         else:
             response = (
                 "❌ No files available for translation. Please search for files first."
             )
     # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
     else:
         # General response
@@ -288,14 +317,44 @@ def update_github_config(token, owner, repo, reference_pr_url):
     return f"✅ GitHub configuration updated: {owner}/{repo}"
 def send_message(message, history):
     new_history, cleared_input = handle_user_message(message, history)
     return new_history, cleared_input, update_status()
 # Button handlers with tab switching
-def start_translate_handler(history, anthropic_key):
     os.environ["ANTHROPIC_API_KEY"] = anthropic_key
     new_hist, cleared_input = handle_user_message("start translation", history)
     selected_tabs = 2 if state.current_file_content["translated"] else 0
     return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
@@ -363,11 +422,16 @@ def approve_handler(history, owner, repo, reference_pr_url):
         translated_content = state.current_file_content["translated"]
         response += "\n\n🚀 **Generating GitHub PR...**"
         pr_response = generate_github_pr(
             target_language=state.target_language,
             filepath=current_file,
             translated_content=translated_content,
             github_config=state.github_config,
         )
         response += f"\n{pr_response}"
     else:

 from agent.workflow import (
     report_translation_target_files,
+    report_in_translation_status_files,
     translate_docs_interactive,
     generate_github_pr,
 )
 from pr_generator.searcher import find_reference_pr_simple_stream
+from translator.content import get_full_prompt, get_content, preprocess_content
 # State management
         self.target_language = "ko"
         self.k_files = 10
         self.files_to_translate = []
+        self.additional_instruction = ""
         self.current_file_content = {"translated": ""}
         self.pr_result = None  # Store PR creation result
         # GitHub configuration
     state.step = "find_files"
     status_report, files_list = report_translation_target_files(lang, k)
+    in_progress_status_report, in_progress_docs = report_in_translation_status_files(
+        lang
+    )
+    state.files_to_translate = (
+        [file[0] for file in files_list if file[0] not in in_progress_docs]
+        if files_list
+        else []
+    )
     response = f"""**✅ File search completed!**
 **Status Report:**
 {status_report}
+{in_progress_status_report}
 **📁 Found first {len(state.files_to_translate)} files to translate:**
 """
     if state.files_to_translate:
+        for i, file in enumerate(state.files_to_translate, 1):
             response += f"\n{i}. `{file}`"
+        # if len(state.files_to_translate) > 5:
+        #     response += f"\n... and {len(state.files_to_translate) - 5} more files"
         response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
     else:
     cleared_input = ""
     selected_tab = 1 if state.files_to_translate else 0
+    # 드롭다운 choices로 쓸 파일 리스트 반환 추가
+    return (
+        history,
+        cleared_input,
+        update_status(),
+        gr.Tabs(selected=selected_tab),
+        update_dropdown_choices(state.files_to_translate),
+    )
+def update_dropdown_choices(file_list):
+    return gr.update(choices=file_list, value=None)
 def start_translation_process():
     # Call translation function (simplified for demo)
     try:
+        translated = translate_docs_interactive(
+            state.target_language, [[current_file]], state.additional_instruction
         )
         state.current_file_content = {"translated": translated}
         original_file_link = (
             "https://github.com/huggingface/transformers/blob/main/" + current_file
         )
+        print("Compeleted translation:\n")
+        print(translated)
+        print("----------------------------")
         response = (
+            f"""🔄 Translation for: `{current_file}`\n"""
             "**📄 Original Content Link:**\n"
             ""
             f"{original_file_link}\n"
             "**🌐 Translated Content:**\n"
+            # f"\n```\n\n{_extract_content_for_display(translated)}\n```"
+            # "\n```\n\n"
+            # f"\n{translated}\n"
+            # f"```"
+            # f"{status}\n"
+            # "✅ Translation completed. The code block will be added when generating PR."
         )
+        return response, translated
     except Exception as e:
         response = f"❌ Translation failed: {str(e)}"
         # User wants to start translation
         if state.files_to_translate:
             state.step = "translate"
+            response, translated = start_translation_process()
+            history.append([message, response])
+            history.append(["", translated])
+            return history, ""
         else:
             response = (
                 "❌ No files available for translation. Please search for files first."
             )
     # Handle GitHub PR creation - This part is removed as approve_handler is the main entry point
     else:
         # General response
     return f"✅ GitHub configuration updated: {owner}/{repo}"
+def update_prompt_preview(language, file_path, additional_instruction):
+    """Update prompt preview based on current settings"""
+    if not file_path.strip():
+        return "Select a file to see the prompt preview..."
+    try:
+        # Get language name
+        if language == "ko":
+            translation_lang = "Korean"
+        else:
+            translation_lang = language
+        # Get sample content (first 500 characters)
+        content = get_content(file_path)
+        to_translate = preprocess_content(content)
+        # Truncate for preview
+        sample_content = to_translate[:500] + ("..." if len(to_translate) > 500 else "")
+        # Generate prompt
+        prompt = get_full_prompt(translation_lang, sample_content, additional_instruction)
+        return prompt
+    except Exception as e:
+        return f"Error generating prompt preview: {str(e)}"
 def send_message(message, history):
     new_history, cleared_input = handle_user_message(message, history)
     return new_history, cleared_input, update_status()
 # Button handlers with tab switching
+def start_translate_handler(history, anthropic_key, file_to_translate, additional_instruction=""):
     os.environ["ANTHROPIC_API_KEY"] = anthropic_key
+    state.additional_instruction = additional_instruction
+    state.files_to_translate = [file_to_translate]
     new_hist, cleared_input = handle_user_message("start translation", history)
     selected_tabs = 2 if state.current_file_content["translated"] else 0
     return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)
         translated_content = state.current_file_content["translated"]
         response += "\n\n🚀 **Generating GitHub PR...**"
+        # Extract title from file for toctree mapping
+        file_name = current_file.split("/")[-1].replace(".md", "").replace("_", " ").title()
+        print(file_name)
         pr_response = generate_github_pr(
             target_language=state.target_language,
             filepath=current_file,
             translated_content=translated_content,
             github_config=state.github_config,
+            en_title=file_name,
         )
         response += f"\n{pr_response}"
     else:

agent/workflow.py CHANGED Viewed

@@ -11,7 +11,7 @@ from translator.content import (
     llm_translate,
     preprocess_content,
 )
-from translator.retriever import report
 # GitHub PR Agent import
 try:
@@ -38,8 +38,34 @@ def report_translation_target_files(
     return status_report, [[file] for file in filepath_list]
-def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
     """Translate documentation."""
     # step 1. Get content from file path
     content = get_content(file_path)
     to_translate = preprocess_content(content)
@@ -47,21 +73,25 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
     # step 2. Prepare prompt with docs content
     if lang == "ko":
         translation_lang = "Korean"
-    to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
     # step 3. Translate with LLM
     # TODO: MCP clilent 넘길 부분
     callback_result, translated_content = llm_translate(to_translate_with_prompt)
     # step 4. Add scaffold to translation result
     translated_doc = fill_scaffold(content, to_translate, translated_content)
     return callback_result, translated_doc
 def translate_docs_interactive(
-    translate_lang: str, selected_files: list[list[str]]
-) -> tuple[str, str, str]:
     """Interactive translation function that processes files one by one.
     Args:
@@ -70,27 +100,17 @@ def translate_docs_interactive(
     """
     # Extract file paths from the dataframe format
     file_paths = [row[0] for row in selected_files if row and len(row) > 0]
-    if not file_paths:
-        return (
-            "No files selected for translation.",
-            gr.update(visible=False),
-            gr.update(visible=False),
-            gr.update(visible=False),
-            [],
-            0,
-        )
     # Start with the first file
     current_file = file_paths[0]
     status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
-    callback_result, translated_content = translate_docs(translate_lang, current_file)
     status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
-    if len(file_paths) > 1:
-        status += f"\n### 📝 Note: Currently, only the first file has been translated.\n> The remaining {len(file_paths) - 1} files have not been processed yet, as the system is in its beta version"
-    return status, translated_content
 def generate_github_pr(
@@ -98,6 +118,7 @@ def generate_github_pr(
     filepath: str,
     translated_content: str = None,
     github_config: dict = None,
 ) -> str:
     """Generate a GitHub PR for translated documentation.
@@ -106,6 +127,7 @@ def generate_github_pr(
         filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
         translated_content: Translated content (if None, read from file)
         github_config: GitHub configuration dictionary
     Returns:
         PR creation result message
@@ -149,9 +171,7 @@ def generate_github_pr(
         print(f"   📁 File: {filepath}")
         print(f"   🌍 Language: {target_language}")
         print(f"   📊 Reference PR: {github_config['reference_pr_url']}")
-        print(
-            f"   🏠 Repository: {github_config['owner']}/{github_config['repo_name']}"
-        )
         agent = GitHubPRAgent()
         result = agent.run_translation_pr_workflow(
@@ -163,14 +183,37 @@ def generate_github_pr(
             repo_name=github_config["repo_name"],
             base_branch=github_config.get("base_branch", "main"),
         )
         # Process result
         if result["status"] == "success":
             return f"""✅ **GitHub PR Creation Successful!**
 🔗 **PR URL:** {result["pr_url"]}
 🌿 **Branch:** {result["branch"]}
-📁 **File:** {result["file_path"]}
 {result["message"]}"""
@@ -178,7 +221,7 @@ def generate_github_pr(
             return f"""⚠️ **Partial Success**
 🌿 **Branch:** {result["branch"]}
-📁 **File:** {result["file_path"]}
 {result["message"]}

     llm_translate,
     preprocess_content,
 )
+from translator.retriever import report, get_github_issue_open_pr
 # GitHub PR Agent import
 try:
     return status_report, [[file] for file in filepath_list]
+def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[str]]:
+    docs, pr_info_list = get_github_issue_open_pr(translate_lang)
+    status_report = ""
+    if docs:
+        status_report = f"""\n🤖 Found {len(docs)} in progress for translation.
+        """
+        for i, file in enumerate(docs):
+            status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
+        status_report += "\n"
+    return status_report, docs
+def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
     """Translate documentation."""
+    # Check if translation already exists
+    translation_file_path = (
+        Path(__file__).resolve().parent.parent
+        / f"translation_result/{file_path}"
+    )
+    if translation_file_path.exists():
+        print(f"📄 Found existing translation: {translation_file_path}")
+        with open(translation_file_path, "r", encoding="utf-8") as f:
+            existing_content = f.read()
+        if existing_content.strip():
+            return "Existing translation loaded (no tokens used)", existing_content
     # step 1. Get content from file path
     content = get_content(file_path)
     to_translate = preprocess_content(content)
     # step 2. Prepare prompt with docs content
     if lang == "ko":
         translation_lang = "Korean"
+    to_translate_with_prompt = get_full_prompt(translation_lang, to_translate, additional_instruction)
+    print("to_translate_with_prompt:\n", to_translate_with_prompt)
     # step 3. Translate with LLM
     # TODO: MCP clilent 넘길 부분
     callback_result, translated_content = llm_translate(to_translate_with_prompt)
+    print("translated_content:\n")
+    print(translated_content)
     # step 4. Add scaffold to translation result
     translated_doc = fill_scaffold(content, to_translate, translated_content)
+    print("translated_doc:\n")
+    print(translated_doc)
     return callback_result, translated_doc
 def translate_docs_interactive(
+    translate_lang: str, selected_files: list[list[str]], additional_instruction: str = ""
+) -> tuple[str, str]:
     """Interactive translation function that processes files one by one.
     Args:
     """
     # Extract file paths from the dataframe format
     file_paths = [row[0] for row in selected_files if row and len(row) > 0]
     # Start with the first file
     current_file = file_paths[0]
     status = f"✅ Translation completed: `{current_file}` → `{translate_lang}`\n\n"
+    callback_result, translated_content = translate_docs(translate_lang, current_file, additional_instruction)
     status += f"💰 Used token and cost: \n```\n{callback_result}\n```"
+    print(status)
+    return translated_content
 def generate_github_pr(
     filepath: str,
     translated_content: str = None,
     github_config: dict = None,
+    en_title: str = None,
 ) -> str:
     """Generate a GitHub PR for translated documentation.
         filepath: Original file path (e.g., "docs/source/en/accelerator_selection.md")
         translated_content: Translated content (if None, read from file)
         github_config: GitHub configuration dictionary
+        en_title: English title for toctree mapping
     Returns:
         PR creation result message
         print(f"   📁 File: {filepath}")
         print(f"   🌍 Language: {target_language}")
         print(f"   📊 Reference PR: {github_config['reference_pr_url']}")
+        print(f"   🏠 Repository: {github_config['owner']}/{github_config['repo_name']}")
         agent = GitHubPRAgent()
         result = agent.run_translation_pr_workflow(
             repo_name=github_config["repo_name"],
             base_branch=github_config.get("base_branch", "main"),
         )
+        # result = {
+        #     'status': 'partial_success',
+        #     'branch': 'ko-attention_interface',
+        #     'file_path': 'docs/source/ko/attention_interface.md',
+        #     'message': 'File was saved and commit was successful.\nPR creation failed: ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1', 'error_details': 'ERROR: Existing PR found: https://github.com/Jwaminju/transformers/pull/1'
+        #     }
+        # Process toctree update after successful translation PR
+        toctree_result = None
+        if en_title:
+            from agent.toctree_handler import TocTreeHandler
+            toctree_handler = TocTreeHandler()
+            toctree_result = toctree_handler.update_toctree_after_translation(
+                result, en_title, filepath, agent, github_config
+            )
+            print("toctree_result:", toctree_result)
         # Process result
+        # Generate toctree status message (shared for both success and partial_success)
+        toctree_status = ""
+        if toctree_result:
+            if toctree_result["status"] == "success":
+                toctree_status = f"\n📋 **Toctree Updated:** ✅ {toctree_result['message']}"
+            else:
+                toctree_status = f"\n📋 **Toctree Update Failed:** ❌ {toctree_result['message']}"
         if result["status"] == "success":
             return f"""✅ **GitHub PR Creation Successful!**
 🔗 **PR URL:** {result["pr_url"]}
 🌿 **Branch:** {result["branch"]}
+📁 **File:** {result["file_path"]}{toctree_status}
 {result["message"]}"""
             return f"""⚠️ **Partial Success**
 🌿 **Branch:** {result["branch"]}
+📁 **File:** {result["file_path"]}{toctree_status}
 {result["message"]}

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from agent.handler import (
     send_message,
     start_translate_handler,
     sync_language_displays,
     update_status,
     update_github_config,
 )
@@ -30,7 +31,7 @@ css = """
     background: rgba(255, 255, 180, 0.25);
     border-radius: 18px;
     box-shadow: 0 4px 24px rgba(0,0,0,0.08);
-    padding: 1.5em;
     backdrop-filter: blur(8px);
     border: 1px solid rgba(255,255,180,0.25);
     width: 100%;
@@ -40,10 +41,12 @@ css = """
     background: rgba(255, 255, 180, 0.25);
     border-radius: 18px;
     box-shadow: 0 4px 24px rgba(0,0,0,0.08);
-    padding: 1.5em;
     backdrop-filter: blur(8px);
     border: 1px solid rgba(255,255,180,0.25);
     width: 100%;
 }
 .status-card {
     width: 100%
@@ -91,7 +94,6 @@ css = """
 with gr.Blocks(
     css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
 ) as demo:
     # Title
     with open("images/hfkr_logo.png", "rb") as img_file:
         base64_img = base64.b64encode(img_file.read()).decode()
@@ -105,11 +107,12 @@ with gr.Blocks(
     # Content
     with gr.Row():
         # Chat interface
-        with gr.Column(scale=4, elem_classes=["chat-container"]):
             gr.Markdown("### 🌐 Hugging Face i18n Agent")
             chatbot = gr.Chatbot(
-                value=[[None, get_welcome_message()]], scale=1, height=585
             )
         # Controller interface
@@ -122,16 +125,15 @@ with gr.Blocks(
                 with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
                     with gr.TabItem("1. Find Files", id=0):
                         with gr.Group():
-                            lang_dropdown = gr.Dropdown(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translate To",
                                 value="ko",
                             )
                             k_input = gr.Number(
                                 label="📊 First k missing translated docs",
-                                value=1,
                                 minimum=1,
-                                maximum=100,
                             )
                             find_btn = gr.Button(
                                 "🔍 Find Files to Translate",
@@ -140,6 +142,17 @@ with gr.Blocks(
                     with gr.TabItem("2. Translate", id=1):
                         with gr.Group():
                             translate_lang_display = gr.Dropdown(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translation Language",
@@ -150,6 +163,21 @@ with gr.Blocks(
                                 label="🔑 Anthropic API key for translation generation",
                                 type="password",
                             )
                             start_translate_btn = gr.Button(
                                 "🚀 Start Translation", elem_classes="action-button"
                             )
@@ -186,7 +214,7 @@ with gr.Blocks(
             # Chat Controller
             with gr.Column(elem_classes=["control-panel"]):
-                gr.Markdown("### 💬 Chat with agent")
                 msg_input = gr.Textbox(
                     placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
                     container=False,
@@ -199,7 +227,7 @@ with gr.Blocks(
     find_btn.click(
         fn=process_file_search_handler,
         inputs=[lang_dropdown, k_input, chatbot],
-        outputs=[chatbot, msg_input, status_display, control_tabs],
     )
     # Sync language across tabs
@@ -209,10 +237,17 @@ with gr.Blocks(
         outputs=[translate_lang_display],
     )
     # Button event handlers
     start_translate_btn.click(
         fn=start_translate_handler,
-        inputs=[chatbot, anthropic_key],
         outputs=[chatbot, msg_input, status_display, control_tabs],
     )
@@ -247,5 +282,13 @@ with gr.Blocks(
         outputs=[chatbot, msg_input, status_display],
     )
 root_path = os.environ.get("GRADIO_ROOT_PATH")
 demo.launch(root_path=root_path)

     send_message,
     start_translate_handler,
     sync_language_displays,
+    update_prompt_preview,
     update_status,
     update_github_config,
 )
     background: rgba(255, 255, 180, 0.25);
     border-radius: 18px;
     box-shadow: 0 4px 24px rgba(0,0,0,0.08);
+    padding: 1.0em;
     backdrop-filter: blur(8px);
     border: 1px solid rgba(255,255,180,0.25);
     width: 100%;
     background: rgba(255, 255, 180, 0.25);
     border-radius: 18px;
     box-shadow: 0 4px 24px rgba(0,0,0,0.08);
+    padding: 1.0em;
     backdrop-filter: blur(8px);
     border: 1px solid rgba(255,255,180,0.25);
     width: 100%;
+    overflow: visible !important;
 }
 .status-card {
     width: 100%
 with gr.Blocks(
     css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
 ) as demo:
     # Title
     with open("images/hfkr_logo.png", "rb") as img_file:
         base64_img = base64.b64encode(img_file.read()).decode()
     # Content
     with gr.Row():
         # Chat interface
+        with gr.Column(scale=3, elem_classes=["chat-container"]):
             gr.Markdown("### 🌐 Hugging Face i18n Agent")
             chatbot = gr.Chatbot(
+                value=[[None, get_welcome_message()]], scale=1, height=585,
+                show_copy_button=True
             )
         # Controller interface
                 with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
                     with gr.TabItem("1. Find Files", id=0):
                         with gr.Group():
+                            lang_dropdown = gr.Radio(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translate To",
                                 value="ko",
                             )
                             k_input = gr.Number(
                                 label="📊 First k missing translated docs",
+                                value=10,
                                 minimum=1,
                             )
                             find_btn = gr.Button(
                                 "🔍 Find Files to Translate",
                     with gr.TabItem("2. Translate", id=1):
                         with gr.Group():
+                            files_to_translate = gr.Radio(
+                                choices=[],
+                                label="📄 Select a file to translate",
+                                interactive=True,
+                                value=None,
+                            )
+                            file_to_translate_input = gr.Textbox(
+                                label="🌍 Select in the dropdown or write the file path to translate",
+                                value="",
+                            )
                             translate_lang_display = gr.Dropdown(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translation Language",
                                 label="🔑 Anthropic API key for translation generation",
                                 type="password",
                             )
+                            additional_instruction = gr.Textbox(
+                                label="📝 Additional instructions (Optional - e.g., custom glossary)",
+                                placeholder="Example: Translate 'model' as '모델' consistently",
+                                lines=2,
+                            )
+                            with gr.Accordion("🔍 Preview Prompt", open=False):
+                                prompt_preview = gr.Textbox(
+                                    label="Current Translation Prompt",
+                                    lines=8,
+                                    interactive=False,
+                                    placeholder="Select a file and language to see the prompt preview...",
+                                    show_copy_button=True,
+                                )
                             start_translate_btn = gr.Button(
                                 "🚀 Start Translation", elem_classes="action-button"
                             )
             # Chat Controller
             with gr.Column(elem_classes=["control-panel"]):
+                gr.Markdown("### 💬 Chat with agent (Only simple chat is available)")
                 msg_input = gr.Textbox(
                     placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
                     container=False,
     find_btn.click(
         fn=process_file_search_handler,
         inputs=[lang_dropdown, k_input, chatbot],
+        outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
     )
     # Sync language across tabs
         outputs=[translate_lang_display],
     )
+    #
+    files_to_translate.change(
+        fn=lambda x: x,
+        inputs=[files_to_translate],
+        outputs=[file_to_translate_input],
+    )
     # Button event handlers
     start_translate_btn.click(
         fn=start_translate_handler,
+        inputs=[chatbot, anthropic_key, file_to_translate_input, additional_instruction],
         outputs=[chatbot, msg_input, status_display, control_tabs],
     )
         outputs=[chatbot, msg_input, status_display],
     )
+    # Update prompt preview when inputs change
+    for input_component in [translate_lang_display, file_to_translate_input, additional_instruction]:
+        input_component.change(
+            fn=update_prompt_preview,
+            inputs=[translate_lang_display, file_to_translate_input, additional_instruction],
+            outputs=[prompt_preview],
+        )
 root_path = os.environ.get("GRADIO_ROOT_PATH")
 demo.launch(root_path=root_path)

pr_generator/agent.py CHANGED Viewed

@@ -518,7 +518,7 @@ Please return only the commit message. No other explanation is needed."""
                     "status": "partial_success",
                     "branch": branch_name,
                     "file_path": target_filepath,
-                    "message": f"File was saved but PR creation failed: {pr_result}",
                     "error_details": pr_result,
                 }
             elif "successful" in pr_result and "http" in pr_result:

                     "status": "partial_success",
                     "branch": branch_name,
                     "file_path": target_filepath,
+                    "message": f"File was saved and commit was successful.\nPR creation failed: {pr_result}",
                     "error_details": pr_result,
                 }
             elif "successful" in pr_result and "http" in pr_result:

translation_result/docs/source/en/accelerator_selection.md CHANGED Viewed

@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 # 가속기 선택 [[accelerator-selection]]
-분산 훈련 중에 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 연산 성능을 가진 가속기가 있고 더 빠른 가속기를 먼저 사용하고 싶을 때 유용할 수 있습니다. 또는 사용 가능한 가속기 중 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 <hfoptions id="select-accelerator">
 <hfoption id="torchrun">
-`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택하세요.
 ```bash
 torchrun --nproc_per_node=2  trainer-program.py ...
@@ -36,7 +36,7 @@ torchrun --nproc_per_node=2  trainer-program.py ...
 </hfoption>
 <hfoption id="Accelerate">
-`--num_processes`를 사용하여 사용할 가속기 수를 선택하세요.
 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
@@ -45,7 +45,7 @@ accelerate launch --num_processes 2 trainer-program.py ...
 </hfoption>
 <hfoption id="DeepSpeed">
-`--num_gpus`를 사용하여 사용할 GPU 수를 선택하세요.
 ```bash
 deepspeed --num_gpus 2 trainer-program.py ...
@@ -55,7 +55,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
 </hfoptions>
 ## 가속기 순서 [[order-of-accelerators]]
-사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 각 실행마다 명령줄에서 설정되는 경우가 많지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
@@ -66,7 +66,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
 CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
 ```
-GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
@@ -80,15 +80,15 @@ GPU 없이 실행하려면:
 CUDA_VISIBLE_DEVICES= python trainer-program.py ...
 ```
-`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치 순서를 제어할 수도 있습니다:
-- PCIe 버스 ID 순서로 정렬 (`nvidia-smi`와 일치):
     ```bash
 $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
     ```
-- 연산 성능 순서로 정렬 (가장 빠른 것부터):
     ```bash
     export CUDA_DEVICE_ORDER=FASTEST_FIRST
@@ -101,7 +101,7 @@ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
 ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
 ```
-XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
 ```bash
@@ -109,13 +109,13 @@ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
 ```
-다음으로 Intel XPU 순서를 제어할 수도 있습니다:
 ```bash
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 ```
-Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
 </hfoption>
 </hfoptions>
@@ -123,5 +123,5 @@ Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero]
 > [!WARNING]
-> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 결국 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신, 동일한 명령줄에서 특정 훈련 실행에 대해 환경 변수를 설정하는 것이 일반적인 관례입니다.
 ```

 # 가속기 선택 [[accelerator-selection]]
+분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
 <hfoptions id="select-accelerator">
 <hfoption id="torchrun">
+`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
 ```bash
 torchrun --nproc_per_node=2  trainer-program.py ...
 </hfoption>
 <hfoption id="Accelerate">
+`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
 </hfoption>
 <hfoption id="DeepSpeed">
+`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
 ```bash
 deepspeed --num_gpus 2 trainer-program.py ...
 </hfoptions>
 ## 가속기 순서 [[order-of-accelerators]]
+사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
 CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
 ```
+GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
 CUDA_VISIBLE_DEVICES= python trainer-program.py ...
 ```
+`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
+- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
     ```bash
 $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
     ```
+- 컴퓨팅 성능 순서 (가장 빠른 것부터):
     ```bash
     export CUDA_DEVICE_ORDER=FASTEST_FIRST
 ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
 ```
+XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
 ```bash
 ```
+다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
 ```bash
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 ```
+Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
 </hfoption>
 </hfoptions>
 > [!WARNING]
+> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
 ```

translator/content.py CHANGED Viewed

@@ -5,8 +5,13 @@ import requests
 from langchain.callbacks import get_openai_callback
 from langchain_anthropic import ChatAnthropic
 def get_content(filepath: str) -> str:
     url = string.Template(
         "https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
     ).safe_substitute(filepath=filepath)
@@ -24,24 +29,31 @@ def preprocess_content(content: str) -> str:
     ## ignore top license comment
     to_translate = content[content.find("#") :]
     ## remove code blocks from text
-    to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
     ## remove markdown tables from text
-    to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
     ## remove empty lines from text
     to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
     return to_translate
-def get_full_prompt(language: str, to_translate: str) -> str:
-    prompt = string.Template(
         "What do these sentences about Hugging Face Transformers "
         "(a machine learning library) mean in $language? "
         "Please do not translate the word after a 🤗 emoji "
-        "as it is a product name. Output only the translated markdown result "
-        "without any explanations or introductions.\n\n```md"
     ).safe_substitute(language=language)
-    return "\n".join([prompt, to_translate.strip(), "```"])
 def split_markdown_sections(markdown: str) -> list:
@@ -64,33 +76,89 @@ def make_scaffold(content: str, to_translate: str) -> string.Template:
     scaffold = content
     for i, text in enumerate(to_translate.split("\n\n")):
         scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
     return string.Template(scaffold)
 def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
     scaffold = make_scaffold(content, to_translate)
     divided = split_markdown_sections(to_translate)
     anchors = get_anchors(divided)
-    translated = split_markdown_sections(translated)
-    translated[1::3] = [
-        f"{korean_title} {anchors[i]}"
-        for i, korean_title in enumerate(translated[1::3])
-    ]
-    translated = "".join(
-        ["".join(translated[i * 3 : i * 3 + 3]) for i in range(len(translated) // 3)]
-    ).split("\n\n")
-    if newlines := scaffold.template.count("$hf_i18n_placeholder") - len(translated):
-        return str(
-            [
-                f"Please {'recover' if newlines > 0 else 'remove'} "
-                f"{abs(newlines)} incorrectly inserted double newlines."
-            ]
-        )
     translated_doc = scaffold.safe_substitute(
-        {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated)}
     )
     return translated_doc

 from langchain.callbacks import get_openai_callback
 from langchain_anthropic import ChatAnthropic
+from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
 def get_content(filepath: str) -> str:
+    if filepath == "":
+        raise ValueError("No files selected for translation.")
     url = string.Template(
         "https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath"
     ).safe_substitute(filepath=filepath)
     ## ignore top license comment
     to_translate = content[content.find("#") :]
     ## remove code blocks from text
+    # to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL)
     ## remove markdown tables from text
+    # to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE)
     ## remove empty lines from text
     to_translate = re.sub(r"\n\n+", "\n\n", to_translate)
     return to_translate
+def get_full_prompt(language: str, to_translate: str, additional_instruction: str = "") -> str:
+    base_prompt = string.Template(
         "What do these sentences about Hugging Face Transformers "
         "(a machine learning library) mean in $language? "
         "Please do not translate the word after a 🤗 emoji "
+        "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
+        "No explanations or extras—only the translated markdown. Also translate all comments within code blocks as well."
     ).safe_substitute(language=language)
+    base_prompt += "\n\n```md"
+    full_prompt = "\n".join([base_prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
+    if additional_instruction.strip():
+        full_prompt += f"\n\n🗒️ Additional instructions: {additional_instruction.strip()}"
+    return full_prompt
 def split_markdown_sections(markdown: str) -> list:
     scaffold = content
     for i, text in enumerate(to_translate.split("\n\n")):
         scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
+    print("inner scaffold:")
+    print(scaffold)
     return string.Template(scaffold)
+def is_in_code_block(text: str, position: int) -> bool:
+    """Check if a position in text is inside a code block"""
+    text_before = text[:position]
+    code_block_starts = text_before.count("```")
+    return code_block_starts % 2 == 1
 def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
     scaffold = make_scaffold(content, to_translate)
+    print("scaffold:")
+    print(scaffold.template)
+    # Get original text sections to maintain structure
+    original_sections = to_translate.split("\n\n")
+    # Split markdown sections to get headers and anchors
     divided = split_markdown_sections(to_translate)
+    print("divided:")
+    print(divided)
     anchors = get_anchors(divided)
+    # Split translated content by markdown sections
+    translated_divided = split_markdown_sections(translated)
+    print("translated divided:")
+    print(translated_divided)
+    # Ensure we have the same number of headers as the original
+    if len(translated_divided[1::3]) != len(anchors):
+        print(f"Warning: Header count mismatch. Original: {len(anchors)}, Translated: {len(translated_divided[1::3])}")
+        # Adjust anchors list to match translated headers
+        if len(translated_divided[1::3]) < len(anchors):
+            anchors = anchors[:len(translated_divided[1::3])]
+        else:
+            # Add empty anchors for extra headers
+            anchors.extend([""] * (len(translated_divided[1::3]) - len(anchors)))
+    # Add anchors to translated headers only if they're not in code blocks
+    for i, korean_title in enumerate(translated_divided[1::3]):
+        if i < len(anchors):
+            # Find the position of this header in the original translated text
+            header_pos = translated.find(korean_title.strip())
+            if header_pos != -1 and not is_in_code_block(translated, header_pos):
+                translated_divided[1 + i * 3] = f"{korean_title} {anchors[i]}"
+            else:
+                translated_divided[1 + i * 3] = korean_title
+    # Reconstruct translated content with proper structure
+    reconstructed_translated = "".join([
+        "".join(translated_divided[i * 3 : i * 3 + 3])
+        for i in range(len(translated_divided) // 3)
+    ])
+    # Split by double newlines to match original structure
+    translated_sections = reconstructed_translated.split("\n\n")
+    print("scaffold template count:")
+    print(scaffold.template.count("$hf_i18n_placeholder"))
+    print("original sections length:")
+    print(len(original_sections))
+    print("translated sections length:")
+    print(len(translated_sections))
+    # Ensure section counts match
+    placeholder_count = scaffold.template.count("$hf_i18n_placeholder")
+    if len(translated_sections) < placeholder_count:
+        # Add empty sections if translated has fewer sections
+        translated_sections.extend([""] * (placeholder_count - len(translated_sections)))
+    elif len(translated_sections) > placeholder_count:
+        # Truncate if translated has more sections
+        translated_sections = translated_sections[:placeholder_count]
+    # Final check
+    if len(translated_sections) != placeholder_count:
+        return f"Error: Section count mismatch. Expected: {placeholder_count}, Got: {len(translated_sections)}"
     translated_doc = scaffold.safe_substitute(
+        {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated_sections)}
     )
     return translated_doc

translator/retriever.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from pathlib import Path
@@ -25,6 +26,59 @@ def get_github_repo_files():
     return file_paths
 def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
     """
     Retrieve missing docs

+import re
 import os
 from pathlib import Path
     return file_paths
+def get_github_issue_open_pr(lang: str = "ko"):
+    """
+    Get open PR in the github issue, filtered by title starting with '🌐 [i18n-KO]'.
+    """
+    if lang == "ko":
+        issue_id = "20179"
+    else:
+        raise ValueError(
+            "No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
+        )
+    headers = {
+        "Accept": "application/vnd.github+json",
+    }
+    all_open_prs = []
+    page = 1
+    per_page = 100  # Maximum allowed by GitHub API
+    while True:
+        url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open&page={page}&per_page={per_page}"
+        response = requests.get(url, headers=headers)
+        if response.status_code != 200:
+            raise Exception(f"GitHub API error: {response.status_code} {response.text}")
+        page_prs = response.json()
+        if not page_prs:  # No more PRs
+            break
+        all_open_prs.extend(page_prs)
+        page += 1
+        # Break if we got less than per_page results (last page)
+        if len(page_prs) < per_page:
+            break
+    filtered_prs = [pr for pr in all_open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
+    pattern = re.compile(r"`([^`]+\.md)`")
+    filenames = [
+        "docs/source/en/" + match.group(1)
+        for pr in filtered_prs
+        if (match := pattern.search(pr["title"]))
+    ]
+    pr_info_list = [
+        f"https://github.com/huggingface/transformers/pull/{pr["url"].rstrip('/').split('/')[-1]}"
+        for pr in filtered_prs
+    ]
+    return filenames, pr_info_list
 def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
     """
     Retrieve missing docs