wony617
commited on
Commit
Β·
8957aec
1
Parent(s):
10adc15
Fix translation doc finder
Browse files- agent/handler.py +2 -6
- agent/toctree_handler.py +16 -47
- agent/workflow.py +22 -16
- pr_generator/agent.py +23 -10
agent/handler.py
CHANGED
@@ -8,7 +8,6 @@ import gradio as gr
|
|
8 |
|
9 |
from agent.workflow import (
|
10 |
report_translation_target_files,
|
11 |
-
report_in_translation_status_files,
|
12 |
translate_docs_interactive,
|
13 |
generate_github_pr,
|
14 |
)
|
@@ -73,11 +72,8 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
73 |
state.step = "find_files"
|
74 |
|
75 |
status_report, files_list = report_translation_target_files(lang, k)
|
76 |
-
in_progress_status_report, in_progress_docs = report_in_translation_status_files(
|
77 |
-
lang
|
78 |
-
)
|
79 |
state.files_to_translate = (
|
80 |
-
[file[0] for file in files_list
|
81 |
if files_list
|
82 |
else []
|
83 |
)
|
@@ -86,7 +82,7 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
|
|
86 |
|
87 |
**Status Report:**
|
88 |
{status_report}
|
89 |
-
|
90 |
**π Found first {len(state.files_to_translate)} files to translate:**
|
91 |
"""
|
92 |
|
|
|
8 |
|
9 |
from agent.workflow import (
|
10 |
report_translation_target_files,
|
|
|
11 |
translate_docs_interactive,
|
12 |
generate_github_pr,
|
13 |
)
|
|
|
72 |
state.step = "find_files"
|
73 |
|
74 |
status_report, files_list = report_translation_target_files(lang, k)
|
|
|
|
|
|
|
75 |
state.files_to_translate = (
|
76 |
+
[file[0] for file in files_list]
|
77 |
if files_list
|
78 |
else []
|
79 |
)
|
|
|
82 |
|
83 |
**Status Report:**
|
84 |
{status_report}
|
85 |
+
|
86 |
**π Found first {len(state.files_to_translate)} files to translate:**
|
87 |
"""
|
88 |
|
agent/toctree_handler.py
CHANGED
@@ -90,26 +90,7 @@ Korean title:"""
|
|
90 |
'local': local_file_path,
|
91 |
'title': en_title
|
92 |
}
|
93 |
-
|
94 |
-
def update_local_toctree_file(self, new_entries: List[Dict[str, str]]):
|
95 |
-
"""Update or create local _toctree.yml file"""
|
96 |
-
toctree_path = os.path.join(self.local_docs_path, "_toctree.yml")
|
97 |
-
|
98 |
-
os.makedirs(self.local_docs_path, exist_ok=True)
|
99 |
-
|
100 |
-
if os.path.exists(toctree_path):
|
101 |
-
with open(toctree_path, 'r', encoding='utf-8') as f:
|
102 |
-
existing_data = yaml.safe_load(f) or []
|
103 |
-
else:
|
104 |
-
existing_data = []
|
105 |
-
|
106 |
-
for entry in new_entries:
|
107 |
-
if entry not in existing_data:
|
108 |
-
existing_data.append(entry)
|
109 |
-
|
110 |
-
with open(toctree_path, 'w', encoding='utf-8') as f:
|
111 |
-
yaml.dump(existing_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
112 |
-
|
113 |
def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
|
114 |
"""Use LLM to create updated Korean toctree with new entry at correct position"""
|
115 |
try:
|
@@ -177,7 +158,7 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
177 |
print(f"Error using LLM to create updated toctree: {e}")
|
178 |
return None
|
179 |
|
180 |
-
def process_pr_commit(self,
|
181 |
"""Process PR commit by using LLM to create complete updated Korean toctree"""
|
182 |
# Get filepath without prefix
|
183 |
filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
|
@@ -194,16 +175,12 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
194 |
|
195 |
if not updated_ko_toctree:
|
196 |
print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
|
197 |
-
return
|
198 |
|
199 |
print(f"LLM successfully updated Korean toctree")
|
200 |
|
201 |
# Store the updated toctree for commit
|
202 |
self.updated_ko_toctree = updated_ko_toctree
|
203 |
-
|
204 |
-
print(f"Updated Korean toctree has {len(updated_ko_toctree)} items")
|
205 |
-
|
206 |
-
return []
|
207 |
|
208 |
def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
|
209 |
"""Commit and push toctree updates as a separate commit"""
|
@@ -219,7 +196,7 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
219 |
toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
220 |
|
221 |
# Create toctree commit message
|
222 |
-
commit_message = "docs: update Korean documentation table of contents
|
223 |
|
224 |
# Commit toctree file
|
225 |
file_result = pr_agent.create_or_update_file(
|
@@ -252,7 +229,6 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
252 |
def update_toctree_after_translation(
|
253 |
self,
|
254 |
translation_result: dict,
|
255 |
-
en_title: str,
|
256 |
filepath: str,
|
257 |
pr_agent,
|
258 |
github_config: dict
|
@@ -261,7 +237,6 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
261 |
|
262 |
Args:
|
263 |
translation_result: Result from translation PR workflow
|
264 |
-
en_title: English title for toctree mapping
|
265 |
filepath: Original file path
|
266 |
pr_agent: GitHub PR agent instance
|
267 |
github_config: GitHub configuration dictionary
|
@@ -269,28 +244,22 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
|
|
269 |
Returns:
|
270 |
Dictionary with toctree update result
|
271 |
"""
|
272 |
-
if translation_result["status"] == "error"
|
273 |
return None
|
274 |
|
275 |
try:
|
276 |
-
|
277 |
-
|
278 |
-
# Create new toctree entries
|
279 |
-
new_entries = self.process_pr_commit([en_title], [local_path], filepath)
|
280 |
-
print("self.updated_ko_toctree = updated_ko_toctree:", self.updated_ko_toctree)
|
281 |
# Commit toctree as separate commit
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
# 'commit_message': 'docs: update Korean documentation table of contents'
|
292 |
-
# }
|
293 |
-
|
294 |
except Exception as e:
|
295 |
return {
|
296 |
"status": "error",
|
|
|
90 |
'local': local_file_path,
|
91 |
'title': en_title
|
92 |
}
|
93 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
|
95 |
"""Use LLM to create updated Korean toctree with new entry at correct position"""
|
96 |
try:
|
|
|
158 |
print(f"Error using LLM to create updated toctree: {e}")
|
159 |
return None
|
160 |
|
161 |
+
def process_pr_commit(self, filepath: str):
|
162 |
"""Process PR commit by using LLM to create complete updated Korean toctree"""
|
163 |
# Get filepath without prefix
|
164 |
filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
|
|
|
175 |
|
176 |
if not updated_ko_toctree:
|
177 |
print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
|
178 |
+
return
|
179 |
|
180 |
print(f"LLM successfully updated Korean toctree")
|
181 |
|
182 |
# Store the updated toctree for commit
|
183 |
self.updated_ko_toctree = updated_ko_toctree
|
|
|
|
|
|
|
|
|
184 |
|
185 |
def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
|
186 |
"""Commit and push toctree updates as a separate commit"""
|
|
|
196 |
toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
197 |
|
198 |
# Create toctree commit message
|
199 |
+
commit_message = "docs: update Korean documentation table of contents"
|
200 |
|
201 |
# Commit toctree file
|
202 |
file_result = pr_agent.create_or_update_file(
|
|
|
229 |
def update_toctree_after_translation(
|
230 |
self,
|
231 |
translation_result: dict,
|
|
|
232 |
filepath: str,
|
233 |
pr_agent,
|
234 |
github_config: dict
|
|
|
237 |
|
238 |
Args:
|
239 |
translation_result: Result from translation PR workflow
|
|
|
240 |
filepath: Original file path
|
241 |
pr_agent: GitHub PR agent instance
|
242 |
github_config: GitHub configuration dictionary
|
|
|
244 |
Returns:
|
245 |
Dictionary with toctree update result
|
246 |
"""
|
247 |
+
if translation_result["status"] == "error":
|
248 |
return None
|
249 |
|
250 |
try:
|
251 |
+
# Process toctree update with LLM
|
252 |
+
self.process_pr_commit(filepath)
|
|
|
|
|
|
|
253 |
# Commit toctree as separate commit
|
254 |
+
print("self.updated_ko_toctree:", self.updated_ko_toctree:)
|
255 |
+
if self.updated_ko_toctree:
|
256 |
+
return self.commit_and_push_toctree(
|
257 |
+
pr_agent=pr_agent,
|
258 |
+
owner=github_config["owner"],
|
259 |
+
repo_name=github_config["repo_name"],
|
260 |
+
branch_name=translation_result["branch"]
|
261 |
+
)
|
262 |
+
|
|
|
|
|
|
|
263 |
except Exception as e:
|
264 |
return {
|
265 |
"status": "error",
|
agent/workflow.py
CHANGED
@@ -28,27 +28,34 @@ except ImportError as e:
|
|
28 |
def report_translation_target_files(
|
29 |
translate_lang: str, top_k: int = 1
|
30 |
) -> tuple[str, list[list[str]]]:
|
31 |
-
"""Return the top-k files that need translation.
|
32 |
|
33 |
Args:
|
34 |
translate_lang: Target language to translate
|
35 |
top_k: Number of top-first files to return for translation. (Default 1)
|
36 |
"""
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
39 |
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
"""
|
48 |
-
for i, file in enumerate(docs):
|
49 |
status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
|
50 |
-
status_report += "\n"
|
51 |
-
|
|
|
52 |
|
53 |
|
54 |
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
@@ -58,14 +65,14 @@ def translate_docs(lang: str, file_path: str, additional_instruction: str = "")
|
|
58 |
Path(__file__).resolve().parent.parent
|
59 |
/ f"translation_result/{file_path}"
|
60 |
)
|
61 |
-
|
62 |
if translation_file_path.exists():
|
63 |
print(f"π Found existing translation: {translation_file_path}")
|
64 |
with open(translation_file_path, "r", encoding="utf-8") as f:
|
65 |
existing_content = f.read()
|
66 |
if existing_content.strip():
|
67 |
return "Existing translation loaded (no tokens used)", existing_content
|
68 |
-
|
69 |
# step 1. Get content from file path
|
70 |
content = get_content(file_path)
|
71 |
to_translate = preprocess_content(content)
|
@@ -195,9 +202,8 @@ def generate_github_pr(
|
|
195 |
from agent.toctree_handler import TocTreeHandler
|
196 |
toctree_handler = TocTreeHandler()
|
197 |
toctree_result = toctree_handler.update_toctree_after_translation(
|
198 |
-
result,
|
199 |
)
|
200 |
-
print("toctree_result:", toctree_result)
|
201 |
|
202 |
# Process result
|
203 |
# Generate toctree status message (shared for both success and partial_success)
|
|
|
28 |
def report_translation_target_files(
|
29 |
translate_lang: str, top_k: int = 1
|
30 |
) -> tuple[str, list[list[str]]]:
|
31 |
+
"""Return the top-k files that need translation, excluding files already in progress.
|
32 |
|
33 |
Args:
|
34 |
translate_lang: Target language to translate
|
35 |
top_k: Number of top-first files to return for translation. (Default 1)
|
36 |
"""
|
37 |
+
# Get files in progress
|
38 |
+
docs_in_progress, pr_info_list = get_github_issue_open_pr(translate_lang)
|
39 |
+
|
40 |
+
# Get all available files for translation
|
41 |
+
all_status_report, all_filepath_list = report(translate_lang, top_k * 2) # Get more to account for filtering
|
42 |
|
43 |
+
# Filter out files that are already in progress
|
44 |
+
available_files = [f for f in all_filepath_list if f not in docs_in_progress]
|
45 |
|
46 |
+
# Take only the requested number
|
47 |
+
filepath_list = available_files[:top_k]
|
48 |
+
|
49 |
+
# Build combined status report
|
50 |
+
status_report = all_status_report
|
51 |
|
52 |
+
if docs_in_progress:
|
53 |
+
status_report += f"\n\nπ€ Found {len(docs_in_progress)} files in progress for translation:"
|
54 |
+
for i, file in enumerate(docs_in_progress):
|
|
|
|
|
55 |
status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
|
56 |
+
status_report += f"\n\nπ Showing {len(filepath_list)} available files (excluding in-progress):"
|
57 |
+
|
58 |
+
return status_report, [[file] for file in filepath_list]
|
59 |
|
60 |
|
61 |
def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
|
|
|
65 |
Path(__file__).resolve().parent.parent
|
66 |
/ f"translation_result/{file_path}"
|
67 |
)
|
68 |
+
|
69 |
if translation_file_path.exists():
|
70 |
print(f"π Found existing translation: {translation_file_path}")
|
71 |
with open(translation_file_path, "r", encoding="utf-8") as f:
|
72 |
existing_content = f.read()
|
73 |
if existing_content.strip():
|
74 |
return "Existing translation loaded (no tokens used)", existing_content
|
75 |
+
|
76 |
# step 1. Get content from file path
|
77 |
content = get_content(file_path)
|
78 |
to_translate = preprocess_content(content)
|
|
|
202 |
from agent.toctree_handler import TocTreeHandler
|
203 |
toctree_handler = TocTreeHandler()
|
204 |
toctree_result = toctree_handler.update_toctree_after_translation(
|
205 |
+
result, filepath, agent, github_config
|
206 |
)
|
|
|
207 |
|
208 |
# Process result
|
209 |
# Generate toctree status message (shared for both success and partial_success)
|
pr_generator/agent.py
CHANGED
@@ -94,10 +94,17 @@ class GitHubPRAgent:
|
|
94 |
if existing_pr:
|
95 |
return f"ERROR: {existing_pr}"
|
96 |
|
97 |
-
# 3. Verify head
|
98 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
99 |
try:
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
base_branch = repo.get_branch(base)
|
102 |
|
103 |
# 4. Check if head and base branches point to the same commit
|
@@ -159,7 +166,9 @@ class GitHubPRAgent:
|
|
159 |
"""Check if there's an existing PR with the same head and base."""
|
160 |
try:
|
161 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
162 |
-
|
|
|
|
|
163 |
for pr in pulls:
|
164 |
return f"Existing PR found: {pr.html_url}"
|
165 |
return None
|
@@ -448,12 +457,12 @@ Please return only the commit message. No other explanation is needed."""
|
|
448 |
pr_analysis["head_branch"], target_language, file_name
|
449 |
)
|
450 |
|
451 |
-
# 3. Get main branch SHA and create branch
|
452 |
-
|
453 |
-
main_branch =
|
454 |
main_sha = main_branch.commit.sha
|
455 |
|
456 |
-
print(f"πΏ Creating branch: {branch_name}")
|
457 |
branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
|
458 |
|
459 |
# Check branch creation result
|
@@ -466,8 +475,11 @@ Please return only the commit message. No other explanation is needed."""
|
|
466 |
elif branch_result.startswith("WARNING"):
|
467 |
print(f"β οΈ {branch_result}")
|
468 |
# Continue if branch already exists
|
|
|
|
|
469 |
else:
|
470 |
-
print(f"{branch_result}")
|
|
|
471 |
|
472 |
# 4. Generate commit message and save file
|
473 |
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
@@ -506,10 +518,11 @@ Please return only the commit message. No other explanation is needed."""
|
|
506 |
)
|
507 |
|
508 |
print(f"π Creating PR: {pr_title}")
|
509 |
-
print(f" Head: {branch_name} β Base: {base_branch}")
|
510 |
|
|
|
511 |
pr_result = self.create_pull_request(
|
512 |
-
|
513 |
)
|
514 |
|
515 |
if pr_result.startswith("ERROR"):
|
|
|
94 |
if existing_pr:
|
95 |
return f"ERROR: {existing_pr}"
|
96 |
|
97 |
+
# 3. Verify head and base branches exist
|
98 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
99 |
try:
|
100 |
+
# For fork-to-upstream PR, head format is "fork_owner:branch_name"
|
101 |
+
if ":" in head:
|
102 |
+
fork_owner, branch_name = head.split(":", 1)
|
103 |
+
fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
|
104 |
+
head_branch = fork_repo.get_branch(branch_name)
|
105 |
+
else:
|
106 |
+
head_branch = repo.get_branch(head)
|
107 |
+
|
108 |
base_branch = repo.get_branch(base)
|
109 |
|
110 |
# 4. Check if head and base branches point to the same commit
|
|
|
166 |
"""Check if there's an existing PR with the same head and base."""
|
167 |
try:
|
168 |
repo = self.github_client.get_repo(f"{owner}/{repo_name}")
|
169 |
+
# For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
|
170 |
+
search_head = head if ":" in head else f"{owner}:{head}"
|
171 |
+
pulls = repo.get_pulls(state="open", head=search_head, base=base)
|
172 |
for pr in pulls:
|
173 |
return f"Existing PR found: {pr.html_url}"
|
174 |
return None
|
|
|
457 |
pr_analysis["head_branch"], target_language, file_name
|
458 |
)
|
459 |
|
460 |
+
# 3. Get main branch SHA from upstream and create branch in fork
|
461 |
+
upstream_repo = self.github_client.get_repo(f"huggingface/{repo_name}")
|
462 |
+
main_branch = upstream_repo.get_branch(base_branch)
|
463 |
main_sha = main_branch.commit.sha
|
464 |
|
465 |
+
print(f"πΏ Creating branch: {branch_name} in fork repository")
|
466 |
branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
|
467 |
|
468 |
# Check branch creation result
|
|
|
475 |
elif branch_result.startswith("WARNING"):
|
476 |
print(f"β οΈ {branch_result}")
|
477 |
# Continue if branch already exists
|
478 |
+
elif branch_result.startswith("SUCCESS"):
|
479 |
+
print(f"β
{branch_result}")
|
480 |
else:
|
481 |
+
print(f"β οΈ Unexpected branch creation result: {branch_result}")
|
482 |
+
# Continue anyway, might still work
|
483 |
|
484 |
# 4. Generate commit message and save file
|
485 |
commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
|
|
|
518 |
)
|
519 |
|
520 |
print(f"π Creating PR: {pr_title}")
|
521 |
+
print(f" Head: {owner}:{branch_name} β Base: huggingface:{base_branch}")
|
522 |
|
523 |
+
# Create PR from fork to upstream repository
|
524 |
pr_result = self.create_pull_request(
|
525 |
+
"huggingface", "transformers", pr_title, f"{owner}:{branch_name}", base_branch, pr_body
|
526 |
)
|
527 |
|
528 |
if pr_result.startswith("ERROR"):
|