wony617 commited on
Commit
8957aec
Β·
1 Parent(s): 10adc15

Fix translation doc finder

Browse files
agent/handler.py CHANGED
@@ -8,7 +8,6 @@ import gradio as gr
8
 
9
  from agent.workflow import (
10
  report_translation_target_files,
11
- report_in_translation_status_files,
12
  translate_docs_interactive,
13
  generate_github_pr,
14
  )
@@ -73,11 +72,8 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
73
  state.step = "find_files"
74
 
75
  status_report, files_list = report_translation_target_files(lang, k)
76
- in_progress_status_report, in_progress_docs = report_in_translation_status_files(
77
- lang
78
- )
79
  state.files_to_translate = (
80
- [file[0] for file in files_list if file[0] not in in_progress_docs]
81
  if files_list
82
  else []
83
  )
@@ -86,7 +82,7 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
86
 
87
  **Status Report:**
88
  {status_report}
89
- {in_progress_status_report}
90
  **πŸ“ Found first {len(state.files_to_translate)} files to translate:**
91
  """
92
 
 
8
 
9
  from agent.workflow import (
10
  report_translation_target_files,
 
11
  translate_docs_interactive,
12
  generate_github_pr,
13
  )
 
72
  state.step = "find_files"
73
 
74
  status_report, files_list = report_translation_target_files(lang, k)
 
 
 
75
  state.files_to_translate = (
76
+ [file[0] for file in files_list]
77
  if files_list
78
  else []
79
  )
 
82
 
83
  **Status Report:**
84
  {status_report}
85
+
86
  **πŸ“ Found first {len(state.files_to_translate)} files to translate:**
87
  """
88
 
agent/toctree_handler.py CHANGED
@@ -90,26 +90,7 @@ Korean title:"""
90
  'local': local_file_path,
91
  'title': en_title
92
  }
93
-
94
- def update_local_toctree_file(self, new_entries: List[Dict[str, str]]):
95
- """Update or create local _toctree.yml file"""
96
- toctree_path = os.path.join(self.local_docs_path, "_toctree.yml")
97
-
98
- os.makedirs(self.local_docs_path, exist_ok=True)
99
-
100
- if os.path.exists(toctree_path):
101
- with open(toctree_path, 'r', encoding='utf-8') as f:
102
- existing_data = yaml.safe_load(f) or []
103
- else:
104
- existing_data = []
105
-
106
- for entry in new_entries:
107
- if entry not in existing_data:
108
- existing_data.append(entry)
109
-
110
- with open(toctree_path, 'w', encoding='utf-8') as f:
111
- yaml.dump(existing_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
112
-
113
  def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
114
  """Use LLM to create updated Korean toctree with new entry at correct position"""
115
  try:
@@ -177,7 +158,7 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
177
  print(f"Error using LLM to create updated toctree: {e}")
178
  return None
179
 
180
- def process_pr_commit(self, en_titles: List[str], local_paths: List[str], filepath: str):
181
  """Process PR commit by using LLM to create complete updated Korean toctree"""
182
  # Get filepath without prefix
183
  filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
@@ -194,16 +175,12 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
194
 
195
  if not updated_ko_toctree:
196
  print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
197
- return []
198
 
199
  print(f"LLM successfully updated Korean toctree")
200
 
201
  # Store the updated toctree for commit
202
  self.updated_ko_toctree = updated_ko_toctree
203
-
204
- print(f"Updated Korean toctree has {len(updated_ko_toctree)} items")
205
-
206
- return []
207
 
208
  def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
209
  """Commit and push toctree updates as a separate commit"""
@@ -219,7 +196,7 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
219
  toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
220
 
221
  # Create toctree commit message
222
- commit_message = "docs: update Korean documentation table of contents - test"
223
 
224
  # Commit toctree file
225
  file_result = pr_agent.create_or_update_file(
@@ -252,7 +229,6 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
252
  def update_toctree_after_translation(
253
  self,
254
  translation_result: dict,
255
- en_title: str,
256
  filepath: str,
257
  pr_agent,
258
  github_config: dict
@@ -261,7 +237,6 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
261
 
262
  Args:
263
  translation_result: Result from translation PR workflow
264
- en_title: English title for toctree mapping
265
  filepath: Original file path
266
  pr_agent: GitHub PR agent instance
267
  github_config: GitHub configuration dictionary
@@ -269,28 +244,22 @@ Example: If English entry is at position [1]['sections'][0] (1st item in section
269
  Returns:
270
  Dictionary with toctree update result
271
  """
272
- if translation_result["status"] == "error" or not en_title:
273
  return None
274
 
275
  try:
276
- local_path = filepath.split("/")[-1].replace(".md", "")
277
-
278
- # Create new toctree entries
279
- new_entries = self.process_pr_commit([en_title], [local_path], filepath)
280
- print("self.updated_ko_toctree = updated_ko_toctree:", self.updated_ko_toctree)
281
  # Commit toctree as separate commit
282
- return self.commit_and_push_toctree(
283
- pr_agent=pr_agent,
284
- owner=github_config["owner"],
285
- repo_name=github_config["repo_name"],
286
- branch_name=translation_result["branch"]
287
- )
288
- # return {
289
- # 'status': 'success',
290
- # 'message': 'Toctree committed successfully: SUCCESS: File updated - docs/source/ko/_toctree.yml',
291
- # 'commit_message': 'docs: update Korean documentation table of contents'
292
- # }
293
-
294
  except Exception as e:
295
  return {
296
  "status": "error",
 
90
  'local': local_file_path,
91
  'title': en_title
92
  }
93
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def create_updated_toctree_with_llm(self, en_toctree_yaml: str, ko_toctree_yaml: str, target_local: str) -> dict:
95
  """Use LLM to create updated Korean toctree with new entry at correct position"""
96
  try:
 
158
  print(f"Error using LLM to create updated toctree: {e}")
159
  return None
160
 
161
+ def process_pr_commit(self, filepath: str):
162
  """Process PR commit by using LLM to create complete updated Korean toctree"""
163
  # Get filepath without prefix
164
  filepath_without_prefix = filepath.replace("docs/source/en/", "").replace(".md", "")
 
175
 
176
  if not updated_ko_toctree:
177
  print(f"Failed to create updated Korean toctree for local: {filepath_without_prefix}")
178
+ return
179
 
180
  print(f"LLM successfully updated Korean toctree")
181
 
182
  # Store the updated toctree for commit
183
  self.updated_ko_toctree = updated_ko_toctree
 
 
 
 
184
 
185
  def commit_and_push_toctree(self, pr_agent, owner: str, repo_name: str, branch_name: str):
186
  """Commit and push toctree updates as a separate commit"""
 
196
  toctree_content = yaml.dump(ko_data, allow_unicode=True, default_flow_style=False, sort_keys=False)
197
 
198
  # Create toctree commit message
199
+ commit_message = "docs: update Korean documentation table of contents"
200
 
201
  # Commit toctree file
202
  file_result = pr_agent.create_or_update_file(
 
229
  def update_toctree_after_translation(
230
  self,
231
  translation_result: dict,
 
232
  filepath: str,
233
  pr_agent,
234
  github_config: dict
 
237
 
238
  Args:
239
  translation_result: Result from translation PR workflow
 
240
  filepath: Original file path
241
  pr_agent: GitHub PR agent instance
242
  github_config: GitHub configuration dictionary
 
244
  Returns:
245
  Dictionary with toctree update result
246
  """
247
+ if translation_result["status"] == "error":
248
  return None
249
 
250
  try:
251
+ # Process toctree update with LLM
252
+ self.process_pr_commit(filepath)
 
 
 
253
  # Commit toctree as separate commit
254
+ print("self.updated_ko_toctree:", self.updated_ko_toctree:)
255
+ if self.updated_ko_toctree:
256
+ return self.commit_and_push_toctree(
257
+ pr_agent=pr_agent,
258
+ owner=github_config["owner"],
259
+ repo_name=github_config["repo_name"],
260
+ branch_name=translation_result["branch"]
261
+ )
262
+
 
 
 
263
  except Exception as e:
264
  return {
265
  "status": "error",
agent/workflow.py CHANGED
@@ -28,27 +28,34 @@ except ImportError as e:
28
  def report_translation_target_files(
29
  translate_lang: str, top_k: int = 1
30
  ) -> tuple[str, list[list[str]]]:
31
- """Return the top-k files that need translation.
32
 
33
  Args:
34
  translate_lang: Target language to translate
35
  top_k: Number of top-first files to return for translation. (Default 1)
36
  """
37
- status_report, filepath_list = report(translate_lang, top_k)
38
- return status_report, [[file] for file in filepath_list]
 
 
 
39
 
 
 
40
 
41
- def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[str]]:
42
- docs, pr_info_list = get_github_issue_open_pr(translate_lang)
 
 
 
43
 
44
- status_report = ""
45
- if docs:
46
- status_report = f"""\nπŸ€– Found {len(docs)} in progress for translation.
47
- """
48
- for i, file in enumerate(docs):
49
  status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
50
- status_report += "\n"
51
- return status_report, docs
 
52
 
53
 
54
  def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
@@ -58,14 +65,14 @@ def translate_docs(lang: str, file_path: str, additional_instruction: str = "")
58
  Path(__file__).resolve().parent.parent
59
  / f"translation_result/{file_path}"
60
  )
61
-
62
  if translation_file_path.exists():
63
  print(f"πŸ“„ Found existing translation: {translation_file_path}")
64
  with open(translation_file_path, "r", encoding="utf-8") as f:
65
  existing_content = f.read()
66
  if existing_content.strip():
67
  return "Existing translation loaded (no tokens used)", existing_content
68
-
69
  # step 1. Get content from file path
70
  content = get_content(file_path)
71
  to_translate = preprocess_content(content)
@@ -195,9 +202,8 @@ def generate_github_pr(
195
  from agent.toctree_handler import TocTreeHandler
196
  toctree_handler = TocTreeHandler()
197
  toctree_result = toctree_handler.update_toctree_after_translation(
198
- result, en_title, filepath, agent, github_config
199
  )
200
- print("toctree_result:", toctree_result)
201
 
202
  # Process result
203
  # Generate toctree status message (shared for both success and partial_success)
 
28
  def report_translation_target_files(
29
  translate_lang: str, top_k: int = 1
30
  ) -> tuple[str, list[list[str]]]:
31
+ """Return the top-k files that need translation, excluding files already in progress.
32
 
33
  Args:
34
  translate_lang: Target language to translate
35
  top_k: Number of top-first files to return for translation. (Default 1)
36
  """
37
+ # Get files in progress
38
+ docs_in_progress, pr_info_list = get_github_issue_open_pr(translate_lang)
39
+
40
+ # Get all available files for translation
41
+ all_status_report, all_filepath_list = report(translate_lang, top_k * 2) # Get more to account for filtering
42
 
43
+ # Filter out files that are already in progress
44
+ available_files = [f for f in all_filepath_list if f not in docs_in_progress]
45
 
46
+ # Take only the requested number
47
+ filepath_list = available_files[:top_k]
48
+
49
+ # Build combined status report
50
+ status_report = all_status_report
51
 
52
+ if docs_in_progress:
53
+ status_report += f"\n\nπŸ€– Found {len(docs_in_progress)} files in progress for translation:"
54
+ for i, file in enumerate(docs_in_progress):
 
 
55
  status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
56
+ status_report += f"\n\nπŸ“‹ Showing {len(filepath_list)} available files (excluding in-progress):"
57
+
58
+ return status_report, [[file] for file in filepath_list]
59
 
60
 
61
  def translate_docs(lang: str, file_path: str, additional_instruction: str = "") -> tuple[str, str]:
 
65
  Path(__file__).resolve().parent.parent
66
  / f"translation_result/{file_path}"
67
  )
68
+
69
  if translation_file_path.exists():
70
  print(f"πŸ“„ Found existing translation: {translation_file_path}")
71
  with open(translation_file_path, "r", encoding="utf-8") as f:
72
  existing_content = f.read()
73
  if existing_content.strip():
74
  return "Existing translation loaded (no tokens used)", existing_content
75
+
76
  # step 1. Get content from file path
77
  content = get_content(file_path)
78
  to_translate = preprocess_content(content)
 
202
  from agent.toctree_handler import TocTreeHandler
203
  toctree_handler = TocTreeHandler()
204
  toctree_result = toctree_handler.update_toctree_after_translation(
205
+ result, filepath, agent, github_config
206
  )
 
207
 
208
  # Process result
209
  # Generate toctree status message (shared for both success and partial_success)
pr_generator/agent.py CHANGED
@@ -94,10 +94,17 @@ class GitHubPRAgent:
94
  if existing_pr:
95
  return f"ERROR: {existing_pr}"
96
 
97
- # 3. Verify head branch exists
98
  repo = self.github_client.get_repo(f"{owner}/{repo_name}")
99
  try:
100
- head_branch = repo.get_branch(head)
 
 
 
 
 
 
 
101
  base_branch = repo.get_branch(base)
102
 
103
  # 4. Check if head and base branches point to the same commit
@@ -159,7 +166,9 @@ class GitHubPRAgent:
159
  """Check if there's an existing PR with the same head and base."""
160
  try:
161
  repo = self.github_client.get_repo(f"{owner}/{repo_name}")
162
- pulls = repo.get_pulls(state="open", head=f"{owner}:{head}", base=base)
 
 
163
  for pr in pulls:
164
  return f"Existing PR found: {pr.html_url}"
165
  return None
@@ -448,12 +457,12 @@ Please return only the commit message. No other explanation is needed."""
448
  pr_analysis["head_branch"], target_language, file_name
449
  )
450
 
451
- # 3. Get main branch SHA and create branch
452
- repo = self.github_client.get_repo(f"{owner}/{repo_name}")
453
- main_branch = repo.get_branch(base_branch)
454
  main_sha = main_branch.commit.sha
455
 
456
- print(f"🌿 Creating branch: {branch_name}")
457
  branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
458
 
459
  # Check branch creation result
@@ -466,8 +475,11 @@ Please return only the commit message. No other explanation is needed."""
466
  elif branch_result.startswith("WARNING"):
467
  print(f"⚠️ {branch_result}")
468
  # Continue if branch already exists
 
 
469
  else:
470
- print(f"{branch_result}")
 
471
 
472
  # 4. Generate commit message and save file
473
  commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
@@ -506,10 +518,11 @@ Please return only the commit message. No other explanation is needed."""
506
  )
507
 
508
  print(f"πŸ”„ Creating PR: {pr_title}")
509
- print(f" Head: {branch_name} β†’ Base: {base_branch}")
510
 
 
511
  pr_result = self.create_pull_request(
512
- owner, repo_name, pr_title, branch_name, base_branch, pr_body
513
  )
514
 
515
  if pr_result.startswith("ERROR"):
 
94
  if existing_pr:
95
  return f"ERROR: {existing_pr}"
96
 
97
+ # 3. Verify head and base branches exist
98
  repo = self.github_client.get_repo(f"{owner}/{repo_name}")
99
  try:
100
+ # For fork-to-upstream PR, head format is "fork_owner:branch_name"
101
+ if ":" in head:
102
+ fork_owner, branch_name = head.split(":", 1)
103
+ fork_repo = self.github_client.get_repo(f"{fork_owner}/{repo_name}")
104
+ head_branch = fork_repo.get_branch(branch_name)
105
+ else:
106
+ head_branch = repo.get_branch(head)
107
+
108
  base_branch = repo.get_branch(base)
109
 
110
  # 4. Check if head and base branches point to the same commit
 
166
  """Check if there's an existing PR with the same head and base."""
167
  try:
168
  repo = self.github_client.get_repo(f"{owner}/{repo_name}")
169
+ # For head parameter, use exactly what was passed (could be "fork_owner:branch" or just "branch")
170
+ search_head = head if ":" in head else f"{owner}:{head}"
171
+ pulls = repo.get_pulls(state="open", head=search_head, base=base)
172
  for pr in pulls:
173
  return f"Existing PR found: {pr.html_url}"
174
  return None
 
457
  pr_analysis["head_branch"], target_language, file_name
458
  )
459
 
460
+ # 3. Get main branch SHA from upstream and create branch in fork
461
+ upstream_repo = self.github_client.get_repo(f"huggingface/{repo_name}")
462
+ main_branch = upstream_repo.get_branch(base_branch)
463
  main_sha = main_branch.commit.sha
464
 
465
+ print(f"🌿 Creating branch: {branch_name} in fork repository")
466
  branch_result = self.create_branch(owner, repo_name, branch_name, main_sha)
467
 
468
  # Check branch creation result
 
475
  elif branch_result.startswith("WARNING"):
476
  print(f"⚠️ {branch_result}")
477
  # Continue if branch already exists
478
+ elif branch_result.startswith("SUCCESS"):
479
+ print(f"βœ… {branch_result}")
480
  else:
481
+ print(f"⚠️ Unexpected branch creation result: {branch_result}")
482
+ # Continue anyway, might still work
483
 
484
  # 4. Generate commit message and save file
485
  commit_messages = [commit["message"] for commit in pr_analysis["commits"]]
 
518
  )
519
 
520
  print(f"πŸ”„ Creating PR: {pr_title}")
521
+ print(f" Head: {owner}:{branch_name} β†’ Base: huggingface:{base_branch}")
522
 
523
+ # Create PR from fork to upstream repository
524
  pr_result = self.create_pull_request(
525
+ "huggingface", "transformers", pr_title, f"{owner}:{branch_name}", base_branch, pr_body
526
  )
527
 
528
  if pr_result.startswith("ERROR"):