bakrianoo commited on
Commit
c065ba1
·
1 Parent(s): e39d0f6

translate xml from wikipedia

Browse files
app.py CHANGED
@@ -44,7 +44,7 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
44
 
45
  def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
46
 
47
- llm_client = init_llm_client(api_key, model_id, base_url)
48
 
49
  translation_prompt = get_translate_prompt(
50
  article_title=article_title,
@@ -53,22 +53,39 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
53
  target_lang=target_lang
54
  )
55
 
56
- # Call the LLM to get the translation
57
- response = llm_client.responses.create(
 
58
  messages=[
59
  {"role": "user", "content": translation_prompt}
60
  ],
61
- model=model_id,
62
  max_tokens=2000,
63
  temperature=0.5
64
  )
65
 
66
- decoded_object = json_repair.loads(response.choices[0].message['content'])
67
  if 'output_content' in decoded_object:
68
  return decoded_object['output_content']
69
 
70
  return "Error: Translation output not found in the response."
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def update_ui_with_sections(sections_dict):
73
  """
74
  Creates a list of components to display in the sections area
@@ -76,20 +93,46 @@ def update_ui_with_sections(sections_dict):
76
  components = []
77
 
78
  if not sections_dict:
79
- return [gr.update(visible=False) for _ in range(10)] # Assuming max 10 sections
 
 
 
 
 
 
 
 
80
 
81
  # Create visible components for available sections
82
  for section_name, section_content in sections_dict.items():
 
83
  components.append(gr.update(
84
  value=section_content,
85
  label=f"Section: {section_name}",
86
  visible=True
87
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Hide any unused components
90
- remaining = 100 - len(components) # Assuming max 100 sections
91
  for _ in range(remaining):
92
- components.append(gr.update(visible=False))
 
 
 
 
93
 
94
  return components
95
 
@@ -115,6 +158,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
115
  model_id = gr.Textbox(
116
  label="OpenAI Model ID",
117
  placeholder="gpt-4.1-mini",
 
118
  )
119
 
120
  base_url = gr.Textbox(
@@ -125,7 +169,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
125
 
126
  target_language = gr.Dropdown(
127
  choices=list(LANGUAGES.keys()),
128
- value="Spanish",
129
  label="Target Language",
130
  )
131
 
@@ -178,13 +222,31 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
178
  visible=False # Hidden by default as it's usually large
179
  )
180
 
181
- # Pre-define section textboxes (limit to 100 for simplicity)
182
  gr.Markdown("### Article Sections")
183
  with gr.Column() as sections_container:
184
- section_textboxes = [
185
- gr.Textbox(visible=False, lines=4)
186
- for _ in range(100) # Support up to 100 sections
187
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  # Connect the extract button to the function
190
  extract_button.click(
@@ -201,7 +263,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
201
  ).then(
202
  fn=update_ui_with_sections,
203
  inputs=[sections_state],
204
- outputs=section_textboxes
205
  )
206
 
207
  # Launch the app
 
44
 
45
  def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
46
 
47
+ llm_client = init_llm_client(api_key, base_url=base_url)
48
 
49
  translation_prompt = get_translate_prompt(
50
  article_title=article_title,
 
53
  target_lang=target_lang
54
  )
55
 
56
+ # Call the LLM to get the translation - updating params to match OpenAI's requirements
57
+ response = llm_client.chat.completions.create(
58
+ model=model_id,
59
  messages=[
60
  {"role": "user", "content": translation_prompt}
61
  ],
 
62
  max_tokens=2000,
63
  temperature=0.5
64
  )
65
 
66
+ decoded_object = json_repair.loads(response.choices[0].message.content)
67
  if 'output_content' in decoded_object:
68
  return decoded_object['output_content']
69
 
70
  return "Error: Translation output not found in the response."
71
 
72
+ def translate_section(section_content, article_title, article_summary, target_lang, api_key, model_id, base_url):
73
+ """
74
+ Translates a single section of the Wikipedia article
75
+ """
76
+ if not section_content or not api_key:
77
+ return "Please provide content and API key for translation."
78
+
79
+ return translate_content(
80
+ content=section_content,
81
+ article_title=article_title,
82
+ artice_summary=article_summary,
83
+ target_lang=target_lang,
84
+ api_key=api_key,
85
+ model_id=model_id,
86
+ base_url=base_url
87
+ )
88
+
89
  def update_ui_with_sections(sections_dict):
90
  """
91
  Creates a list of components to display in the sections area
 
93
  components = []
94
 
95
  if not sections_dict:
96
+ # Return updates for all components (input, button, output)
97
+ empty_updates = []
98
+ for _ in range(100): # Assuming max 100 sections
99
+ empty_updates.extend([
100
+ gr.update(visible=False), # section textbox
101
+ gr.update(visible=False), # translate button
102
+ gr.update(visible=False) # translation output
103
+ ])
104
+ return empty_updates
105
 
106
  # Create visible components for available sections
107
  for section_name, section_content in sections_dict.items():
108
+ # Update for section content textbox
109
  components.append(gr.update(
110
  value=section_content,
111
  label=f"Section: {section_name}",
112
  visible=True
113
  ))
114
+
115
+ # Update for translate button
116
+ components.append(gr.update(
117
+ visible=True,
118
+ value=f"Translate {section_name}"
119
+ ))
120
+
121
+ # Update for translation output
122
+ components.append(gr.update(
123
+ visible=True,
124
+ value="",
125
+ label=f"Translation: {section_name}"
126
+ ))
127
 
128
  # Hide any unused components
129
+ remaining = 100 - len(sections_dict) # Assuming max 100 sections
130
  for _ in range(remaining):
131
+ components.extend([
132
+ gr.update(visible=False), # section textbox
133
+ gr.update(visible=False), # translate button
134
+ gr.update(visible=False) # translation output
135
+ ])
136
 
137
  return components
138
 
 
158
  model_id = gr.Textbox(
159
  label="OpenAI Model ID",
160
  placeholder="gpt-4.1-mini",
161
+ value="gpt-4.1-mini",
162
  )
163
 
164
  base_url = gr.Textbox(
 
169
 
170
  target_language = gr.Dropdown(
171
  choices=list(LANGUAGES.keys()),
172
+ value="Arabic",
173
  label="Target Language",
174
  )
175
 
 
222
  visible=False # Hidden by default as it's usually large
223
  )
224
 
225
+ # Pre-define section textboxes and related components
226
  gr.Markdown("### Article Sections")
227
  with gr.Column() as sections_container:
228
+ section_components = []
229
+ for i in range(100): # Support up to 100 sections
230
+ with gr.Row():
231
+ section_textbox = gr.Textbox(visible=False, lines=4)
232
+ translate_btn = gr.Button("Translate", visible=False)
233
+ translation_output = gr.Textbox(visible=False, lines=4)
234
+ section_components.extend([section_textbox, translate_btn, translation_output])
235
+
236
+ # Connect the translate button to the translation function
237
+ translate_btn.click(
238
+ fn=translate_section,
239
+ inputs=[
240
+ section_textbox,
241
+ article_title,
242
+ aticle_summary,
243
+ target_language,
244
+ api_key,
245
+ model_id,
246
+ base_url
247
+ ],
248
+ outputs=translation_output
249
+ )
250
 
251
  # Connect the extract button to the function
252
  extract_button.click(
 
263
  ).then(
264
  fn=update_ui_with_sections,
265
  inputs=[sections_state],
266
+ outputs=section_components
267
  )
268
 
269
  # Launch the app
utils/__pycache__/llm_parser.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/llm_parser.cpython-310.pyc and b/utils/__pycache__/llm_parser.cpython-310.pyc differ
 
utils/__pycache__/llm_prompts.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/llm_prompts.cpython-310.pyc and b/utils/__pycache__/llm_prompts.cpython-310.pyc differ
 
utils/__pycache__/wikipedia_extractor.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/wikipedia_extractor.cpython-310.pyc and b/utils/__pycache__/wikipedia_extractor.cpython-310.pyc differ
 
utils/llm_parser.py CHANGED
@@ -1,15 +1,18 @@
1
  import os
2
  from openai import OpenAI
 
3
 
4
-
5
- def init_llm_client(api_key, model_id, base_url=None):
6
  """
7
  Initialize the OpenAI client with the provided API key and model ID.
8
  """
9
- if base_url:
 
10
  os.environ["OPENAI_API_BASE"] = base_url
 
 
 
11
  os.environ["OPENAI_API_KEY"] = api_key
12
- os.environ["OPENAI_MODEL_ID"] = model_id
13
 
14
- return OpenAI(api_key=api_key, model_id=model_id, base_url=base_url)
15
 
 
1
  import os
2
  from openai import OpenAI
3
+ import urllib
4
 
5
+ def init_llm_client(api_key, base_url=None):
 
6
  """
7
  Initialize the OpenAI client with the provided API key and model ID.
8
  """
9
+ print("base_url", base_url)
10
+ if base_url and len(base_url) > 0 and base_url.startswith("http"):
11
  os.environ["OPENAI_API_BASE"] = base_url
12
+ else:
13
+ base_url = None
14
+
15
  os.environ["OPENAI_API_KEY"] = api_key
 
16
 
17
+ return OpenAI(api_key=api_key, base_url=base_url)
18
 
utils/llm_prompts.py CHANGED
@@ -35,11 +35,11 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
35
 
36
  "# Output Format\n"
37
  "Return a single JSON object with the following structure:\n"
38
- "```json\n" +
39
- json.dumps({
40
- "output_content": "The complete translated XML content with all tags preserved",
41
- }, indent=4, ensure_ascii=False) +
42
- "\n```\n\n"
43
 
44
  "# Translation Quality Guidelines\n"
45
  "- Accuracy: Ensure factual information is preserved exactly\n"
 
35
 
36
  "# Output Format\n"
37
  "Return a single JSON object with the following structure:\n"
38
+ "```json\n"
39
+ "{{\n"
40
+ " \"output_content\": \"The complete translated XML content with all tags preserved\"\n"
41
+ "}}\n"
42
+ "```\n\n"
43
 
44
  "# Translation Quality Guidelines\n"
45
  "- Accuracy: Ensure factual information is preserved exactly\n"