Spaces:
Sleeping
Sleeping
translate xml from wikipedia
Browse files
app.py
CHANGED
@@ -44,7 +44,7 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
|
|
44 |
|
45 |
def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
|
46 |
|
47 |
-
llm_client = init_llm_client(api_key,
|
48 |
|
49 |
translation_prompt = get_translate_prompt(
|
50 |
article_title=article_title,
|
@@ -53,22 +53,39 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
|
|
53 |
target_lang=target_lang
|
54 |
)
|
55 |
|
56 |
-
# Call the LLM to get the translation
|
57 |
-
response = llm_client.
|
|
|
58 |
messages=[
|
59 |
{"role": "user", "content": translation_prompt}
|
60 |
],
|
61 |
-
model=model_id,
|
62 |
max_tokens=2000,
|
63 |
temperature=0.5
|
64 |
)
|
65 |
|
66 |
-
decoded_object = json_repair.loads(response.choices[0].message
|
67 |
if 'output_content' in decoded_object:
|
68 |
return decoded_object['output_content']
|
69 |
|
70 |
return "Error: Translation output not found in the response."
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
def update_ui_with_sections(sections_dict):
|
73 |
"""
|
74 |
Creates a list of components to display in the sections area
|
@@ -76,20 +93,46 @@ def update_ui_with_sections(sections_dict):
|
|
76 |
components = []
|
77 |
|
78 |
if not sections_dict:
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
# Create visible components for available sections
|
82 |
for section_name, section_content in sections_dict.items():
|
|
|
83 |
components.append(gr.update(
|
84 |
value=section_content,
|
85 |
label=f"Section: {section_name}",
|
86 |
visible=True
|
87 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Hide any unused components
|
90 |
-
remaining = 100 - len(
|
91 |
for _ in range(remaining):
|
92 |
-
components.
|
|
|
|
|
|
|
|
|
93 |
|
94 |
return components
|
95 |
|
@@ -115,6 +158,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
115 |
model_id = gr.Textbox(
|
116 |
label="OpenAI Model ID",
|
117 |
placeholder="gpt-4.1-mini",
|
|
|
118 |
)
|
119 |
|
120 |
base_url = gr.Textbox(
|
@@ -125,7 +169,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
125 |
|
126 |
target_language = gr.Dropdown(
|
127 |
choices=list(LANGUAGES.keys()),
|
128 |
-
value="
|
129 |
label="Target Language",
|
130 |
)
|
131 |
|
@@ -178,13 +222,31 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
178 |
visible=False # Hidden by default as it's usually large
|
179 |
)
|
180 |
|
181 |
-
# Pre-define section textboxes
|
182 |
gr.Markdown("### Article Sections")
|
183 |
with gr.Column() as sections_container:
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
# Connect the extract button to the function
|
190 |
extract_button.click(
|
@@ -201,7 +263,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
201 |
).then(
|
202 |
fn=update_ui_with_sections,
|
203 |
inputs=[sections_state],
|
204 |
-
outputs=
|
205 |
)
|
206 |
|
207 |
# Launch the app
|
|
|
44 |
|
45 |
def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
|
46 |
|
47 |
+
llm_client = init_llm_client(api_key, base_url=base_url)
|
48 |
|
49 |
translation_prompt = get_translate_prompt(
|
50 |
article_title=article_title,
|
|
|
53 |
target_lang=target_lang
|
54 |
)
|
55 |
|
56 |
+
# Call the LLM to get the translation - updating params to match OpenAI's requirements
|
57 |
+
response = llm_client.chat.completions.create(
|
58 |
+
model=model_id,
|
59 |
messages=[
|
60 |
{"role": "user", "content": translation_prompt}
|
61 |
],
|
|
|
62 |
max_tokens=2000,
|
63 |
temperature=0.5
|
64 |
)
|
65 |
|
66 |
+
decoded_object = json_repair.loads(response.choices[0].message.content)
|
67 |
if 'output_content' in decoded_object:
|
68 |
return decoded_object['output_content']
|
69 |
|
70 |
return "Error: Translation output not found in the response."
|
71 |
|
72 |
+
def translate_section(section_content, article_title, article_summary, target_lang, api_key, model_id, base_url):
|
73 |
+
"""
|
74 |
+
Translates a single section of the Wikipedia article
|
75 |
+
"""
|
76 |
+
if not section_content or not api_key:
|
77 |
+
return "Please provide content and API key for translation."
|
78 |
+
|
79 |
+
return translate_content(
|
80 |
+
content=section_content,
|
81 |
+
article_title=article_title,
|
82 |
+
artice_summary=article_summary,
|
83 |
+
target_lang=target_lang,
|
84 |
+
api_key=api_key,
|
85 |
+
model_id=model_id,
|
86 |
+
base_url=base_url
|
87 |
+
)
|
88 |
+
|
89 |
def update_ui_with_sections(sections_dict):
|
90 |
"""
|
91 |
Creates a list of components to display in the sections area
|
|
|
93 |
components = []
|
94 |
|
95 |
if not sections_dict:
|
96 |
+
# Return updates for all components (input, button, output)
|
97 |
+
empty_updates = []
|
98 |
+
for _ in range(100): # Assuming max 100 sections
|
99 |
+
empty_updates.extend([
|
100 |
+
gr.update(visible=False), # section textbox
|
101 |
+
gr.update(visible=False), # translate button
|
102 |
+
gr.update(visible=False) # translation output
|
103 |
+
])
|
104 |
+
return empty_updates
|
105 |
|
106 |
# Create visible components for available sections
|
107 |
for section_name, section_content in sections_dict.items():
|
108 |
+
# Update for section content textbox
|
109 |
components.append(gr.update(
|
110 |
value=section_content,
|
111 |
label=f"Section: {section_name}",
|
112 |
visible=True
|
113 |
))
|
114 |
+
|
115 |
+
# Update for translate button
|
116 |
+
components.append(gr.update(
|
117 |
+
visible=True,
|
118 |
+
value=f"Translate {section_name}"
|
119 |
+
))
|
120 |
+
|
121 |
+
# Update for translation output
|
122 |
+
components.append(gr.update(
|
123 |
+
visible=True,
|
124 |
+
value="",
|
125 |
+
label=f"Translation: {section_name}"
|
126 |
+
))
|
127 |
|
128 |
# Hide any unused components
|
129 |
+
remaining = 100 - len(sections_dict) # Assuming max 100 sections
|
130 |
for _ in range(remaining):
|
131 |
+
components.extend([
|
132 |
+
gr.update(visible=False), # section textbox
|
133 |
+
gr.update(visible=False), # translate button
|
134 |
+
gr.update(visible=False) # translation output
|
135 |
+
])
|
136 |
|
137 |
return components
|
138 |
|
|
|
158 |
model_id = gr.Textbox(
|
159 |
label="OpenAI Model ID",
|
160 |
placeholder="gpt-4.1-mini",
|
161 |
+
value="gpt-4.1-mini",
|
162 |
)
|
163 |
|
164 |
base_url = gr.Textbox(
|
|
|
169 |
|
170 |
target_language = gr.Dropdown(
|
171 |
choices=list(LANGUAGES.keys()),
|
172 |
+
value="Arabic",
|
173 |
label="Target Language",
|
174 |
)
|
175 |
|
|
|
222 |
visible=False # Hidden by default as it's usually large
|
223 |
)
|
224 |
|
225 |
+
# Pre-define section textboxes and related components
|
226 |
gr.Markdown("### Article Sections")
|
227 |
with gr.Column() as sections_container:
|
228 |
+
section_components = []
|
229 |
+
for i in range(100): # Support up to 100 sections
|
230 |
+
with gr.Row():
|
231 |
+
section_textbox = gr.Textbox(visible=False, lines=4)
|
232 |
+
translate_btn = gr.Button("Translate", visible=False)
|
233 |
+
translation_output = gr.Textbox(visible=False, lines=4)
|
234 |
+
section_components.extend([section_textbox, translate_btn, translation_output])
|
235 |
+
|
236 |
+
# Connect the translate button to the translation function
|
237 |
+
translate_btn.click(
|
238 |
+
fn=translate_section,
|
239 |
+
inputs=[
|
240 |
+
section_textbox,
|
241 |
+
article_title,
|
242 |
+
aticle_summary,
|
243 |
+
target_language,
|
244 |
+
api_key,
|
245 |
+
model_id,
|
246 |
+
base_url
|
247 |
+
],
|
248 |
+
outputs=translation_output
|
249 |
+
)
|
250 |
|
251 |
# Connect the extract button to the function
|
252 |
extract_button.click(
|
|
|
263 |
).then(
|
264 |
fn=update_ui_with_sections,
|
265 |
inputs=[sections_state],
|
266 |
+
outputs=section_components
|
267 |
)
|
268 |
|
269 |
# Launch the app
|
utils/__pycache__/llm_parser.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/llm_parser.cpython-310.pyc and b/utils/__pycache__/llm_parser.cpython-310.pyc differ
|
|
utils/__pycache__/llm_prompts.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/llm_prompts.cpython-310.pyc and b/utils/__pycache__/llm_prompts.cpython-310.pyc differ
|
|
utils/__pycache__/wikipedia_extractor.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/wikipedia_extractor.cpython-310.pyc and b/utils/__pycache__/wikipedia_extractor.cpython-310.pyc differ
|
|
utils/llm_parser.py
CHANGED
@@ -1,15 +1,18 @@
|
|
1 |
import os
|
2 |
from openai import OpenAI
|
|
|
3 |
|
4 |
-
|
5 |
-
def init_llm_client(api_key, model_id, base_url=None):
|
6 |
"""
|
7 |
Initialize the OpenAI client with the provided API key and model ID.
|
8 |
"""
|
9 |
-
|
|
|
10 |
os.environ["OPENAI_API_BASE"] = base_url
|
|
|
|
|
|
|
11 |
os.environ["OPENAI_API_KEY"] = api_key
|
12 |
-
os.environ["OPENAI_MODEL_ID"] = model_id
|
13 |
|
14 |
-
return OpenAI(api_key=api_key,
|
15 |
|
|
|
1 |
import os
|
2 |
from openai import OpenAI
|
3 |
+
import urllib
|
4 |
|
5 |
+
def init_llm_client(api_key, base_url=None):
|
|
|
6 |
"""
|
7 |
Initialize the OpenAI client with the provided API key and model ID.
|
8 |
"""
|
9 |
+
print("base_url", base_url)
|
10 |
+
if base_url and len(base_url) > 0 and base_url.startswith("http"):
|
11 |
os.environ["OPENAI_API_BASE"] = base_url
|
12 |
+
else:
|
13 |
+
base_url = None
|
14 |
+
|
15 |
os.environ["OPENAI_API_KEY"] = api_key
|
|
|
16 |
|
17 |
+
return OpenAI(api_key=api_key, base_url=base_url)
|
18 |
|
utils/llm_prompts.py
CHANGED
@@ -35,11 +35,11 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
|
|
35 |
|
36 |
"# Output Format\n"
|
37 |
"Return a single JSON object with the following structure:\n"
|
38 |
-
"```json\n"
|
39 |
-
|
40 |
-
|
41 |
-
}
|
42 |
-
"
|
43 |
|
44 |
"# Translation Quality Guidelines\n"
|
45 |
"- Accuracy: Ensure factual information is preserved exactly\n"
|
|
|
35 |
|
36 |
"# Output Format\n"
|
37 |
"Return a single JSON object with the following structure:\n"
|
38 |
+
"```json\n"
|
39 |
+
"{{\n"
|
40 |
+
" \"output_content\": \"The complete translated XML content with all tags preserved\"\n"
|
41 |
+
"}}\n"
|
42 |
+
"```\n\n"
|
43 |
|
44 |
"# Translation Quality Guidelines\n"
|
45 |
"- Accuracy: Ensure factual information is preserved exactly\n"
|