Spaces:
Running
Running
File size: 24,387 Bytes
ce96e8f c25ce6b e424603 e39d0f6 61057b2 ce96e8f 06c3d9b ce96e8f 99acdbc ce96e8f 571e415 4257826 ce96e8f 99acdbc 30561b6 99acdbc e424603 4257826 94260c3 4257826 e424603 94260c3 186544d c25ce6b c065ba1 c25ce6b 99acdbc c25ce6b 94260c3 186544d c25ce6b c065ba1 c25ce6b c065ba1 efca194 e39d0f6 efca194 186544d c065ba1 efca194 c065ba1 99acdbc 30561b6 99acdbc efca194 c065ba1 94260c3 99acdbc c065ba1 efca194 186544d efca194 c065ba1 efca194 c065ba1 efca194 61057b2 d7a887c 61057b2 d7a887c 61057b2 d7a887c 61057b2 d7a887c 61057b2 d7a887c 61057b2 fc1df00 efca194 e424603 efca194 e424603 efca194 e424603 efca194 e424603 efca194 fc1df00 efca194 fc1df00 efca194 fc1df00 efca194 e424603 efca194 ce96e8f 4257826 ce96e8f c673286 ce96e8f f792136 e424603 f792136 e424603 f792136 99acdbc f792136 ce96e8f f792136 ce96e8f c065ba1 ce96e8f c065ba1 ce96e8f 94260c3 99acdbc 4257826 94260c3 efca194 186544d ce96e8f c673286 ce96e8f f792136 ce96e8f e424603 ce96e8f e424603 b9a6ee5 e424603 b9a6ee5 e424603 b9a6ee5 e424603 b9a6ee5 e424603 efca194 f039cae efca194 c065ba1 e424603 c065ba1 4257826 b9a6ee5 4257826 b9a6ee5 4257826 efca194 c065ba1 efca194 c065ba1 94260c3 c065ba1 99acdbc c065ba1 efca194 f039cae efca194 c065ba1 efca194 f039cae ce96e8f 4257826 e424603 c065ba1 ce96e8f f792136 61057b2 efca194 61057b2 efca194 f039cae efca194 61057b2 d7a887c 61057b2 d7a887c 61057b2 efca194 ce96e8f e424603 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 |
import gradio as gr
from utils import (extract_wiki_id, get_wiki_details,
init_llm_client, split_content_into_sections,
get_translate_prompt)
import json
import json_repair
import os
import tempfile
# Define language options for translation
LANGUAGES = {
"Arabic": "ar",
"Arabic-Extended": "ar-x-extended",
"English": "en",
"Spanish": "es",
"French": "fr",
"German": "de",
"Italian": "it",
"Portuguese": "pt",
"Russian": "ru",
"Japanese": "ja",
"Chinese": "zh",
"Hindi": "hi",
"Korean": "ko",
"Custom": "custom" # Add custom option
}
debug_display = None
debug_header = None
def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang, custom_lang, content_format, chunking):
"""
Function to extract content from Wikipedia URL (placeholder for now)
"""
# Use custom language if selected
if target_lang == "Custom" and custom_lang:
target_lang = custom_lang
wiki_id = extract_wiki_id(wiki_url)
if not wiki_id:
return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {}
# Get the details of the Wikipedia article
wiki_details = get_wiki_details(wiki_id)
if chunking:
# Split content into sections when chunking is enabled
if content_format == "XML":
content_sections = split_content_into_sections(wiki_details['wiki_xml'], content_format)
else:
content_sections = split_content_into_sections(wiki_details['content'], content_format)
else:
# Use entire content as a single section when chunking is disabled
content_sections = {"Full Article": wiki_details['content'] if content_format == "Text" else wiki_details['wiki_xml']}
return (
"Extraction complete! Sections: " + str(len(content_sections)),
wiki_details['pageid'],
wiki_details['title'],
wiki_details['summary'],
wiki_details['wiki_xml'],
content_sections
)
def translate_content(content, article_title, artice_summary, content_format,
target_lang, api_key, model_id, base_url, preference_prompt=None, debug_mode=False):
llm_client = init_llm_client(api_key, base_url=base_url)
# Use the target_lang as is - it should already be properly resolved
# by the calling function (either a language code or custom value)
translation_prompt = get_translate_prompt(
article_title=article_title,
artice_summary=artice_summary,
original_content=content,
target_lang=target_lang,
content_format=content_format,
preference_prompt=preference_prompt
)
# Call the LLM to get the translation - updating params to match OpenAI's requirements
response = llm_client.chat.completions.create(
model=model_id,
messages=[
{"role": "user", "content": translation_prompt}
],
max_tokens=2000,
temperature=0.5
)
decoded_object = json_repair.loads(response.choices[0].message.content)
# Return translation and debug info if debug mode is enabled
if debug_mode:
debug_info = {
"prompt": translation_prompt,
"response": response.choices[0].message.content,
"usage": {
"prompt_tokens": response.usage.prompt_tokens,
"completion_tokens": response.usage.completion_tokens,
"total_tokens": response.usage.total_tokens
},
"model": model_id
}
if 'output_content' in decoded_object:
return decoded_object['output_content'], debug_info
return "Error: Translation output not found in the response.", debug_info
# Regular return when debug mode is disabled
if 'output_content' in decoded_object:
return decoded_object['output_content']
return "Error: Translation output not found in the response."
def translate_section(section_content, article_title, article_summary, content_format,
target_lang, custom_lang, api_key, model_id, base_url, preference_prompt=None, debug_mode=False):
"""
Translates a single section of the Wikipedia article
"""
if not section_content or not api_key:
return "Please provide content and API key for translation.", None if debug_mode else None
# Use custom language if selected
if target_lang == "Custom" and custom_lang:
actual_lang = custom_lang
else:
actual_lang = target_lang
result = translate_content(
content=section_content,
article_title=article_title,
artice_summary=article_summary,
content_format=content_format,
target_lang=actual_lang,
api_key=api_key,
model_id=model_id,
base_url=base_url,
preference_prompt=preference_prompt,
debug_mode=debug_mode
)
if debug_mode:
translation, debug_info = result
return translation, debug_info
return result, None
def format_debug_info(debug_info):
"""Format debug information as markdown for display in modal"""
if not debug_info:
return "No debug information available."
# Format the debug information as markdown
markdown = "## LLM Debug Information\n\n"
# Add model and usage info
markdown += f"### Model: {debug_info['model']}\n\n"
markdown += "### Usage\n"
markdown += f"- Prompt tokens: {debug_info['usage']['prompt_tokens']}\n"
markdown += f"- Completion tokens: {debug_info['usage']['completion_tokens']}\n"
markdown += f"- Total tokens: {debug_info['usage']['total_tokens']}\n\n"
# Add prompt
markdown += "### Prompt\n"
markdown += f"```\n{debug_info['prompt'].replace('```','')}\n```\n\n"
# Add raw response
markdown += "### Raw Response\n"
markdown += f"```json\n{debug_info['response']}\n```\n"
return markdown
# Functions to generate downloadable content for original and translated articles
def generate_download_original(download_format, article_title, article_summary, article_xml, sections):
"""
Generate a downloadable original content file in the specified format.
"""
# Prepare content and filename
if download_format == "Wikipedia XML":
content = article_xml or ""
filename = f"{article_title or 'article'}.xml"
elif download_format == "HTML":
parts = [f"<h1>{article_title}</h1>", f"<p>{article_summary}</p>"]
for title, text in sections.items():
parts.append(f"<h2>{title}</h2>")
parts.append(f"<p>{text}</p>")
content = "\n".join(parts)
filename = f"{article_title or 'article'}.html"
elif download_format == "JSON":
obj = {"title": article_title, "summary": article_summary, "sections": sections}
content = json.dumps(obj, ensure_ascii=False, indent=2)
filename = f"{article_title or 'article'}.json"
else: # Plain Text
parts = [article_title or 'Article', article_summary or '']
for title, text in sections.items():
parts.append(f"## {title}\n{text}")
content = "\n\n".join(parts)
filename = f"{article_title or 'article'}.txt"
# Write to a temp file and return its path
temp_path = os.path.join(tempfile.gettempdir(), filename)
with open(temp_path, 'w', encoding='utf-8') as f:
f.write(content)
return temp_path
def generate_download_translated(download_format, article_title, article_summary, sections, *translations_values):
"""
Generate downloadable translated content file from existing translations.
"""
# Build translations dict from provided UI values
section_titles = list(sections.keys())
translations = {section_titles[i]: translations_values[i] for i in range(min(len(section_titles), len(translations_values)))}
# Build downloadable content
if download_format == "Wikipedia XML":
parts = [f"<article title=\"{article_title}\">"]
for title, text in translations.items():
parts.append(f" <section title=\"{title}\">{text}</section>")
parts.append("</article>")
content = "\n".join(parts)
filename = f"{article_title or 'article'}_translated.xml"
elif download_format == "HTML":
parts = [f"<h1>{article_title}</h1>", f"<p>{article_summary}</p>"]
for title, text in translations.items():
parts.append(f"<h2>{title}</h2>")
parts.append(f"<p>{text}</p>")
content = "\n".join(parts)
filename = f"{article_title or 'article'}_translated.html"
elif download_format == "JSON":
obj = {"title": article_title, "summary": article_summary, "sections": translations}
content = json.dumps(obj, ensure_ascii=False, indent=2)
filename = f"{article_title or 'article'}_translated.json"
else:
parts = [article_title or 'Article', article_summary or '']
for title, text in translations.items():
parts.append(f"## {title}\n{text}")
content = "\n\n".join(parts)
filename = f"{article_title or 'article'}_translated.txt"
# Write to temp file
temp_path = os.path.join(tempfile.gettempdir(), filename)
with open(temp_path, 'w', encoding='utf-8') as f:
f.write(content)
return temp_path
def clean_section_title(title):
"""Clean section title from HTML entities and comments"""
# Remove HTML comments
import re
title = re.sub(r'<!--.*?-->', '', title)
# Replace HTML entities
import html
title = html.unescape(title)
# Remove extra whitespace
title = ' '.join(title.split())
return title.strip()
# Add this function to update UI with sections from Wikipedia content
def update_ui_with_sections(sections):
"""
Updates the UI to display sections from the Wikipedia article
Args:
sections: Dictionary of section titles and content
Returns:
List of updates for all section components
"""
results = []
# Prepare updates for up to 100 sections (400 components - 4 per section)
for i in range(100):
if i < len(sections):
# Get section title and content
section_title = list(sections.keys())[i]
section_content = sections[section_title]
# Clean the section title for display
clean_title = clean_section_title(section_title)
# Make section textbox visible with content and label
results.extend([
gr.update(visible=True, value=section_content, label=f"Section: {clean_title}"),
gr.update(visible=True), # Translate button
gr.update(visible=True, value="", label=f"Translation: {clean_title}"), # Translation output
gr.update(visible=False) # Debug button (hidden by default)
])
else:
# Hide unused components
results.extend([
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False)
])
return results
# Create Gradio app
with gr.Blocks(theme=gr.themes.Monochrome(), css="""
.odd-section { background-color: rgb(228 213 213); padding: 15px; border-radius: 8px; margin: 10px 0; }
""") as demo:
gr.Markdown("# Wikipedia Translator")
gr.Markdown("""
**Translate Wikipedia articles into any language using AI**
This tool helps you:
- Extract content from any Wikipedia article
- Translate it into your chosen language using OpenAI's models
- Maintain the article's structure and formatting
- Download the translated content in various formats
Start by configuring your API settings in the sidebar and entering a Wikipedia URL below.
---
Developed by [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/)
""")
# State variables
sections_state = gr.State({})
sidebar_expanded = gr.State(True) # Track sidebar state, default is expanded
def toggle_sidebar(expanded):
"""Toggle the sidebar visibility"""
new_expanded = not expanded
return (
new_expanded,
gr.update(visible=new_expanded),
gr.update(scale=3 if not new_expanded else 2),
gr.update(visible=not new_expanded) # Control visibility of the show button
)
# Function to show/hide custom language input based on selection
def toggle_custom_language(target_lang):
if target_lang == "Custom":
return gr.update(visible=True)
return gr.update(visible=False)
with gr.Row() as main_layout:
# Sidebar for configuration
with gr.Column(scale=1, visible=True) as sidebar:
# Add a toggle button at the top of the sidebar with updated icon
sidebar_toggle = gr.Button("« Hide Sidebar", scale=0)
gr.Markdown("### Configuration")
with gr.Group():
api_key = gr.Textbox(
label="OpenAI API Key",
placeholder="sk-...",
type="password",
)
model_id = gr.Textbox(
label="OpenAI Model ID",
placeholder="gpt-4.1-mini",
value="gpt-4.1-mini",
)
base_url = gr.Textbox(
label="OpenAI API Base URL (Optional)",
placeholder="https://api.openai.com/v1",
info="Leave default unless using a proxy"
)
target_language = gr.Dropdown(
choices=list(LANGUAGES.keys()),
value="Arabic",
label="Target Language",
)
custom_language = gr.Textbox(
label="Custom Language",
placeholder="Enter language name (e.g., Swedish, Dutch, etc.)",
visible=False,
info="Specify your desired language if not in the list above"
)
# Connect the dropdown to show/hide custom language input
target_language.change(
fn=toggle_custom_language,
inputs=[target_language],
outputs=[custom_language]
)
# Add chunking control before content format
chunking = gr.Checkbox(
label="Enable Content Chunking",
value=False,
info="Split content into sections for individual translation"
)
content_format = gr.Radio(
choices=["Text", "XML"],
value="XML",
label="Content Format",
info="Choose how to display article content"
)
# Debug mode toggle
debug_mode = gr.Checkbox(
label="Debug Mode",
value=False,
info="Show detailed information about LLM calls"
)
# Add preference prompt section
gr.Markdown("### Translation Preferences")
preference_prompt = gr.Textbox(
label="Additional Translation Preferences",
placeholder="Enter any specific translation preferences or instructions...",
lines=5,
info="Optional: Add specific preferences for how the translation should be performed"
)
# Replace static About section with Accordion
with gr.Accordion("About", open=False):
gr.Markdown("""
This tool extracts content from Wikipedia articles and translates them into your selected language using OpenAI's language models.
1. Configure your API settings
2. Enter a Wikipedia URL
3. Click Extract to process the article
---
Developed by [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/)
""")
# Main content area
with gr.Column(scale=2) as main_content:
# Show sidebar toggle button when sidebar is hidden (updated icon)
with gr.Row():
sidebar_show_btn = gr.Button("» Show Sidebar", visible=False, scale=0)
with gr.Column(scale=1):
gr.Markdown("### Wikipedia Article")
wiki_url = gr.Textbox(
label="Wikipedia URL",
placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence",
info="Enter the full URL of the Wikipedia article"
)
extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
output = gr.Markdown(label="Status")
# Results area (will expand in the future)
article_pageid = gr.Textbox(
label="Article Page ID",
placeholder="Page ID will appear here after extraction",
interactive=False,
show_copy_button=True
)
article_title = gr.Textbox(
label="Article Title",
placeholder="Title will appear here after extraction",
interactive=False,
show_copy_button=True
)
aticle_summary = gr.Textbox(
label="Article Summary",
placeholder="Summary will appear here after extraction",
interactive=False,
show_copy_button=True
)
article_xml = gr.Textbox(
label="Article XML",
placeholder="XML will appear here after extraction",
interactive=False,
visible=False, # Hidden by default as it's usually large
show_copy_button=True
)
# Debug info state and modal components
debug_info_state = gr.State(None)
# Remove the debug_markdown from the main area as we'll only use the sidebar for debug info
# Pre-define section textboxes and related components
gr.Markdown("### Article Sections")
with gr.Column() as sections_container:
section_components = []
for i in range(100): # Support up to 100 sections
with gr.Column(elem_classes=["odd-section"] if i % 2 == 0 else []) as section: # Add class for odd sections
# Section content
section_textbox = gr.Textbox(visible=False, lines=4, show_copy_button=True)
with gr.Row(): # Controls row
translate_btn = gr.Button("Translate", visible=False)
debug_btn = gr.Button("View Debug Info", visible=False)
# Translation output
translation_output = gr.Textbox(visible=False, lines=4, show_copy_button=True)
# Add separator
gr.Markdown("---", visible=False)
section_components.extend([section_textbox, translate_btn, translation_output, debug_btn])
# Connect the translate button to the translation function
result = translate_btn.click(
fn=translate_section,
inputs=[
section_textbox,
article_title,
aticle_summary,
content_format,
target_language,
custom_language,
api_key,
model_id,
base_url,
preference_prompt,
debug_mode
],
outputs=[translation_output, debug_info_state]
)
# Show debug button only when debug mode is on and after translation
result.then(
fn=lambda debug_info, debug_mode: gr.update(visible=debug_mode and debug_info is not None),
inputs=[debug_info_state, debug_mode],
outputs=[debug_btn]
)
# Update this to only show the debug info in the sidebar
# We'll reconnect this later in the code
# Connect the extract button to the function
extract_button.click(
fn=extract_wikipedia_content,
inputs=[wiki_url, api_key, model_id, base_url, target_language, custom_language, content_format, chunking],
outputs=[
output,
article_pageid,
article_title,
aticle_summary,
article_xml,
sections_state,
]
).then(
fn=update_ui_with_sections,
inputs=[sections_state],
outputs=section_components
)
# Connect the sidebar toggle buttons
sidebar_toggle.click(
fn=toggle_sidebar,
inputs=[sidebar_expanded],
outputs=[sidebar_expanded, sidebar, main_content, sidebar_show_btn]
)
sidebar_show_btn.click(
fn=toggle_sidebar,
inputs=[sidebar_expanded],
outputs=[sidebar_expanded, sidebar, main_content, sidebar_show_btn]
)
# Add download options to the bottom of the sidebar
with sidebar:
download_format = gr.Dropdown(
choices=["Wikipedia XML", "HTML", "JSON", "Plain Text"],
value="Wikipedia XML",
label="Download Format"
)
download_original_btn = gr.Button("Download Original")
download_original_file = gr.File(label="Original Article")
download_translated_btn = gr.Button("Download Translated")
download_translated_file = gr.File(label="Translated Article")
# Debug info display
debug_header = gr.Markdown("### Debug Information", visible=False)
debug_display = gr.Markdown(visible=False)
# Update the debug button click handler to show debug info in the sidebar
for i in range(0, len(section_components), 4):
debug_btn = section_components[i+3] # The debug button is the 4th component
# Connect debug button directly to show debug info only in the sidebar
debug_btn.click(
fn=format_debug_info,
inputs=[debug_info_state],
outputs=[debug_display]
).then(
fn=lambda: (gr.update(visible=True), gr.update(visible=True)),
outputs=[debug_header, debug_display]
)
# Connect download buttons
download_original_btn.click(
fn=generate_download_original,
inputs=[download_format, article_title, aticle_summary, article_xml, sections_state],
outputs=[download_original_file]
)
# Prepare existing translation outputs for download
translation_outputs = [section_components[i+2] for i in range(0, len(section_components), 4)]
download_translated_btn.click(
fn=generate_download_translated,
inputs=[download_format, article_title, aticle_summary, sections_state] + translation_outputs,
outputs=[download_translated_file]
)
# Launch the app
if __name__ == "__main__":
demo.launch() |