Spaces:
Running
Running
import gradio as gr | |
from utils import (extract_wiki_id, get_wiki_details, | |
init_llm_client, split_content_into_sections, | |
get_translate_prompt) | |
import json | |
import json_repair | |
import os | |
import tempfile | |
# Define language options for translation | |
LANGUAGES = { | |
"Arabic": "ar", | |
"Arabic-Extended": "ar-x-extended", | |
"English": "en", | |
"Spanish": "es", | |
"French": "fr", | |
"German": "de", | |
"Italian": "it", | |
"Portuguese": "pt", | |
"Russian": "ru", | |
"Japanese": "ja", | |
"Chinese": "zh", | |
"Hindi": "hi", | |
"Korean": "ko", | |
"Custom": "custom" # Add custom option | |
} | |
debug_display = None | |
debug_header = None | |
def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang, custom_lang, content_format, chunking): | |
""" | |
Function to extract content from Wikipedia URL (placeholder for now) | |
""" | |
# Use custom language if selected | |
if target_lang == "Custom" and custom_lang: | |
target_lang = custom_lang | |
wiki_id = extract_wiki_id(wiki_url) | |
if not wiki_id: | |
return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {} | |
# Get the details of the Wikipedia article | |
wiki_details = get_wiki_details(wiki_id) | |
if chunking: | |
# Split content into sections when chunking is enabled | |
if content_format == "XML": | |
content_sections = split_content_into_sections(wiki_details['wiki_xml'], content_format) | |
else: | |
content_sections = split_content_into_sections(wiki_details['content'], content_format) | |
else: | |
# Use entire content as a single section when chunking is disabled | |
content_sections = {"Full Article": wiki_details['content'] if content_format == "Text" else wiki_details['wiki_xml']} | |
return ( | |
"Extraction complete! Sections: " + str(len(content_sections)), | |
wiki_details['pageid'], | |
wiki_details['title'], | |
wiki_details['summary'], | |
wiki_details['wiki_xml'], | |
content_sections | |
) | |
def translate_content(content, article_title, artice_summary, content_format, | |
target_lang, api_key, model_id, base_url, preference_prompt=None, debug_mode=False): | |
llm_client = init_llm_client(api_key, base_url=base_url) | |
# Use the target_lang as is - it should already be properly resolved | |
# by the calling function (either a language code or custom value) | |
translation_prompt = get_translate_prompt( | |
article_title=article_title, | |
artice_summary=artice_summary, | |
original_content=content, | |
target_lang=target_lang, | |
content_format=content_format, | |
preference_prompt=preference_prompt | |
) | |
# Call the LLM to get the translation - updating params to match OpenAI's requirements | |
response = llm_client.chat.completions.create( | |
model=model_id, | |
messages=[ | |
{"role": "user", "content": translation_prompt} | |
], | |
max_tokens=2000, | |
temperature=0.5 | |
) | |
decoded_object = json_repair.loads(response.choices[0].message.content) | |
# Return translation and debug info if debug mode is enabled | |
if debug_mode: | |
debug_info = { | |
"prompt": translation_prompt, | |
"response": response.choices[0].message.content, | |
"usage": { | |
"prompt_tokens": response.usage.prompt_tokens, | |
"completion_tokens": response.usage.completion_tokens, | |
"total_tokens": response.usage.total_tokens | |
}, | |
"model": model_id | |
} | |
if 'output_content' in decoded_object: | |
return decoded_object['output_content'], debug_info | |
return "Error: Translation output not found in the response.", debug_info | |
# Regular return when debug mode is disabled | |
if 'output_content' in decoded_object: | |
return decoded_object['output_content'] | |
return "Error: Translation output not found in the response." | |
def translate_section(section_content, article_title, article_summary, content_format, | |
target_lang, custom_lang, api_key, model_id, base_url, preference_prompt=None, debug_mode=False): | |
""" | |
Translates a single section of the Wikipedia article | |
""" | |
if not section_content or not api_key: | |
return "Please provide content and API key for translation.", None if debug_mode else None | |
# Use custom language if selected | |
if target_lang == "Custom" and custom_lang: | |
actual_lang = custom_lang | |
else: | |
actual_lang = target_lang | |
result = translate_content( | |
content=section_content, | |
article_title=article_title, | |
artice_summary=article_summary, | |
content_format=content_format, | |
target_lang=actual_lang, | |
api_key=api_key, | |
model_id=model_id, | |
base_url=base_url, | |
preference_prompt=preference_prompt, | |
debug_mode=debug_mode | |
) | |
if debug_mode: | |
translation, debug_info = result | |
return translation, debug_info | |
return result, None | |
def format_debug_info(debug_info): | |
"""Format debug information as markdown for display in modal""" | |
if not debug_info: | |
return "No debug information available." | |
# Format the debug information as markdown | |
markdown = "## LLM Debug Information\n\n" | |
# Add model and usage info | |
markdown += f"### Model: {debug_info['model']}\n\n" | |
markdown += "### Usage\n" | |
markdown += f"- Prompt tokens: {debug_info['usage']['prompt_tokens']}\n" | |
markdown += f"- Completion tokens: {debug_info['usage']['completion_tokens']}\n" | |
markdown += f"- Total tokens: {debug_info['usage']['total_tokens']}\n\n" | |
# Add prompt | |
markdown += "### Prompt\n" | |
markdown += f"```\n{debug_info['prompt'].replace('```','')}\n```\n\n" | |
# Add raw response | |
markdown += "### Raw Response\n" | |
markdown += f"```json\n{debug_info['response']}\n```\n" | |
return markdown | |
# Functions to generate downloadable content for original and translated articles | |
def generate_download_original(download_format, article_title, article_summary, article_xml, sections): | |
""" | |
Generate a downloadable original content file in the specified format. | |
""" | |
# Prepare content and filename | |
if download_format == "Wikipedia XML": | |
content = article_xml or "" | |
filename = f"{article_title or 'article'}.xml" | |
elif download_format == "HTML": | |
parts = [f"<h1>{article_title}</h1>", f"<p>{article_summary}</p>"] | |
for title, text in sections.items(): | |
parts.append(f"<h2>{title}</h2>") | |
parts.append(f"<p>{text}</p>") | |
content = "\n".join(parts) | |
filename = f"{article_title or 'article'}.html" | |
elif download_format == "JSON": | |
obj = {"title": article_title, "summary": article_summary, "sections": sections} | |
content = json.dumps(obj, ensure_ascii=False, indent=2) | |
filename = f"{article_title or 'article'}.json" | |
else: # Plain Text | |
parts = [article_title or 'Article', article_summary or ''] | |
for title, text in sections.items(): | |
parts.append(f"## {title}\n{text}") | |
content = "\n\n".join(parts) | |
filename = f"{article_title or 'article'}.txt" | |
# Write to a temp file and return its path | |
temp_path = os.path.join(tempfile.gettempdir(), filename) | |
with open(temp_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return temp_path | |
def generate_download_translated(download_format, article_title, article_summary, sections, *translations_values): | |
""" | |
Generate downloadable translated content file from existing translations. | |
""" | |
# Build translations dict from provided UI values | |
section_titles = list(sections.keys()) | |
translations = {section_titles[i]: translations_values[i] for i in range(min(len(section_titles), len(translations_values)))} | |
# Build downloadable content | |
if download_format == "Wikipedia XML": | |
parts = [f"<article title=\"{article_title}\">"] | |
for title, text in translations.items(): | |
parts.append(f" <section title=\"{title}\">{text}</section>") | |
parts.append("</article>") | |
content = "\n".join(parts) | |
filename = f"{article_title or 'article'}_translated.xml" | |
elif download_format == "HTML": | |
parts = [f"<h1>{article_title}</h1>", f"<p>{article_summary}</p>"] | |
for title, text in translations.items(): | |
parts.append(f"<h2>{title}</h2>") | |
parts.append(f"<p>{text}</p>") | |
content = "\n".join(parts) | |
filename = f"{article_title or 'article'}_translated.html" | |
elif download_format == "JSON": | |
obj = {"title": article_title, "summary": article_summary, "sections": translations} | |
content = json.dumps(obj, ensure_ascii=False, indent=2) | |
filename = f"{article_title or 'article'}_translated.json" | |
else: | |
parts = [article_title or 'Article', article_summary or ''] | |
for title, text in translations.items(): | |
parts.append(f"## {title}\n{text}") | |
content = "\n\n".join(parts) | |
filename = f"{article_title or 'article'}_translated.txt" | |
# Write to temp file | |
temp_path = os.path.join(tempfile.gettempdir(), filename) | |
with open(temp_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return temp_path | |
def clean_section_title(title): | |
"""Clean section title from HTML entities and comments""" | |
# Remove HTML comments | |
import re | |
title = re.sub(r'<!--.*?-->', '', title) | |
# Replace HTML entities | |
import html | |
title = html.unescape(title) | |
# Remove extra whitespace | |
title = ' '.join(title.split()) | |
return title.strip() | |
# Add this function to update UI with sections from Wikipedia content | |
def update_ui_with_sections(sections): | |
""" | |
Updates the UI to display sections from the Wikipedia article | |
Args: | |
sections: Dictionary of section titles and content | |
Returns: | |
List of updates for all section components | |
""" | |
results = [] | |
# Prepare updates for up to 100 sections (400 components - 4 per section) | |
for i in range(100): | |
if i < len(sections): | |
# Get section title and content | |
section_title = list(sections.keys())[i] | |
section_content = sections[section_title] | |
# Clean the section title for display | |
clean_title = clean_section_title(section_title) | |
# Make section textbox visible with content and label | |
results.extend([ | |
gr.update(visible=True, value=section_content, label=f"Section: {clean_title}"), | |
gr.update(visible=True), # Translate button | |
gr.update(visible=True, value="", label=f"Translation: {clean_title}"), # Translation output | |
gr.update(visible=False) # Debug button (hidden by default) | |
]) | |
else: | |
# Hide unused components | |
results.extend([ | |
gr.update(visible=False), | |
gr.update(visible=False), | |
gr.update(visible=False), | |
gr.update(visible=False) | |
]) | |
return results | |
# Create Gradio app | |
with gr.Blocks(theme=gr.themes.Monochrome(), css=""" | |
.odd-section { background-color: rgb(228 213 213); padding: 15px; border-radius: 8px; margin: 10px 0; } | |
""") as demo: | |
gr.Markdown("# Wikipedia Translator") | |
gr.Markdown(""" | |
**Translate Wikipedia articles into any language using AI** | |
This tool helps you: | |
- Extract content from any Wikipedia article | |
- Translate it into your chosen language using OpenAI's models | |
- Maintain the article's structure and formatting | |
- Download the translated content in various formats | |
Start by configuring your API settings in the sidebar and entering a Wikipedia URL below. | |
--- | |
Developed by [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/) | |
""") | |
# State variables | |
sections_state = gr.State({}) | |
sidebar_expanded = gr.State(True) # Track sidebar state, default is expanded | |
def toggle_sidebar(expanded): | |
"""Toggle the sidebar visibility""" | |
new_expanded = not expanded | |
return ( | |
new_expanded, | |
gr.update(visible=new_expanded), | |
gr.update(scale=3 if not new_expanded else 2), | |
gr.update(visible=not new_expanded) # Control visibility of the show button | |
) | |
# Function to show/hide custom language input based on selection | |
def toggle_custom_language(target_lang): | |
if target_lang == "Custom": | |
return gr.update(visible=True) | |
return gr.update(visible=False) | |
with gr.Row() as main_layout: | |
# Sidebar for configuration | |
with gr.Column(scale=1, visible=True) as sidebar: | |
# Add a toggle button at the top of the sidebar with updated icon | |
sidebar_toggle = gr.Button("« Hide Sidebar", scale=0) | |
gr.Markdown("### Configuration") | |
with gr.Group(): | |
api_key = gr.Textbox( | |
label="OpenAI API Key", | |
placeholder="sk-...", | |
type="password", | |
) | |
model_id = gr.Textbox( | |
label="OpenAI Model ID", | |
placeholder="gpt-4.1-mini", | |
value="gpt-4.1-mini", | |
) | |
base_url = gr.Textbox( | |
label="OpenAI API Base URL (Optional)", | |
placeholder="https://api.openai.com/v1", | |
info="Leave default unless using a proxy" | |
) | |
target_language = gr.Dropdown( | |
choices=list(LANGUAGES.keys()), | |
value="Arabic", | |
label="Target Language", | |
) | |
custom_language = gr.Textbox( | |
label="Custom Language", | |
placeholder="Enter language name (e.g., Swedish, Dutch, etc.)", | |
visible=False, | |
info="Specify your desired language if not in the list above" | |
) | |
# Connect the dropdown to show/hide custom language input | |
target_language.change( | |
fn=toggle_custom_language, | |
inputs=[target_language], | |
outputs=[custom_language] | |
) | |
# Add chunking control before content format | |
chunking = gr.Checkbox( | |
label="Enable Content Chunking", | |
value=False, | |
info="Split content into sections for individual translation" | |
) | |
content_format = gr.Radio( | |
choices=["Text", "XML"], | |
value="XML", | |
label="Content Format", | |
info="Choose how to display article content" | |
) | |
# Debug mode toggle | |
debug_mode = gr.Checkbox( | |
label="Debug Mode", | |
value=False, | |
info="Show detailed information about LLM calls" | |
) | |
# Add preference prompt section | |
gr.Markdown("### Translation Preferences") | |
preference_prompt = gr.Textbox( | |
label="Additional Translation Preferences", | |
placeholder="Enter any specific translation preferences or instructions...", | |
lines=5, | |
info="Optional: Add specific preferences for how the translation should be performed" | |
) | |
# Replace static About section with Accordion | |
with gr.Accordion("About", open=False): | |
gr.Markdown(""" | |
This tool extracts content from Wikipedia articles and translates them into your selected language using OpenAI's language models. | |
1. Configure your API settings | |
2. Enter a Wikipedia URL | |
3. Click Extract to process the article | |
--- | |
Developed by [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/) | |
""") | |
# Main content area | |
with gr.Column(scale=2) as main_content: | |
# Show sidebar toggle button when sidebar is hidden (updated icon) | |
with gr.Row(): | |
sidebar_show_btn = gr.Button("» Show Sidebar", visible=False, scale=0) | |
with gr.Column(scale=1): | |
gr.Markdown("### Wikipedia Article") | |
wiki_url = gr.Textbox( | |
label="Wikipedia URL", | |
placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence", | |
info="Enter the full URL of the Wikipedia article" | |
) | |
extract_button = gr.Button("Extract and Prepare for Translation", variant="primary") | |
output = gr.Markdown(label="Status") | |
# Results area (will expand in the future) | |
article_pageid = gr.Textbox( | |
label="Article Page ID", | |
placeholder="Page ID will appear here after extraction", | |
interactive=False, | |
show_copy_button=True | |
) | |
article_title = gr.Textbox( | |
label="Article Title", | |
placeholder="Title will appear here after extraction", | |
interactive=False, | |
show_copy_button=True | |
) | |
aticle_summary = gr.Textbox( | |
label="Article Summary", | |
placeholder="Summary will appear here after extraction", | |
interactive=False, | |
show_copy_button=True | |
) | |
article_xml = gr.Textbox( | |
label="Article XML", | |
placeholder="XML will appear here after extraction", | |
interactive=False, | |
visible=False, # Hidden by default as it's usually large | |
show_copy_button=True | |
) | |
# Debug info state and modal components | |
debug_info_state = gr.State(None) | |
# Remove the debug_markdown from the main area as we'll only use the sidebar for debug info | |
# Pre-define section textboxes and related components | |
gr.Markdown("### Article Sections") | |
with gr.Column() as sections_container: | |
section_components = [] | |
for i in range(100): # Support up to 100 sections | |
with gr.Column(elem_classes=["odd-section"] if i % 2 == 0 else []) as section: # Add class for odd sections | |
# Section content | |
section_textbox = gr.Textbox(visible=False, lines=4, show_copy_button=True) | |
with gr.Row(): # Controls row | |
translate_btn = gr.Button("Translate", visible=False) | |
debug_btn = gr.Button("View Debug Info", visible=False) | |
# Translation output | |
translation_output = gr.Textbox(visible=False, lines=4, show_copy_button=True) | |
# Add separator | |
gr.Markdown("---", visible=False) | |
section_components.extend([section_textbox, translate_btn, translation_output, debug_btn]) | |
# Connect the translate button to the translation function | |
result = translate_btn.click( | |
fn=translate_section, | |
inputs=[ | |
section_textbox, | |
article_title, | |
aticle_summary, | |
content_format, | |
target_language, | |
custom_language, | |
api_key, | |
model_id, | |
base_url, | |
preference_prompt, | |
debug_mode | |
], | |
outputs=[translation_output, debug_info_state] | |
) | |
# Show debug button only when debug mode is on and after translation | |
result.then( | |
fn=lambda debug_info, debug_mode: gr.update(visible=debug_mode and debug_info is not None), | |
inputs=[debug_info_state, debug_mode], | |
outputs=[debug_btn] | |
) | |
# Update this to only show the debug info in the sidebar | |
# We'll reconnect this later in the code | |
# Connect the extract button to the function | |
extract_button.click( | |
fn=extract_wikipedia_content, | |
inputs=[wiki_url, api_key, model_id, base_url, target_language, custom_language, content_format, chunking], | |
outputs=[ | |
output, | |
article_pageid, | |
article_title, | |
aticle_summary, | |
article_xml, | |
sections_state, | |
] | |
).then( | |
fn=update_ui_with_sections, | |
inputs=[sections_state], | |
outputs=section_components | |
) | |
# Connect the sidebar toggle buttons | |
sidebar_toggle.click( | |
fn=toggle_sidebar, | |
inputs=[sidebar_expanded], | |
outputs=[sidebar_expanded, sidebar, main_content, sidebar_show_btn] | |
) | |
sidebar_show_btn.click( | |
fn=toggle_sidebar, | |
inputs=[sidebar_expanded], | |
outputs=[sidebar_expanded, sidebar, main_content, sidebar_show_btn] | |
) | |
# Add download options to the bottom of the sidebar | |
with sidebar: | |
download_format = gr.Dropdown( | |
choices=["Wikipedia XML", "HTML", "JSON", "Plain Text"], | |
value="Wikipedia XML", | |
label="Download Format" | |
) | |
download_original_btn = gr.Button("Download Original") | |
download_original_file = gr.File(label="Original Article") | |
download_translated_btn = gr.Button("Download Translated") | |
download_translated_file = gr.File(label="Translated Article") | |
# Debug info display | |
debug_header = gr.Markdown("### Debug Information", visible=False) | |
debug_display = gr.Markdown(visible=False) | |
# Update the debug button click handler to show debug info in the sidebar | |
for i in range(0, len(section_components), 4): | |
debug_btn = section_components[i+3] # The debug button is the 4th component | |
# Connect debug button directly to show debug info only in the sidebar | |
debug_btn.click( | |
fn=format_debug_info, | |
inputs=[debug_info_state], | |
outputs=[debug_display] | |
).then( | |
fn=lambda: (gr.update(visible=True), gr.update(visible=True)), | |
outputs=[debug_header, debug_display] | |
) | |
# Connect download buttons | |
download_original_btn.click( | |
fn=generate_download_original, | |
inputs=[download_format, article_title, aticle_summary, article_xml, sections_state], | |
outputs=[download_original_file] | |
) | |
# Prepare existing translation outputs for download | |
translation_outputs = [section_components[i+2] for i in range(0, len(section_components), 4)] | |
download_translated_btn.click( | |
fn=generate_download_translated, | |
inputs=[download_format, article_title, aticle_summary, sections_state] + translation_outputs, | |
outputs=[download_translated_file] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
demo.launch() |