Spaces:
Running
Running
| import gradio as gr | |
| from utils import (extract_wiki_id, get_wiki_details, | |
| init_llm_client, split_content_into_sections, | |
| get_translate_prompt) | |
| import json | |
| import json_repair | |
| import os | |
| import tempfile | |
| # Define language options for translation | |
| LANGUAGES = { | |
| "Arabic": "ar", | |
| "Arabic-Extended": "ar-x-extended", | |
| "English": "en", | |
| "Spanish": "es", | |
| "French": "fr", | |
| "German": "de", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Russian": "ru", | |
| "Japanese": "ja", | |
| "Chinese": "zh", | |
| "Hindi": "hi", | |
| "Korean": "ko", | |
| "Custom": "custom" # Add custom option | |
| } | |
| debug_display = None | |
| debug_header = None | |
| def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang, custom_lang, content_format, chunking): | |
| """ | |
| Function to extract content from Wikipedia URL (placeholder for now) | |
| """ | |
| # Use custom language if selected | |
| if target_lang == "Custom" and custom_lang: | |
| target_lang = custom_lang | |
| wiki_id = extract_wiki_id(wiki_url) | |
| if not wiki_id: | |
| return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {} | |
| # Get the details of the Wikipedia article | |
| wiki_details = get_wiki_details(wiki_id) | |
| if chunking: | |
| # Split content into sections when chunking is enabled | |
| if content_format == "XML": | |
| content_sections = split_content_into_sections(wiki_details['wiki_xml'], content_format) | |
| else: | |
| content_sections = split_content_into_sections(wiki_details['content'], content_format) | |
| else: | |
| # Use entire content as a single section when chunking is disabled | |
| content_sections = {"Full Article": wiki_details['content'] if content_format == "Text" else wiki_details['wiki_xml']} | |
| return ( | |
| "Extraction complete! Sections: " + str(len(content_sections)), | |
| wiki_details['pageid'], | |
| wiki_details['title'], | |
| wiki_details['summary'], | |
| wiki_details['wiki_xml'], | |
| content_sections | |
| ) | |
| def translate_content(content, article_title, artice_summary, content_format, | |
| target_lang, api_key, model_id, base_url, preference_prompt=None, debug_mode=False): | |
| llm_client = init_llm_client(api_key, base_url=base_url) | |
| # Use the target_lang as is - it should already be properly resolved | |
| # by the calling function (either a language code or custom value) | |
| translation_prompt = get_translate_prompt( | |
| article_title=article_title, | |
| artice_summary=artice_summary, | |
| original_content=content, | |
| target_lang=target_lang, | |
| content_format=content_format, | |
| preference_prompt=preference_prompt | |
| ) | |
| # Call the LLM to get the translation - updating params to match OpenAI's requirements | |
| response = llm_client.chat.completions.create( | |
| model=model_id, | |
| messages=[ | |
| {"role": "user", "content": translation_prompt} | |
| ], | |
| max_tokens=2000, | |
| temperature=0.5 | |
| ) | |
| decoded_object = json_repair.loads(response.choices[0].message.content) | |
| # Return translation and debug info if debug mode is enabled | |
| if debug_mode: | |
| debug_info = { | |
| "prompt": translation_prompt, | |
| "response": response.choices[0].message.content, | |
| "usage": { | |
| "prompt_tokens": response.usage.prompt_tokens, | |
| "completion_tokens": response.usage.completion_tokens, | |
| "total_tokens": response.usage.total_tokens | |
| }, | |
| "model": model_id | |
| } | |
| if 'output_content' in decoded_object: | |
| return decoded_object['output_content'], debug_info | |
| return "Error: Translation output not found in the response.", debug_info | |
| # Regular return when debug mode is disabled | |
| if 'output_content' in decoded_object: | |
| return decoded_object['output_content'] | |
| return "Error: Translation output not found in the response." | |
| def translate_section(section_content, article_title, article_summary, content_format, | |
| target_lang, custom_lang, api_key, model_id, base_url, preference_prompt=None, debug_mode=False): | |
| """ | |
| Translates a single section of the Wikipedia article | |
| """ | |
| if not section_content or not api_key: | |
| return "Please provide content and API key for translation.", None if debug_mode else None | |
| # Use custom language if selected | |
| if target_lang == "Custom" and custom_lang: | |
| actual_lang = custom_lang | |
| else: | |
| actual_lang = target_lang | |
| result = translate_content( | |
| content=section_content, | |
| article_title=article_title, | |
| artice_summary=article_summary, | |
| content_format=content_format, | |
| target_lang=actual_lang, | |
| api_key=api_key, | |
| model_id=model_id, | |
| base_url=base_url, | |
| preference_prompt=preference_prompt, | |
| debug_mode=debug_mode | |
| ) | |
| if debug_mode: | |
| translation, debug_info = result | |
| return translation, debug_info | |
| return result, None | |
| def format_debug_info(debug_info): | |
| """Format debug information as markdown for display in modal""" | |
| if not debug_info: | |
| return "No debug information available." | |
| # Format the debug information as markdown | |
| markdown = "## LLM Debug Information\n\n" | |
| # Add model and usage info | |
| markdown += f"### Model: {debug_info['model']}\n\n" | |
| markdown += "### Usage\n" | |
| markdown += f"- Prompt tokens: {debug_info['usage']['prompt_tokens']}\n" | |
| markdown += f"- Completion tokens: {debug_info['usage']['completion_tokens']}\n" | |
| markdown += f"- Total tokens: {debug_info['usage']['total_tokens']}\n\n" | |
| # Add prompt | |
| markdown += "### Prompt\n" | |
| markdown += f"```\n{debug_info['prompt'].replace('```','')}\n```\n\n" | |
| # Add raw response | |
| markdown += "### Raw Response\n" | |
| markdown += f"```json\n{debug_info['response']}\n```\n" | |
| return markdown | |
| # Functions to generate downloadable content for original and translated articles | |
| def generate_download_original(download_format, article_title, article_summary, article_xml, sections): | |
| """ | |
| Generate a downloadable original content file in the specified format. | |
| """ | |
| # Prepare content and filename | |
| if download_format == "Wikipedia XML": | |
| content = article_xml or "" | |
| filename = f"{article_title or 'article'}.xml" | |
| elif download_format == "HTML": | |
| parts = [f"<h1>{article_title}</h1>", f"<p>{article_summary}</p>"] | |
| for title, text in sections.items(): | |
| parts.append(f"<h2>{title}</h2>") | |
| parts.append(f"<p>{text}</p>") | |
| content = "\n".join(parts) | |
| filename = f"{article_title or 'article'}.html" | |
| elif download_format == "JSON": | |
| obj = {"title": article_title, "summary": article_summary, "sections": sections} | |
| content = json.dumps(obj, ensure_ascii=False, indent=2) | |
| filename = f"{article_title or 'article'}.json" | |
| else: # Plain Text | |
| parts = [article_title or 'Article', article_summary or ''] | |
| for title, text in sections.items(): | |
| parts.append(f"## {title}\n{text}") | |
| content = "\n\n".join(parts) | |
| filename = f"{article_title or 'article'}.txt" | |
| # Write to a temp file and return its path | |
| temp_path = os.path.join(tempfile.gettempdir(), filename) | |
| with open(temp_path, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| return temp_path | |
| def generate_download_translated(download_format, article_title, article_summary, sections, *translations_values): | |
| """ | |
| Generate downloadable translated content file from existing translations. | |
| """ | |
| # Build translations dict from provided UI values | |
| section_titles = list(sections.keys()) | |
| translations = {section_titles[i]: translations_values[i] for i in range(min(len(section_titles), len(translations_values)))} | |
| # Build downloadable content | |
| if download_format == "Wikipedia XML": | |
| parts = [f"<article title=\"{article_title}\">"] | |
| for title, text in translations.items(): | |
| parts.append(f" <section title=\"{title}\">{text}</section>") | |
| parts.append("</article>") | |
| content = "\n".join(parts) | |
| filename = f"{article_title or 'article'}_translated.xml" | |
| elif download_format == "HTML": | |
| parts = [f"<h1>{article_title}</h1>", f"<p>{article_summary}</p>"] | |
| for title, text in translations.items(): | |
| parts.append(f"<h2>{title}</h2>") | |
| parts.append(f"<p>{text}</p>") | |
| content = "\n".join(parts) | |
| filename = f"{article_title or 'article'}_translated.html" | |
| elif download_format == "JSON": | |
| obj = {"title": article_title, "summary": article_summary, "sections": translations} | |
| content = json.dumps(obj, ensure_ascii=False, indent=2) | |
| filename = f"{article_title or 'article'}_translated.json" | |
| else: | |
| parts = [article_title or 'Article', article_summary or ''] | |
| for title, text in translations.items(): | |
| parts.append(f"## {title}\n{text}") | |
| content = "\n\n".join(parts) | |
| filename = f"{article_title or 'article'}_translated.txt" | |
| # Write to temp file | |
| temp_path = os.path.join(tempfile.gettempdir(), filename) | |
| with open(temp_path, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| return temp_path | |
| def clean_section_title(title): | |
| """Clean section title from HTML entities and comments""" | |
| # Remove HTML comments | |
| import re | |
| title = re.sub(r'<!--.*?-->', '', title) | |
| # Replace HTML entities | |
| import html | |
| title = html.unescape(title) | |
| # Remove extra whitespace | |
| title = ' '.join(title.split()) | |
| return title.strip() | |
| # Add this function to update UI with sections from Wikipedia content | |
| def update_ui_with_sections(sections): | |
| """ | |
| Updates the UI to display sections from the Wikipedia article | |
| Args: | |
| sections: Dictionary of section titles and content | |
| Returns: | |
| List of updates for all section components | |
| """ | |
| results = [] | |
| # Prepare updates for up to 100 sections (400 components - 4 per section) | |
| for i in range(100): | |
| if i < len(sections): | |
| # Get section title and content | |
| section_title = list(sections.keys())[i] | |
| section_content = sections[section_title] | |
| # Clean the section title for display | |
| clean_title = clean_section_title(section_title) | |
| # Make section textbox visible with content and label | |
| results.extend([ | |
| gr.update(visible=True, value=section_content, label=f"Section: {clean_title}"), | |
| gr.update(visible=True), # Translate button | |
| gr.update(visible=True, value="", label=f"Translation: {clean_title}"), # Translation output | |
| gr.update(visible=False) # Debug button (hidden by default) | |
| ]) | |
| else: | |
| # Hide unused components | |
| results.extend([ | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False) | |
| ]) | |
| return results | |
| # Create Gradio app | |
| with gr.Blocks(theme=gr.themes.Monochrome(), css=""" | |
| .odd-section { background-color: rgb(228 213 213); padding: 15px; border-radius: 8px; margin: 10px 0; } | |
| """) as demo: | |
| gr.Markdown("# Wikipedia Translator") | |
| gr.Markdown(""" | |
| **Translate Wikipedia articles into any language using AI** | |
| This tool helps you: | |
| - Extract content from any Wikipedia article | |
| - Translate it into your chosen language using OpenAI's models | |
| - Maintain the article's structure and formatting | |
| - Download the translated content in various formats | |
| Start by configuring your API settings in the sidebar and entering a Wikipedia URL below. | |
| --- | |
| Developed by [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/) | |
| """) | |
| # State variables | |
| sections_state = gr.State({}) | |
| sidebar_expanded = gr.State(True) # Track sidebar state, default is expanded | |
| def toggle_sidebar(expanded): | |
| """Toggle the sidebar visibility""" | |
| new_expanded = not expanded | |
| return ( | |
| new_expanded, | |
| gr.update(visible=new_expanded), | |
| gr.update(scale=3 if not new_expanded else 2), | |
| gr.update(visible=not new_expanded) # Control visibility of the show button | |
| ) | |
| # Function to show/hide custom language input based on selection | |
| def toggle_custom_language(target_lang): | |
| if target_lang == "Custom": | |
| return gr.update(visible=True) | |
| return gr.update(visible=False) | |
| with gr.Row() as main_layout: | |
| # Sidebar for configuration | |
| with gr.Column(scale=1, visible=True) as sidebar: | |
| # Add a toggle button at the top of the sidebar with updated icon | |
| sidebar_toggle = gr.Button("« Hide Sidebar", scale=0) | |
| gr.Markdown("### Configuration") | |
| with gr.Group(): | |
| api_key = gr.Textbox( | |
| label="OpenAI API Key", | |
| placeholder="sk-...", | |
| type="password", | |
| ) | |
| model_id = gr.Textbox( | |
| label="OpenAI Model ID", | |
| placeholder="gpt-4.1-mini", | |
| value="gpt-4.1-mini", | |
| ) | |
| base_url = gr.Textbox( | |
| label="OpenAI API Base URL (Optional)", | |
| placeholder="https://api.openai.com/v1", | |
| info="Leave default unless using a proxy" | |
| ) | |
| target_language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="Arabic", | |
| label="Target Language", | |
| ) | |
| custom_language = gr.Textbox( | |
| label="Custom Language", | |
| placeholder="Enter language name (e.g., Swedish, Dutch, etc.)", | |
| visible=False, | |
| info="Specify your desired language if not in the list above" | |
| ) | |
| # Connect the dropdown to show/hide custom language input | |
| target_language.change( | |
| fn=toggle_custom_language, | |
| inputs=[target_language], | |
| outputs=[custom_language] | |
| ) | |
| # Add chunking control before content format | |
| chunking = gr.Checkbox( | |
| label="Enable Content Chunking", | |
| value=False, | |
| info="Split content into sections for individual translation" | |
| ) | |
| content_format = gr.Radio( | |
| choices=["Text", "XML"], | |
| value="XML", | |
| label="Content Format", | |
| info="Choose how to display article content" | |
| ) | |
| # Debug mode toggle | |
| debug_mode = gr.Checkbox( | |
| label="Debug Mode", | |
| value=False, | |
| info="Show detailed information about LLM calls" | |
| ) | |
| # Add preference prompt section | |
| gr.Markdown("### Translation Preferences") | |
| preference_prompt = gr.Textbox( | |
| label="Additional Translation Preferences", | |
| placeholder="Enter any specific translation preferences or instructions...", | |
| lines=5, | |
| info="Optional: Add specific preferences for how the translation should be performed" | |
| ) | |
| # Replace static About section with Accordion | |
| with gr.Accordion("About", open=False): | |
| gr.Markdown(""" | |
| This tool extracts content from Wikipedia articles and translates them into your selected language using OpenAI's language models. | |
| 1. Configure your API settings | |
| 2. Enter a Wikipedia URL | |
| 3. Click Extract to process the article | |
| --- | |
| Developed by [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/) | |
| """) | |
| # Main content area | |
| with gr.Column(scale=2) as main_content: | |
| # Show sidebar toggle button when sidebar is hidden (updated icon) | |
| with gr.Row(): | |
| sidebar_show_btn = gr.Button("» Show Sidebar", visible=False, scale=0) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Wikipedia Article") | |
| wiki_url = gr.Textbox( | |
| label="Wikipedia URL", | |
| placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence", | |
| info="Enter the full URL of the Wikipedia article" | |
| ) | |
| extract_button = gr.Button("Extract and Prepare for Translation", variant="primary") | |
| output = gr.Markdown(label="Status") | |
| # Results area (will expand in the future) | |
| article_pageid = gr.Textbox( | |
| label="Article Page ID", | |
| placeholder="Page ID will appear here after extraction", | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| article_title = gr.Textbox( | |
| label="Article Title", | |
| placeholder="Title will appear here after extraction", | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| aticle_summary = gr.Textbox( | |
| label="Article Summary", | |
| placeholder="Summary will appear here after extraction", | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| article_xml = gr.Textbox( | |
| label="Article XML", | |
| placeholder="XML will appear here after extraction", | |
| interactive=False, | |
| visible=False, # Hidden by default as it's usually large | |
| show_copy_button=True | |
| ) | |
| # Debug info state and modal components | |
| debug_info_state = gr.State(None) | |
| # Remove the debug_markdown from the main area as we'll only use the sidebar for debug info | |
| # Pre-define section textboxes and related components | |
| gr.Markdown("### Article Sections") | |
| with gr.Column() as sections_container: | |
| section_components = [] | |
| for i in range(100): # Support up to 100 sections | |
| with gr.Column(elem_classes=["odd-section"] if i % 2 == 0 else []) as section: # Add class for odd sections | |
| # Section content | |
| section_textbox = gr.Textbox(visible=False, lines=4, show_copy_button=True) | |
| with gr.Row(): # Controls row | |
| translate_btn = gr.Button("Translate", visible=False) | |
| debug_btn = gr.Button("View Debug Info", visible=False) | |
| # Translation output | |
| translation_output = gr.Textbox(visible=False, lines=4, show_copy_button=True) | |
| # Add separator | |
| gr.Markdown("---", visible=False) | |
| section_components.extend([section_textbox, translate_btn, translation_output, debug_btn]) | |
| # Connect the translate button to the translation function | |
| result = translate_btn.click( | |
| fn=translate_section, | |
| inputs=[ | |
| section_textbox, | |
| article_title, | |
| aticle_summary, | |
| content_format, | |
| target_language, | |
| custom_language, | |
| api_key, | |
| model_id, | |
| base_url, | |
| preference_prompt, | |
| debug_mode | |
| ], | |
| outputs=[translation_output, debug_info_state] | |
| ) | |
| # Show debug button only when debug mode is on and after translation | |
| result.then( | |
| fn=lambda debug_info, debug_mode: gr.update(visible=debug_mode and debug_info is not None), | |
| inputs=[debug_info_state, debug_mode], | |
| outputs=[debug_btn] | |
| ) | |
| # Update this to only show the debug info in the sidebar | |
| # We'll reconnect this later in the code | |
| # Connect the extract button to the function | |
| extract_button.click( | |
| fn=extract_wikipedia_content, | |
| inputs=[wiki_url, api_key, model_id, base_url, target_language, custom_language, content_format, chunking], | |
| outputs=[ | |
| output, | |
| article_pageid, | |
| article_title, | |
| aticle_summary, | |
| article_xml, | |
| sections_state, | |
| ] | |
| ).then( | |
| fn=update_ui_with_sections, | |
| inputs=[sections_state], | |
| outputs=section_components | |
| ) | |
| # Connect the sidebar toggle buttons | |
| sidebar_toggle.click( | |
| fn=toggle_sidebar, | |
| inputs=[sidebar_expanded], | |
| outputs=[sidebar_expanded, sidebar, main_content, sidebar_show_btn] | |
| ) | |
| sidebar_show_btn.click( | |
| fn=toggle_sidebar, | |
| inputs=[sidebar_expanded], | |
| outputs=[sidebar_expanded, sidebar, main_content, sidebar_show_btn] | |
| ) | |
| # Add download options to the bottom of the sidebar | |
| with sidebar: | |
| download_format = gr.Dropdown( | |
| choices=["Wikipedia XML", "HTML", "JSON", "Plain Text"], | |
| value="Wikipedia XML", | |
| label="Download Format" | |
| ) | |
| download_original_btn = gr.Button("Download Original") | |
| download_original_file = gr.File(label="Original Article") | |
| download_translated_btn = gr.Button("Download Translated") | |
| download_translated_file = gr.File(label="Translated Article") | |
| # Debug info display | |
| debug_header = gr.Markdown("### Debug Information", visible=False) | |
| debug_display = gr.Markdown(visible=False) | |
| # Update the debug button click handler to show debug info in the sidebar | |
| for i in range(0, len(section_components), 4): | |
| debug_btn = section_components[i+3] # The debug button is the 4th component | |
| # Connect debug button directly to show debug info only in the sidebar | |
| debug_btn.click( | |
| fn=format_debug_info, | |
| inputs=[debug_info_state], | |
| outputs=[debug_display] | |
| ).then( | |
| fn=lambda: (gr.update(visible=True), gr.update(visible=True)), | |
| outputs=[debug_header, debug_display] | |
| ) | |
| # Connect download buttons | |
| download_original_btn.click( | |
| fn=generate_download_original, | |
| inputs=[download_format, article_title, aticle_summary, article_xml, sections_state], | |
| outputs=[download_original_file] | |
| ) | |
| # Prepare existing translation outputs for download | |
| translation_outputs = [section_components[i+2] for i in range(0, len(section_components), 4)] | |
| download_translated_btn.click( | |
| fn=generate_download_translated, | |
| inputs=[download_format, article_title, aticle_summary, sections_state] + translation_outputs, | |
| outputs=[download_translated_file] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() |