Spaces:

bakrianoo
/

wikipedia-translator

Sleeping

App Files Files Community

wikipedia-translator / app.py

bakrianoo

setup llm parser

c25ce6b 4 months ago

raw

history blame

6.63 kB

	import gradio as gr
	from utils import (extract_wiki_id, get_wiki_details,
	init_llm_client, split_content_into_sections,
	get_translate_prompt)
	import json

	# Define language options for translation
	LANGUAGES = {
	"Arabic": "ar",
	"English": "en",
	"Spanish": "es",
	"French": "fr",
	"German": "de",
	"Italian": "it",
	"Portuguese": "pt",
	"Russian": "ru",
	"Japanese": "ja",
	"Chinese": "zh",
	"Hindi": "hi",
	"Korean": "ko"
	}

	def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang):
	"""
	Function to extract content from Wikipedia URL (placeholder for now)
	"""
	wiki_id = extract_wiki_id(wiki_url)
	if not wiki_id:
	return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {}

	# Get the details of the Wikipedia article
	wiki_details = get_wiki_details(wiki_id)
	content_sections = split_content_into_sections(wiki_details['wiki_xml'])

	return (
	"Extraction complete! Sections: " + str(len(content_sections)),
	wiki_details['pageid'],
	wiki_details['title'],
	wiki_details['summary'],
	wiki_details['wiki_xml'],
	content_sections
	)

	def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):

	llm_client = init_llm_client(api_key, model_id, base_url)

	translation_prompt = get_translate_prompt(
	article_title=article_title,
	artice_summary=artice_summary,
	original_content=content,
	target_lang=target_lang
	)

	# Call the LLM to get the translation
	response = llm_client.responses.create(
	messages=[
	{"role": "user", "content": translation_prompt}
	],
	model=model_id,
	max_tokens=2000,
	temperature=0.5
	)

	def update_ui_with_sections(sections_dict):
	"""
	Creates a list of components to display in the sections area
	"""
	components = []

	if not sections_dict:
	return [gr.update(visible=False) for _ in range(10)] # Assuming max 10 sections

	# Create visible components for available sections
	for section_name, section_content in sections_dict.items():
	components.append(gr.update(
	value=section_content,
	label=f"Section: {section_name}",
	visible=True
	))

	# Hide any unused components
	remaining = 100 - len(components) # Assuming max 100 sections
	for _ in range(remaining):
	components.append(gr.update(visible=False))

	return components

	# Create Gradio app
	with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
	gr.Markdown("# Wikipedia Translator")

	# State variable to store sections
	sections_state = gr.State({})

	with gr.Row():
	# Sidebar for configuration
	with gr.Column(scale=1):
	gr.Markdown("### Configuration")

	with gr.Group():
	api_key = gr.Textbox(
	label="OpenAI API Key",
	placeholder="sk-...",
	type="password",
	)

	model_id = gr.Textbox(
	label="OpenAI Model ID",
	placeholder="gpt-4.1-mini",
	)

	base_url = gr.Textbox(
	label="OpenAI API Base URL (Optional)",
	placeholder="https://api.openai.com/v1",
	info="Leave default unless using a proxy"
	)

	target_language = gr.Dropdown(
	choices=list(LANGUAGES.keys()),
	value="Spanish",
	label="Target Language",
	)

	gr.Markdown("### About")
	gr.Markdown("""
	This tool extracts content from Wikipedia articles and translates them into your selected language using OpenAI's language models.

	1. Configure your API settings
	2. Enter a Wikipedia URL
	3. Click Extract to process the article
	""")

	# Main content area
	with gr.Column(scale=2):
	gr.Markdown("### Wikipedia Article")

	wiki_url = gr.Textbox(
	label="Wikipedia URL",
	placeholder="https://en.wikipedia.org/wiki/Artificial_intelligence",
	info="Enter the full URL of the Wikipedia article"
	)

	extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")

	output = gr.Markdown(label="Status")

	# Results area (will expand in the future)
	article_pageid = gr.Textbox(
	label="Article Page ID",
	placeholder="Page ID will appear here after extraction",
	interactive=False
	)

	article_title = gr.Textbox(
	label="Article Title",
	placeholder="Title will appear here after extraction",
	interactive=False
	)

	aticle_summary = gr.Textbox(
	label="Article Summary",
	placeholder="Summary will appear here after extraction",
	interactive=False
	)

	article_xml = gr.Textbox(
	label="Article XML",
	placeholder="XML will appear here after extraction",
	interactive=False,
	visible=False # Hidden by default as it's usually large
	)

	# Pre-define section textboxes (limit to 100 for simplicity)
	gr.Markdown("### Article Sections")
	with gr.Column() as sections_container:
	section_textboxes = [
	gr.Textbox(visible=False, lines=4)
	for _ in range(100) # Support up to 100 sections
	]

	# Connect the extract button to the function
	extract_button.click(
	fn=extract_wikipedia_content,
	inputs=[wiki_url, api_key, model_id, base_url, target_language],
	outputs=[
	output,
	article_pageid,
	article_title,
	aticle_summary,
	article_xml,
	sections_state,
	]
	).then(
	fn=update_ui_with_sections,
	inputs=[sections_state],
	outputs=section_textboxes
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()