Spaces:

bakrianoo
/

wikipedia-translator

Running

App Files Files Community

wikipedia-translator / utils.py

bakrianoo

specify the translated format

94260c3 about 1 month ago

raw

history blame

2.95 kB

	def extract_wiki_id(wiki_url):
	"""
	Extracts the Wikipedia ID from the given URL.
	"""
	import re

	match = re.search(r'wiki/([^#?]+)', wiki_url)
	return match.group(1) if match else None

	def get_wiki_details(wiki_id):
	"""
	Placeholder function to get Wikipedia details using the wiki ID.
	"""
	# This should interact with the Wikipedia API or your backend service
	# For now, returning dummy data
	return {
	"pageid": 123456,
	"title": "Artificial Intelligence",
	"summary": "AI is the simulation of human intelligence in machines.",
	"wiki_xml": "<xml>...</xml>",
	"sections": {
	"Introduction": "AI Introduction content...",
	"History": "AI History content...",
	"Applications": "AI Applications content...",
	}
	}

	def init_llm_client(api_key, base_url="https://api.openai.com/v1"):
	"""
	Initializes the LLM client with the given API key and base URL.
	"""
	import openai

	openai.api_key = api_key
	openai.api_base = base_url

	return openai

	def split_content_into_sections(wiki_xml, content_format="Plain Text"):
	"""
	Split the Wikipedia content into logical sections.

	Args:
	wiki_xml (str): The XML content of the Wikipedia article
	content_format (str): The format to return the content in ("Plain Text" or "XML")

	Returns:
	dict: A dictionary mapping section names to their content
	"""
	from xml.etree import ElementTree as ET

	# Parse the XML content
	root = ET.fromstring(wiki_xml)

	sections = {}
	for child in root:
	# Assuming each child of the root is a section
	section_name = child.tag
	section_content = ET.tostring(child, encoding='unicode')

	# Add to sections dictionary
	if content_format == "XML":
	sections[section_name] = section_content
	else: # Plain Text
	try:
	# Try to extract text content only
	text_content = child.text if child.text else ""
	for elem in child.iter():
	if elem.text and elem != child:
	text_content += " " + elem.text
	if elem.tail:
	text_content += " " + elem.tail
	sections[section_name] = text_content.strip()
	except Exception as e:
	# Fallback in case of parsing issues
	sections[section_name] = f"Error extracting text: {str(e)}"

	return sections

	def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
	"""
	Constructs the translation prompt for the LLM.
	"""
	return f"""
	You are a professional translator. Translate the following content to {target_lang}.

	Title: {article_title}
	Summary: {artice_summary}

	Content: {original_content}
	"""