Spaces:
Running
Running
def extract_wiki_id(wiki_url): | |
""" | |
Extracts the Wikipedia ID from the given URL. | |
""" | |
import re | |
match = re.search(r'wiki/([^#?]+)', wiki_url) | |
return match.group(1) if match else None | |
def get_wiki_details(wiki_id): | |
""" | |
Placeholder function to get Wikipedia details using the wiki ID. | |
""" | |
# This should interact with the Wikipedia API or your backend service | |
# For now, returning dummy data | |
return { | |
"pageid": 123456, | |
"title": "Artificial Intelligence", | |
"summary": "AI is the simulation of human intelligence in machines.", | |
"wiki_xml": "<xml>...</xml>", | |
"sections": { | |
"Introduction": "AI Introduction content...", | |
"History": "AI History content...", | |
"Applications": "AI Applications content...", | |
} | |
} | |
def init_llm_client(api_key, base_url="https://api.openai.com/v1"): | |
""" | |
Initializes the LLM client with the given API key and base URL. | |
""" | |
import openai | |
openai.api_key = api_key | |
openai.api_base = base_url | |
return openai | |
def split_content_into_sections(wiki_xml, content_format="Plain Text"): | |
""" | |
Split the Wikipedia content into logical sections. | |
Args: | |
wiki_xml (str): The XML content of the Wikipedia article | |
content_format (str): The format to return the content in ("Plain Text" or "XML") | |
Returns: | |
dict: A dictionary mapping section names to their content | |
""" | |
from xml.etree import ElementTree as ET | |
# Parse the XML content | |
root = ET.fromstring(wiki_xml) | |
sections = {} | |
for child in root: | |
# Assuming each child of the root is a section | |
section_name = child.tag | |
section_content = ET.tostring(child, encoding='unicode') | |
# Add to sections dictionary | |
if content_format == "XML": | |
sections[section_name] = section_content | |
else: # Plain Text | |
try: | |
# Try to extract text content only | |
text_content = child.text if child.text else "" | |
for elem in child.iter(): | |
if elem.text and elem != child: | |
text_content += " " + elem.text | |
if elem.tail: | |
text_content += " " + elem.tail | |
sections[section_name] = text_content.strip() | |
except Exception as e: | |
# Fallback in case of parsing issues | |
sections[section_name] = f"Error extracting text: {str(e)}" | |
return sections | |
def get_translate_prompt(article_title, artice_summary, original_content, target_lang): | |
""" | |
Constructs the translation prompt for the LLM. | |
""" | |
return f""" | |
You are a professional translator. Translate the following content to {target_lang}. | |
Title: {article_title} | |
Summary: {artice_summary} | |
Content: {original_content} | |
""" |