Spaces:

bakrianoo
/

wikipedia-translator

Running

File size: 2,952 Bytes

94260c3

def extract_wiki_id(wiki_url):
    """
    Extracts the Wikipedia ID from the given URL.
    """
    import re
    
    match = re.search(r'wiki/([^#?]+)', wiki_url)
    return match.group(1) if match else None

def get_wiki_details(wiki_id):
    """
    Placeholder function to get Wikipedia details using the wiki ID.
    """
    # This should interact with the Wikipedia API or your backend service
    # For now, returning dummy data
    return {
        "pageid": 123456,
        "title": "Artificial Intelligence",
        "summary": "AI is the simulation of human intelligence in machines.",
        "wiki_xml": "<xml>...</xml>",
        "sections": {
            "Introduction": "AI Introduction content...",
            "History": "AI History content...",
            "Applications": "AI Applications content...",
        }
    }

def init_llm_client(api_key, base_url="https://api.openai.com/v1"):
    """
    Initializes the LLM client with the given API key and base URL.
    """
    import openai
    
    openai.api_key = api_key
    openai.api_base = base_url
    
    return openai

def split_content_into_sections(wiki_xml, content_format="Plain Text"):
    """
    Split the Wikipedia content into logical sections.
    
    Args:
        wiki_xml (str): The XML content of the Wikipedia article
        content_format (str): The format to return the content in ("Plain Text" or "XML")
    
    Returns:
        dict: A dictionary mapping section names to their content
    """
    from xml.etree import ElementTree as ET
    
    # Parse the XML content
    root = ET.fromstring(wiki_xml)
    
    sections = {}
    for child in root:
        # Assuming each child of the root is a section
        section_name = child.tag
        section_content = ET.tostring(child, encoding='unicode')
        
        # Add to sections dictionary
        if content_format == "XML":
            sections[section_name] = section_content
        else:  # Plain Text
            try:
                # Try to extract text content only
                text_content = child.text if child.text else ""
                for elem in child.iter():
                    if elem.text and elem != child:
                        text_content += " " + elem.text
                    if elem.tail:
                        text_content += " " + elem.tail
                sections[section_name] = text_content.strip()
            except Exception as e:
                # Fallback in case of parsing issues
                sections[section_name] = f"Error extracting text: {str(e)}"
    
    return sections

def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
    """
    Constructs the translation prompt for the LLM.
    """
    return f"""
    You are a professional translator. Translate the following content to {target_lang}.

    Title: {article_title}
    Summary: {artice_summary}

    Content: {original_content}
    """