Spaces:
Running
Running
specify the translated format
Browse files- app.py +20 -6
- utils.py +91 -0
- utils/__pycache__/llm_parser.cpython-310.pyc +0 -0
- utils/__pycache__/wikipedia_extractor.cpython-310.pyc +0 -0
- utils/llm_prompts.py +9 -8
- utils/wikipedia_extractor.py +2 -1
app.py
CHANGED
@@ -21,7 +21,7 @@ LANGUAGES = {
|
|
21 |
"Korean": "ko"
|
22 |
}
|
23 |
|
24 |
-
def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang):
|
25 |
"""
|
26 |
Function to extract content from Wikipedia URL (placeholder for now)
|
27 |
"""
|
@@ -31,7 +31,10 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
|
|
31 |
|
32 |
# Get the details of the Wikipedia article
|
33 |
wiki_details = get_wiki_details(wiki_id)
|
34 |
-
|
|
|
|
|
|
|
35 |
|
36 |
return (
|
37 |
"Extraction complete! Sections: " + str(len(content_sections)),
|
@@ -42,7 +45,8 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
|
|
42 |
content_sections
|
43 |
)
|
44 |
|
45 |
-
def translate_content(content, article_title, artice_summary,
|
|
|
46 |
|
47 |
llm_client = init_llm_client(api_key, base_url=base_url)
|
48 |
|
@@ -50,7 +54,8 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
|
|
50 |
article_title=article_title,
|
51 |
artice_summary=artice_summary,
|
52 |
original_content=content,
|
53 |
-
target_lang=target_lang
|
|
|
54 |
)
|
55 |
|
56 |
# Call the LLM to get the translation - updating params to match OpenAI's requirements
|
@@ -69,7 +74,7 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
|
|
69 |
|
70 |
return "Error: Translation output not found in the response."
|
71 |
|
72 |
-
def translate_section(section_content, article_title, article_summary, target_lang, api_key, model_id, base_url):
|
73 |
"""
|
74 |
Translates a single section of the Wikipedia article
|
75 |
"""
|
@@ -80,6 +85,7 @@ def translate_section(section_content, article_title, article_summary, target_la
|
|
80 |
content=section_content,
|
81 |
article_title=article_title,
|
82 |
artice_summary=article_summary,
|
|
|
83 |
target_lang=target_lang,
|
84 |
api_key=api_key,
|
85 |
model_id=model_id,
|
@@ -172,6 +178,13 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
172 |
value="Arabic",
|
173 |
label="Target Language",
|
174 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
gr.Markdown("### About")
|
177 |
gr.Markdown("""
|
@@ -240,6 +253,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
240 |
section_textbox,
|
241 |
article_title,
|
242 |
aticle_summary,
|
|
|
243 |
target_language,
|
244 |
api_key,
|
245 |
model_id,
|
@@ -251,7 +265,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
|
251 |
# Connect the extract button to the function
|
252 |
extract_button.click(
|
253 |
fn=extract_wikipedia_content,
|
254 |
-
inputs=[wiki_url, api_key, model_id, base_url, target_language],
|
255 |
outputs=[
|
256 |
output,
|
257 |
article_pageid,
|
|
|
21 |
"Korean": "ko"
|
22 |
}
|
23 |
|
24 |
+
def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang, content_format):
|
25 |
"""
|
26 |
Function to extract content from Wikipedia URL (placeholder for now)
|
27 |
"""
|
|
|
31 |
|
32 |
# Get the details of the Wikipedia article
|
33 |
wiki_details = get_wiki_details(wiki_id)
|
34 |
+
if content_format == "XML":
|
35 |
+
content_sections = split_content_into_sections(wiki_details['wiki_xml'], content_format)
|
36 |
+
else:
|
37 |
+
content_sections = split_content_into_sections(wiki_details['content'], content_format)
|
38 |
|
39 |
return (
|
40 |
"Extraction complete! Sections: " + str(len(content_sections)),
|
|
|
45 |
content_sections
|
46 |
)
|
47 |
|
48 |
+
def translate_content(content, article_title, artice_summary, content_format,
|
49 |
+
target_lang, api_key, model_id, base_url):
|
50 |
|
51 |
llm_client = init_llm_client(api_key, base_url=base_url)
|
52 |
|
|
|
54 |
article_title=article_title,
|
55 |
artice_summary=artice_summary,
|
56 |
original_content=content,
|
57 |
+
target_lang=target_lang,
|
58 |
+
content_format=content_format
|
59 |
)
|
60 |
|
61 |
# Call the LLM to get the translation - updating params to match OpenAI's requirements
|
|
|
74 |
|
75 |
return "Error: Translation output not found in the response."
|
76 |
|
77 |
+
def translate_section(section_content, article_title, article_summary, content_format, target_lang, api_key, model_id, base_url):
|
78 |
"""
|
79 |
Translates a single section of the Wikipedia article
|
80 |
"""
|
|
|
85 |
content=section_content,
|
86 |
article_title=article_title,
|
87 |
artice_summary=article_summary,
|
88 |
+
content_format=content_format,
|
89 |
target_lang=target_lang,
|
90 |
api_key=api_key,
|
91 |
model_id=model_id,
|
|
|
178 |
value="Arabic",
|
179 |
label="Target Language",
|
180 |
)
|
181 |
+
|
182 |
+
content_format = gr.Radio(
|
183 |
+
choices=["Text", "XML"],
|
184 |
+
value="XML",
|
185 |
+
label="Content Format",
|
186 |
+
info="Choose how to display article content"
|
187 |
+
)
|
188 |
|
189 |
gr.Markdown("### About")
|
190 |
gr.Markdown("""
|
|
|
253 |
section_textbox,
|
254 |
article_title,
|
255 |
aticle_summary,
|
256 |
+
content_format,
|
257 |
target_language,
|
258 |
api_key,
|
259 |
model_id,
|
|
|
265 |
# Connect the extract button to the function
|
266 |
extract_button.click(
|
267 |
fn=extract_wikipedia_content,
|
268 |
+
inputs=[wiki_url, api_key, model_id, base_url, target_language, content_format],
|
269 |
outputs=[
|
270 |
output,
|
271 |
article_pageid,
|
utils.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def extract_wiki_id(wiki_url):
|
2 |
+
"""
|
3 |
+
Extracts the Wikipedia ID from the given URL.
|
4 |
+
"""
|
5 |
+
import re
|
6 |
+
|
7 |
+
match = re.search(r'wiki/([^#?]+)', wiki_url)
|
8 |
+
return match.group(1) if match else None
|
9 |
+
|
10 |
+
def get_wiki_details(wiki_id):
|
11 |
+
"""
|
12 |
+
Placeholder function to get Wikipedia details using the wiki ID.
|
13 |
+
"""
|
14 |
+
# This should interact with the Wikipedia API or your backend service
|
15 |
+
# For now, returning dummy data
|
16 |
+
return {
|
17 |
+
"pageid": 123456,
|
18 |
+
"title": "Artificial Intelligence",
|
19 |
+
"summary": "AI is the simulation of human intelligence in machines.",
|
20 |
+
"wiki_xml": "<xml>...</xml>",
|
21 |
+
"sections": {
|
22 |
+
"Introduction": "AI Introduction content...",
|
23 |
+
"History": "AI History content...",
|
24 |
+
"Applications": "AI Applications content...",
|
25 |
+
}
|
26 |
+
}
|
27 |
+
|
28 |
+
def init_llm_client(api_key, base_url="https://api.openai.com/v1"):
|
29 |
+
"""
|
30 |
+
Initializes the LLM client with the given API key and base URL.
|
31 |
+
"""
|
32 |
+
import openai
|
33 |
+
|
34 |
+
openai.api_key = api_key
|
35 |
+
openai.api_base = base_url
|
36 |
+
|
37 |
+
return openai
|
38 |
+
|
39 |
+
def split_content_into_sections(wiki_xml, content_format="Plain Text"):
|
40 |
+
"""
|
41 |
+
Split the Wikipedia content into logical sections.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
wiki_xml (str): The XML content of the Wikipedia article
|
45 |
+
content_format (str): The format to return the content in ("Plain Text" or "XML")
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
dict: A dictionary mapping section names to their content
|
49 |
+
"""
|
50 |
+
from xml.etree import ElementTree as ET
|
51 |
+
|
52 |
+
# Parse the XML content
|
53 |
+
root = ET.fromstring(wiki_xml)
|
54 |
+
|
55 |
+
sections = {}
|
56 |
+
for child in root:
|
57 |
+
# Assuming each child of the root is a section
|
58 |
+
section_name = child.tag
|
59 |
+
section_content = ET.tostring(child, encoding='unicode')
|
60 |
+
|
61 |
+
# Add to sections dictionary
|
62 |
+
if content_format == "XML":
|
63 |
+
sections[section_name] = section_content
|
64 |
+
else: # Plain Text
|
65 |
+
try:
|
66 |
+
# Try to extract text content only
|
67 |
+
text_content = child.text if child.text else ""
|
68 |
+
for elem in child.iter():
|
69 |
+
if elem.text and elem != child:
|
70 |
+
text_content += " " + elem.text
|
71 |
+
if elem.tail:
|
72 |
+
text_content += " " + elem.tail
|
73 |
+
sections[section_name] = text_content.strip()
|
74 |
+
except Exception as e:
|
75 |
+
# Fallback in case of parsing issues
|
76 |
+
sections[section_name] = f"Error extracting text: {str(e)}"
|
77 |
+
|
78 |
+
return sections
|
79 |
+
|
80 |
+
def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
|
81 |
+
"""
|
82 |
+
Constructs the translation prompt for the LLM.
|
83 |
+
"""
|
84 |
+
return f"""
|
85 |
+
You are a professional translator. Translate the following content to {target_lang}.
|
86 |
+
|
87 |
+
Title: {article_title}
|
88 |
+
Summary: {artice_summary}
|
89 |
+
|
90 |
+
Content: {original_content}
|
91 |
+
"""
|
utils/__pycache__/llm_parser.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/llm_parser.cpython-310.pyc and b/utils/__pycache__/llm_parser.cpython-310.pyc differ
|
|
utils/__pycache__/wikipedia_extractor.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/wikipedia_extractor.cpython-310.pyc and b/utils/__pycache__/wikipedia_extractor.cpython-310.pyc differ
|
|
utils/llm_prompts.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
import json
|
2 |
|
3 |
-
def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
|
4 |
"""
|
5 |
-
Function to get the translation prompt for the LLM to translate Wikipedia
|
6 |
with high quality and fidelity to the original.
|
7 |
"""
|
8 |
# Define the prompt template
|
9 |
translate_prompt = (
|
10 |
"# Task\n"
|
11 |
"You are an expert Wikipedia translator specializing in multilingual content adaptation. "
|
12 |
-
"Your task is to translate the provided
|
13 |
"academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
|
14 |
|
15 |
"# Article Original Title\n"
|
@@ -18,15 +18,15 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
|
|
18 |
"# Article Summary\n"
|
19 |
"{article_summary}\n\n"
|
20 |
|
21 |
-
"# Article Original Content (
|
22 |
"{original_content}\n\n"
|
23 |
|
24 |
"# Target Language\n"
|
25 |
"{target_lang}\n\n"
|
26 |
|
27 |
"# Instructions\n"
|
28 |
-
"1. Preserve all XML tags, attributes, and structure exactly as they appear\n"
|
29 |
-
"2. Translate only the text content between XML tags\n"
|
30 |
"3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
|
31 |
"4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
|
32 |
"5. Adapt cultural references or idioms to be understandable in the target language\n"
|
@@ -37,7 +37,7 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
|
|
37 |
"Return a single JSON object with the following structure:\n"
|
38 |
"```json\n"
|
39 |
"{{\n"
|
40 |
-
" \"output_content\": \"The complete translated
|
41 |
"}}\n"
|
42 |
"```\n\n"
|
43 |
|
@@ -58,7 +58,8 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
|
|
58 |
article_title=article_title,
|
59 |
article_summary=artice_summary,
|
60 |
original_content=original_content,
|
61 |
-
target_lang=target_lang
|
|
|
62 |
)
|
63 |
|
64 |
return formatted_prompt
|
|
|
1 |
import json
|
2 |
|
3 |
+
def get_translate_prompt(article_title, artice_summary, content_format, original_content, target_lang):
|
4 |
"""
|
5 |
+
Function to get the translation prompt for the LLM to translate Wikipedia {content_format} content
|
6 |
with high quality and fidelity to the original.
|
7 |
"""
|
8 |
# Define the prompt template
|
9 |
translate_prompt = (
|
10 |
"# Task\n"
|
11 |
"You are an expert Wikipedia translator specializing in multilingual content adaptation. "
|
12 |
+
"Your task is to translate the provided {content_format} content into {target_lang} while preserving the "
|
13 |
"academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
|
14 |
|
15 |
"# Article Original Title\n"
|
|
|
18 |
"# Article Summary\n"
|
19 |
"{article_summary}\n\n"
|
20 |
|
21 |
+
"# Article Original Content ({content_format} format)\n"
|
22 |
"{original_content}\n\n"
|
23 |
|
24 |
"# Target Language\n"
|
25 |
"{target_lang}\n\n"
|
26 |
|
27 |
"# Instructions\n"
|
28 |
+
"1. If provided input is XML code, Preserve all XML tags, attributes, and structure exactly as they appear\n"
|
29 |
+
"2. If provided input is XML code, Translate only the text content between XML tags\n"
|
30 |
"3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
|
31 |
"4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
|
32 |
"5. Adapt cultural references or idioms to be understandable in the target language\n"
|
|
|
37 |
"Return a single JSON object with the following structure:\n"
|
38 |
"```json\n"
|
39 |
"{{\n"
|
40 |
+
" \"output_content\": \"The complete translated {content_format} content with all tags preserved\"\n"
|
41 |
"}}\n"
|
42 |
"```\n\n"
|
43 |
|
|
|
58 |
article_title=article_title,
|
59 |
article_summary=artice_summary,
|
60 |
original_content=original_content,
|
61 |
+
target_lang=target_lang,
|
62 |
+
content_format=content_format
|
63 |
)
|
64 |
|
65 |
return formatted_prompt
|
utils/wikipedia_extractor.py
CHANGED
@@ -111,13 +111,14 @@ def get_wiki_xml(page_title):
|
|
111 |
return None, {"error": f"An error occurred: {str(e)}"}
|
112 |
|
113 |
# function to split content into sections using === [SECTION NAME] === regex pattern
|
114 |
-
def split_content_into_sections(content: str) -> List[str]:
|
115 |
|
116 |
"""
|
117 |
Splits the content into sections using the === [SECTION NAME] === regex pattern.
|
118 |
|
119 |
Args:
|
120 |
content (str): The content to split.
|
|
|
121 |
|
122 |
Returns:
|
123 |
dict: The sections dictionary.
|
|
|
111 |
return None, {"error": f"An error occurred: {str(e)}"}
|
112 |
|
113 |
# function to split content into sections using === [SECTION NAME] === regex pattern
|
114 |
+
def split_content_into_sections(content: str, content_format: str=None) -> List[str]:
|
115 |
|
116 |
"""
|
117 |
Splits the content into sections using the === [SECTION NAME] === regex pattern.
|
118 |
|
119 |
Args:
|
120 |
content (str): The content to split.
|
121 |
+
content_format (str): The format to return the content in ("Plain Text" or "XML").
|
122 |
|
123 |
Returns:
|
124 |
dict: The sections dictionary.
|