Spaces:
Running
Running
setup llm parser
Browse files- app.py +24 -1
- utils/__init__.py +2 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/llm_parser.cpython-310.pyc +0 -0
- utils/__pycache__/llm_prompts.cpython-310.pyc +0 -0
- utils/llm_parser.py +15 -0
- utils/llm_prompts.py +64 -0
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
-
from utils import extract_wiki_id, get_wiki_details,
|
|
|
|
|
3 |
import json
|
4 |
|
5 |
# Define language options for translation
|
@@ -39,6 +41,27 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
|
|
39 |
content_sections
|
40 |
)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def update_ui_with_sections(sections_dict):
|
43 |
"""
|
44 |
Creates a list of components to display in the sections area
|
|
|
1 |
import gradio as gr
|
2 |
+
from utils import (extract_wiki_id, get_wiki_details,
|
3 |
+
init_llm_client, split_content_into_sections,
|
4 |
+
get_translate_prompt)
|
5 |
import json
|
6 |
|
7 |
# Define language options for translation
|
|
|
41 |
content_sections
|
42 |
)
|
43 |
|
44 |
+
def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
|
45 |
+
|
46 |
+
llm_client = init_llm_client(api_key, model_id, base_url)
|
47 |
+
|
48 |
+
translation_prompt = get_translate_prompt(
|
49 |
+
article_title=article_title,
|
50 |
+
artice_summary=artice_summary,
|
51 |
+
original_content=content,
|
52 |
+
target_lang=target_lang
|
53 |
+
)
|
54 |
+
|
55 |
+
# Call the LLM to get the translation
|
56 |
+
response = llm_client.responses.create(
|
57 |
+
messages=[
|
58 |
+
{"role": "user", "content": translation_prompt}
|
59 |
+
],
|
60 |
+
model=model_id,
|
61 |
+
max_tokens=2000,
|
62 |
+
temperature=0.5
|
63 |
+
)
|
64 |
+
|
65 |
def update_ui_with_sections(sections_dict):
|
66 |
"""
|
67 |
Creates a list of components to display in the sections area
|
utils/__init__.py
CHANGED
@@ -1 +1,3 @@
|
|
1 |
from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)
|
|
|
|
|
|
1 |
from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)
|
2 |
+
from .llm_parser import init_llm_client
|
3 |
+
from .llm_prompts import get_translate_prompt
|
utils/__pycache__/__init__.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/__init__.cpython-310.pyc and b/utils/__pycache__/__init__.cpython-310.pyc differ
|
|
utils/__pycache__/llm_parser.cpython-310.pyc
ADDED
Binary file (560 Bytes). View file
|
|
utils/__pycache__/llm_prompts.cpython-310.pyc
ADDED
Binary file (2.24 kB). View file
|
|
utils/llm_parser.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
|
4 |
+
|
5 |
+
def init_llm_client(api_key, model_id, base_url=None):
|
6 |
+
"""
|
7 |
+
Initialize the OpenAI client with the provided API key and model ID.
|
8 |
+
"""
|
9 |
+
if base_url:
|
10 |
+
os.environ["OPENAI_API_BASE"] = base_url
|
11 |
+
os.environ["OPENAI_API_KEY"] = api_key
|
12 |
+
os.environ["OPENAI_MODEL_ID"] = model_id
|
13 |
+
|
14 |
+
return OpenAI(api_key=api_key, model_id=model_id, base_url=base_url)
|
15 |
+
|
utils/llm_prompts.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
|
4 |
+
"""
|
5 |
+
Function to get the translation prompt for the LLM to translate Wikipedia XML content
|
6 |
+
with high quality and fidelity to the original.
|
7 |
+
"""
|
8 |
+
# Define the prompt template
|
9 |
+
translate_prompt = (
|
10 |
+
"# Task\n"
|
11 |
+
"You are an expert Wikipedia translator specializing in multilingual content adaptation. "
|
12 |
+
"Your task is to translate the provided XML content into {target_lang} while preserving the "
|
13 |
+
"academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
|
14 |
+
|
15 |
+
"# Article Original Title\n"
|
16 |
+
"{article_title}\n\n"
|
17 |
+
|
18 |
+
"# Article Summary\n"
|
19 |
+
"{article_summary}\n\n"
|
20 |
+
|
21 |
+
"# Article Original Content (XML format)\n"
|
22 |
+
"{original_content}\n\n"
|
23 |
+
|
24 |
+
"# Target Language\n"
|
25 |
+
"{target_lang}\n\n"
|
26 |
+
|
27 |
+
"# Instructions\n"
|
28 |
+
"1. Preserve all XML tags, attributes, and structure exactly as they appear\n"
|
29 |
+
"2. Translate only the text content between XML tags\n"
|
30 |
+
"3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
|
31 |
+
"4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
|
32 |
+
"5. Adapt cultural references or idioms to be understandable in the target language\n"
|
33 |
+
"6. Use terminology consistent with the {target_lang} Wikipedia for similar topics\n"
|
34 |
+
"7. Maintain the same paragraph structure and information hierarchy\n\n"
|
35 |
+
|
36 |
+
"# Output Format\n"
|
37 |
+
"Return a single JSON object with the following structure:\n"
|
38 |
+
"```json\n" +
|
39 |
+
json.dumps({
|
40 |
+
"translated_content": "The complete translated XML content with all tags preserved",
|
41 |
+
}, indent=4, ensure_ascii=False) +
|
42 |
+
"\n```\n\n"
|
43 |
+
|
44 |
+
"# Translation Quality Guidelines\n"
|
45 |
+
"- Accuracy: Ensure factual information is preserved exactly\n"
|
46 |
+
"- Completeness: Translate all content, don't summarize or omit information\n"
|
47 |
+
"- Consistency: Use consistent terminology throughout the article\n"
|
48 |
+
"- Fluency: Produce natural-sounding text in the target language\n"
|
49 |
+
"- Formatting: Preserve all formatting elements, including lists, tables, and emphasis\n"
|
50 |
+
|
51 |
+
"# Output json\n"
|
52 |
+
"```json\n"
|
53 |
+
|
54 |
+
)
|
55 |
+
|
56 |
+
# Format the prompt with the provided values
|
57 |
+
formatted_prompt = translate_prompt.format(
|
58 |
+
article_title=article_title,
|
59 |
+
article_summary=artice_summary,
|
60 |
+
original_content=original_content,
|
61 |
+
target_lang=target_lang
|
62 |
+
)
|
63 |
+
|
64 |
+
return formatted_prompt
|