bakrianoo commited on
Commit
94260c3
·
1 Parent(s): c065ba1

specify the translated format

Browse files
app.py CHANGED
@@ -21,7 +21,7 @@ LANGUAGES = {
21
  "Korean": "ko"
22
  }
23
 
24
- def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang):
25
  """
26
  Function to extract content from Wikipedia URL (placeholder for now)
27
  """
@@ -31,7 +31,10 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
31
 
32
  # Get the details of the Wikipedia article
33
  wiki_details = get_wiki_details(wiki_id)
34
- content_sections = split_content_into_sections(wiki_details['wiki_xml'])
 
 
 
35
 
36
  return (
37
  "Extraction complete! Sections: " + str(len(content_sections)),
@@ -42,7 +45,8 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
42
  content_sections
43
  )
44
 
45
- def translate_content(content, article_title, artice_summary, target_lang, api_key, model_id, base_url):
 
46
 
47
  llm_client = init_llm_client(api_key, base_url=base_url)
48
 
@@ -50,7 +54,8 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
50
  article_title=article_title,
51
  artice_summary=artice_summary,
52
  original_content=content,
53
- target_lang=target_lang
 
54
  )
55
 
56
  # Call the LLM to get the translation - updating params to match OpenAI's requirements
@@ -69,7 +74,7 @@ def translate_content(content, article_title, artice_summary, target_lang, api_k
69
 
70
  return "Error: Translation output not found in the response."
71
 
72
- def translate_section(section_content, article_title, article_summary, target_lang, api_key, model_id, base_url):
73
  """
74
  Translates a single section of the Wikipedia article
75
  """
@@ -80,6 +85,7 @@ def translate_section(section_content, article_title, article_summary, target_la
80
  content=section_content,
81
  article_title=article_title,
82
  artice_summary=article_summary,
 
83
  target_lang=target_lang,
84
  api_key=api_key,
85
  model_id=model_id,
@@ -172,6 +178,13 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
172
  value="Arabic",
173
  label="Target Language",
174
  )
 
 
 
 
 
 
 
175
 
176
  gr.Markdown("### About")
177
  gr.Markdown("""
@@ -240,6 +253,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
240
  section_textbox,
241
  article_title,
242
  aticle_summary,
 
243
  target_language,
244
  api_key,
245
  model_id,
@@ -251,7 +265,7 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
251
  # Connect the extract button to the function
252
  extract_button.click(
253
  fn=extract_wikipedia_content,
254
- inputs=[wiki_url, api_key, model_id, base_url, target_language],
255
  outputs=[
256
  output,
257
  article_pageid,
 
21
  "Korean": "ko"
22
  }
23
 
24
+ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang, content_format):
25
  """
26
  Function to extract content from Wikipedia URL (placeholder for now)
27
  """
 
31
 
32
  # Get the details of the Wikipedia article
33
  wiki_details = get_wiki_details(wiki_id)
34
+ if content_format == "XML":
35
+ content_sections = split_content_into_sections(wiki_details['wiki_xml'], content_format)
36
+ else:
37
+ content_sections = split_content_into_sections(wiki_details['content'], content_format)
38
 
39
  return (
40
  "Extraction complete! Sections: " + str(len(content_sections)),
 
45
  content_sections
46
  )
47
 
48
+ def translate_content(content, article_title, artice_summary, content_format,
49
+ target_lang, api_key, model_id, base_url):
50
 
51
  llm_client = init_llm_client(api_key, base_url=base_url)
52
 
 
54
  article_title=article_title,
55
  artice_summary=artice_summary,
56
  original_content=content,
57
+ target_lang=target_lang,
58
+ content_format=content_format
59
  )
60
 
61
  # Call the LLM to get the translation - updating params to match OpenAI's requirements
 
74
 
75
  return "Error: Translation output not found in the response."
76
 
77
+ def translate_section(section_content, article_title, article_summary, content_format, target_lang, api_key, model_id, base_url):
78
  """
79
  Translates a single section of the Wikipedia article
80
  """
 
85
  content=section_content,
86
  article_title=article_title,
87
  artice_summary=article_summary,
88
+ content_format=content_format,
89
  target_lang=target_lang,
90
  api_key=api_key,
91
  model_id=model_id,
 
178
  value="Arabic",
179
  label="Target Language",
180
  )
181
+
182
+ content_format = gr.Radio(
183
+ choices=["Text", "XML"],
184
+ value="XML",
185
+ label="Content Format",
186
+ info="Choose how to display article content"
187
+ )
188
 
189
  gr.Markdown("### About")
190
  gr.Markdown("""
 
253
  section_textbox,
254
  article_title,
255
  aticle_summary,
256
+ content_format,
257
  target_language,
258
  api_key,
259
  model_id,
 
265
  # Connect the extract button to the function
266
  extract_button.click(
267
  fn=extract_wikipedia_content,
268
+ inputs=[wiki_url, api_key, model_id, base_url, target_language, content_format],
269
  outputs=[
270
  output,
271
  article_pageid,
utils.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def extract_wiki_id(wiki_url):
2
+ """
3
+ Extracts the Wikipedia ID from the given URL.
4
+ """
5
+ import re
6
+
7
+ match = re.search(r'wiki/([^#?]+)', wiki_url)
8
+ return match.group(1) if match else None
9
+
10
+ def get_wiki_details(wiki_id):
11
+ """
12
+ Placeholder function to get Wikipedia details using the wiki ID.
13
+ """
14
+ # This should interact with the Wikipedia API or your backend service
15
+ # For now, returning dummy data
16
+ return {
17
+ "pageid": 123456,
18
+ "title": "Artificial Intelligence",
19
+ "summary": "AI is the simulation of human intelligence in machines.",
20
+ "wiki_xml": "<xml>...</xml>",
21
+ "sections": {
22
+ "Introduction": "AI Introduction content...",
23
+ "History": "AI History content...",
24
+ "Applications": "AI Applications content...",
25
+ }
26
+ }
27
+
28
+ def init_llm_client(api_key, base_url="https://api.openai.com/v1"):
29
+ """
30
+ Initializes the LLM client with the given API key and base URL.
31
+ """
32
+ import openai
33
+
34
+ openai.api_key = api_key
35
+ openai.api_base = base_url
36
+
37
+ return openai
38
+
39
+ def split_content_into_sections(wiki_xml, content_format="Plain Text"):
40
+ """
41
+ Split the Wikipedia content into logical sections.
42
+
43
+ Args:
44
+ wiki_xml (str): The XML content of the Wikipedia article
45
+ content_format (str): The format to return the content in ("Plain Text" or "XML")
46
+
47
+ Returns:
48
+ dict: A dictionary mapping section names to their content
49
+ """
50
+ from xml.etree import ElementTree as ET
51
+
52
+ # Parse the XML content
53
+ root = ET.fromstring(wiki_xml)
54
+
55
+ sections = {}
56
+ for child in root:
57
+ # Assuming each child of the root is a section
58
+ section_name = child.tag
59
+ section_content = ET.tostring(child, encoding='unicode')
60
+
61
+ # Add to sections dictionary
62
+ if content_format == "XML":
63
+ sections[section_name] = section_content
64
+ else: # Plain Text
65
+ try:
66
+ # Try to extract text content only
67
+ text_content = child.text if child.text else ""
68
+ for elem in child.iter():
69
+ if elem.text and elem != child:
70
+ text_content += " " + elem.text
71
+ if elem.tail:
72
+ text_content += " " + elem.tail
73
+ sections[section_name] = text_content.strip()
74
+ except Exception as e:
75
+ # Fallback in case of parsing issues
76
+ sections[section_name] = f"Error extracting text: {str(e)}"
77
+
78
+ return sections
79
+
80
+ def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
81
+ """
82
+ Constructs the translation prompt for the LLM.
83
+ """
84
+ return f"""
85
+ You are a professional translator. Translate the following content to {target_lang}.
86
+
87
+ Title: {article_title}
88
+ Summary: {artice_summary}
89
+
90
+ Content: {original_content}
91
+ """
utils/__pycache__/llm_parser.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/llm_parser.cpython-310.pyc and b/utils/__pycache__/llm_parser.cpython-310.pyc differ
 
utils/__pycache__/wikipedia_extractor.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/wikipedia_extractor.cpython-310.pyc and b/utils/__pycache__/wikipedia_extractor.cpython-310.pyc differ
 
utils/llm_prompts.py CHANGED
@@ -1,15 +1,15 @@
1
  import json
2
 
3
- def get_translate_prompt(article_title, artice_summary, original_content, target_lang):
4
  """
5
- Function to get the translation prompt for the LLM to translate Wikipedia XML content
6
  with high quality and fidelity to the original.
7
  """
8
  # Define the prompt template
9
  translate_prompt = (
10
  "# Task\n"
11
  "You are an expert Wikipedia translator specializing in multilingual content adaptation. "
12
- "Your task is to translate the provided XML content into {target_lang} while preserving the "
13
  "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
14
 
15
  "# Article Original Title\n"
@@ -18,15 +18,15 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
18
  "# Article Summary\n"
19
  "{article_summary}\n\n"
20
 
21
- "# Article Original Content (XML format)\n"
22
  "{original_content}\n\n"
23
 
24
  "# Target Language\n"
25
  "{target_lang}\n\n"
26
 
27
  "# Instructions\n"
28
- "1. Preserve all XML tags, attributes, and structure exactly as they appear\n"
29
- "2. Translate only the text content between XML tags\n"
30
  "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
31
  "4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
32
  "5. Adapt cultural references or idioms to be understandable in the target language\n"
@@ -37,7 +37,7 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
37
  "Return a single JSON object with the following structure:\n"
38
  "```json\n"
39
  "{{\n"
40
- " \"output_content\": \"The complete translated XML content with all tags preserved\"\n"
41
  "}}\n"
42
  "```\n\n"
43
 
@@ -58,7 +58,8 @@ def get_translate_prompt(article_title, artice_summary, original_content, target
58
  article_title=article_title,
59
  article_summary=artice_summary,
60
  original_content=original_content,
61
- target_lang=target_lang
 
62
  )
63
 
64
  return formatted_prompt
 
1
  import json
2
 
3
+ def get_translate_prompt(article_title, artice_summary, content_format, original_content, target_lang):
4
  """
5
+ Function to get the translation prompt for the LLM to translate Wikipedia {content_format} content
6
  with high quality and fidelity to the original.
7
  """
8
  # Define the prompt template
9
  translate_prompt = (
10
  "# Task\n"
11
  "You are an expert Wikipedia translator specializing in multilingual content adaptation. "
12
+ "Your task is to translate the provided {content_format} content into {target_lang} while preserving the "
13
  "academic tone, factual accuracy, and encyclopedic style of Wikipedia.\n\n"
14
 
15
  "# Article Original Title\n"
 
18
  "# Article Summary\n"
19
  "{article_summary}\n\n"
20
 
21
+ "# Article Original Content ({content_format} format)\n"
22
  "{original_content}\n\n"
23
 
24
  "# Target Language\n"
25
  "{target_lang}\n\n"
26
 
27
  "# Instructions\n"
28
+ "1. If provided input is XML code, Preserve all XML tags, attributes, and structure exactly as they appear\n"
29
+ "2. If provided input is XML code, Translate only the text content between XML tags\n"
30
  "3. Maintain Wikipedia's neutral point of view and encyclopedic style\n"
31
  "4. Preserve proper nouns, scientific terminology, and citations appropriately\n"
32
  "5. Adapt cultural references or idioms to be understandable in the target language\n"
 
37
  "Return a single JSON object with the following structure:\n"
38
  "```json\n"
39
  "{{\n"
40
+ " \"output_content\": \"The complete translated {content_format} content with all tags preserved\"\n"
41
  "}}\n"
42
  "```\n\n"
43
 
 
58
  article_title=article_title,
59
  article_summary=artice_summary,
60
  original_content=original_content,
61
+ target_lang=target_lang,
62
+ content_format=content_format
63
  )
64
 
65
  return formatted_prompt
utils/wikipedia_extractor.py CHANGED
@@ -111,13 +111,14 @@ def get_wiki_xml(page_title):
111
  return None, {"error": f"An error occurred: {str(e)}"}
112
 
113
  # function to split content into sections using === [SECTION NAME] === regex pattern
114
- def split_content_into_sections(content: str) -> List[str]:
115
 
116
  """
117
  Splits the content into sections using the === [SECTION NAME] === regex pattern.
118
 
119
  Args:
120
  content (str): The content to split.
 
121
 
122
  Returns:
123
  dict: The sections dictionary.
 
111
  return None, {"error": f"An error occurred: {str(e)}"}
112
 
113
  # function to split content into sections using === [SECTION NAME] === regex pattern
114
+ def split_content_into_sections(content: str, content_format: str=None) -> List[str]:
115
 
116
  """
117
  Splits the content into sections using the === [SECTION NAME] === regex pattern.
118
 
119
  Args:
120
  content (str): The content to split.
121
+ content_format (str): The format to return the content in ("Plain Text" or "XML").
122
 
123
  Returns:
124
  dict: The sections dictionary.