bakrianoo commited on
Commit
e424603
·
1 Parent(s): ce96e8f

extract wikipedia details

Browse files
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
- import wikipedia
3
- import openai
4
- import os
5
 
6
  # Define language options for translation
7
  LANGUAGES = {
@@ -15,7 +14,6 @@ LANGUAGES = {
15
  "Russian": "ru",
16
  "Japanese": "ja",
17
  "Chinese": "zh",
18
- "Arabic": "ar",
19
  "Hindi": "hi",
20
  "Korean": "ko"
21
  }
@@ -24,13 +22,54 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
24
  """
25
  Function to extract content from Wikipedia URL (placeholder for now)
26
  """
27
- # Will implement the actual extraction and translation later
28
- return f"Configuration saved. API Key: {api_key[:5]}..., Model: {model_id}, Target Language: {target_lang}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Create Gradio app
31
- with gr.Blocks(theme=gr.themes.Monochrome()) as app:
32
  gr.Markdown("# Wikipedia Translator")
33
 
 
 
 
34
  with gr.Row():
35
  # Sidebar for configuration
36
  with gr.Column(scale=1):
@@ -81,19 +120,60 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as app:
81
 
82
  extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
83
 
84
- output = gr.Textbox(label="Status")
85
 
86
  # Results area (will expand in the future)
87
- article_info = gr.Textbox(label="Article Information", visible=False)
88
- article_content = gr.Textbox(label="Article Content", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  # Connect the extract button to the function
91
  extract_button.click(
92
  fn=extract_wikipedia_content,
93
  inputs=[wiki_url, api_key, model_id, base_url, target_language],
94
- outputs=[output]
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
 
97
  # Launch the app
98
  if __name__ == "__main__":
99
- app.launch()
 
1
  import gradio as gr
2
+ from utils import extract_wiki_id, get_wiki_details, split_content_into_sections
3
+ import json
 
4
 
5
  # Define language options for translation
6
  LANGUAGES = {
 
14
  "Russian": "ru",
15
  "Japanese": "ja",
16
  "Chinese": "zh",
 
17
  "Hindi": "hi",
18
  "Korean": "ko"
19
  }
 
22
  """
23
  Function to extract content from Wikipedia URL (placeholder for now)
24
  """
25
+ wiki_id = extract_wiki_id(wiki_url)
26
+ if not wiki_id:
27
+ return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {}
28
+
29
+ # Get the details of the Wikipedia article
30
+ wiki_details = get_wiki_details(wiki_id)
31
+ content_sections = split_content_into_sections(wiki_details['wiki_xml'])
32
+
33
+ return (
34
+ "Extraction complete! Sections: " + str(len(content_sections)),
35
+ wiki_details['pageid'],
36
+ wiki_details['title'],
37
+ wiki_details['summary'],
38
+ wiki_details['wiki_xml'],
39
+ content_sections
40
+ )
41
+
42
+ def update_ui_with_sections(sections_dict):
43
+ """
44
+ Creates a list of components to display in the sections area
45
+ """
46
+ components = []
47
+
48
+ if not sections_dict:
49
+ return [gr.update(visible=False) for _ in range(10)] # Assuming max 10 sections
50
+
51
+ # Create visible components for available sections
52
+ for section_name, section_content in sections_dict.items():
53
+ components.append(gr.update(
54
+ value=section_content,
55
+ label=f"Section: {section_name}",
56
+ visible=True
57
+ ))
58
+
59
+ # Hide any unused components
60
+ remaining = 100 - len(components) # Assuming max 100 sections
61
+ for _ in range(remaining):
62
+ components.append(gr.update(visible=False))
63
+
64
+ return components
65
 
66
  # Create Gradio app
67
+ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
68
  gr.Markdown("# Wikipedia Translator")
69
 
70
+ # State variable to store sections
71
+ sections_state = gr.State({})
72
+
73
  with gr.Row():
74
  # Sidebar for configuration
75
  with gr.Column(scale=1):
 
120
 
121
  extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
122
 
123
+ output = gr.Markdown(label="Status")
124
 
125
  # Results area (will expand in the future)
126
+ article_pageid = gr.Textbox(
127
+ label="Article Page ID",
128
+ placeholder="Page ID will appear here after extraction",
129
+ interactive=False
130
+ )
131
+
132
+ article_title = gr.Textbox(
133
+ label="Article Title",
134
+ placeholder="Title will appear here after extraction",
135
+ interactive=False
136
+ )
137
+
138
+ aticle_summary = gr.Textbox(
139
+ label="Article Summary",
140
+ placeholder="Summary will appear here after extraction",
141
+ interactive=False
142
+ )
143
+
144
+ article_xml = gr.Textbox(
145
+ label="Article XML",
146
+ placeholder="XML will appear here after extraction",
147
+ interactive=False,
148
+ visible=False # Hidden by default as it's usually large
149
+ )
150
+
151
+ # Pre-define section textboxes (limit to 100 for simplicity)
152
+ gr.Markdown("### Article Sections")
153
+ with gr.Column() as sections_container:
154
+ section_textboxes = [
155
+ gr.Textbox(visible=False, lines=4)
156
+ for _ in range(100) # Support up to 100 sections
157
+ ]
158
 
159
  # Connect the extract button to the function
160
  extract_button.click(
161
  fn=extract_wikipedia_content,
162
  inputs=[wiki_url, api_key, model_id, base_url, target_language],
163
+ outputs=[
164
+ output,
165
+ article_pageid,
166
+ article_title,
167
+ aticle_summary,
168
+ article_xml,
169
+ sections_state,
170
+ ]
171
+ ).then(
172
+ fn=update_ui_with_sections,
173
+ inputs=[sections_state],
174
+ outputs=section_textboxes
175
  )
176
 
177
  # Launch the app
178
  if __name__ == "__main__":
179
+ demo.launch()
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (292 Bytes). View file
 
utils/__pycache__/wikipedia_extractor.cpython-310.pyc ADDED
Binary file (1.43 kB). View file
 
utils/wikipedia_extractor.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import wikipedia
2
+ from typing import List, Dict, Any
3
+ import urllib.parse
4
+ import requests
5
+ import xml.etree.ElementTree as ET
6
+ import re
7
+
8
+
9
+ # Function to extract wiki id from a given url
10
+ def extract_wiki_id(url: str) -> str:
11
+ """
12
+ Extracts the wiki id from a given url.
13
+
14
+ Args:
15
+ url (str): The url to extract the wiki id from.
16
+
17
+ Returns:
18
+ str: The extracted wiki id.
19
+ """
20
+
21
+ # validate the url is from wikipedia
22
+ if "wikipedia.org" not in url:
23
+ raise ValueError("URL is not from Wikipedia")
24
+
25
+ # Parse the URL
26
+ parsed_url = urllib.parse.urlparse(url)
27
+
28
+ # Extract the path from the parsed URL
29
+ path = parsed_url.path
30
+
31
+ # Split the path into parts
32
+ path_parts = path.split('/')
33
+
34
+ # The wiki id is the last part of the path
35
+ wiki_id = path_parts[-1]
36
+
37
+ # Remove any query parameters
38
+ if '?' in wiki_id:
39
+ wiki_id = wiki_id.split('?')[0]
40
+
41
+ # Remove any fragment identifiers
42
+ if '#' in wiki_id:
43
+ wiki_id = wiki_id.split('#')[0]
44
+
45
+ return wiki_id
46
+
47
+ # Function to get all details dictionary from a given wiki id
48
+ def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
49
+ """
50
+ Gets all details dictionary from a given wiki id.
51
+
52
+ Args:
53
+ wiki_id (str): The wiki id to get the details from.
54
+
55
+ Returns:
56
+ dict: The details dictionary.
57
+ """
58
+
59
+ # Get the page object
60
+ page = wikipedia.page(wiki_id)
61
+
62
+ wiki_xml, has_error = get_wiki_xml(wiki_id)
63
+ if has_error or not wiki_xml:
64
+ print(f"Error fetching XML data: {has_error}")
65
+ return None
66
+
67
+ # Get the details dictionary
68
+ details = {
69
+ "title": page.title,
70
+ "wiki_xml": wiki_xml,
71
+ "pageid": page.pageid,
72
+ "url": page.url,
73
+ "content": page.content,
74
+ "summary": page.summary,
75
+ "images": page.images,
76
+ "links": page.links,
77
+ "categories": page.categories,
78
+ "references": page.references,
79
+ "sections": page.sections
80
+ }
81
+
82
+ return details
83
+
84
+ # functio to get xml data from a given wiki id
85
+ def get_wiki_xml(page_title):
86
+ try:
87
+
88
+ # MediaWiki API endpoint
89
+ url = "https://en.wikipedia.org/w/api.php"
90
+
91
+ # Parameters for XML format
92
+ params = {
93
+ "action": "query",
94
+ "titles": page_title,
95
+ "prop": "revisions",
96
+ "rvprop": "content",
97
+ "format": "xml"
98
+ }
99
+
100
+ # Make the request
101
+ response = requests.get(url, params=params)
102
+ xml_content = response.text
103
+
104
+ return xml_content, None
105
+
106
+ except wikipedia.exceptions.PageError:
107
+ return None, {"error": f"Page '{page_title}' does not exist"}
108
+ except wikipedia.exceptions.DisambiguationError as e:
109
+ return None, {"error": f"Disambiguation error: {e}"}
110
+ except Exception as e:
111
+ return None, {"error": f"An error occurred: {str(e)}"}
112
+
113
+ # function to split content into sections using === [SECTION NAME] === regex pattern
114
+ def split_content_into_sections(content: str) -> List[str]:
115
+
116
+ """
117
+ Splits the content into sections using the === [SECTION NAME] === regex pattern.
118
+
119
+ Args:
120
+ content (str): The content to split.
121
+
122
+ Returns:
123
+ dict: The sections dictionary.
124
+ """
125
+
126
+ sections_dict = {}
127
+
128
+ # Split the content into sections using regex
129
+ sections = re.split(r'={2,}([^=]+)={2,}', content)
130
+
131
+ # Iterate over the sections and add them to the dictionary
132
+ for i in range(1, len(sections), 2):
133
+ section_name = sections[i].strip()
134
+ section_content = sections[i + 1].strip()
135
+ sections_dict[section_name] = section_content
136
+
137
+ return sections_dict
138
+