Spaces:
Running
Running
extract wikipedia details
Browse files- app.py +92 -12
- utils/__init__.py +1 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/wikipedia_extractor.cpython-310.pyc +0 -0
- utils/wikipedia_extractor.py +138 -0
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
import
|
4 |
-
import os
|
5 |
|
6 |
# Define language options for translation
|
7 |
LANGUAGES = {
|
@@ -15,7 +14,6 @@ LANGUAGES = {
|
|
15 |
"Russian": "ru",
|
16 |
"Japanese": "ja",
|
17 |
"Chinese": "zh",
|
18 |
-
"Arabic": "ar",
|
19 |
"Hindi": "hi",
|
20 |
"Korean": "ko"
|
21 |
}
|
@@ -24,13 +22,54 @@ def extract_wikipedia_content(wiki_url, api_key, model_id, base_url, target_lang
|
|
24 |
"""
|
25 |
Function to extract content from Wikipedia URL (placeholder for now)
|
26 |
"""
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Create Gradio app
|
31 |
-
with gr.Blocks(theme=gr.themes.Monochrome()) as
|
32 |
gr.Markdown("# Wikipedia Translator")
|
33 |
|
|
|
|
|
|
|
34 |
with gr.Row():
|
35 |
# Sidebar for configuration
|
36 |
with gr.Column(scale=1):
|
@@ -81,19 +120,60 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as app:
|
|
81 |
|
82 |
extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
|
83 |
|
84 |
-
output = gr.
|
85 |
|
86 |
# Results area (will expand in the future)
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
# Connect the extract button to the function
|
91 |
extract_button.click(
|
92 |
fn=extract_wikipedia_content,
|
93 |
inputs=[wiki_url, api_key, model_id, base_url, target_language],
|
94 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
)
|
96 |
|
97 |
# Launch the app
|
98 |
if __name__ == "__main__":
|
99 |
-
|
|
|
1 |
import gradio as gr
|
2 |
+
from utils import extract_wiki_id, get_wiki_details, split_content_into_sections
|
3 |
+
import json
|
|
|
4 |
|
5 |
# Define language options for translation
|
6 |
LANGUAGES = {
|
|
|
14 |
"Russian": "ru",
|
15 |
"Japanese": "ja",
|
16 |
"Chinese": "zh",
|
|
|
17 |
"Hindi": "hi",
|
18 |
"Korean": "ko"
|
19 |
}
|
|
|
22 |
"""
|
23 |
Function to extract content from Wikipedia URL (placeholder for now)
|
24 |
"""
|
25 |
+
wiki_id = extract_wiki_id(wiki_url)
|
26 |
+
if not wiki_id:
|
27 |
+
return "Invalid Wikipedia URL. Please check the URL and try again.", None, None, None, None, {}
|
28 |
+
|
29 |
+
# Get the details of the Wikipedia article
|
30 |
+
wiki_details = get_wiki_details(wiki_id)
|
31 |
+
content_sections = split_content_into_sections(wiki_details['wiki_xml'])
|
32 |
+
|
33 |
+
return (
|
34 |
+
"Extraction complete! Sections: " + str(len(content_sections)),
|
35 |
+
wiki_details['pageid'],
|
36 |
+
wiki_details['title'],
|
37 |
+
wiki_details['summary'],
|
38 |
+
wiki_details['wiki_xml'],
|
39 |
+
content_sections
|
40 |
+
)
|
41 |
+
|
42 |
+
def update_ui_with_sections(sections_dict):
|
43 |
+
"""
|
44 |
+
Creates a list of components to display in the sections area
|
45 |
+
"""
|
46 |
+
components = []
|
47 |
+
|
48 |
+
if not sections_dict:
|
49 |
+
return [gr.update(visible=False) for _ in range(10)] # Assuming max 10 sections
|
50 |
+
|
51 |
+
# Create visible components for available sections
|
52 |
+
for section_name, section_content in sections_dict.items():
|
53 |
+
components.append(gr.update(
|
54 |
+
value=section_content,
|
55 |
+
label=f"Section: {section_name}",
|
56 |
+
visible=True
|
57 |
+
))
|
58 |
+
|
59 |
+
# Hide any unused components
|
60 |
+
remaining = 100 - len(components) # Assuming max 100 sections
|
61 |
+
for _ in range(remaining):
|
62 |
+
components.append(gr.update(visible=False))
|
63 |
+
|
64 |
+
return components
|
65 |
|
66 |
# Create Gradio app
|
67 |
+
with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
|
68 |
gr.Markdown("# Wikipedia Translator")
|
69 |
|
70 |
+
# State variable to store sections
|
71 |
+
sections_state = gr.State({})
|
72 |
+
|
73 |
with gr.Row():
|
74 |
# Sidebar for configuration
|
75 |
with gr.Column(scale=1):
|
|
|
120 |
|
121 |
extract_button = gr.Button("Extract and Prepare for Translation", variant="primary")
|
122 |
|
123 |
+
output = gr.Markdown(label="Status")
|
124 |
|
125 |
# Results area (will expand in the future)
|
126 |
+
article_pageid = gr.Textbox(
|
127 |
+
label="Article Page ID",
|
128 |
+
placeholder="Page ID will appear here after extraction",
|
129 |
+
interactive=False
|
130 |
+
)
|
131 |
+
|
132 |
+
article_title = gr.Textbox(
|
133 |
+
label="Article Title",
|
134 |
+
placeholder="Title will appear here after extraction",
|
135 |
+
interactive=False
|
136 |
+
)
|
137 |
+
|
138 |
+
aticle_summary = gr.Textbox(
|
139 |
+
label="Article Summary",
|
140 |
+
placeholder="Summary will appear here after extraction",
|
141 |
+
interactive=False
|
142 |
+
)
|
143 |
+
|
144 |
+
article_xml = gr.Textbox(
|
145 |
+
label="Article XML",
|
146 |
+
placeholder="XML will appear here after extraction",
|
147 |
+
interactive=False,
|
148 |
+
visible=False # Hidden by default as it's usually large
|
149 |
+
)
|
150 |
+
|
151 |
+
# Pre-define section textboxes (limit to 100 for simplicity)
|
152 |
+
gr.Markdown("### Article Sections")
|
153 |
+
with gr.Column() as sections_container:
|
154 |
+
section_textboxes = [
|
155 |
+
gr.Textbox(visible=False, lines=4)
|
156 |
+
for _ in range(100) # Support up to 100 sections
|
157 |
+
]
|
158 |
|
159 |
# Connect the extract button to the function
|
160 |
extract_button.click(
|
161 |
fn=extract_wikipedia_content,
|
162 |
inputs=[wiki_url, api_key, model_id, base_url, target_language],
|
163 |
+
outputs=[
|
164 |
+
output,
|
165 |
+
article_pageid,
|
166 |
+
article_title,
|
167 |
+
aticle_summary,
|
168 |
+
article_xml,
|
169 |
+
sections_state,
|
170 |
+
]
|
171 |
+
).then(
|
172 |
+
fn=update_ui_with_sections,
|
173 |
+
inputs=[sections_state],
|
174 |
+
outputs=section_textboxes
|
175 |
)
|
176 |
|
177 |
# Launch the app
|
178 |
if __name__ == "__main__":
|
179 |
+
demo.launch()
|
utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .wikipedia_extractor import (extract_wiki_id, get_wiki_details, split_content_into_sections)
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (292 Bytes). View file
|
|
utils/__pycache__/wikipedia_extractor.cpython-310.pyc
ADDED
Binary file (1.43 kB). View file
|
|
utils/wikipedia_extractor.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import wikipedia
|
2 |
+
from typing import List, Dict, Any
|
3 |
+
import urllib.parse
|
4 |
+
import requests
|
5 |
+
import xml.etree.ElementTree as ET
|
6 |
+
import re
|
7 |
+
|
8 |
+
|
9 |
+
# Function to extract wiki id from a given url
|
10 |
+
def extract_wiki_id(url: str) -> str:
|
11 |
+
"""
|
12 |
+
Extracts the wiki id from a given url.
|
13 |
+
|
14 |
+
Args:
|
15 |
+
url (str): The url to extract the wiki id from.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
str: The extracted wiki id.
|
19 |
+
"""
|
20 |
+
|
21 |
+
# validate the url is from wikipedia
|
22 |
+
if "wikipedia.org" not in url:
|
23 |
+
raise ValueError("URL is not from Wikipedia")
|
24 |
+
|
25 |
+
# Parse the URL
|
26 |
+
parsed_url = urllib.parse.urlparse(url)
|
27 |
+
|
28 |
+
# Extract the path from the parsed URL
|
29 |
+
path = parsed_url.path
|
30 |
+
|
31 |
+
# Split the path into parts
|
32 |
+
path_parts = path.split('/')
|
33 |
+
|
34 |
+
# The wiki id is the last part of the path
|
35 |
+
wiki_id = path_parts[-1]
|
36 |
+
|
37 |
+
# Remove any query parameters
|
38 |
+
if '?' in wiki_id:
|
39 |
+
wiki_id = wiki_id.split('?')[0]
|
40 |
+
|
41 |
+
# Remove any fragment identifiers
|
42 |
+
if '#' in wiki_id:
|
43 |
+
wiki_id = wiki_id.split('#')[0]
|
44 |
+
|
45 |
+
return wiki_id
|
46 |
+
|
47 |
+
# Function to get all details dictionary from a given wiki id
|
48 |
+
def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
|
49 |
+
"""
|
50 |
+
Gets all details dictionary from a given wiki id.
|
51 |
+
|
52 |
+
Args:
|
53 |
+
wiki_id (str): The wiki id to get the details from.
|
54 |
+
|
55 |
+
Returns:
|
56 |
+
dict: The details dictionary.
|
57 |
+
"""
|
58 |
+
|
59 |
+
# Get the page object
|
60 |
+
page = wikipedia.page(wiki_id)
|
61 |
+
|
62 |
+
wiki_xml, has_error = get_wiki_xml(wiki_id)
|
63 |
+
if has_error or not wiki_xml:
|
64 |
+
print(f"Error fetching XML data: {has_error}")
|
65 |
+
return None
|
66 |
+
|
67 |
+
# Get the details dictionary
|
68 |
+
details = {
|
69 |
+
"title": page.title,
|
70 |
+
"wiki_xml": wiki_xml,
|
71 |
+
"pageid": page.pageid,
|
72 |
+
"url": page.url,
|
73 |
+
"content": page.content,
|
74 |
+
"summary": page.summary,
|
75 |
+
"images": page.images,
|
76 |
+
"links": page.links,
|
77 |
+
"categories": page.categories,
|
78 |
+
"references": page.references,
|
79 |
+
"sections": page.sections
|
80 |
+
}
|
81 |
+
|
82 |
+
return details
|
83 |
+
|
84 |
+
# functio to get xml data from a given wiki id
|
85 |
+
def get_wiki_xml(page_title):
|
86 |
+
try:
|
87 |
+
|
88 |
+
# MediaWiki API endpoint
|
89 |
+
url = "https://en.wikipedia.org/w/api.php"
|
90 |
+
|
91 |
+
# Parameters for XML format
|
92 |
+
params = {
|
93 |
+
"action": "query",
|
94 |
+
"titles": page_title,
|
95 |
+
"prop": "revisions",
|
96 |
+
"rvprop": "content",
|
97 |
+
"format": "xml"
|
98 |
+
}
|
99 |
+
|
100 |
+
# Make the request
|
101 |
+
response = requests.get(url, params=params)
|
102 |
+
xml_content = response.text
|
103 |
+
|
104 |
+
return xml_content, None
|
105 |
+
|
106 |
+
except wikipedia.exceptions.PageError:
|
107 |
+
return None, {"error": f"Page '{page_title}' does not exist"}
|
108 |
+
except wikipedia.exceptions.DisambiguationError as e:
|
109 |
+
return None, {"error": f"Disambiguation error: {e}"}
|
110 |
+
except Exception as e:
|
111 |
+
return None, {"error": f"An error occurred: {str(e)}"}
|
112 |
+
|
113 |
+
# function to split content into sections using === [SECTION NAME] === regex pattern
|
114 |
+
def split_content_into_sections(content: str) -> List[str]:
|
115 |
+
|
116 |
+
"""
|
117 |
+
Splits the content into sections using the === [SECTION NAME] === regex pattern.
|
118 |
+
|
119 |
+
Args:
|
120 |
+
content (str): The content to split.
|
121 |
+
|
122 |
+
Returns:
|
123 |
+
dict: The sections dictionary.
|
124 |
+
"""
|
125 |
+
|
126 |
+
sections_dict = {}
|
127 |
+
|
128 |
+
# Split the content into sections using regex
|
129 |
+
sections = re.split(r'={2,}([^=]+)={2,}', content)
|
130 |
+
|
131 |
+
# Iterate over the sections and add them to the dictionary
|
132 |
+
for i in range(1, len(sections), 2):
|
133 |
+
section_name = sections[i].strip()
|
134 |
+
section_content = sections[i + 1].strip()
|
135 |
+
sections_dict[section_name] = section_content
|
136 |
+
|
137 |
+
return sections_dict
|
138 |
+
|