bakrianoo commited on
Commit
49b1cdf
·
1 Parent(s): 06c3d9b

fix searching for wikipedia pages

Browse files
Files changed (1) hide show
  1. utils/wikipedia_extractor.py +63 -29
utils/wikipedia_extractor.py CHANGED
@@ -1,5 +1,5 @@
1
  import wikipedia
2
- from typing import List, Dict, Any
3
  import urllib.parse
4
  import requests
5
  import xml.etree.ElementTree as ET
@@ -42,44 +42,79 @@ def extract_wiki_id(url: str) -> str:
42
  if '#' in wiki_id:
43
  wiki_id = wiki_id.split('#')[0]
44
 
 
 
 
 
 
 
45
  return wiki_id
46
 
47
  # Function to get all details dictionary from a given wiki id
48
- def get_wiki_details(wiki_id: str) -> Dict[str, Any]:
49
  """
50
- Gets all details dictionary from a given wiki id.
51
 
52
  Args:
53
- wiki_id (str): The wiki id to get the details from.
54
 
55
  Returns:
56
- dict: The details dictionary.
57
  """
58
-
59
- # Get the page object
60
- page = wikipedia.page(wiki_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- wiki_xml, has_error = get_wiki_xml(wiki_id)
63
- if has_error or not wiki_xml:
64
- print(f"Error fetching XML data: {has_error}")
 
 
 
 
 
65
  return None
66
-
67
- # Get the details dictionary
68
- details = {
69
- "title": page.title,
70
- "wiki_xml": wiki_xml,
71
- "pageid": page.pageid,
72
- "url": page.url,
73
- "content": page.content,
74
- "summary": page.summary,
75
- "images": page.images,
76
- "links": page.links,
77
- "categories": page.categories,
78
- "references": page.references,
79
- "sections": page.sections
80
- }
81
-
82
- return details
83
 
84
  # functio to get xml data from a given wiki id
85
  def get_wiki_xml(page_title):
@@ -136,4 +171,3 @@ def split_content_into_sections(content: str, content_format: str=None) -> List[
136
  sections_dict[section_name] = section_content
137
 
138
  return sections_dict
139
-
 
1
  import wikipedia
2
+ from typing import List, Dict, Any, Optional, Tuple
3
  import urllib.parse
4
  import requests
5
  import xml.etree.ElementTree as ET
 
42
  if '#' in wiki_id:
43
  wiki_id = wiki_id.split('#')[0]
44
 
45
+ # URL decode the wiki id to handle special characters
46
+ wiki_id = urllib.parse.unquote(wiki_id)
47
+
48
+ # Replace underscores with spaces as Wikipedia API expects spaces
49
+ wiki_id = wiki_id.replace('_', ' ')
50
+
51
  return wiki_id
52
 
53
  # Function to get all details dictionary from a given wiki id
54
+ def get_wiki_details(wiki_id_or_url: str) -> Optional[Dict[str, Any]]:
55
  """
56
+ Gets all details dictionary from a given wiki id or URL.
57
 
58
  Args:
59
+ wiki_id_or_url (str): The wiki id or URL to get the details from.
60
 
61
  Returns:
62
+ dict: The details dictionary or None if there was an error.
63
  """
64
+ try:
65
+ # Check if input is a URL and extract wiki_id if it is
66
+ if "wikipedia.org" in wiki_id_or_url:
67
+ wiki_id = extract_wiki_id(wiki_id_or_url)
68
+ else:
69
+ wiki_id = wiki_id_or_url
70
+
71
+ # Get the page object
72
+ try:
73
+ page = wikipedia.page(wiki_id, auto_suggest=False)
74
+ except wikipedia.exceptions.PageError:
75
+ # If direct page lookup fails, try search
76
+ search_results = wikipedia.search(wiki_id)
77
+ if not search_results:
78
+ print(f"No results found for '{wiki_id}'")
79
+ return None
80
+
81
+ # Use the first search result
82
+ try:
83
+ page = wikipedia.page(search_results[0], auto_suggest=False)
84
+ print(f"Using closest match: '{page.title}' for query '{wiki_id}'")
85
+ except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError) as e:
86
+ print(f"Error with search result: {e}")
87
+ return None
88
+
89
+ wiki_xml, has_error = get_wiki_xml(page.title)
90
+ if has_error or not wiki_xml:
91
+ print(f"Error fetching XML data: {has_error}")
92
+ return None
93
+
94
+ # Get the details dictionary
95
+ details = {
96
+ "title": page.title,
97
+ "wiki_xml": wiki_xml,
98
+ "pageid": page.pageid,
99
+ "url": page.url,
100
+ "content": page.content,
101
+ "summary": page.summary,
102
+ "images": page.images,
103
+ "links": page.links,
104
+ "categories": page.categories,
105
+ "references": page.references,
106
+ "sections": page.sections
107
+ }
108
 
109
+ return details
110
+
111
+ except wikipedia.exceptions.DisambiguationError as e:
112
+ print(f"Disambiguation error: {e}")
113
+ print(f"Please specify one of the options above.")
114
+ return None
115
+ except Exception as e:
116
+ print(f"An error occurred: {str(e)}")
117
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # functio to get xml data from a given wiki id
120
  def get_wiki_xml(page_title):
 
171
  sections_dict[section_name] = section_content
172
 
173
  return sections_dict