Spaces:

stillerman
/

wikihop-server

Runtime error

App Files Files Community

stillerman commited on Apr 25

Commit

f64fe29

1 Parent(s): fa70ae5

fixed parser

Browse files

Files changed (2) hide show

.gitignore +1 -0
wiki_parser.py +114 -86

.gitignore CHANGED Viewed

@@ -1,2 +1,3 @@
 *.bz2
 data

 *.bz2
 data
+__pycache__

wiki_parser.py CHANGED Viewed

@@ -1,10 +1,108 @@
 import bz2
 import json
 import re
-import xml.etree.ElementTree as ET
-from collections import defaultdict
-from pathlib import Path
 import os
 def parse_wiki_dump(dump_path, output_path, max_articles=None):
     """
@@ -20,99 +118,29 @@ def parse_wiki_dump(dump_path, output_path, max_articles=None):
     """
     print(f"Parsing Wikipedia dump: {dump_path}")
-    # Dictionary to store articles and their links
-    wiki_data = {}
-    article_count = 0
-    # Namespace mapping in Wikipedia XML
-    ns = {
-        'xml': 'http://www.w3.org/XML/1998/namespace'
-    }
-    # Initialize XML parser
-    context = ET.iterparse(bz2.open(dump_path, 'rt', encoding='utf-8'), events=('end',))
-    # Process XML elements
-    for event, elem in context:
-        if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
-            # Extract title
-            title_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}title')
-            if title_elem is None:
-                continue
-            title = title_elem.text
-            # Skip non-main namespace
-            ns_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}ns')
-            if ns_elem is not None and ns_elem.text != '0':
-                elem.clear()
-                continue
-            # Extract content
-            text_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}text')
-            if text_elem is None or text_elem.text is None:
-                elem.clear()
-                continue
-            content = text_elem.text
-            # Extract links from content
-            links = extract_links(content)
-            # Add to wiki data
-            wiki_data[title] = {
-                'title': title,
-                'text': content,
-                'links': links
-            }
-            article_count += 1
-            # Print progress
-            if article_count % 1000 == 0:
-                print(f"Processed {article_count} articles...")
-            # Check if we've reached the maximum number of articles
-            if max_articles and article_count >= max_articles:
-                break
-            # Clear element to save memory
-            elem.clear()
-    print(f"Extracted {article_count} articles with their links.")
     # Save data to JSON file
     output_file = os.path.join(output_path, 'wiki_data.json')
     with open(output_file, 'w', encoding='utf-8') as f:
-        json.dump(wiki_data, f)
     print(f"Data saved to {output_file}")
     return output_file
-def extract_links(text):
-    """Extract links from article wikitext"""
-    # Pattern to match [[Link]] or [[Link|Text]] format
-    links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
-    # Process links
-    processed_links = []
-    for link in links:
-        # Skip non-article links
-        if ':' in link and not link.startswith('Category:'):
-            continue
-        # Remove any section links (with #)
-        link = link.split('#')[0].strip()
-        # Skip empty links
-        if not link:
-            continue
-        processed_links.append(link)
-    # Remove duplicates and return
-    return list(set(processed_links))
 if __name__ == "__main__":
     import argparse

 import bz2
 import json
 import re
 import os
+from pathlib import Path
+from xml.sax import make_parser, handler
+class WikiContentHandler(handler.ContentHandler):
+    def __init__(self, max_articles=None):
+        self.wiki_data = {}
+        self.article_count = 0
+        self.max_articles = max_articles
+        # Current elements
+        self.current_title = None
+        self.current_text = None
+        self.current_ns = None
+        self.in_page = False
+        self.in_title = False
+        self.in_text = False
+        self.in_ns = False
+        self.buffer = []
+    def startElement(self, name, attrs):
+        if name == 'page':
+            self.in_page = True
+            self.current_title = None
+            self.current_text = None
+            self.current_ns = None
+        elif self.in_page and name == 'title':
+            self.in_title = True
+            self.buffer = []
+        elif self.in_page and name == 'ns':
+            self.in_ns = True
+            self.buffer = []
+        elif self.in_page and name == 'text':
+            self.in_text = True
+            self.buffer = []
+    def endElement(self, name):
+        if name == 'page':
+            self.in_page = False
+            # Only process main namespace articles (ns = 0)
+            if self.current_ns == '0' and self.current_title and self.current_text:
+                # Extract links
+                links = self.extract_links(self.current_text)
+                # Add to wiki data
+                self.wiki_data[self.current_title] = {
+                    'title': self.current_title,
+                    'text': self.current_text,
+                    'links': links
+                }
+                self.article_count += 1
+                # Print progress
+                if self.article_count % 100 == 0:
+                    print(f"Processed {self.article_count} articles...")
+                # Check if we've reached the maximum number of articles
+                if self.max_articles and self.article_count >= self.max_articles:
+                    raise StopIteration("Reached maximum number of articles")
+        elif name == 'title':
+            self.in_title = False
+            self.current_title = ''.join(self.buffer)
+        elif name == 'ns':
+            self.in_ns = False
+            self.current_ns = ''.join(self.buffer)
+        elif name == 'text':
+            self.in_text = False
+            self.current_text = ''.join(self.buffer)
+    def characters(self, content):
+        if self.in_title:
+            self.buffer.append(content)
+        elif self.in_ns:
+            self.buffer.append(content)
+        elif self.in_text:
+            self.buffer.append(content)
+    def extract_links(self, text):
+        """Extract links from article wikitext"""
+        # Pattern to match [[Link]] or [[Link|Text]] format
+        links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
+        # Process links
+        processed_links = []
+        for link in links:
+            # Skip non-article links (except categories which might be useful)
+            if ':' in link and not link.startswith('Category:'):
+                continue
+            # Remove any section links (with #)
+            link = link.split('#')[0].strip()
+            # Skip empty links
+            if not link:
+                continue
+            processed_links.append(link)
+        # Remove duplicates and return
+        return list(set(processed_links))
 def parse_wiki_dump(dump_path, output_path, max_articles=None):
     """
     """
     print(f"Parsing Wikipedia dump: {dump_path}")
+    # Create SAX parser with custom content handler
+    parser = make_parser()
+    content_handler = WikiContentHandler(max_articles)
+    parser.setContentHandler(content_handler)
+    # Parse the dump
+    try:
+        parser.parse(bz2.BZ2File(dump_path))
+    except StopIteration:
+        print("Reached maximum number of articles")
+    except Exception as e:
+        print(f"Error parsing dump: {e}")
+    print(f"Extracted {content_handler.article_count} articles with their links.")
     # Save data to JSON file
     output_file = os.path.join(output_path, 'wiki_data.json')
     with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(content_handler.wiki_data, f)
     print(f"Data saved to {output_file}")
     return output_file
 if __name__ == "__main__":
     import argparse