wikihop-server / db /wiki_parser_sqlite.py
stillerman's picture
stillerman HF Staff
sqlite backend
e91ced9
import bz2
import re
import os
import sqlite3
from pathlib import Path
from xml.sax import make_parser, handler
import time
class WikiContentHandler(handler.ContentHandler):
def __init__(self, db_conn, batch_size=1000, max_articles=None):
self.db_conn = db_conn
self.cursor = db_conn.cursor()
self.batch_size = batch_size
self.article_count = 0
self.max_articles = max_articles
self.article_batch = []
self.links_batch = []
# Current elements
self.current_title = None
self.current_text = None
self.current_ns = None
self.in_page = False
self.in_title = False
self.in_text = False
self.in_ns = False
self.buffer = []
def startElement(self, name, attrs):
if name == 'page':
self.in_page = True
self.current_title = None
self.current_text = None
self.current_ns = None
elif self.in_page and name == 'title':
self.in_title = True
self.buffer = []
elif self.in_page and name == 'ns':
self.in_ns = True
self.buffer = []
elif self.in_page and name == 'text':
self.in_text = True
self.buffer = []
def endElement(self, name):
if name == 'page':
self.in_page = False
# Only process main namespace articles (ns = 0)
if self.current_ns == '0' and self.current_title and self.current_text:
# Extract links
links = self.extract_links(self.current_text)
# Add to batch
self.article_batch.append(
(self.current_title, self.current_text)
)
# Add links to batch
for link in links:
self.links_batch.append(
(self.current_title, link)
)
self.article_count += 1
# Print progress
if self.article_count % 100 == 0:
print(f"Processed {self.article_count} articles...")
# Insert batch if reached batch size
if len(self.article_batch) >= self.batch_size:
self._insert_batch()
# Check if we've reached the maximum number of articles
if self.max_articles and self.article_count >= self.max_articles:
self._insert_batch() # Insert any remaining items
raise StopIteration("Reached maximum number of articles")
elif name == 'title':
self.in_title = False
self.current_title = ''.join(self.buffer)
elif name == 'ns':
self.in_ns = False
self.current_ns = ''.join(self.buffer)
elif name == 'text':
self.in_text = False
self.current_text = ''.join(self.buffer)
def characters(self, content):
if self.in_title:
self.buffer.append(content)
elif self.in_ns:
self.buffer.append(content)
elif self.in_text:
self.buffer.append(content)
def extract_links(self, text):
"""Extract links from article wikitext"""
# Pattern to match [[Link]] or [[Link|Text]] format
links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
# Process links
processed_links = []
for link in links:
# Skip non-article links (except categories which might be useful)
if ':' in link and not link.startswith('Category:'):
continue
# Remove any section links (with #)
link = link.split('#')[0].strip()
# Skip empty links
if not link:
continue
processed_links.append(link)
# Remove duplicates and return
return list(set(processed_links))
def _insert_batch(self):
"""Insert batched data into the database"""
if self.article_batch:
self.cursor.executemany(
"INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)",
self.article_batch
)
if self.links_batch:
self.cursor.executemany(
"INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)",
self.links_batch
)
self.db_conn.commit()
self.article_batch = []
self.links_batch = []
def create_db_schema(db_conn):
"""Create the database schema"""
cursor = db_conn.cursor()
# Create articles table
cursor.execute('''
CREATE TABLE IF NOT EXISTS articles (
title TEXT PRIMARY KEY,
text TEXT
)
''')
# Create links table
cursor.execute('''
CREATE TABLE IF NOT EXISTS links (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_title TEXT,
target_title TEXT,
FOREIGN KEY (source_title) REFERENCES articles (title),
UNIQUE (source_title, target_title)
)
''')
# Create index on links for faster queries
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title)
''')
cursor.execute('''
CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title)
''')
db_conn.commit()
def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None):
"""
Parse the Wikipedia XML dump and extract articles with their links into SQLite database.
Args:
dump_path: Path to the bz2 Wikipedia dump
db_path: Path to save the SQLite database
batch_size: Number of articles to process before committing to the database
max_articles: Maximum number of articles to extract (None for all)
Returns:
The path to the created SQLite database
"""
start_time = time.time()
print(f"Parsing Wikipedia dump: {dump_path}")
# Create or connect to SQLite database
db_conn = sqlite3.connect(db_path)
# Create schema
create_db_schema(db_conn)
# Create SAX parser with custom content handler
parser = make_parser()
content_handler = WikiContentHandler(db_conn, batch_size, max_articles)
parser.setContentHandler(content_handler)
# Parse the dump
try:
parser.parse(bz2.BZ2File(dump_path))
# Insert any remaining items in the batch
content_handler._insert_batch()
except StopIteration:
print("Reached maximum number of articles")
except Exception as e:
print(f"Error parsing dump: {e}")
raise
finally:
db_conn.commit()
db_conn.close()
duration = time.time() - start_time
print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.")
print(f"Data saved to {db_path}")
return db_path
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite')
parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
parser.add_argument('output_path', help='Path to save the SQLite database')
parser.add_argument('--batch-size', type=int, default=1000,
help='Batch size for database inserts (default: 1000)')
parser.add_argument('--max-articles', type=int, default=None,
help='Maximum number of articles to extract (default: all)')
args = parser.parse_args()
# Create output directory if it doesn't exist
output_dir = os.path.dirname(args.output_path)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
# Parse the dump
parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles)