stillerman HF Staff commited on
Commit
e91ced9
·
1 Parent(s): 2c2bab6

sqlite backend

Browse files
README.md CHANGED
@@ -1 +1,4 @@
1
  wget https://dumps.wikimedia.org/simplewiki/20250420/simplewiki-20250420-pages-articles-multistream.xml.bz2
 
 
 
 
1
  wget https://dumps.wikimedia.org/simplewiki/20250420/simplewiki-20250420-pages-articles-multistream.xml.bz2
2
+
3
+
4
+ python db/wiki_parser_sqlite.py simplewiki-20250420-pages-articles-multistream.xml.bz2 db/data/wikihop.db --batch-size 10000
db/wiki_db_sqlite.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+
3
+ class WikiDBSqlite:
4
+ def __init__(self, db_path):
5
+ """Initialize the database with path to SQLite database"""
6
+ self.db_path = db_path
7
+ self.conn = sqlite3.connect(db_path)
8
+ self.conn.row_factory = sqlite3.Row
9
+ self.cursor = self.conn.cursor()
10
+ self._article_count = self._get_article_count()
11
+ print(f"Connected to SQLite database with {self._article_count} articles")
12
+
13
+ def __del__(self):
14
+ """Close database connection when object is destroyed"""
15
+ if hasattr(self, 'conn') and self.conn:
16
+ self.conn.close()
17
+
18
+ def _get_article_count(self):
19
+ """Get the number of articles in the database"""
20
+ self.cursor.execute("SELECT COUNT(*) FROM articles")
21
+ return self.cursor.fetchone()[0]
22
+
23
+ def get_article_count(self):
24
+ """Return the number of articles in the database"""
25
+ return self._article_count
26
+
27
+ def get_all_article_titles(self):
28
+ """Return a list of all article titles"""
29
+ self.cursor.execute("SELECT title FROM articles")
30
+ return [row[0] for row in self.cursor.fetchall()]
31
+
32
+ def get_article(self, title):
33
+ """Get article data by title"""
34
+ self.cursor.execute(
35
+ "SELECT title, text FROM articles WHERE title = ?",
36
+ (title,)
37
+ )
38
+ article = self.cursor.fetchone()
39
+
40
+ if not article:
41
+ return {}
42
+
43
+ # Get links for this article
44
+ self.cursor.execute(
45
+ "SELECT target_title FROM links WHERE source_title = ?",
46
+ (title,)
47
+ )
48
+ links = [row[0] for row in self.cursor.fetchall()]
49
+
50
+ return {
51
+ 'title': article['title'],
52
+ 'text': article['text'],
53
+ 'links': links
54
+ }
55
+
56
+ def article_exists(self, title):
57
+ """Check if an article exists in the database"""
58
+ self.cursor.execute(
59
+ "SELECT 1 FROM articles WHERE title = ? LIMIT 1",
60
+ (title,)
61
+ )
62
+ return bool(self.cursor.fetchone())
63
+
64
+ def get_article_text(self, title):
65
+ """Get the text of an article"""
66
+ self.cursor.execute(
67
+ "SELECT text FROM articles WHERE title = ?",
68
+ (title,)
69
+ )
70
+ result = self.cursor.fetchone()
71
+ return result['text'] if result else ''
72
+
73
+ def get_article_links(self, title):
74
+ """Get the links of an article"""
75
+ self.cursor.execute(
76
+ "SELECT target_title FROM links WHERE source_title = ?",
77
+ (title,)
78
+ )
79
+ return [row[0] for row in self.cursor.fetchall()]
db/{wiki_parser.py → wiki_parser_json.py} RENAMED
File without changes
db/wiki_parser_sqlite.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bz2
2
+ import re
3
+ import os
4
+ import sqlite3
5
+ from pathlib import Path
6
+ from xml.sax import make_parser, handler
7
+ import time
8
+
9
+ class WikiContentHandler(handler.ContentHandler):
10
+ def __init__(self, db_conn, batch_size=1000, max_articles=None):
11
+ self.db_conn = db_conn
12
+ self.cursor = db_conn.cursor()
13
+ self.batch_size = batch_size
14
+ self.article_count = 0
15
+ self.max_articles = max_articles
16
+ self.article_batch = []
17
+ self.links_batch = []
18
+
19
+ # Current elements
20
+ self.current_title = None
21
+ self.current_text = None
22
+ self.current_ns = None
23
+ self.in_page = False
24
+ self.in_title = False
25
+ self.in_text = False
26
+ self.in_ns = False
27
+ self.buffer = []
28
+
29
+ def startElement(self, name, attrs):
30
+ if name == 'page':
31
+ self.in_page = True
32
+ self.current_title = None
33
+ self.current_text = None
34
+ self.current_ns = None
35
+ elif self.in_page and name == 'title':
36
+ self.in_title = True
37
+ self.buffer = []
38
+ elif self.in_page and name == 'ns':
39
+ self.in_ns = True
40
+ self.buffer = []
41
+ elif self.in_page and name == 'text':
42
+ self.in_text = True
43
+ self.buffer = []
44
+
45
+ def endElement(self, name):
46
+ if name == 'page':
47
+ self.in_page = False
48
+ # Only process main namespace articles (ns = 0)
49
+ if self.current_ns == '0' and self.current_title and self.current_text:
50
+ # Extract links
51
+ links = self.extract_links(self.current_text)
52
+
53
+ # Add to batch
54
+ self.article_batch.append(
55
+ (self.current_title, self.current_text)
56
+ )
57
+
58
+ # Add links to batch
59
+ for link in links:
60
+ self.links_batch.append(
61
+ (self.current_title, link)
62
+ )
63
+
64
+ self.article_count += 1
65
+
66
+ # Print progress
67
+ if self.article_count % 100 == 0:
68
+ print(f"Processed {self.article_count} articles...")
69
+
70
+ # Insert batch if reached batch size
71
+ if len(self.article_batch) >= self.batch_size:
72
+ self._insert_batch()
73
+
74
+ # Check if we've reached the maximum number of articles
75
+ if self.max_articles and self.article_count >= self.max_articles:
76
+ self._insert_batch() # Insert any remaining items
77
+ raise StopIteration("Reached maximum number of articles")
78
+
79
+ elif name == 'title':
80
+ self.in_title = False
81
+ self.current_title = ''.join(self.buffer)
82
+ elif name == 'ns':
83
+ self.in_ns = False
84
+ self.current_ns = ''.join(self.buffer)
85
+ elif name == 'text':
86
+ self.in_text = False
87
+ self.current_text = ''.join(self.buffer)
88
+
89
+ def characters(self, content):
90
+ if self.in_title:
91
+ self.buffer.append(content)
92
+ elif self.in_ns:
93
+ self.buffer.append(content)
94
+ elif self.in_text:
95
+ self.buffer.append(content)
96
+
97
+ def extract_links(self, text):
98
+ """Extract links from article wikitext"""
99
+ # Pattern to match [[Link]] or [[Link|Text]] format
100
+ links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
101
+
102
+ # Process links
103
+ processed_links = []
104
+ for link in links:
105
+ # Skip non-article links (except categories which might be useful)
106
+ if ':' in link and not link.startswith('Category:'):
107
+ continue
108
+
109
+ # Remove any section links (with #)
110
+ link = link.split('#')[0].strip()
111
+
112
+ # Skip empty links
113
+ if not link:
114
+ continue
115
+
116
+ processed_links.append(link)
117
+
118
+ # Remove duplicates and return
119
+ return list(set(processed_links))
120
+
121
+ def _insert_batch(self):
122
+ """Insert batched data into the database"""
123
+ if self.article_batch:
124
+ self.cursor.executemany(
125
+ "INSERT OR IGNORE INTO articles (title, text) VALUES (?, ?)",
126
+ self.article_batch
127
+ )
128
+
129
+ if self.links_batch:
130
+ self.cursor.executemany(
131
+ "INSERT OR IGNORE INTO links (source_title, target_title) VALUES (?, ?)",
132
+ self.links_batch
133
+ )
134
+
135
+ self.db_conn.commit()
136
+ self.article_batch = []
137
+ self.links_batch = []
138
+
139
+ def create_db_schema(db_conn):
140
+ """Create the database schema"""
141
+ cursor = db_conn.cursor()
142
+
143
+ # Create articles table
144
+ cursor.execute('''
145
+ CREATE TABLE IF NOT EXISTS articles (
146
+ title TEXT PRIMARY KEY,
147
+ text TEXT
148
+ )
149
+ ''')
150
+
151
+ # Create links table
152
+ cursor.execute('''
153
+ CREATE TABLE IF NOT EXISTS links (
154
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
155
+ source_title TEXT,
156
+ target_title TEXT,
157
+ FOREIGN KEY (source_title) REFERENCES articles (title),
158
+ UNIQUE (source_title, target_title)
159
+ )
160
+ ''')
161
+
162
+ # Create index on links for faster queries
163
+ cursor.execute('''
164
+ CREATE INDEX IF NOT EXISTS idx_links_source ON links (source_title)
165
+ ''')
166
+
167
+ cursor.execute('''
168
+ CREATE INDEX IF NOT EXISTS idx_links_target ON links (target_title)
169
+ ''')
170
+
171
+ db_conn.commit()
172
+
173
+ def parse_wiki_dump(dump_path, db_path, batch_size=1000, max_articles=None):
174
+ """
175
+ Parse the Wikipedia XML dump and extract articles with their links into SQLite database.
176
+
177
+ Args:
178
+ dump_path: Path to the bz2 Wikipedia dump
179
+ db_path: Path to save the SQLite database
180
+ batch_size: Number of articles to process before committing to the database
181
+ max_articles: Maximum number of articles to extract (None for all)
182
+
183
+ Returns:
184
+ The path to the created SQLite database
185
+ """
186
+ start_time = time.time()
187
+ print(f"Parsing Wikipedia dump: {dump_path}")
188
+
189
+ # Create or connect to SQLite database
190
+ db_conn = sqlite3.connect(db_path)
191
+
192
+ # Create schema
193
+ create_db_schema(db_conn)
194
+
195
+ # Create SAX parser with custom content handler
196
+ parser = make_parser()
197
+ content_handler = WikiContentHandler(db_conn, batch_size, max_articles)
198
+ parser.setContentHandler(content_handler)
199
+
200
+ # Parse the dump
201
+ try:
202
+ parser.parse(bz2.BZ2File(dump_path))
203
+ # Insert any remaining items in the batch
204
+ content_handler._insert_batch()
205
+ except StopIteration:
206
+ print("Reached maximum number of articles")
207
+ except Exception as e:
208
+ print(f"Error parsing dump: {e}")
209
+ raise
210
+ finally:
211
+ db_conn.commit()
212
+ db_conn.close()
213
+
214
+ duration = time.time() - start_time
215
+ print(f"Extracted {content_handler.article_count} articles in {duration:.2f} seconds.")
216
+ print(f"Data saved to {db_path}")
217
+ return db_path
218
+
219
+ if __name__ == "__main__":
220
+ import argparse
221
+
222
+ parser = argparse.ArgumentParser(description='Parse Wikipedia XML dump to SQLite')
223
+ parser.add_argument('dump_path', help='Path to the Wikipedia XML dump (bz2 file)')
224
+ parser.add_argument('output_path', help='Path to save the SQLite database')
225
+ parser.add_argument('--batch-size', type=int, default=1000,
226
+ help='Batch size for database inserts (default: 1000)')
227
+ parser.add_argument('--max-articles', type=int, default=None,
228
+ help='Maximum number of articles to extract (default: all)')
229
+
230
+ args = parser.parse_args()
231
+
232
+ # Create output directory if it doesn't exist
233
+ output_dir = os.path.dirname(args.output_path)
234
+ if output_dir:
235
+ os.makedirs(output_dir, exist_ok=True)
236
+
237
+ # Parse the dump
238
+ parse_wiki_dump(args.dump_path, args.output_path, args.batch_size, args.max_articles)
engine.py CHANGED
@@ -4,12 +4,17 @@
4
  # 3. Navigation between articles
5
 
6
  import random
 
7
  from db.wiki_db_json import WikiDBJson
8
 
9
  class WikiRunEnvironment:
10
  def __init__(self, wiki_data_path):
11
  """Initialize with path to Wikipedia data"""
12
- self.db = WikiDBJson(wiki_data_path)
 
 
 
 
13
  self.current_article = None
14
  self.target_article = None
15
  self.path_taken = []
 
4
  # 3. Navigation between articles
5
 
6
  import random
7
+ from db.wiki_db_sqlite import WikiDBSqlite
8
  from db.wiki_db_json import WikiDBJson
9
 
10
  class WikiRunEnvironment:
11
  def __init__(self, wiki_data_path):
12
  """Initialize with path to Wikipedia data"""
13
+ if wiki_data_path.endswith('.json'):
14
+ self.db = WikiDBJson(wiki_data_path)
15
+ else:
16
+ self.db = WikiDBSqlite(wiki_data_path)
17
+
18
  self.current_article = None
19
  self.target_article = None
20
  self.path_taken = []