stillerman HF Staff commited on
Commit
f64fe29
·
1 Parent(s): fa70ae5

fixed parser

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. wiki_parser.py +114 -86
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.bz2
2
  data
 
 
1
  *.bz2
2
  data
3
+ __pycache__
wiki_parser.py CHANGED
@@ -1,10 +1,108 @@
1
  import bz2
2
  import json
3
  import re
4
- import xml.etree.ElementTree as ET
5
- from collections import defaultdict
6
- from pathlib import Path
7
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def parse_wiki_dump(dump_path, output_path, max_articles=None):
10
  """
@@ -20,99 +118,29 @@ def parse_wiki_dump(dump_path, output_path, max_articles=None):
20
  """
21
  print(f"Parsing Wikipedia dump: {dump_path}")
22
 
23
- # Dictionary to store articles and their links
24
- wiki_data = {}
25
- article_count = 0
 
26
 
27
- # Namespace mapping in Wikipedia XML
28
- ns = {
29
- 'xml': 'http://www.w3.org/XML/1998/namespace'
30
- }
31
-
32
- # Initialize XML parser
33
- context = ET.iterparse(bz2.open(dump_path, 'rt', encoding='utf-8'), events=('end',))
34
-
35
- # Process XML elements
36
- for event, elem in context:
37
- if elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
38
- # Extract title
39
- title_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}title')
40
- if title_elem is None:
41
- continue
42
-
43
- title = title_elem.text
44
-
45
- # Skip non-main namespace
46
- ns_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}ns')
47
- if ns_elem is not None and ns_elem.text != '0':
48
- elem.clear()
49
- continue
50
-
51
- # Extract content
52
- text_elem = elem.find('.//{http://www.mediawiki.org/xml/export-0.10/}text')
53
- if text_elem is None or text_elem.text is None:
54
- elem.clear()
55
- continue
56
-
57
- content = text_elem.text
58
-
59
- # Extract links from content
60
- links = extract_links(content)
61
-
62
- # Add to wiki data
63
- wiki_data[title] = {
64
- 'title': title,
65
- 'text': content,
66
- 'links': links
67
- }
68
-
69
- article_count += 1
70
-
71
- # Print progress
72
- if article_count % 1000 == 0:
73
- print(f"Processed {article_count} articles...")
74
-
75
- # Check if we've reached the maximum number of articles
76
- if max_articles and article_count >= max_articles:
77
- break
78
-
79
- # Clear element to save memory
80
- elem.clear()
81
 
82
- print(f"Extracted {article_count} articles with their links.")
83
 
84
  # Save data to JSON file
85
  output_file = os.path.join(output_path, 'wiki_data.json')
86
  with open(output_file, 'w', encoding='utf-8') as f:
87
- json.dump(wiki_data, f)
88
 
89
  print(f"Data saved to {output_file}")
90
  return output_file
91
 
92
- def extract_links(text):
93
- """Extract links from article wikitext"""
94
- # Pattern to match [[Link]] or [[Link|Text]] format
95
- links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
96
-
97
- # Process links
98
- processed_links = []
99
- for link in links:
100
- # Skip non-article links
101
- if ':' in link and not link.startswith('Category:'):
102
- continue
103
-
104
- # Remove any section links (with #)
105
- link = link.split('#')[0].strip()
106
-
107
- # Skip empty links
108
- if not link:
109
- continue
110
-
111
- processed_links.append(link)
112
-
113
- # Remove duplicates and return
114
- return list(set(processed_links))
115
-
116
  if __name__ == "__main__":
117
  import argparse
118
 
 
1
  import bz2
2
  import json
3
  import re
 
 
 
4
  import os
5
+ from pathlib import Path
6
+ from xml.sax import make_parser, handler
7
+
8
+ class WikiContentHandler(handler.ContentHandler):
9
+ def __init__(self, max_articles=None):
10
+ self.wiki_data = {}
11
+ self.article_count = 0
12
+ self.max_articles = max_articles
13
+
14
+ # Current elements
15
+ self.current_title = None
16
+ self.current_text = None
17
+ self.current_ns = None
18
+ self.in_page = False
19
+ self.in_title = False
20
+ self.in_text = False
21
+ self.in_ns = False
22
+ self.buffer = []
23
+
24
+ def startElement(self, name, attrs):
25
+ if name == 'page':
26
+ self.in_page = True
27
+ self.current_title = None
28
+ self.current_text = None
29
+ self.current_ns = None
30
+ elif self.in_page and name == 'title':
31
+ self.in_title = True
32
+ self.buffer = []
33
+ elif self.in_page and name == 'ns':
34
+ self.in_ns = True
35
+ self.buffer = []
36
+ elif self.in_page and name == 'text':
37
+ self.in_text = True
38
+ self.buffer = []
39
+
40
+ def endElement(self, name):
41
+ if name == 'page':
42
+ self.in_page = False
43
+ # Only process main namespace articles (ns = 0)
44
+ if self.current_ns == '0' and self.current_title and self.current_text:
45
+ # Extract links
46
+ links = self.extract_links(self.current_text)
47
+
48
+ # Add to wiki data
49
+ self.wiki_data[self.current_title] = {
50
+ 'title': self.current_title,
51
+ 'text': self.current_text,
52
+ 'links': links
53
+ }
54
+
55
+ self.article_count += 1
56
+
57
+ # Print progress
58
+ if self.article_count % 100 == 0:
59
+ print(f"Processed {self.article_count} articles...")
60
+
61
+ # Check if we've reached the maximum number of articles
62
+ if self.max_articles and self.article_count >= self.max_articles:
63
+ raise StopIteration("Reached maximum number of articles")
64
+
65
+ elif name == 'title':
66
+ self.in_title = False
67
+ self.current_title = ''.join(self.buffer)
68
+ elif name == 'ns':
69
+ self.in_ns = False
70
+ self.current_ns = ''.join(self.buffer)
71
+ elif name == 'text':
72
+ self.in_text = False
73
+ self.current_text = ''.join(self.buffer)
74
+
75
+ def characters(self, content):
76
+ if self.in_title:
77
+ self.buffer.append(content)
78
+ elif self.in_ns:
79
+ self.buffer.append(content)
80
+ elif self.in_text:
81
+ self.buffer.append(content)
82
+
83
+ def extract_links(self, text):
84
+ """Extract links from article wikitext"""
85
+ # Pattern to match [[Link]] or [[Link|Text]] format
86
+ links = re.findall(r'\[\[([^|\]]+)(?:\|[^\]]+)?\]\]', text)
87
+
88
+ # Process links
89
+ processed_links = []
90
+ for link in links:
91
+ # Skip non-article links (except categories which might be useful)
92
+ if ':' in link and not link.startswith('Category:'):
93
+ continue
94
+
95
+ # Remove any section links (with #)
96
+ link = link.split('#')[0].strip()
97
+
98
+ # Skip empty links
99
+ if not link:
100
+ continue
101
+
102
+ processed_links.append(link)
103
+
104
+ # Remove duplicates and return
105
+ return list(set(processed_links))
106
 
107
  def parse_wiki_dump(dump_path, output_path, max_articles=None):
108
  """
 
118
  """
119
  print(f"Parsing Wikipedia dump: {dump_path}")
120
 
121
+ # Create SAX parser with custom content handler
122
+ parser = make_parser()
123
+ content_handler = WikiContentHandler(max_articles)
124
+ parser.setContentHandler(content_handler)
125
 
126
+ # Parse the dump
127
+ try:
128
+ parser.parse(bz2.BZ2File(dump_path))
129
+ except StopIteration:
130
+ print("Reached maximum number of articles")
131
+ except Exception as e:
132
+ print(f"Error parsing dump: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
+ print(f"Extracted {content_handler.article_count} articles with their links.")
135
 
136
  # Save data to JSON file
137
  output_file = os.path.join(output_path, 'wiki_data.json')
138
  with open(output_file, 'w', encoding='utf-8') as f:
139
+ json.dump(content_handler.wiki_data, f)
140
 
141
  print(f"Data saved to {output_file}")
142
  return output_file
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  if __name__ == "__main__":
145
  import argparse
146