Marc-Antoine Rondeau commited on
Commit
2f25f03
·
1 Parent(s): 1f22b14

Trying to refactor the parsing

Browse files
Files changed (2) hide show
  1. buster/docparser.py +8 -7
  2. buster/parser.py +97 -121
buster/docparser.py CHANGED
@@ -1,5 +1,6 @@
1
  import glob
2
  import os
 
3
 
4
  import numpy as np
5
  import pandas as pd
@@ -42,7 +43,7 @@ supported_docs = {
42
 
43
 
44
  def get_all_documents(
45
- root_dir: str, base_url: str, parser: Parser, min_section_length: int = 100, max_section_length: int = 2000
46
  ) -> pd.DataFrame:
47
  """Parse all HTML files in `root_dir`, and extract all sections.
48
 
@@ -60,12 +61,12 @@ def get_all_documents(
60
  source = f.read()
61
 
62
  soup = BeautifulSoup(source, "html.parser")
63
- soup_parser = parser(soup, base_url, file, min_section_length, max_section_length)
64
- sections_file, urls_file, names_file = soup_parser.parse()
65
-
66
- sections.extend(sections_file)
67
- urls.extend(urls_file)
68
- names.extend(names_file)
69
 
70
  documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
71
 
 
1
  import glob
2
  import os
3
+ from typing import Type
4
 
5
  import numpy as np
6
  import pandas as pd
 
43
 
44
 
45
  def get_all_documents(
46
+ root_dir: str, base_url: str, parser_cls: Type[Parser], min_section_length: int = 100, max_section_length: int = 2000
47
  ) -> pd.DataFrame:
48
  """Parse all HTML files in `root_dir`, and extract all sections.
49
 
 
61
  source = f.read()
62
 
63
  soup = BeautifulSoup(source, "html.parser")
64
+ parser = parser_cls(soup, base_url, file, min_section_length, max_section_length)
65
+ # sections_file, urls_file, names_file =
66
+ for section in parser.parse():
67
+ sections.append(section.text)
68
+ urls.append(section.url)
69
+ names.append(section.name)
70
 
71
  documents_df = pd.DataFrame.from_dict({"title": names, "url": urls, "content": sections})
72
 
buster/parser.py CHANGED
@@ -1,151 +1,127 @@
 
 
 
1
  import math
2
  import os
 
3
 
4
  import bs4
5
  import pandas as pd
6
  from bs4 import BeautifulSoup
7
 
8
 
9
- def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
10
- section = []
11
- for node in nodes:
12
- if node.name == "table":
13
- node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
14
- elif node.name == "script":
15
- continue
16
- else:
17
- node_text = node.text
18
- section.append(node_text)
19
- section = "".join(section)
20
-
21
- return section
22
-
23
-
24
- class Parser:
25
- def __init__(
26
- self,
27
- soup: BeautifulSoup,
28
- base_url: str,
29
- filename: str,
30
- min_section_length: int = 100,
31
- max_section_length: int = 2000,
32
- ):
33
- self.soup = soup
34
- self.base_url = base_url
35
- self.filename = filename
36
- self.min_section_length = min_section_length
37
- self.max_section_length = max_section_length
38
-
39
- def parse(self) -> tuple[list[str], list[str], list[str]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ...
41
 
42
- def find_sections(self) -> bs4.element.ResultSet:
 
43
  ...
44
 
45
- def build_url(self, suffix: str) -> str:
46
- ...
 
 
 
 
47
 
48
 
49
  class SphinxParser(Parser):
50
- def parse(self) -> tuple[list[str], list[str], list[str]]:
51
- found = self.find_sections()
52
 
53
- sections = []
54
- urls = []
55
- names = []
56
- for i in range(len(found)):
57
- section_found = found[i]
58
 
59
- section_soup = section_found.parent.parent
60
- section_href = section_soup.find_all("a", href=True, class_="headerlink")
61
 
62
  # If sections has subsections, keep only the part before the first subsection
63
- if len(section_href) > 1 and section_soup.section is not None:
64
- section_siblings = list(section_soup.section.previous_siblings)[::-1]
65
- section = parse_section(section_siblings)
66
  else:
67
- section = parse_section(section_soup.children)
68
-
69
- # Remove special characters, plus newlines in some url and section names.
70
- section = section.strip()
71
- url = section_found["href"].strip().replace("\n", "")
72
- name = section_found.parent.text.strip()[:-1].replace("\n", "")
73
-
74
- url = self.build_url(url)
75
-
76
- # If text is too long, split into chunks of equal sizes
77
- if len(section) > self.max_section_length:
78
- n_chunks = math.ceil(len(section) / float(self.max_section_length))
79
- separator_index = math.floor(len(section) / n_chunks)
80
-
81
- section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
82
- url_chunks = [url] * n_chunks
83
- name_chunks = [name] * n_chunks
84
-
85
- sections.extend(section_chunks)
86
- urls.extend(url_chunks)
87
- names.extend(name_chunks)
88
- # If text is not too short, add in 1 chunk
89
- elif len(section) > self.min_section_length:
90
- sections.append(section)
91
- urls.append(url)
92
- names.append(name)
93
-
94
- return sections, urls, names
95
-
96
- def find_sections(self) -> bs4.element.ResultSet:
97
- return self.soup.find_all("a", href=True, class_="headerlink")
98
 
99
  def build_url(self, suffix: str) -> str:
100
  return self.base_url + self.filename + suffix
101
 
102
 
103
  class HuggingfaceParser(Parser):
104
- def parse(self) -> tuple[list[str], list[str], list[str]]:
105
- found = self.find_sections()
106
 
107
- sections = []
108
- urls = []
109
- names = []
110
- for i in range(len(found)):
111
- section_href = found[i].find("a", href=True, class_="header-link")
112
-
113
- section_nodes = []
114
- for element in found[i].find_next_siblings():
115
- if i + 1 < len(found) and element == found[i + 1]:
116
- break
117
- section_nodes.append(element)
118
- section = parse_section(section_nodes)
119
-
120
- # Remove special characters, plus newlines in some url and section names.
121
- section = section.strip()
122
- url = section_href["href"].strip().replace("\n", "")
123
- name = found[i].text.strip().replace("\n", "")
124
-
125
- url = self.build_url(url)
126
-
127
- # If text is too long, split into chunks of equal sizes
128
- if len(section) > self.max_section_length:
129
- n_chunks = math.ceil(len(section) / float(self.max_section_length))
130
- separator_index = math.floor(len(section) / n_chunks)
131
-
132
- section_chunks = [section[separator_index * i : separator_index * (i + 1)] for i in range(n_chunks)]
133
- url_chunks = [url] * n_chunks
134
- name_chunks = [name] * n_chunks
135
-
136
- sections.extend(section_chunks)
137
- urls.extend(url_chunks)
138
- names.extend(name_chunks)
139
- # If text is not too short, add in 1 chunk
140
- elif len(section) > self.min_section_length:
141
- sections.append(section)
142
- urls.append(url)
143
- names.append(name)
144
-
145
- return sections, urls, names
146
-
147
- def find_sections(self) -> bs4.element.ResultSet:
148
- return self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
149
 
150
  def build_url(self, suffix: str) -> str:
151
  # The splitext is to remove the .html extension
 
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import InitVar, dataclass, field
3
+ from itertools import takewhile, zip_longest
4
  import math
5
  import os
6
+ from typing import Generator
7
 
8
  import bs4
9
  import pandas as pd
10
  from bs4 import BeautifulSoup
11
 
12
 
13
+ @dataclass
14
+ class Section:
15
+ url: str
16
+ name: str
17
+ nodes: InitVar[list[bs4.element.NavigableString]]
18
+ text: str = field(init=False)
19
+
20
+ def __post_init__(self, nodes: list[bs4.element.NavigableString]):
21
+ section = []
22
+ for node in nodes:
23
+ if node.name == "table":
24
+ node_text = pd.read_html(node.prettify())[0].to_markdown(index=False, tablefmt="github")
25
+ elif node.name == "script":
26
+ continue
27
+ else:
28
+ node_text = node.text
29
+ section.append(node_text)
30
+ self.text = "".join(section).strip()
31
+
32
+ def __len__(self) -> int:
33
+ return len(self.text)
34
+
35
+ @classmethod
36
+ def from_text(cls, text: str, url: str, name: str) -> 'Section':
37
+ """Alternate constructor, without parsing."""
38
+ section = cls.__new__(cls) # Allocate memory, does not call __init__
39
+ # Does the init here.
40
+ section.text = text
41
+ section.url = url
42
+ section.name = name
43
+
44
+ return section
45
+
46
+ def get_chunks(self, min_length: int, max_length: int) -> Generator['Section', None, None]:
47
+ """Split a section into chunks."""
48
+ if len(self) > max_length:
49
+ # Get the number of chunk, by dividing and rounding up.
50
+ # Then, split the section into equal lenght chunks.
51
+ # This could results in chunks below the minimum length,
52
+ # and will truncate the end of the section.
53
+ n_chunks = (len(self) + max_length - 1) // max_length
54
+ length = len(self) // n_chunks
55
+ for chunk in range(n_chunks):
56
+ start = chunk * length
57
+ yield Section.from_text(
58
+ self.text[start: start + length],
59
+ self.url, self.name)
60
+ elif len(self) > min_length:
61
+ yield self
62
+ return
63
+
64
+
65
+ @dataclass
66
+ class Parser(ABC):
67
+ soup: BeautifulSoup
68
+ base_url: str
69
+ filename: str
70
+ min_section_length: int = 100
71
+ max_section_length: int = 2000
72
+
73
+ @abstractmethod
74
+ def build_url(self, suffix: str) -> str:
75
  ...
76
 
77
+ @abstractmethod
78
+ def find_sections(self) -> Generator[Section, None, None]:
79
  ...
80
 
81
+ def parse(self) -> list[Section]:
82
+ """Parse the documents into sections, respecting the lenght constraints."""
83
+ sections = []
84
+ for section in self.find_sections():
85
+ sections.extend(section.get_chunks(self.min_section_length, self.max_section_length))
86
+ return sections
87
 
88
 
89
  class SphinxParser(Parser):
 
 
90
 
91
+ def find_sections(self) -> Generator[Section, None, None]:
92
+ for section in self.soup.find_all("a", href=True, class_="headerlink"):
93
+ container = section.parent.parent
94
+ section_href = container.find_all("a", href=True, class_="headerlink")
 
95
 
96
+ url = self.build_url(section["href"].strip().replace("\n", ""))
97
+ name = section.parent.text
98
 
99
  # If sections has subsections, keep only the part before the first subsection
100
+ if len(section_href) > 1 and container.section is not None:
101
+ siblings = list(container.section.previous_siblings)[::-1]
102
+ section = Section(url, name, siblings)
103
  else:
104
+ section = Section(url, name, container.children)
105
+ yield section
106
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
  def build_url(self, suffix: str) -> str:
109
  return self.base_url + self.filename + suffix
110
 
111
 
112
  class HuggingfaceParser(Parser):
 
 
113
 
114
+ def find_sections(self) -> Generator[Section, None, None]:
115
+ sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
116
+ for section, next_section in zip_longest(sections, sections[1:]):
117
+ href = section.find("a", href=True, class_="header-link")
118
+ nodes = list(takewhile(lambda sibling: sibling != next_section, section.find_next_siblings()))
119
+
120
+ url = self.build_url(href["href"].strip().replace("\n", ""))
121
+ name = section.text.strip().replace("\n", "")
122
+ yield Section(url, name, nodes)
123
+
124
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  def build_url(self, suffix: str) -> str:
127
  # The splitext is to remove the .html extension