Spaces:
Runtime error
Runtime error
Marc-Antoine Rondeau
commited on
Commit
·
909ae3f
1
Parent(s):
2f25f03
Linting and replace Generator by Iterator
Browse files- buster/docparser.py +5 -1
- buster/parser.py +7 -11
buster/docparser.py
CHANGED
@@ -43,7 +43,11 @@ supported_docs = {
|
|
43 |
|
44 |
|
45 |
def get_all_documents(
|
46 |
-
root_dir: str,
|
|
|
|
|
|
|
|
|
47 |
) -> pd.DataFrame:
|
48 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
49 |
|
|
|
43 |
|
44 |
|
45 |
def get_all_documents(
|
46 |
+
root_dir: str,
|
47 |
+
base_url: str,
|
48 |
+
parser_cls: Type[Parser],
|
49 |
+
min_section_length: int = 100,
|
50 |
+
max_section_length: int = 2000,
|
51 |
) -> pd.DataFrame:
|
52 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
53 |
|
buster/parser.py
CHANGED
@@ -3,7 +3,7 @@ from dataclasses import InitVar, dataclass, field
|
|
3 |
from itertools import takewhile, zip_longest
|
4 |
import math
|
5 |
import os
|
6 |
-
from typing import
|
7 |
|
8 |
import bs4
|
9 |
import pandas as pd
|
@@ -33,7 +33,7 @@ class Section:
|
|
33 |
return len(self.text)
|
34 |
|
35 |
@classmethod
|
36 |
-
def from_text(cls, text: str, url: str, name: str) ->
|
37 |
"""Alternate constructor, without parsing."""
|
38 |
section = cls.__new__(cls) # Allocate memory, does not call __init__
|
39 |
# Does the init here.
|
@@ -43,7 +43,7 @@ class Section:
|
|
43 |
|
44 |
return section
|
45 |
|
46 |
-
def get_chunks(self, min_length: int, max_length: int) ->
|
47 |
"""Split a section into chunks."""
|
48 |
if len(self) > max_length:
|
49 |
# Get the number of chunk, by dividing and rounding up.
|
@@ -54,9 +54,7 @@ class Section:
|
|
54 |
length = len(self) // n_chunks
|
55 |
for chunk in range(n_chunks):
|
56 |
start = chunk * length
|
57 |
-
yield Section.from_text(
|
58 |
-
self.text[start: start + length],
|
59 |
-
self.url, self.name)
|
60 |
elif len(self) > min_length:
|
61 |
yield self
|
62 |
return
|
@@ -75,7 +73,7 @@ class Parser(ABC):
|
|
75 |
...
|
76 |
|
77 |
@abstractmethod
|
78 |
-
def find_sections(self) ->
|
79 |
...
|
80 |
|
81 |
def parse(self) -> list[Section]:
|
@@ -87,8 +85,7 @@ class Parser(ABC):
|
|
87 |
|
88 |
|
89 |
class SphinxParser(Parser):
|
90 |
-
|
91 |
-
def find_sections(self) -> Generator[Section, None, None]:
|
92 |
for section in self.soup.find_all("a", href=True, class_="headerlink"):
|
93 |
container = section.parent.parent
|
94 |
section_href = container.find_all("a", href=True, class_="headerlink")
|
@@ -110,8 +107,7 @@ class SphinxParser(Parser):
|
|
110 |
|
111 |
|
112 |
class HuggingfaceParser(Parser):
|
113 |
-
|
114 |
-
def find_sections(self) -> Generator[Section, None, None]:
|
115 |
sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
|
116 |
for section, next_section in zip_longest(sections, sections[1:]):
|
117 |
href = section.find("a", href=True, class_="header-link")
|
|
|
3 |
from itertools import takewhile, zip_longest
|
4 |
import math
|
5 |
import os
|
6 |
+
from typing import Iterator
|
7 |
|
8 |
import bs4
|
9 |
import pandas as pd
|
|
|
33 |
return len(self.text)
|
34 |
|
35 |
@classmethod
|
36 |
+
def from_text(cls, text: str, url: str, name: str) -> "Section":
|
37 |
"""Alternate constructor, without parsing."""
|
38 |
section = cls.__new__(cls) # Allocate memory, does not call __init__
|
39 |
# Does the init here.
|
|
|
43 |
|
44 |
return section
|
45 |
|
46 |
+
def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
|
47 |
"""Split a section into chunks."""
|
48 |
if len(self) > max_length:
|
49 |
# Get the number of chunk, by dividing and rounding up.
|
|
|
54 |
length = len(self) // n_chunks
|
55 |
for chunk in range(n_chunks):
|
56 |
start = chunk * length
|
57 |
+
yield Section.from_text(self.text[start : start + length], self.url, self.name)
|
|
|
|
|
58 |
elif len(self) > min_length:
|
59 |
yield self
|
60 |
return
|
|
|
73 |
...
|
74 |
|
75 |
@abstractmethod
|
76 |
+
def find_sections(self) -> Iterator[Section]:
|
77 |
...
|
78 |
|
79 |
def parse(self) -> list[Section]:
|
|
|
85 |
|
86 |
|
87 |
class SphinxParser(Parser):
|
88 |
+
def find_sections(self) -> Iterator[Section]:
|
|
|
89 |
for section in self.soup.find_all("a", href=True, class_="headerlink"):
|
90 |
container = section.parent.parent
|
91 |
section_href = container.find_all("a", href=True, class_="headerlink")
|
|
|
107 |
|
108 |
|
109 |
class HuggingfaceParser(Parser):
|
110 |
+
def find_sections(self) -> Iterator[Section]:
|
|
|
111 |
sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
|
112 |
for section, next_section in zip_longest(sections, sections[1:]):
|
113 |
href = section.find("a", href=True, class_="header-link")
|