Marc-Antoine Rondeau commited on
Commit
909ae3f
·
1 Parent(s): 2f25f03

Linting and replace Generator by Iterator

Browse files
Files changed (2) hide show
  1. buster/docparser.py +5 -1
  2. buster/parser.py +7 -11
buster/docparser.py CHANGED
@@ -43,7 +43,11 @@ supported_docs = {
43
 
44
 
45
  def get_all_documents(
46
- root_dir: str, base_url: str, parser_cls: Type[Parser], min_section_length: int = 100, max_section_length: int = 2000
 
 
 
 
47
  ) -> pd.DataFrame:
48
  """Parse all HTML files in `root_dir`, and extract all sections.
49
 
 
43
 
44
 
45
  def get_all_documents(
46
+ root_dir: str,
47
+ base_url: str,
48
+ parser_cls: Type[Parser],
49
+ min_section_length: int = 100,
50
+ max_section_length: int = 2000,
51
  ) -> pd.DataFrame:
52
  """Parse all HTML files in `root_dir`, and extract all sections.
53
 
buster/parser.py CHANGED
@@ -3,7 +3,7 @@ from dataclasses import InitVar, dataclass, field
3
  from itertools import takewhile, zip_longest
4
  import math
5
  import os
6
- from typing import Generator
7
 
8
  import bs4
9
  import pandas as pd
@@ -33,7 +33,7 @@ class Section:
33
  return len(self.text)
34
 
35
  @classmethod
36
- def from_text(cls, text: str, url: str, name: str) -> 'Section':
37
  """Alternate constructor, without parsing."""
38
  section = cls.__new__(cls) # Allocate memory, does not call __init__
39
  # Does the init here.
@@ -43,7 +43,7 @@ class Section:
43
 
44
  return section
45
 
46
- def get_chunks(self, min_length: int, max_length: int) -> Generator['Section', None, None]:
47
  """Split a section into chunks."""
48
  if len(self) > max_length:
49
  # Get the number of chunk, by dividing and rounding up.
@@ -54,9 +54,7 @@ class Section:
54
  length = len(self) // n_chunks
55
  for chunk in range(n_chunks):
56
  start = chunk * length
57
- yield Section.from_text(
58
- self.text[start: start + length],
59
- self.url, self.name)
60
  elif len(self) > min_length:
61
  yield self
62
  return
@@ -75,7 +73,7 @@ class Parser(ABC):
75
  ...
76
 
77
  @abstractmethod
78
- def find_sections(self) -> Generator[Section, None, None]:
79
  ...
80
 
81
  def parse(self) -> list[Section]:
@@ -87,8 +85,7 @@ class Parser(ABC):
87
 
88
 
89
  class SphinxParser(Parser):
90
-
91
- def find_sections(self) -> Generator[Section, None, None]:
92
  for section in self.soup.find_all("a", href=True, class_="headerlink"):
93
  container = section.parent.parent
94
  section_href = container.find_all("a", href=True, class_="headerlink")
@@ -110,8 +107,7 @@ class SphinxParser(Parser):
110
 
111
 
112
  class HuggingfaceParser(Parser):
113
-
114
- def find_sections(self) -> Generator[Section, None, None]:
115
  sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
116
  for section, next_section in zip_longest(sections, sections[1:]):
117
  href = section.find("a", href=True, class_="header-link")
 
3
  from itertools import takewhile, zip_longest
4
  import math
5
  import os
6
+ from typing import Iterator
7
 
8
  import bs4
9
  import pandas as pd
 
33
  return len(self.text)
34
 
35
  @classmethod
36
+ def from_text(cls, text: str, url: str, name: str) -> "Section":
37
  """Alternate constructor, without parsing."""
38
  section = cls.__new__(cls) # Allocate memory, does not call __init__
39
  # Does the init here.
 
43
 
44
  return section
45
 
46
+ def get_chunks(self, min_length: int, max_length: int) -> Iterator["Section"]:
47
  """Split a section into chunks."""
48
  if len(self) > max_length:
49
  # Get the number of chunk, by dividing and rounding up.
 
54
  length = len(self) // n_chunks
55
  for chunk in range(n_chunks):
56
  start = chunk * length
57
+ yield Section.from_text(self.text[start : start + length], self.url, self.name)
 
 
58
  elif len(self) > min_length:
59
  yield self
60
  return
 
73
  ...
74
 
75
  @abstractmethod
76
+ def find_sections(self) -> Iterator[Section]:
77
  ...
78
 
79
  def parse(self) -> list[Section]:
 
85
 
86
 
87
  class SphinxParser(Parser):
88
+ def find_sections(self) -> Iterator[Section]:
 
89
  for section in self.soup.find_all("a", href=True, class_="headerlink"):
90
  container = section.parent.parent
91
  section_href = container.find_all("a", href=True, class_="headerlink")
 
107
 
108
 
109
  class HuggingfaceParser(Parser):
110
+ def find_sections(self) -> Iterator[Section]:
 
111
  sections = self.soup.find_all(["h1", "h2", "h3"], class_="relative group")
112
  for section, next_section in zip_longest(sections, sections[1:]):
113
  href = section.find("a", href=True, class_="header-link")