hbertrand commited on
Commit
90ae9dd
·
1 Parent(s): e112463

parsing urls

Browse files
Files changed (2) hide show
  1. buster/docparser.py +27 -11
  2. requirements.txt +6 -2
buster/docparser.py CHANGED
@@ -11,7 +11,10 @@ EMBEDDING_MODEL = "text-embedding-ada-002"
11
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
13
 
14
- def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]:
 
 
 
15
  """Parse all HTML files in `root_dir`, and extract all sections.
16
 
17
  Sections are broken into subsections if they are longer than `max_section_length`.
@@ -19,32 +22,45 @@ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]
19
  """
20
  files = glob.glob("*.html", root_dir=root_dir)
21
 
22
- selector = "section > section"
23
-
24
  # Recurse until sections are small enough
25
- def get_all_subsections(soup, selector: str) -> list[str]:
26
- found = soup.select(selector)
27
- data = [x.text.split(";")[-1].strip() for x in found]
 
 
28
 
29
  sections = []
30
- for i, section in enumerate(data):
 
 
 
 
 
31
  if len(section) > max_section_length:
32
- sections.extend(get_all_subsections(found[i], selector + " > section"))
 
 
33
  else:
34
  sections.append(section)
 
35
 
36
- return sections
37
 
38
  sections = []
 
39
  for file in files:
40
  filepath = os.path.join(root_dir, file)
41
  with open(filepath, "r") as file:
42
  source = file.read()
43
 
44
  soup = BeautifulSoup(source, "html.parser")
45
- sections.extend(get_all_subsections(soup, selector))
 
46
 
47
- return sections
 
 
 
48
 
49
 
50
  def write_sections(filepath: str, sections: list[str]):
 
11
  EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
12
 
13
 
14
+ BASE_URL = "https://docs.mila.quebec/"
15
+
16
+
17
+ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[list[str], list[str]]:
18
  """Parse all HTML files in `root_dir`, and extract all sections.
19
 
20
  Sections are broken into subsections if they are longer than `max_section_length`.
 
22
  """
23
  files = glob.glob("*.html", root_dir=root_dir)
24
 
 
 
25
  # Recurse until sections are small enough
26
+ def get_all_subsections(soup: BeautifulSoup, level: int) -> tuple[list[str], list[str]]:
27
+ if level >= 5:
28
+ return [], []
29
+
30
+ found = soup.find_all('a', href=True, class_="headerlink")
31
 
32
  sections = []
33
+ urls = []
34
+ for section_found in found:
35
+ section_soup = section_found.parent.parent
36
+ section = section_soup.text
37
+ url = section_found['href']
38
+
39
  if len(section) > max_section_length:
40
+ s, u = get_all_subsections(section_soup, level + 1)
41
+ sections.extend(s)
42
+ urls.extend(u)
43
  else:
44
  sections.append(section)
45
+ urls.append(url)
46
 
47
+ return sections, urls
48
 
49
  sections = []
50
+ urls = []
51
  for file in files:
52
  filepath = os.path.join(root_dir, file)
53
  with open(filepath, "r") as file:
54
  source = file.read()
55
 
56
  soup = BeautifulSoup(source, "html.parser")
57
+ sections_file, urls_file = get_all_subsections(soup, 2)
58
+ sections.extend(sections_file)
59
 
60
+ urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
61
+ urls.extend(urls_file)
62
+
63
+ return sections, urls
64
 
65
 
66
  def write_sections(filepath: str, sections: list[str]):
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
- pandas
2
- openai
3
  numpy
 
4
  tiktoken
 
 
 
 
 
1
+ bs4
 
2
  numpy
3
+ <<<<<<< HEAD
4
  tiktoken
5
+ =======
6
+ openai
7
+ pandas
8
+ >>>>>>> fe2ece9 (parsing urls)