Spaces:
Runtime error
Runtime error
parsing urls
Browse files- buster/docparser.py +27 -11
- requirements.txt +6 -2
buster/docparser.py
CHANGED
@@ -11,7 +11,10 @@ EMBEDDING_MODEL = "text-embedding-ada-002"
|
|
11 |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
|
12 |
|
13 |
|
14 |
-
|
|
|
|
|
|
|
15 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
16 |
|
17 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
@@ -19,32 +22,45 @@ def get_all_sections(root_dir: str, max_section_length: int = 3000) -> list[str]
|
|
19 |
"""
|
20 |
files = glob.glob("*.html", root_dir=root_dir)
|
21 |
|
22 |
-
selector = "section > section"
|
23 |
-
|
24 |
# Recurse until sections are small enough
|
25 |
-
def get_all_subsections(soup,
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
|
29 |
sections = []
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
if len(section) > max_section_length:
|
32 |
-
|
|
|
|
|
33 |
else:
|
34 |
sections.append(section)
|
|
|
35 |
|
36 |
-
return sections
|
37 |
|
38 |
sections = []
|
|
|
39 |
for file in files:
|
40 |
filepath = os.path.join(root_dir, file)
|
41 |
with open(filepath, "r") as file:
|
42 |
source = file.read()
|
43 |
|
44 |
soup = BeautifulSoup(source, "html.parser")
|
45 |
-
|
|
|
46 |
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
|
50 |
def write_sections(filepath: str, sections: list[str]):
|
|
|
11 |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
|
12 |
|
13 |
|
14 |
+
BASE_URL = "https://docs.mila.quebec/"
|
15 |
+
|
16 |
+
|
17 |
+
def get_all_sections(root_dir: str, max_section_length: int = 3000) -> tuple[list[str], list[str]]:
|
18 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
19 |
|
20 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
|
|
22 |
"""
|
23 |
files = glob.glob("*.html", root_dir=root_dir)
|
24 |
|
|
|
|
|
25 |
# Recurse until sections are small enough
|
26 |
+
def get_all_subsections(soup: BeautifulSoup, level: int) -> tuple[list[str], list[str]]:
|
27 |
+
if level >= 5:
|
28 |
+
return [], []
|
29 |
+
|
30 |
+
found = soup.find_all('a', href=True, class_="headerlink")
|
31 |
|
32 |
sections = []
|
33 |
+
urls = []
|
34 |
+
for section_found in found:
|
35 |
+
section_soup = section_found.parent.parent
|
36 |
+
section = section_soup.text
|
37 |
+
url = section_found['href']
|
38 |
+
|
39 |
if len(section) > max_section_length:
|
40 |
+
s, u = get_all_subsections(section_soup, level + 1)
|
41 |
+
sections.extend(s)
|
42 |
+
urls.extend(u)
|
43 |
else:
|
44 |
sections.append(section)
|
45 |
+
urls.append(url)
|
46 |
|
47 |
+
return sections, urls
|
48 |
|
49 |
sections = []
|
50 |
+
urls = []
|
51 |
for file in files:
|
52 |
filepath = os.path.join(root_dir, file)
|
53 |
with open(filepath, "r") as file:
|
54 |
source = file.read()
|
55 |
|
56 |
soup = BeautifulSoup(source, "html.parser")
|
57 |
+
sections_file, urls_file = get_all_subsections(soup, 2)
|
58 |
+
sections.extend(sections_file)
|
59 |
|
60 |
+
urls_file = [BASE_URL + os.path.basename(file.name) + url for url in urls_file]
|
61 |
+
urls.extend(urls_file)
|
62 |
+
|
63 |
+
return sections, urls
|
64 |
|
65 |
|
66 |
def write_sections(filepath: str, sections: list[str]):
|
requirements.txt
CHANGED
@@ -1,4 +1,8 @@
|
|
1 |
-
|
2 |
-
openai
|
3 |
numpy
|
|
|
4 |
tiktoken
|
|
|
|
|
|
|
|
|
|
1 |
+
bs4
|
|
|
2 |
numpy
|
3 |
+
<<<<<<< HEAD
|
4 |
tiktoken
|
5 |
+
=======
|
6 |
+
openai
|
7 |
+
pandas
|
8 |
+
>>>>>>> fe2ece9 (parsing urls)
|