Spaces:
Runtime error
Runtime error
remove special chars (#15)
Browse files- buster/docparser.py +7 -5
buster/docparser.py
CHANGED
@@ -54,8 +54,10 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
|
|
54 |
else:
|
55 |
section = parse_section(section_soup.children)
|
56 |
|
57 |
-
url
|
58 |
-
|
|
|
|
|
59 |
|
60 |
# If text is too long, split into chunks of equal sizes
|
61 |
if len(section) > max_section_length:
|
@@ -81,14 +83,14 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
|
|
81 |
names = []
|
82 |
for file in files:
|
83 |
filepath = os.path.join(root_dir, file)
|
84 |
-
with open(filepath, "r") as
|
85 |
-
source =
|
86 |
|
87 |
soup = BeautifulSoup(source, "html.parser")
|
88 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
89 |
sections.extend(sections_file)
|
90 |
|
91 |
-
urls_file = [base_url +
|
92 |
urls.extend(urls_file)
|
93 |
|
94 |
names.extend(names_file)
|
|
|
54 |
else:
|
55 |
section = parse_section(section_soup.children)
|
56 |
|
57 |
+
# Remove special characters, plus newlines in some url and section names.
|
58 |
+
section = section.strip()
|
59 |
+
url = section_found["href"].strip().replace("\n", "")
|
60 |
+
name = section_found.parent.text.strip()[:-1].replace("\n", "")
|
61 |
|
62 |
# If text is too long, split into chunks of equal sizes
|
63 |
if len(section) > max_section_length:
|
|
|
83 |
names = []
|
84 |
for file in files:
|
85 |
filepath = os.path.join(root_dir, file)
|
86 |
+
with open(filepath, "r") as f:
|
87 |
+
source = f.read()
|
88 |
|
89 |
soup = BeautifulSoup(source, "html.parser")
|
90 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
91 |
sections.extend(sections_file)
|
92 |
|
93 |
+
urls_file = [base_url + file + url for url in urls_file]
|
94 |
urls.extend(urls_file)
|
95 |
|
96 |
names.extend(names_file)
|