hbertrand commited on
Commit
6e7e500
·
unverified ·
1 Parent(s): fa9ac7e

remove special chars (#15)

Browse files
Files changed (1) hide show
  1. buster/docparser.py +7 -5
buster/docparser.py CHANGED
@@ -54,8 +54,10 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
54
  else:
55
  section = parse_section(section_soup.children)
56
 
57
- url = section_found["href"]
58
- name = section_found.parent.text[:-1]
 
 
59
 
60
  # If text is too long, split into chunks of equal sizes
61
  if len(section) > max_section_length:
@@ -81,14 +83,14 @@ def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 20
81
  names = []
82
  for file in files:
83
  filepath = os.path.join(root_dir, file)
84
- with open(filepath, "r") as file:
85
- source = file.read()
86
 
87
  soup = BeautifulSoup(source, "html.parser")
88
  sections_file, urls_file, names_file = get_all_subsections(soup)
89
  sections.extend(sections_file)
90
 
91
- urls_file = [base_url + os.path.basename(file.name) + url for url in urls_file]
92
  urls.extend(urls_file)
93
 
94
  names.extend(names_file)
 
54
  else:
55
  section = parse_section(section_soup.children)
56
 
57
+ # Remove special characters, plus newlines in some url and section names.
58
+ section = section.strip()
59
+ url = section_found["href"].strip().replace("\n", "")
60
+ name = section_found.parent.text.strip()[:-1].replace("\n", "")
61
 
62
  # If text is too long, split into chunks of equal sizes
63
  if len(section) > max_section_length:
 
83
  names = []
84
  for file in files:
85
  filepath = os.path.join(root_dir, file)
86
+ with open(filepath, "r") as f:
87
+ source = f.read()
88
 
89
  soup = BeautifulSoup(source, "html.parser")
90
  sections_file, urls_file, names_file = get_all_subsections(soup)
91
  sections.extend(sections_file)
92
 
93
+ urls_file = [base_url + file + url for url in urls_file]
94
  urls.extend(urls_file)
95
 
96
  names.extend(names_file)