Kevin Hu
commited on
Commit
·
ef2a724
1
Parent(s):
8f1a7d6
add sql to naive parser (#1908)
Browse files### What problem does this PR solve?
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- api/utils/file_utils.py +1 -1
- deepdoc/parser/txt_parser.py +9 -5
- rag/app/naive.py +4 -2
api/utils/file_utils.py
CHANGED
|
@@ -156,7 +156,7 @@ def filename_type(filename):
|
|
| 156 |
return FileType.PDF.value
|
| 157 |
|
| 158 |
if re.match(
|
| 159 |
-
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
| 160 |
return FileType.DOC.value
|
| 161 |
|
| 162 |
if re.match(
|
|
|
|
| 156 |
return FileType.PDF.value
|
| 157 |
|
| 158 |
if re.match(
|
| 159 |
+
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
|
| 160 |
return FileType.DOC.value
|
| 161 |
|
| 162 |
if re.match(
|
deepdoc/parser/txt_parser.py
CHANGED
|
@@ -12,6 +12,7 @@
|
|
| 12 |
#
|
| 13 |
|
| 14 |
from rag.nlp import find_codec,num_tokens_from_string
|
|
|
|
| 15 |
|
| 16 |
class RAGFlowTxtParser:
|
| 17 |
def __call__(self, fnm, binary=None, chunk_token_num=128):
|
|
@@ -29,14 +30,17 @@ class RAGFlowTxtParser:
|
|
| 29 |
return self.parser_txt(txt, chunk_token_num)
|
| 30 |
|
| 31 |
@classmethod
|
| 32 |
-
def parser_txt(cls, txt, chunk_token_num=128):
|
| 33 |
if type(txt) != str:
|
| 34 |
raise TypeError("txt type should be str!")
|
| 35 |
sections = []
|
| 36 |
-
for sec in
|
|
|
|
|
|
|
|
|
|
| 37 |
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
| 38 |
-
sections.append(
|
| 39 |
-
sections.append(
|
| 40 |
else:
|
| 41 |
-
sections.append(
|
| 42 |
return sections
|
|
|
|
| 12 |
#
|
| 13 |
|
| 14 |
from rag.nlp import find_codec,num_tokens_from_string
|
| 15 |
+
import re
|
| 16 |
|
| 17 |
class RAGFlowTxtParser:
|
| 18 |
def __call__(self, fnm, binary=None, chunk_token_num=128):
|
|
|
|
| 30 |
return self.parser_txt(txt, chunk_token_num)
|
| 31 |
|
| 32 |
@classmethod
|
| 33 |
+
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
|
| 34 |
if type(txt) != str:
|
| 35 |
raise TypeError("txt type should be str!")
|
| 36 |
sections = []
|
| 37 |
+
for sec in re.split(r"[%s]+"%delimiter, txt):
|
| 38 |
+
if sections and sec in delimiter:
|
| 39 |
+
sections[-1][0] += sec
|
| 40 |
+
continue
|
| 41 |
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
| 42 |
+
sections.append([sec[: int(len(sec) / 2)], ""])
|
| 43 |
+
sections.append([sec[int(len(sec) / 2) :], ""])
|
| 44 |
else:
|
| 45 |
+
sections.append([sec, ""])
|
| 46 |
return sections
|
rag/app/naive.py
CHANGED
|
@@ -224,9 +224,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
| 224 |
excel_parser = ExcelParser()
|
| 225 |
sections = [(l, "") for l in excel_parser.html(binary) if l]
|
| 226 |
|
| 227 |
-
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
| 228 |
callback(0.1, "Start to parse.")
|
| 229 |
-
sections = TxtParser()(filename,binary,
|
|
|
|
|
|
|
| 230 |
callback(0.8, "Finish parsing.")
|
| 231 |
|
| 232 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
|
|
|
| 224 |
excel_parser = ExcelParser()
|
| 225 |
sections = [(l, "") for l in excel_parser.html(binary) if l]
|
| 226 |
|
| 227 |
+
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
| 228 |
callback(0.1, "Start to parse.")
|
| 229 |
+
sections = TxtParser()(filename,binary,
|
| 230 |
+
parser_config.get("chunk_token_num", 128),
|
| 231 |
+
parser_config.get("delimiter", "\n!?;。;!?"))
|
| 232 |
callback(0.8, "Finish parsing.")
|
| 233 |
|
| 234 |
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|