Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -35,16 +35,22 @@ def extract_text_from_docx(docx_path):
|
|
35 |
return ""
|
36 |
|
37 |
# --- Chunking & Retrieval ---
|
38 |
-
def chunk_text(text, tokenizer, chunk_size=
|
39 |
tokens = tokenizer.tokenize(text)
|
40 |
-
chunks
|
|
|
41 |
while start < len(tokens):
|
42 |
end = min(start + chunk_size, len(tokens))
|
43 |
chunk_tokens = tokens[start:end]
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
start += chunk_size - chunk_overlap
|
46 |
return chunks
|
47 |
|
|
|
48 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
49 |
question_embedding = embed_model.encode(question)
|
50 |
D, I = index.search(np.array([question_embedding]), k)
|
|
|
35 |
return ""
|
36 |
|
37 |
# --- Chunking & Retrieval ---
|
38 |
+
def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512):
|
39 |
tokens = tokenizer.tokenize(text)
|
40 |
+
chunks = []
|
41 |
+
start = 0
|
42 |
while start < len(tokens):
|
43 |
end = min(start + chunk_size, len(tokens))
|
44 |
chunk_tokens = tokens[start:end]
|
45 |
+
# Drop chunk if it's too long after detokenization
|
46 |
+
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
|
47 |
+
# Double-check token count with tokenizer to be safe
|
48 |
+
if len(tokenizer.encode(chunk_text)) <= max_tokens:
|
49 |
+
chunks.append(chunk_text.strip())
|
50 |
start += chunk_size - chunk_overlap
|
51 |
return chunks
|
52 |
|
53 |
+
|
54 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
55 |
question_embedding = embed_model.encode(question)
|
56 |
D, I = index.search(np.array([question_embedding]), k)
|