Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -90,18 +90,21 @@ def load_json_data(json_path):
|
|
90 |
return ""
|
91 |
|
92 |
# ---------------- Chunking ----------------
|
93 |
-
def chunk_text(text, tokenizer, chunk_size=
|
94 |
-
|
95 |
chunks = []
|
96 |
start = 0
|
97 |
-
while start < len(
|
98 |
-
end = min(start + chunk_size, len(
|
99 |
-
|
100 |
-
|
101 |
-
|
|
|
|
|
102 |
start += chunk_size - chunk_overlap
|
103 |
return chunks
|
104 |
|
|
|
105 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
106 |
q_embedding = embed_model.encode(question)
|
107 |
D, I = index.search(np.array([q_embedding]), k)
|
|
|
90 |
return ""
|
91 |
|
92 |
# ---------------- Chunking ----------------
|
93 |
+
def chunk_text(text, tokenizer, chunk_size=512, chunk_overlap=50):
|
94 |
+
input_ids = tokenizer.encode(text, add_special_tokens=False)
|
95 |
chunks = []
|
96 |
start = 0
|
97 |
+
while start < len(input_ids):
|
98 |
+
end = min(start + chunk_size, len(input_ids))
|
99 |
+
chunk_ids = input_ids[start:end]
|
100 |
+
chunk_text = tokenizer.decode(chunk_ids)
|
101 |
+
chunks.append(chunk_text)
|
102 |
+
if end == len(input_ids):
|
103 |
+
break
|
104 |
start += chunk_size - chunk_overlap
|
105 |
return chunks
|
106 |
|
107 |
+
|
108 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
109 |
q_embedding = embed_model.encode(question)
|
110 |
D, I = index.search(np.array([q_embedding]), k)
|