Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -90,16 +90,16 @@ def load_json_data(json_path):
|
|
90 |
return ""
|
91 |
|
92 |
# ---------------- Chunking ----------------
|
93 |
-
def chunk_text(text, tokenizer, chunk_size=
|
94 |
-
|
95 |
chunks = []
|
96 |
start = 0
|
97 |
-
while start < len(
|
98 |
-
end = min(start + chunk_size, len(
|
99 |
-
|
100 |
-
chunk_text = tokenizer.
|
101 |
chunks.append(chunk_text)
|
102 |
-
if end == len(
|
103 |
break
|
104 |
start += chunk_size - chunk_overlap
|
105 |
return chunks
|
@@ -201,7 +201,7 @@ def setup_knowledge_base():
|
|
201 |
except Exception as e:
|
202 |
print(f"CSV read error: {e}")
|
203 |
|
204 |
-
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
|
205 |
chunks = chunk_text(all_text, tokenizer)
|
206 |
model = SentenceTransformer('all-mpnet-base-v2')
|
207 |
embeddings = model.encode(chunks, show_progress_bar=False)
|
|
|
90 |
return ""
|
91 |
|
92 |
# ---------------- Chunking ----------------
|
93 |
+
def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512):
|
94 |
+
tokens = tokenizer.tokenize(text)
|
95 |
chunks = []
|
96 |
start = 0
|
97 |
+
while start < len(tokens):
|
98 |
+
end = min(start + chunk_size, len(tokens))
|
99 |
+
chunk_tokens = tokens[start:end]
|
100 |
+
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
|
101 |
chunks.append(chunk_text)
|
102 |
+
if end == len(tokens):
|
103 |
break
|
104 |
start += chunk_size - chunk_overlap
|
105 |
return chunks
|
|
|
201 |
except Exception as e:
|
202 |
print(f"CSV read error: {e}")
|
203 |
|
204 |
+
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=512)
|
205 |
chunks = chunk_text(all_text, tokenizer)
|
206 |
model = SentenceTransformer('all-mpnet-base-v2')
|
207 |
embeddings = model.encode(chunks, show_progress_bar=False)
|