masadonline commited on
Commit
aac755e
·
verified ·
1 Parent(s): 7a5db40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -90,18 +90,21 @@ def load_json_data(json_path):
90
  return ""
91
 
92
  # ---------------- Chunking ----------------
93
- def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32):
94
- tokens = tokenizer.tokenize(text)
95
  chunks = []
96
  start = 0
97
- while start < len(tokens):
98
- end = min(start + chunk_size, len(tokens))
99
- chunk = tokens[start:end]
100
- chunks.append(tokenizer.convert_tokens_to_string(chunk))
101
- if end == len(tokens): break
 
 
102
  start += chunk_size - chunk_overlap
103
  return chunks
104
 
 
105
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
106
  q_embedding = embed_model.encode(question)
107
  D, I = index.search(np.array([q_embedding]), k)
 
90
  return ""
91
 
92
  # ---------------- Chunking ----------------
93
+ def chunk_text(text, tokenizer, chunk_size=512, chunk_overlap=50):
94
+ input_ids = tokenizer.encode(text, add_special_tokens=False)
95
  chunks = []
96
  start = 0
97
+ while start < len(input_ids):
98
+ end = min(start + chunk_size, len(input_ids))
99
+ chunk_ids = input_ids[start:end]
100
+ chunk_text = tokenizer.decode(chunk_ids)
101
+ chunks.append(chunk_text)
102
+ if end == len(input_ids):
103
+ break
104
  start += chunk_size - chunk_overlap
105
  return chunks
106
 
107
+
108
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
109
  q_embedding = embed_model.encode(question)
110
  D, I = index.search(np.array([q_embedding]), k)