masadonline commited on
Commit
021a9d3
·
verified ·
1 Parent(s): 4c9633a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -3
app.py CHANGED
@@ -35,16 +35,22 @@ def extract_text_from_docx(docx_path):
35
  return ""
36
 
37
  # --- Chunking & Retrieval ---
38
- def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
39
  tokens = tokenizer.tokenize(text)
40
- chunks, start = [], 0
 
41
  while start < len(tokens):
42
  end = min(start + chunk_size, len(tokens))
43
  chunk_tokens = tokens[start:end]
44
- chunks.append(tokenizer.convert_tokens_to_string(chunk_tokens))
 
 
 
 
45
  start += chunk_size - chunk_overlap
46
  return chunks
47
 
 
48
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
49
  question_embedding = embed_model.encode(question)
50
  D, I = index.search(np.array([question_embedding]), k)
 
35
  return ""
36
 
37
  # --- Chunking & Retrieval ---
38
+ def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512):
39
  tokens = tokenizer.tokenize(text)
40
+ chunks = []
41
+ start = 0
42
  while start < len(tokens):
43
  end = min(start + chunk_size, len(tokens))
44
  chunk_tokens = tokens[start:end]
45
+ # Drop chunk if it's too long after detokenization
46
+ chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
47
+ # Double-check token count with tokenizer to be safe
48
+ if len(tokenizer.encode(chunk_text)) <= max_tokens:
49
+ chunks.append(chunk_text.strip())
50
  start += chunk_size - chunk_overlap
51
  return chunks
52
 
53
+
54
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
55
  question_embedding = embed_model.encode(question)
56
  D, I = index.search(np.array([question_embedding]), k)