masadonline commited on
Commit
322de72
·
verified ·
1 Parent(s): aac755e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -90,16 +90,16 @@ def load_json_data(json_path):
90
  return ""
91
 
92
  # ---------------- Chunking ----------------
93
- def chunk_text(text, tokenizer, chunk_size=512, chunk_overlap=50):
94
- input_ids = tokenizer.encode(text, add_special_tokens=False)
95
  chunks = []
96
  start = 0
97
- while start < len(input_ids):
98
- end = min(start + chunk_size, len(input_ids))
99
- chunk_ids = input_ids[start:end]
100
- chunk_text = tokenizer.decode(chunk_ids)
101
  chunks.append(chunk_text)
102
- if end == len(input_ids):
103
  break
104
  start += chunk_size - chunk_overlap
105
  return chunks
@@ -201,7 +201,7 @@ def setup_knowledge_base():
201
  except Exception as e:
202
  print(f"CSV read error: {e}")
203
 
204
- tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
205
  chunks = chunk_text(all_text, tokenizer)
206
  model = SentenceTransformer('all-mpnet-base-v2')
207
  embeddings = model.encode(chunks, show_progress_bar=False)
 
90
  return ""
91
 
92
  # ---------------- Chunking ----------------
93
+ def chunk_text(text, tokenizer, chunk_size=128, chunk_overlap=32, max_tokens=512):
94
+ tokens = tokenizer.tokenize(text)
95
  chunks = []
96
  start = 0
97
+ while start < len(tokens):
98
+ end = min(start + chunk_size, len(tokens))
99
+ chunk_tokens = tokens[start:end]
100
+ chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
101
  chunks.append(chunk_text)
102
+ if end == len(tokens):
103
  break
104
  start += chunk_size - chunk_overlap
105
  return chunks
 
201
  except Exception as e:
202
  print(f"CSV read error: {e}")
203
 
204
+ tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', model_max_length=512)
205
  chunks = chunk_text(all_text, tokenizer)
206
  model = SentenceTransformer('all-mpnet-base-v2')
207
  embeddings = model.encode(chunks, show_progress_bar=False)