DurgaDeepak commited on
Commit
e9706de
·
verified ·
1 Parent(s): 9b1fba6

Update ingestion.py

Browse files
Files changed (1) hide show
  1. ingestion.py +26 -11
ingestion.py CHANGED
@@ -1,20 +1,29 @@
1
  import os
2
  import glob
 
 
3
  from datasets import Dataset
4
  from unstructured.partition.pdf import partition_pdf
5
  from transformers import RagTokenizer
 
 
 
 
 
 
 
 
 
 
6
 
7
- def ingest_and_push(dataset_name="username/mealplan-chunks"):
8
- # Initialize tokenizer for token-aware splitting
9
- tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
10
  texts, sources, pages = [], [], []
11
 
 
12
  for pdf_path in glob.glob("pdfs/*.pdf"):
13
  book = os.path.basename(pdf_path)
14
  pages_data = partition_pdf(filename=pdf_path)
15
- for page_num, page in enumerate(pages_data, start=1):
16
- # Encode page text into token windows
17
- enc = tokenizer(
18
  page.text,
19
  max_length=800,
20
  truncation=True,
@@ -22,14 +31,13 @@ def ingest_and_push(dataset_name="username/mealplan-chunks"):
22
  stride=50,
23
  return_tensors="pt"
24
  )
25
- # Decode each token window back to text chunk
26
  for token_ids in enc["input_ids"]:
27
- chunk = tokenizer.decode(token_ids, skip_special_tokens=True)
28
  texts.append(chunk)
29
  sources.append(book)
30
- pages.append(page_num)
31
 
32
- # Build HF Dataset
33
  ds = Dataset.from_dict({
34
  "text": texts,
35
  "source": sources,
@@ -37,5 +45,12 @@ def ingest_and_push(dataset_name="username/mealplan-chunks"):
37
  })
38
  ds.push_to_hub(dataset_name, token=True)
39
 
 
 
 
 
 
 
 
40
  if __name__ == "__main__":
41
- ingest_and_push()
 
1
  import os
2
  import glob
3
+ import faiss
4
+ import numpy as np
5
  from datasets import Dataset
6
  from unstructured.partition.pdf import partition_pdf
7
  from transformers import RagTokenizer
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ def ingest_and_push(
11
+ dataset_name="username/mealplan-chunks",
12
+ index_path="mealplan.index"
13
+ ):
14
+ # 1) Tokenizer for chunking
15
+ rag_tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
16
+ # 2) Embedder for FAISS
17
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
18
 
 
 
 
19
  texts, sources, pages = [], [], []
20
 
21
+ # 3) Chunk each PDF
22
  for pdf_path in glob.glob("pdfs/*.pdf"):
23
  book = os.path.basename(pdf_path)
24
  pages_data = partition_pdf(filename=pdf_path)
25
+ for pg_num, page in enumerate(pages_data, start=1):
26
+ enc = rag_tokenizer(
 
27
  page.text,
28
  max_length=800,
29
  truncation=True,
 
31
  stride=50,
32
  return_tensors="pt"
33
  )
 
34
  for token_ids in enc["input_ids"]:
35
+ chunk = rag_tokenizer.decode(token_ids, skip_special_tokens=True)
36
  texts.append(chunk)
37
  sources.append(book)
38
+ pages.append(pg_num)
39
 
40
+ # 4) Build HF Dataset
41
  ds = Dataset.from_dict({
42
  "text": texts,
43
  "source": sources,
 
45
  })
46
  ds.push_to_hub(dataset_name, token=True)
47
 
48
+ # 5) Build FAISS index
49
+ embeddings = embedder.encode(texts, convert_to_numpy=True)
50
+ dim = embeddings.shape[1]
51
+ index = faiss.IndexFlatL2(dim) # CPU index
52
+ index.add(embeddings)
53
+ faiss.write_index(index, index_path)
54
+
55
  if __name__ == "__main__":
56
+ ingest_and_push()