Caden Shokat commited on
Commit
f0943d0
·
1 Parent(s): 14c03a7

chunking + processing

Browse files
src/chunking/chunk.py CHANGED
@@ -1,17 +1,21 @@
1
  import argparse
2
  import os
3
  import json
4
- from typing import List
5
  from chunker import Chunker
6
 
7
  def chunk_file(file_path: str, chunker: Chunker) -> List[str]:
 
 
8
  with open(file_path, 'r', encoding='utf-8') as f:
9
  text = f.read()
10
- return chunker.split_text(text)
11
 
12
- def save_chunks(original_file: str, chunks: List[str], input_root: str, output_dir: str):
 
 
 
13
  rel_path = os.path.relpath(original_file, input_root)
14
- base_name = os.path.splitext(rel_path)[0] + "_chunks.json"
15
  out_path = os.path.join(output_dir, base_name)
16
 
17
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
 
1
  import argparse
2
  import os
3
  import json
4
+ from typing import List, Dict
5
  from chunker import Chunker
6
 
7
  def chunk_file(file_path: str, chunker: Chunker) -> List[str]:
8
+ doc_id = os.path.splitext(os.path.basename(file_path))[0]
9
+
10
  with open(file_path, 'r', encoding='utf-8') as f:
11
  text = f.read()
 
12
 
13
+ raw_chunks = chunker.split_text(text)
14
+ return [{ "doc_id": doc_id, "chunk_id": f"{i}", "text": chunk } for i, chunk in enumerate(raw_chunks)]
15
+
16
+ def save_chunks(original_file: str, chunks: List[Dict], input_root: str, output_dir: str):
17
  rel_path = os.path.relpath(original_file, input_root)
18
+ base_name = os.path.splitext(rel_path)[0] + ".json"
19
  out_path = os.path.join(output_dir, base_name)
20
 
21
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
src/processing/generate_qas.py CHANGED
@@ -25,12 +25,17 @@ def generate(model: str, prompt: str) -> List[Dict]:
25
  messages=[
26
  { "role": "system", "content":
27
  """
28
- You are an expert at generating reading-comprehension questions in **strict JSON** form.
29
- Given the user’s chunk, you will output **only** a JSON array of objects—no commentary, no extra text.
30
- Each object must have:
 
 
 
 
 
 
31
  - question : the question text
32
  - answer_span : the exact sentence from the chunk that answers this question
33
- Output exactly 4 questions.
34
  """
35
  },
36
  { "role": "user", "content": prompt }
@@ -39,7 +44,6 @@ def generate(model: str, prompt: str) -> List[Dict]:
39
  max_tokens=NUM_QUESTIONS * 100
40
  )
41
  text = resp.choices[0].message.content.strip()
42
- print(text)
43
  raw = text
44
  raw = re.sub(r"^```(?:json)?\s*", "", raw)
45
  raw = re.sub(r"```$", "", raw).strip()
@@ -50,7 +54,6 @@ def generate(model: str, prompt: str) -> List[Dict]:
50
 
51
  arr = json.loads(raw)
52
 
53
- print(arr)
54
  return arr
55
  except json.JSONDecodeError as e:
56
  print("Failed to parse JSON, retrying...", e)
@@ -95,6 +98,7 @@ def main():
95
  }
96
  out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
97
  total += 1
 
98
  time.sleep(args.sleep)
99
 
100
  print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")
 
25
  messages=[
26
  { "role": "system", "content":
27
  """
28
+ Given the chunk, you will output **only** a JSON array of objects—no extra text.
29
+ You will generate exactly **4** question-answer items in strict JSON.
30
+
31
+ Rules:
32
+ - Questions must be answerable using ONLY the chunk text below (no outside knowledge).
33
+ - Prefer the chunk’s exact terminology; minimal paraphrase for grammar only.
34
+ - The answer_span must be an exact contiguous substring from the chunk.
35
+
36
+ Output:
37
  - question : the question text
38
  - answer_span : the exact sentence from the chunk that answers this question
 
39
  """
40
  },
41
  { "role": "user", "content": prompt }
 
44
  max_tokens=NUM_QUESTIONS * 100
45
  )
46
  text = resp.choices[0].message.content.strip()
 
47
  raw = text
48
  raw = re.sub(r"^```(?:json)?\s*", "", raw)
49
  raw = re.sub(r"```$", "", raw).strip()
 
54
 
55
  arr = json.loads(raw)
56
 
 
57
  return arr
58
  except json.JSONDecodeError as e:
59
  print("Failed to parse JSON, retrying...", e)
 
98
  }
99
  out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
100
  total += 1
101
+ print(f"Chunk {total} done.")
102
  time.sleep(args.sleep)
103
 
104
  print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")
src/processing/output.jsonl CHANGED
The diff for this file is too large to render. See raw diff