Caden Shokat
commited on
Commit
·
f0943d0
1
Parent(s):
14c03a7
chunking + processing
Browse files- src/chunking/chunk.py +8 -4
- src/processing/generate_qas.py +10 -6
- src/processing/output.jsonl +0 -0
src/chunking/chunk.py
CHANGED
@@ -1,17 +1,21 @@
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import json
|
4 |
-
from typing import List
|
5 |
from chunker import Chunker
|
6 |
|
7 |
def chunk_file(file_path: str, chunker: Chunker) -> List[str]:
|
|
|
|
|
8 |
with open(file_path, 'r', encoding='utf-8') as f:
|
9 |
text = f.read()
|
10 |
-
return chunker.split_text(text)
|
11 |
|
12 |
-
|
|
|
|
|
|
|
13 |
rel_path = os.path.relpath(original_file, input_root)
|
14 |
-
base_name = os.path.splitext(rel_path)[0] + "
|
15 |
out_path = os.path.join(output_dir, base_name)
|
16 |
|
17 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import json
|
4 |
+
from typing import List, Dict
|
5 |
from chunker import Chunker
|
6 |
|
7 |
def chunk_file(file_path: str, chunker: Chunker) -> List[str]:
|
8 |
+
doc_id = os.path.splitext(os.path.basename(file_path))[0]
|
9 |
+
|
10 |
with open(file_path, 'r', encoding='utf-8') as f:
|
11 |
text = f.read()
|
|
|
12 |
|
13 |
+
raw_chunks = chunker.split_text(text)
|
14 |
+
return [{ "doc_id": doc_id, "chunk_id": f"{i}", "text": chunk } for i, chunk in enumerate(raw_chunks)]
|
15 |
+
|
16 |
+
def save_chunks(original_file: str, chunks: List[Dict], input_root: str, output_dir: str):
|
17 |
rel_path = os.path.relpath(original_file, input_root)
|
18 |
+
base_name = os.path.splitext(rel_path)[0] + ".json"
|
19 |
out_path = os.path.join(output_dir, base_name)
|
20 |
|
21 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
src/processing/generate_qas.py
CHANGED
@@ -25,12 +25,17 @@ def generate(model: str, prompt: str) -> List[Dict]:
|
|
25 |
messages=[
|
26 |
{ "role": "system", "content":
|
27 |
"""
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
- question : the question text
|
32 |
- answer_span : the exact sentence from the chunk that answers this question
|
33 |
-
Output exactly 4 questions.
|
34 |
"""
|
35 |
},
|
36 |
{ "role": "user", "content": prompt }
|
@@ -39,7 +44,6 @@ def generate(model: str, prompt: str) -> List[Dict]:
|
|
39 |
max_tokens=NUM_QUESTIONS * 100
|
40 |
)
|
41 |
text = resp.choices[0].message.content.strip()
|
42 |
-
print(text)
|
43 |
raw = text
|
44 |
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
45 |
raw = re.sub(r"```$", "", raw).strip()
|
@@ -50,7 +54,6 @@ def generate(model: str, prompt: str) -> List[Dict]:
|
|
50 |
|
51 |
arr = json.loads(raw)
|
52 |
|
53 |
-
print(arr)
|
54 |
return arr
|
55 |
except json.JSONDecodeError as e:
|
56 |
print("Failed to parse JSON, retrying...", e)
|
@@ -95,6 +98,7 @@ def main():
|
|
95 |
}
|
96 |
out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
|
97 |
total += 1
|
|
|
98 |
time.sleep(args.sleep)
|
99 |
|
100 |
print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")
|
|
|
25 |
messages=[
|
26 |
{ "role": "system", "content":
|
27 |
"""
|
28 |
+
Given the chunk, you will output **only** a JSON array of objects—no extra text.
|
29 |
+
You will generate exactly **4** question-answer items in strict JSON.
|
30 |
+
|
31 |
+
Rules:
|
32 |
+
- Questions must be answerable using ONLY the chunk text below (no outside knowledge).
|
33 |
+
- Prefer the chunk’s exact terminology; minimal paraphrase for grammar only.
|
34 |
+
- The answer_span must be an exact contiguous substring from the chunk.
|
35 |
+
|
36 |
+
Output:
|
37 |
- question : the question text
|
38 |
- answer_span : the exact sentence from the chunk that answers this question
|
|
|
39 |
"""
|
40 |
},
|
41 |
{ "role": "user", "content": prompt }
|
|
|
44 |
max_tokens=NUM_QUESTIONS * 100
|
45 |
)
|
46 |
text = resp.choices[0].message.content.strip()
|
|
|
47 |
raw = text
|
48 |
raw = re.sub(r"^```(?:json)?\s*", "", raw)
|
49 |
raw = re.sub(r"```$", "", raw).strip()
|
|
|
54 |
|
55 |
arr = json.loads(raw)
|
56 |
|
|
|
57 |
return arr
|
58 |
except json.JSONDecodeError as e:
|
59 |
print("Failed to parse JSON, retrying...", e)
|
|
|
98 |
}
|
99 |
out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
|
100 |
total += 1
|
101 |
+
print(f"Chunk {total} done.")
|
102 |
time.sleep(args.sleep)
|
103 |
|
104 |
print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")
|
src/processing/output.jsonl
CHANGED
The diff for this file is too large to render.
See raw diff
|
|