Spaces:

CadenShokat
/

modernbert-finetuned-embeddings

Paused

App Files Files

xet

Community

Caden Shokat commited on Aug 10

Commit

f0943d0

1 Parent(s): 14c03a7

chunking + processing

Browse files

Files changed (3) hide show

src/chunking/chunk.py +8 -4
src/processing/generate_qas.py +10 -6
src/processing/output.jsonl +0 -0

src/chunking/chunk.py CHANGED Viewed

@@ -1,17 +1,21 @@
 import argparse
 import os
 import json
-from typing import List
 from chunker import Chunker
 def chunk_file(file_path: str, chunker: Chunker) -> List[str]:
     with open(file_path, 'r', encoding='utf-8') as f:
         text = f.read()
-    return chunker.split_text(text)
-def save_chunks(original_file: str, chunks: List[str], input_root: str, output_dir: str):
     rel_path = os.path.relpath(original_file, input_root)
-    base_name = os.path.splitext(rel_path)[0] + "_chunks.json"
     out_path = os.path.join(output_dir, base_name)
     os.makedirs(os.path.dirname(out_path), exist_ok=True)

 import argparse
 import os
 import json
+from typing import List, Dict
 from chunker import Chunker
 def chunk_file(file_path: str, chunker: Chunker) -> List[str]:
+    doc_id = os.path.splitext(os.path.basename(file_path))[0]
     with open(file_path, 'r', encoding='utf-8') as f:
         text = f.read()
+    raw_chunks = chunker.split_text(text)
+    return [{ "doc_id": doc_id, "chunk_id": f"{i}", "text": chunk } for i, chunk in enumerate(raw_chunks)]
+def save_chunks(original_file: str, chunks: List[Dict], input_root: str, output_dir: str):
     rel_path = os.path.relpath(original_file, input_root)
+    base_name = os.path.splitext(rel_path)[0] + ".json"
     out_path = os.path.join(output_dir, base_name)
     os.makedirs(os.path.dirname(out_path), exist_ok=True)

src/processing/generate_qas.py CHANGED Viewed

@@ -25,12 +25,17 @@ def generate(model: str, prompt: str) -> List[Dict]:
                 messages=[
                     { "role": "system", "content":
                     """
-                        You are an expert at generating reading-comprehension questions in **strict JSON** form.
-                        Given the user’s chunk, you will output **only** a JSON array of objects—no commentary, no extra text.
-                        Each object must have:
                         - question    : the question text
                         - answer_span : the exact sentence from the chunk that answers this question
-                        Output exactly 4 questions.
                     """
                     },
                     { "role": "user", "content": prompt }
@@ -39,7 +44,6 @@ def generate(model: str, prompt: str) -> List[Dict]:
                 max_tokens=NUM_QUESTIONS * 100
             )
             text = resp.choices[0].message.content.strip()
-            print(text)
             raw = text
             raw = re.sub(r"^```(?:json)?\s*", "", raw)
             raw = re.sub(r"```$", "", raw).strip()
@@ -50,7 +54,6 @@ def generate(model: str, prompt: str) -> List[Dict]:
             arr = json.loads(raw)
-            print(arr)
             return arr
         except json.JSONDecodeError as e:
             print("Failed to parse JSON, retrying...", e)
@@ -95,6 +98,7 @@ def main():
                 }
                 out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
                 total += 1
             time.sleep(args.sleep)
     print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")

                 messages=[
                     { "role": "system", "content":
                     """
+                        Given the chunk, you will output **only** a JSON array of objects—no extra text.
+                        You will generate exactly **4** question-answer items in strict JSON.
+                        Rules:
+                            - Questions must be answerable using ONLY the chunk text below (no outside knowledge).
+                            - Prefer the chunk’s exact terminology; minimal paraphrase for grammar only.
+                            - The answer_span must be an exact contiguous substring from the chunk.
+                        Output:
                         - question    : the question text
                         - answer_span : the exact sentence from the chunk that answers this question
                     """
                     },
                     { "role": "user", "content": prompt }
                 max_tokens=NUM_QUESTIONS * 100
             )
             text = resp.choices[0].message.content.strip()
             raw = text
             raw = re.sub(r"^```(?:json)?\s*", "", raw)
             raw = re.sub(r"```$", "", raw).strip()
             arr = json.loads(raw)
             return arr
         except json.JSONDecodeError as e:
             print("Failed to parse JSON, retrying...", e)
                 }
                 out_f.write(json.dumps(out, ensure_ascii=False) + "\n")
                 total += 1
+                print(f"Chunk {total} done.")
             time.sleep(args.sleep)
     print(f"Done — generated {total} questions across {len(chunks)} chunks into '{args.output}'.")

src/processing/output.jsonl CHANGED Viewed

The diff for this file is too large to render. See raw diff