Allanatrix's picture
Upload 50 files
ef4c8c3 verified
from typing import Dict, List
from Tokenization.preprocessing.Clean_text import clean_text
from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs
def preprocess_sample(paper: Dict) -> List[Dict]:
"""
Clean and segment a paper into samples for LLM ingestion.
Returns a list of dicts: one for title+abstract, and one per paragraph.
"""
title = clean_text(paper.get("title", ""))
abstract = clean_text(paper.get("abstract", ""))
full_text = clean_text(paper.get("full_text", ""))
paragraphs = segment_paragraphs(full_text) if full_text else []
samples = []
# Title + abstract sample
if title or abstract:
sample = dict(paper)
sample["title"] = title
sample["abstract"] = abstract
sample["full_text"] = ""
sample["section"] = "abstract"
samples.append(sample)
# Paragraph samples
for para in paragraphs:
sample = dict(paper)
sample["title"] = title
sample["abstract"] = ""
sample["full_text"] = para
sample["section"] = "paragraph"
samples.append(sample)
return samples