Spaces:

Allanatrix
/

Nexa_Data_Studio

Sleeping

File size: 1,172 Bytes

ef4c8c3

from typing import Dict, List
from Tokenization.preprocessing.Clean_text import clean_text
from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs

def preprocess_sample(paper: Dict) -> List[Dict]:
    """

    Clean and segment a paper into samples for LLM ingestion.

    Returns a list of dicts: one for title+abstract, and one per paragraph.

    """
    title = clean_text(paper.get("title", ""))
    abstract = clean_text(paper.get("abstract", ""))
    full_text = clean_text(paper.get("full_text", ""))
    paragraphs = segment_paragraphs(full_text) if full_text else []
    samples = []
    # Title + abstract sample
    if title or abstract:
        sample = dict(paper)
        sample["title"] = title
        sample["abstract"] = abstract
        sample["full_text"] = ""
        sample["section"] = "abstract"
        samples.append(sample)
    # Paragraph samples
    for para in paragraphs:
        sample = dict(paper)
        sample["title"] = title
        sample["abstract"] = ""
        sample["full_text"] = para
        sample["section"] = "paragraph"
        samples.append(sample)
    return samples