File size: 1,172 Bytes
ef4c8c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from typing import Dict, List
from Tokenization.preprocessing.Clean_text import clean_text
from Tokenization.preprocessing.Segment_paragraphs import segment_paragraphs

def preprocess_sample(paper: Dict) -> List[Dict]:
    """

    Clean and segment a paper into samples for LLM ingestion.

    Returns a list of dicts: one for title+abstract, and one per paragraph.

    """
    title = clean_text(paper.get("title", ""))
    abstract = clean_text(paper.get("abstract", ""))
    full_text = clean_text(paper.get("full_text", ""))
    paragraphs = segment_paragraphs(full_text) if full_text else []
    samples = []
    # Title + abstract sample
    if title or abstract:
        sample = dict(paper)
        sample["title"] = title
        sample["abstract"] = abstract
        sample["full_text"] = ""
        sample["section"] = "abstract"
        samples.append(sample)
    # Paragraph samples
    for para in paragraphs:
        sample = dict(paper)
        sample["title"] = title
        sample["abstract"] = ""
        sample["full_text"] = para
        sample["section"] = "paragraph"
        samples.append(sample)
    return samples