File size: 642 Bytes
ef4c8c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# Tokenization/pretraining/instruction_formatter.py

class InstructionFormatter:
    @staticmethod
    def format_sample(sample):
        """

        Formats a sample dict with 'instruction', 'input', and 'output' fields.

        This is a placeholder; customize as needed for your data.

        """
        # Ensure required fields exist
        instruction = sample.get("instruction", "")
        input_ = sample.get("input", "")
        output = sample.get("output", "")
        return {
            "instruction": instruction.strip(),
            "input": input_.strip(),
            "output": output.strip(),
        }