Nexa_Data_Studio / Tokenization /pretraining /Instruction_formatter.py
Allanatrix's picture
Upload 50 files
ef4c8c3 verified
raw
history blame contribute delete
642 Bytes
# Tokenization/pretraining/instruction_formatter.py
class InstructionFormatter:
@staticmethod
def format_sample(sample):
"""
Formats a sample dict with 'instruction', 'input', and 'output' fields.
This is a placeholder; customize as needed for your data.
"""
# Ensure required fields exist
instruction = sample.get("instruction", "")
input_ = sample.get("input", "")
output = sample.get("output", "")
return {
"instruction": instruction.strip(),
"input": input_.strip(),
"output": output.strip(),
}