File size: 2,832 Bytes
5301c48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# Complete prompts that need no additional template text
COMPLETE_PROMPTS = {
    "data_gen": """
You are a data generation expert. Your primary objective is to create
high-quality synthetic data that strictly adheres to the provided guidelines.

The user has provided specific instructions for data generation.
    - Carefully analyze the given instructions.
    - Ensure the generated data aligns with the specified requirements.
    - Maintain accuracy, coherence, and logical consistency.
user_instruction: {{user_instruction}}

{% if good_examples %}
The user has provided high-quality reference examples.
    - Identify patterns, structures, and key characteristics from these examples.
    - Generate data that maintains a similar style, quality, and relevance.
    - Ensure variations while preserving meaningful consistency.
    good_examples: {{good_examples}}
{% endif %}

{% if bad_examples %}
The following examples represent poor-quality data.
    - Avoid replicating errors, inconsistencies, or undesirable patterns.
    - Ensure generated data is free from the flaws present in these examples.
    bad_examples: {{bad_examples}}
{% endif %}

{% if duplicate_examples %}
The user has specified examples that should not be duplicated.
    - Ensure the generated data remains unique and does not replicate these examples.
    - Introduce meaningful variations while maintaining quality and consistency.
    duplicate_examples: {{duplicate_examples}}
{% endif %}

{% if topic %}
The generated data should be contextually relevant to the given topic: '{{topic}}'.
    - Maintain thematic consistency.
    - Ensure factual accuracy where applicable.
{% endif %}

Generate unique and high-quality data points.
- Ensure diversity in the dataset while maintaining coherence.
- Avoid redundant or repetitive entries.
""",
}

# Partial prompts that need to be combined with user-provided content
PARTIAL_PROMPTS = {
    "data_gen": {
        "header": """You are a data generation expert. Your primary objective is to create
high-quality synthetic data that strictly adheres to the provided guidelines.""",
        "footer": """
       Generate unique and high-quality data points.
        - Ensure diversity in the dataset while maintaining coherence.
        - Avoid redundant or repetitive entries.
        """,
    },
}

# QA pair generation prompt
qa_generation = """
    Create {num_pairs} question-answer pairs from this text for LLM training.
    
    Rules:
    1. Questions must be about important facts in the text
    2. Answers must be directly supported by the text
    3. Return JSON format only:
    
    [
      {{
        "question": "Question 1?",
        "answer": "Answer 1."
      }},
      {{
        "question": "Question 2?",
        "answer": "Answer 2."
      }}
    ]
    
    Text:
    {text}

"""