Mac Huang commited on
Commit
ed6b901
·
1 Parent(s): 43864c1

Push full project including dataset, code, and training scripts

Browse files
Files changed (47) hide show
  1. PROJECT_STRUCTURE.md +228 -0
  2. dataset/linux_bugfix_100k.jsonl +3 -0
  3. dataset/linux_bugfix_prompt_completion.jsonl +3 -0
  4. dataset_builder/build_dataset.py +160 -0
  5. dataset_builder/build_dataset_demo.py +157 -0
  6. dataset_builder/convert_to_prompt_completion.py +41 -0
  7. evaluate/__pycache__/evaluate.cpython-312.pyc +0 -0
  8. evaluate/eval.jsonl +3 -0
  9. evaluate/evaluate.py +87 -0
  10. evaluate/output/.eval_results.csv.swp +0 -0
  11. evaluate/output/eval_results.csv +3 -0
  12. evaluate/output/eval_results.json +3 -0
  13. requirements.txt +182 -0
  14. train/download_model.py +6 -0
  15. train/output/qlora-codellama-bugfix/README.md +207 -0
  16. train/output/qlora-codellama-bugfix/adapter_config.json +3 -0
  17. train/output/qlora-codellama-bugfix/adapter_model.safetensors +3 -0
  18. train/output/qlora-codellama-bugfix/chat_template.jinja +1 -0
  19. train/output/qlora-codellama-bugfix/checkpoint-1000/README.md +207 -0
  20. train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_config.json +3 -0
  21. train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_model.safetensors +3 -0
  22. train/output/qlora-codellama-bugfix/checkpoint-1000/chat_template.jinja +1 -0
  23. train/output/qlora-codellama-bugfix/checkpoint-1000/optimizer.pt +3 -0
  24. train/output/qlora-codellama-bugfix/checkpoint-1000/rng_state.pth +3 -0
  25. train/output/qlora-codellama-bugfix/checkpoint-1000/scheduler.pt +3 -0
  26. train/output/qlora-codellama-bugfix/checkpoint-1000/special_tokens_map.json +3 -0
  27. train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer.json +3 -0
  28. train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer_config.json +3 -0
  29. train/output/qlora-codellama-bugfix/checkpoint-1000/trainer_state.json +3 -0
  30. train/output/qlora-codellama-bugfix/checkpoint-1000/training_args.bin +3 -0
  31. train/output/qlora-codellama-bugfix/checkpoint-500/README.md +207 -0
  32. train/output/qlora-codellama-bugfix/checkpoint-500/adapter_config.json +3 -0
  33. train/output/qlora-codellama-bugfix/checkpoint-500/adapter_model.safetensors +3 -0
  34. train/output/qlora-codellama-bugfix/checkpoint-500/chat_template.jinja +1 -0
  35. train/output/qlora-codellama-bugfix/checkpoint-500/optimizer.pt +3 -0
  36. train/output/qlora-codellama-bugfix/checkpoint-500/rng_state.pth +3 -0
  37. train/output/qlora-codellama-bugfix/checkpoint-500/scheduler.pt +3 -0
  38. train/output/qlora-codellama-bugfix/checkpoint-500/special_tokens_map.json +3 -0
  39. train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer.json +3 -0
  40. train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer_config.json +3 -0
  41. train/output/qlora-codellama-bugfix/checkpoint-500/trainer_state.json +3 -0
  42. train/output/qlora-codellama-bugfix/checkpoint-500/training_args.bin +3 -0
  43. train/output/qlora-codellama-bugfix/special_tokens_map.json +3 -0
  44. train/output/qlora-codellama-bugfix/tokenizer.json +3 -0
  45. train/output/qlora-codellama-bugfix/tokenizer_config.json +3 -0
  46. train/train.py +143 -0
  47. train/train_codellama_qlora.py +96 -0
PROJECT_STRUCTURE.md ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Linux Kernel Anti-Pattern Detector - Project Structure
2
+
3
+ ## Overview
4
+
5
+ This project is organized into a clear, maintainable structure that separates concerns and makes it easy to find, modify, and extend functionality.
6
+
7
+ ## Directory Structure
8
+
9
+ ```
10
+ Linux Kernel Anti-Pattern Detector/
11
+ ├── 📁 data/ # Analysis data and results
12
+ │ ├── results.json # Main analysis results
13
+ │ ├── concurrency_analysis_report.json
14
+ │ └── kernel_analysis.log # Analysis logs
15
+
16
+ ├── 📁 docs/ # Documentation
17
+ │ ├── kernel-analysis-guide.md # Kernel analysis documentation
18
+ │ └── [additional documentation]
19
+
20
+ ├── 📁 examples/ # Example code and usage
21
+ │ └── [example files]
22
+
23
+ ├── 📁 reports/ # Generated analysis reports
24
+ │ ├── Linux_Kernel_Anti_Pattern_Analysis_Report.md
25
+ │ ├── Executive_Summary.md
26
+ │ └── 📁 concurrency/ # Concurrency-specific reports
27
+ │ └── Concurrency_Analysis_Report.md
28
+
29
+ ├── 📁 scripts/ # Analysis and utility scripts
30
+ │ ├── 📁 analysis/ # Core analysis scripts
31
+ │ │ ├── concurrency_analyzer.py # Concurrency issue analyzer
32
+ │ │ └── analyze_kernel_structure.py
33
+ │ ├── 📁 reporting/ # Report generation scripts
34
+ │ │ └── view_results.py # Results viewer
35
+ │ └── 📁 utils/ # Utility scripts
36
+ │ └── quick_summary.py # Quick summary generator
37
+
38
+ ├── 📁 src/ # Source code (main project)
39
+ │ ├── __init__.py
40
+ │ ├── 📁 detectors/ # Anti-pattern detection modules
41
+ │ ├── 📁 rules/ # Detection rules and patterns
42
+ │ └── 📁 utils/ # Utility functions
43
+
44
+ ├── 📁 tests/ # Test files
45
+ │ └── [test files]
46
+
47
+ ├── 📁 tools/ # Analysis tools and detectors
48
+ │ ├── 📁 detectors/ # Main detection tools
49
+ │ │ ├── detector.py # Main anti-pattern detector
50
+ │ │ └── config.yaml # Detection configuration
51
+ │ ├── 📁 visualizers/ # Data visualization tools
52
+ │ └── 📁 exporters/ # Data export tools
53
+
54
+ ├── 📁 linux/ # Linux kernel source (cloned)
55
+ │ └── [kernel source files]
56
+
57
+ ├── 📄 README.md # Main project documentation
58
+ ├── 📄 requirements.txt # Main project dependencies
59
+ ├── 📄 requirements-kernel-analysis.txt
60
+ ├── 📄 requirements-simple.txt
61
+ ├── 📄 .gitignore # Git ignore rules
62
+ └── 📄 PROJECT_STRUCTURE.md # This file
63
+ ```
64
+
65
+ ## Directory Descriptions
66
+
67
+ ### 📁 data/
68
+ Contains all analysis results, logs, and generated data files.
69
+ - **results.json**: Complete analysis results from the main detector
70
+ - **concurrency_analysis_report.json**: Detailed concurrency analysis
71
+ - **kernel_analysis.log**: Analysis execution logs
72
+
73
+ ### 📁 docs/
74
+ Project documentation and guides.
75
+ - **kernel-analysis-guide.md**: Comprehensive guide for kernel analysis
76
+ - Additional documentation for specific features
77
+
78
+ ### 📁 examples/
79
+ Example code, usage patterns, and sample data.
80
+ - Example kernel modules for testing
81
+ - Sample configuration files
82
+ - Usage examples
83
+
84
+ ### 📁 reports/
85
+ Generated analysis reports in various formats.
86
+ - **Linux_Kernel_Anti_Pattern_Analysis_Report.md**: Complete technical report
87
+ - **Executive_Summary.md**: High-level summary for stakeholders
88
+ - **concurrency/**: Specialized reports for specific issue types
89
+
90
+ ### 📁 scripts/
91
+ Analysis and utility scripts organized by function.
92
+
93
+ #### 📁 analysis/
94
+ Core analysis scripts for different types of anti-patterns.
95
+ - **concurrency_analyzer.py**: Specialized concurrency issue analysis
96
+ - **analyze_kernel_structure.py**: Kernel structure analysis
97
+
98
+ #### 📁 reporting/
99
+ Scripts for generating and viewing reports.
100
+ - **view_results.py**: Interactive results viewer and reporter
101
+
102
+ #### 📁 utils/
103
+ Utility scripts for common tasks.
104
+ - **quick_summary.py**: Quick summary generation
105
+
106
+ ### 📁 src/
107
+ Main project source code (core framework).
108
+ - **detectors/**: Anti-pattern detection modules
109
+ - **rules/**: Detection rules and pattern definitions
110
+ - **utils/**: Utility functions and helpers
111
+
112
+ ### 📁 tests/
113
+ Test files and test data.
114
+ - Unit tests for detection modules
115
+ - Integration tests
116
+ - Test data and fixtures
117
+
118
+ ### 📁 tools/
119
+ Analysis tools and detectors.
120
+
121
+ #### 📁 detectors/
122
+ Main detection tools and configurations.
123
+ - **detector.py**: Primary anti-pattern detection engine
124
+ - **config.yaml**: Detection configuration and rules
125
+
126
+ #### 📁 visualizers/
127
+ Data visualization and charting tools.
128
+ - Interactive dashboards
129
+ - Chart generators
130
+ - Data plotting utilities
131
+
132
+ #### 📁 exporters/
133
+ Data export and format conversion tools.
134
+ - JSON to other formats
135
+ - Report generation
136
+ - Data transformation
137
+
138
+ ### 📁 linux/
139
+ Cloned Linux kernel source code for analysis.
140
+ - Complete kernel source tree
141
+ - Used for code snippet extraction
142
+ - Reference for pattern validation
143
+
144
+ ## File Descriptions
145
+
146
+ ### Core Files
147
+ - **README.md**: Main project documentation and getting started guide
148
+ - **requirements.txt**: Main project Python dependencies
149
+ - **requirements-kernel-analysis.txt**: Kernel analysis specific dependencies
150
+ - **requirements-simple.txt**: Simplified dependencies for basic usage
151
+ - **.gitignore**: Git ignore patterns for the project
152
+
153
+ ### Configuration Files
154
+ - **tools/detectors/config.yaml**: Main detection configuration
155
+ - **tools/detectors/detector.py**: Primary detection engine
156
+
157
+ ## Usage Patterns
158
+
159
+ ### Running Analysis
160
+ ```bash
161
+ # Main analysis
162
+ python tools/detectors/detector.py --clone --output data/results.json
163
+
164
+ # Concurrency analysis
165
+ python scripts/analysis/concurrency_analyzer.py
166
+
167
+ # View results
168
+ python scripts/reporting/view_results.py data/results.json
169
+ ```
170
+
171
+ ### Generating Reports
172
+ ```bash
173
+ # Quick summary
174
+ python scripts/utils/quick_summary.py
175
+
176
+ # Interactive viewer
177
+ python scripts/reporting/view_results.py --interactive
178
+ ```
179
+
180
+ ### Development
181
+ ```bash
182
+ # Install dependencies
183
+ pip install -r requirements.txt
184
+ pip install -r requirements-kernel-analysis.txt
185
+
186
+ # Run tests
187
+ python -m pytest tests/
188
+
189
+ # Development setup
190
+ conda activate linux-kernel-anti-pattern-detector
191
+ ```
192
+
193
+ ## Best Practices
194
+
195
+ ### Adding New Features
196
+ 1. **Analysis scripts**: Add to `scripts/analysis/`
197
+ 2. **Reporting tools**: Add to `scripts/reporting/`
198
+ 3. **Utilities**: Add to `scripts/utils/`
199
+ 4. **Core detection**: Add to `src/detectors/`
200
+ 5. **Configuration**: Update `tools/detectors/config.yaml`
201
+
202
+ ### File Naming Conventions
203
+ - **Python files**: snake_case (e.g., `concurrency_analyzer.py`)
204
+ - **Configuration files**: kebab-case (e.g., `kernel-analysis-guide.md`)
205
+ - **Reports**: Pascal_Case (e.g., `Concurrency_Analysis_Report.md`)
206
+
207
+ ### Data Management
208
+ - **Raw data**: Store in `data/`
209
+ - **Processed results**: Store in `data/`
210
+ - **Reports**: Generate in `reports/`
211
+ - **Logs**: Store in `data/`
212
+
213
+ ## Maintenance
214
+
215
+ ### Regular Tasks
216
+ 1. **Update dependencies**: Review and update requirements files
217
+ 2. **Clean data**: Remove old analysis results periodically
218
+ 3. **Update kernel**: Refresh the Linux kernel source
219
+ 4. **Backup reports**: Archive important analysis reports
220
+
221
+ ### Version Control
222
+ - **Track**: Source code, configuration, documentation
223
+ - **Ignore**: Analysis results, logs, kernel source (large files)
224
+ - **Archive**: Important reports and findings
225
+
226
+ ---
227
+
228
+ *This structure is designed to be scalable, maintainable, and easy to navigate. Each directory has a clear purpose and the organization supports both development and research workflows.*
dataset/linux_bugfix_100k.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef9110bf645ceb42b2a6de21a952f82448ed7cfa31ac383c1c62fb9a840b9574
3
+ size 241507688
dataset/linux_bugfix_prompt_completion.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:621b0b7e04c5b359ea28eec9bf165aad2eaea77b07b834b0c1b7de37202e7992
3
+ size 2184324531
dataset_builder/build_dataset.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydriller import Repository
2
+ import os
3
+ import json
4
+ from tqdm import tqdm
5
+ import re
6
+
7
+ REPO_PATH = '../linux'
8
+ OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'
9
+
10
+ TEST_MODE = False # Set to False to process the full repository
11
+
12
+ BUGFIX_KEYWORDS = [
13
+ 'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
14
+ 'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
15
+ 'security', 'vulnerability', 'exploit', 'buffer', 'stack'
16
+ ]
17
+
18
+ def is_bugfix_commit(msg):
19
+ msg_lower = msg.lower()
20
+ return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)
21
+
22
+ def extract_instruction_from_commit_msg(msg):
23
+ lines = msg.strip().splitlines()
24
+ for line in lines:
25
+ line = line.strip()
26
+ if len(line) < 5 or not any(c.isalpha() for c in line):
27
+ continue
28
+ if line.lower().startswith((
29
+ '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
30
+ 'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
31
+ 'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
32
+ )):
33
+ continue
34
+ return line
35
+ return msg.strip().splitlines()[0] if msg.strip() else "fix"
36
+
37
+ def extract_code_context(code, line_number, context_lines=10):
38
+ if not code:
39
+ return ""
40
+ lines = code.split('\n')
41
+ start = max(0, line_number - context_lines)
42
+ end = min(len(lines), line_number + context_lines)
43
+ return '\n'.join(lines[start:end])
44
+
45
+ def extract_diff_context(diff_text, context_lines=5):
46
+ if not diff_text:
47
+ return ""
48
+ lines = diff_text.split('\n')
49
+ change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
50
+ if not change_lines:
51
+ return diff_text
52
+ start = max(0, change_lines[0] - context_lines)
53
+ end = min(len(lines), change_lines[-1] + context_lines + 1)
54
+ return '\n'.join(lines[start:end])
55
+
56
+ def create_dataset_entry(original_code, commit_msg, diff_code):
57
+ return {
58
+ "input": {
59
+ "original code": original_code.strip(),
60
+ "instruction": extract_instruction_from_commit_msg(commit_msg)
61
+ },
62
+ "output": {
63
+ "diff codes": diff_code.strip()
64
+ }
65
+ }
66
+
67
+ def process_commit(commit):
68
+ entries = []
69
+ if not is_bugfix_commit(commit.msg):
70
+ return entries
71
+
72
+ for mod in commit.modified_files:
73
+ if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
74
+ continue
75
+ if mod.change_type.name != "MODIFY":
76
+ continue
77
+ if not mod.diff or not mod.source_code_before:
78
+ continue
79
+
80
+ focused_diff = extract_diff_context(mod.diff)
81
+
82
+ diff_lines = mod.diff.split('\n')
83
+ line_numbers = []
84
+ for line in diff_lines:
85
+ if line.startswith('@@'):
86
+ match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
87
+ if match:
88
+ line_numbers.append(int(match.group(1)))
89
+
90
+ if line_numbers:
91
+ focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
92
+ else:
93
+ focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])
94
+
95
+ entry = create_dataset_entry(
96
+ original_code=focused_code,
97
+ commit_msg=commit.msg,
98
+ diff_code=focused_diff
99
+ )
100
+ entries.append(entry)
101
+
102
+ return entries
103
+
104
+ def main():
105
+ if not os.path.exists(REPO_PATH):
106
+ print(f"\u274c Repository not found at: {REPO_PATH}")
107
+ return
108
+
109
+ os.makedirs('./output', exist_ok=True)
110
+
111
+ print("\ud83d\udd0d Building Linux kernel bug-fix dataset...")
112
+ print(f"\ud83d\udcc1 Repository: {REPO_PATH}")
113
+ print(f"\ud83d\udcce Output: {OUTPUT_FILE}")
114
+
115
+ output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE
116
+
117
+ repo = Repository(REPO_PATH)
118
+ dataset_entries = []
119
+ processed_commits = 0
120
+ total_commits = 0
121
+ bugfix_commits = 0
122
+
123
+ for commit in tqdm(repo.traverse_commits(), desc="Processing commits"):
124
+ total_commits += 1
125
+ if TEST_MODE and MAX_COMMITS_TEST and total_commits > MAX_COMMITS_TEST:
126
+ break
127
+ if is_bugfix_commit(commit.msg):
128
+ bugfix_commits += 1
129
+ entries = process_commit(commit)
130
+ if entries:
131
+ dataset_entries.extend(entries)
132
+ processed_commits += 1
133
+ if TEST_MODE:
134
+ print(f"\n\ud83d\udd0d Bug-fix commit {processed_commits}: {commit.hash[:8]}")
135
+ print(f"\ud83d\udcdd Message: {extract_instruction_from_commit_msg(commit.msg)}")
136
+ print(f"\ud83d\udcca Files: {len(entries)} entries extracted")
137
+ print(f"\ud83d\udcc1 Files: {[mod.new_path for mod in commit.modified_files if mod.new_path and mod.new_path.endswith(('.c', '.h'))]}")
138
+
139
+ with open(output_file, 'w', encoding='utf-8') as f:
140
+ for entry in dataset_entries:
141
+ f.write(json.dumps(entry, ensure_ascii=False) + '\n')
142
+
143
+ print(f"\n\u2705 Dataset creation completed!")
144
+ print(f"\ud83d\udcca Total commits processed: {total_commits}")
145
+ print(f"\ud83d\udc1b Bug-fix commits found: {bugfix_commits}")
146
+ print(f"\ud83d\udcdd Commits with valid entries: {processed_commits}")
147
+ print(f"\ud83d\udcdd Total dataset entries: {len(dataset_entries)}")
148
+ print(f"\ud83d\udcce Saved to: {output_file}")
149
+
150
+ if dataset_entries:
151
+ print(f"\n\ud83d\udccb Sample dataset entry:")
152
+ sample = dataset_entries[0]
153
+ print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
154
+ print(f"\n\ud83d\udcc1 Dataset structure:")
155
+ print(f" - Input: original code + instruction")
156
+ print(f" - Output: diff codes")
157
+ print(f" - Format: JSONL (one JSON object per line)")
158
+
159
+ if __name__ == "__main__":
160
+ main()
dataset_builder/build_dataset_demo.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydriller import Repository
2
+ import os
3
+ import json
4
+ from tqdm import tqdm
5
+ import re
6
+ from multiprocessing import Pool
7
+
8
+ REPO_PATH = '../linux'
9
+ OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'
10
+
11
+ TEST_MODE = False # Set to False to process the full repository
12
+ MAX_COMMITS_TEST = 50 # Set a limit if TEST_MODE is True
13
+ NUM_WORKERS = 16 # Adjust to your actual core count
14
+
15
+ BUGFIX_KEYWORDS = [
16
+ 'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
17
+ 'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
18
+ 'security', 'vulnerability', 'exploit', 'buffer', 'stack'
19
+ ]
20
+
21
+ def is_bugfix_commit(msg):
22
+ msg_lower = msg.lower()
23
+ return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)
24
+
25
+ def extract_instruction_from_commit_msg(msg):
26
+ lines = msg.strip().splitlines()
27
+ for line in lines:
28
+ line = line.strip()
29
+ if len(line) < 5 or not any(c.isalpha() for c in line):
30
+ continue
31
+ if line.lower().startswith((
32
+ '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
33
+ 'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
34
+ 'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
35
+ )):
36
+ continue
37
+ return line
38
+ return msg.strip().splitlines()[0] if msg.strip() else "fix"
39
+
40
+ def extract_code_context(code, line_number, context_lines=10):
41
+ if not code:
42
+ return ""
43
+ lines = code.split('\n')
44
+ start = max(0, line_number - context_lines)
45
+ end = min(len(lines), line_number + context_lines)
46
+ return '\n'.join(lines[start:end])
47
+
48
+ def extract_diff_context(diff_text, context_lines=5):
49
+ if not diff_text:
50
+ return ""
51
+ lines = diff_text.split('\n')
52
+ change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
53
+ if not change_lines:
54
+ return diff_text
55
+ start = max(0, change_lines[0] - context_lines)
56
+ end = min(len(lines), change_lines[-1] + context_lines + 1)
57
+ return '\n'.join(lines[start:end])
58
+
59
+ def create_dataset_entry(original_code, commit_msg, diff_code):
60
+ return {
61
+ "input": {
62
+ "original code": original_code.strip(),
63
+ "instruction": extract_instruction_from_commit_msg(commit_msg)
64
+ },
65
+ "output": {
66
+ "diff codes": diff_code.strip()
67
+ }
68
+ }
69
+
70
+ def process_commit(commit):
71
+ entries = []
72
+ if not is_bugfix_commit(commit.msg):
73
+ return entries
74
+
75
+ for mod in commit.modified_files:
76
+ if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
77
+ continue
78
+ if mod.change_type.name != "MODIFY":
79
+ continue
80
+ if not mod.diff or not mod.source_code_before:
81
+ continue
82
+
83
+ focused_diff = extract_diff_context(mod.diff)
84
+
85
+ diff_lines = mod.diff.split('\n')
86
+ line_numbers = []
87
+ for line in diff_lines:
88
+ if line.startswith('@@'):
89
+ match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
90
+ if match:
91
+ line_numbers.append(int(match.group(1)))
92
+
93
+ if line_numbers:
94
+ focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
95
+ else:
96
+ focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])
97
+
98
+ entry = create_dataset_entry(
99
+ original_code=focused_code,
100
+ commit_msg=commit.msg,
101
+ diff_code=focused_diff
102
+ )
103
+ entries.append(entry)
104
+
105
+ return entries
106
+
107
+ def collect_entries_from_hash(commit_hash):
108
+ try:
109
+ commit = next(Repository(REPO_PATH, only_commits=[commit_hash]).traverse_commits())
110
+ return process_commit(commit)
111
+ except Exception:
112
+ return []
113
+
114
+ def main():
115
+ if not os.path.exists(REPO_PATH):
116
+ print("[ERROR] Repository not found at:", REPO_PATH)
117
+ return
118
+
119
+ os.makedirs('./output', exist_ok=True)
120
+
121
+ print("[INFO] Building Linux kernel bug-fix dataset...")
122
+ print("[INFO] Repository:", REPO_PATH)
123
+ print("[INFO] Output file:", OUTPUT_FILE)
124
+
125
+ output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE
126
+
127
+ all_hashes = [c.hash for c in Repository(REPO_PATH).traverse_commits()]
128
+ if TEST_MODE and MAX_COMMITS_TEST:
129
+ all_hashes = all_hashes[:MAX_COMMITS_TEST]
130
+
131
+ dataset_entries = []
132
+ with Pool(NUM_WORKERS) as pool:
133
+ results = list(tqdm(pool.imap_unordered(collect_entries_from_hash, all_hashes), total=len(all_hashes)))
134
+
135
+ for entries in results:
136
+ dataset_entries.extend(entries)
137
+
138
+ with open(output_file, 'w', encoding='utf-8') as f:
139
+ for entry in dataset_entries:
140
+ f.write(json.dumps(entry, ensure_ascii=False) + '\n')
141
+
142
+ print("[DONE] Dataset creation completed!")
143
+ print("[INFO] Total commits processed:", len(all_hashes))
144
+ print("[INFO] Total dataset entries:", len(dataset_entries))
145
+ print("[INFO] Saved to:", output_file)
146
+
147
+ if dataset_entries:
148
+ print("[INFO] Sample dataset entry:")
149
+ sample = dataset_entries[0]
150
+ print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
151
+ print("[INFO] Dataset structure:")
152
+ print(" - Input: original code + instruction")
153
+ print(" - Output: diff codes")
154
+ print(" - Format: JSONL (one JSON object per line)")
155
+
156
+ if __name__ == "__main__":
157
+ main()
dataset_builder/convert_to_prompt_completion.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ INPUT_FILE = './output/linux_bugfix_dataset.jsonl'
4
+ OUTPUT_FILE = './output/linux_bugfix_prompt_completion.jsonl'
5
+
6
+ def format_prompt(original_code, instruction):
7
+ return (
8
+ "Given the following original C code:\n"
9
+ f"{original_code.strip()}\n\n"
10
+ "Instruction:\n"
11
+ f"{instruction.strip()}\n\n"
12
+ "Return the diff that fixes it:\n"
13
+ )
14
+
15
+ def format_completion(diff_code):
16
+ return diff_code.strip()
17
+
18
+ def convert_dataset(input_path, output_path):
19
+ with open(input_path, 'r', encoding='utf-8') as fin, \
20
+ open(output_path, 'w', encoding='utf-8') as fout:
21
+
22
+ for line in fin:
23
+ data = json.loads(line)
24
+ original_code = data["input"]["original code"]
25
+ instruction = data["input"]["instruction"]
26
+ diff_code = data["output"]["diff codes"]
27
+
28
+ prompt = format_prompt(original_code, instruction)
29
+ completion = format_completion(diff_code)
30
+
31
+ new_entry = {
32
+ "prompt": prompt,
33
+ "completion": completion
34
+ }
35
+
36
+ fout.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
37
+
38
+ print(f"[DONE] Converted dataset saved to: {output_path}")
39
+
40
+ if __name__ == "__main__":
41
+ convert_dataset(INPUT_FILE, OUTPUT_FILE)
evaluate/__pycache__/evaluate.cpython-312.pyc ADDED
Binary file (3.93 kB). View file
 
evaluate/eval.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0775fb198ac76efb0313376b80d43db691701b1ed52a524520bd38490c123242
3
+ size 1758795
evaluate/evaluate.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ from datasets import load_dataset
4
+ from tqdm import tqdm
5
+ import json
6
+ import csv
7
+ import os
8
+ import evaluate
9
+
10
+ # ==== CONFIG ====
11
+ MODEL_PATH = "../train/output/qlora-codellama-bugfix"
12
+ EVAL_FILE = "eval.jsonl"
13
+ OUTPUT_JSON = "./output/eval_results.json"
14
+ OUTPUT_CSV = "./output/eval_results.csv"
15
+ MAX_INPUT_LEN = 1024
16
+ MAX_NEW_TOKENS = 256
17
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # ==== Ensure output folder exists ====
20
+ os.makedirs(os.path.dirname(OUTPUT_JSON), exist_ok=True)
21
+
22
+ # ==== Load model ====
23
+ print("🔄 Loading model...")
24
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
25
+ model = AutoModelForCausalLM.from_pretrained(
26
+ MODEL_PATH,
27
+ torch_dtype=torch.bfloat16,
28
+ device_map="auto"
29
+ )
30
+ model.eval()
31
+
32
+ # ==== Load eval data ====
33
+ print("📂 Loading evaluation data...")
34
+ eval_data = load_dataset("json", data_files=EVAL_FILE, split="train")
35
+
36
+ # ==== Inference ====
37
+ results = []
38
+ print("⚙️ Running inference...")
39
+ for example in tqdm(eval_data):
40
+ prompt = example["prompt"]
41
+ reference = example["completion"]
42
+
43
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(DEVICE)
44
+
45
+ with torch.no_grad():
46
+ outputs = model.generate(
47
+ **inputs,
48
+ max_new_tokens=MAX_NEW_TOKENS,
49
+ do_sample=False,
50
+ num_beams=4,
51
+ pad_token_id=tokenizer.pad_token_id,
52
+ eos_token_id=tokenizer.eos_token_id
53
+ )
54
+
55
+ prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
56
+
57
+ results.append({
58
+ "prompt": prompt,
59
+ "reference": reference.strip(),
60
+ "prediction": prediction.strip()
61
+ })
62
+
63
+ # ==== Save results ====
64
+ with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
65
+ json.dump(results, f, indent=2)
66
+ print(f"✅ Saved JSON to {OUTPUT_JSON}")
67
+
68
+ with open(OUTPUT_CSV, "w", encoding="utf-8", newline='') as f:
69
+ writer = csv.DictWriter(f, fieldnames=["prompt", "reference", "prediction"])
70
+ writer.writeheader()
71
+ writer.writerows(results)
72
+ print(f"✅ Saved CSV to {OUTPUT_CSV}")
73
+
74
+ # ==== Compute Metrics ====
75
+ print("📊 Computing BLEU and ROUGE...")
76
+ bleu = evaluate.load("bleu")
77
+ rouge = evaluate.load("rouge")
78
+
79
+ predictions = [r["prediction"] for r in results]
80
+ references = [r["reference"] for r in results]
81
+
82
+ bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
83
+ rouge_score = rouge.compute(predictions=predictions, references=references)
84
+
85
+ print("\n📈 Evaluation Results:")
86
+ print("BLEU:", bleu_score)
87
+ print("ROUGE:", json.dumps(rouge_score, indent=2))
evaluate/output/.eval_results.csv.swp ADDED
Binary file (45.1 kB). View file
 
evaluate/output/eval_results.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20a66ee1f8b006ceb278d3363c9337f23a4ffa7e49e54c379e409342cda874fd
3
+ size 2836184
evaluate/output/eval_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cdf1b2c1317da8ff11b303547cedd6642d5762984e1952134a11cff4ff8120e
3
+ size 3078110
requirements.txt ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ absl-py==2.3.0
2
+ accelerate==1.8.1
3
+ aiohappyeyeballs==2.6.1
4
+ aiohttp==3.12.13
5
+ aiosignal==1.4.0
6
+ anyio==4.9.0
7
+ argon2-cffi==25.1.0
8
+ argon2-cffi-bindings==21.2.0
9
+ arrow==1.3.0
10
+ asttokens==3.0.0
11
+ async-lru==2.0.5
12
+ attrs==25.3.0
13
+ babel==2.17.0
14
+ bash_kernel==0.10.0
15
+ beautifulsoup4==4.13.4
16
+ bitsandbytes==0.46.1
17
+ bleach==6.2.0
18
+ blinker==1.7.0
19
+ certifi==2025.6.15
20
+ cffi==1.17.1
21
+ charset-normalizer==3.4.2
22
+ comm==0.2.2
23
+ conda-pack==0.8.1
24
+ cryptography==41.0.7
25
+ datasets==3.6.0
26
+ dbus-python==1.3.2
27
+ debugpy==1.8.14
28
+ decorator==5.2.1
29
+ defusedxml==0.7.1
30
+ dill==0.3.8
31
+ distro==1.9.0
32
+ evaluate==0.4.4
33
+ executing==2.2.0
34
+ fastjsonschema==2.21.1
35
+ filelock==3.13.1
36
+ filetype==1.2.0
37
+ fqdn==1.5.1
38
+ frozenlist==1.7.0
39
+ fsspec==2024.6.1
40
+ grpcio==1.73.1
41
+ h11==0.16.0
42
+ hf-xet==1.1.5
43
+ httpcore==1.0.9
44
+ httplib2==0.20.4
45
+ httpx==0.28.1
46
+ huggingface-hub==0.33.2
47
+ idna==3.10
48
+ iniconfig==2.1.0
49
+ iotop==0.6
50
+ ipykernel==6.29.5
51
+ ipython==9.3.0
52
+ ipython_pygments_lexers==1.1.1
53
+ ipywidgets==8.1.7
54
+ isoduration==20.11.0
55
+ jedi==0.19.2
56
+ Jinja2==3.1.6
57
+ json5==0.12.0
58
+ jsonpointer==3.0.0
59
+ jsonschema==4.24.0
60
+ jsonschema-specifications==2025.4.1
61
+ jupyter==1.1.1
62
+ jupyter-archive==3.4.0
63
+ jupyter-console==6.6.3
64
+ jupyter-events==0.12.0
65
+ jupyter-http-over-ws==0.0.8
66
+ jupyter-lsp==2.2.5
67
+ jupyter_client==8.6.3
68
+ jupyter_core==5.8.1
69
+ jupyter_server==2.16.0
70
+ jupyter_server_terminals==0.5.3
71
+ jupyterlab==4.4.4
72
+ jupyterlab_pygments==0.3.0
73
+ jupyterlab_server==2.27.3
74
+ jupyterlab_widgets==3.0.15
75
+ launchpadlib==1.11.0
76
+ lazr.restfulclient==0.14.6
77
+ lazr.uri==1.0.6
78
+ Markdown==3.8.2
79
+ MarkupSafe==3.0.2
80
+ matplotlib-inline==0.1.7
81
+ mistune==3.1.3
82
+ mpmath==1.3.0
83
+ multidict==6.6.3
84
+ multiprocess==0.70.16
85
+ nbclient==0.10.2
86
+ nbconvert==7.16.6
87
+ nbformat==5.10.4
88
+ nbzip==0.1.0
89
+ nest-asyncio==1.6.0
90
+ networkx==3.3
91
+ notebook==7.4.3
92
+ notebook_shim==0.2.4
93
+ numpy==2.3.1
94
+ nvidia-cublas-cu12==12.8.3.14
95
+ nvidia-cuda-cupti-cu12==12.8.57
96
+ nvidia-cuda-nvrtc-cu12==12.8.61
97
+ nvidia-cuda-runtime-cu12==12.8.57
98
+ nvidia-cudnn-cu12==9.7.1.26
99
+ nvidia-cufft-cu12==11.3.3.41
100
+ nvidia-cufile-cu12==1.13.0.11
101
+ nvidia-curand-cu12==10.3.9.55
102
+ nvidia-cusolver-cu12==11.7.2.55
103
+ nvidia-cusparse-cu12==12.5.7.53
104
+ nvidia-cusparselt-cu12==0.6.3
105
+ nvidia-nccl-cu12==2.26.2
106
+ nvidia-nvjitlink-cu12==12.8.61
107
+ nvidia-nvtx-cu12==12.8.55
108
+ oauthlib==3.2.2
109
+ overrides==7.7.0
110
+ packaging==25.0
111
+ pandas==2.3.1
112
+ pandocfilters==1.5.1
113
+ parso==0.8.4
114
+ peft==0.16.0
115
+ pexpect==4.9.0
116
+ pillow==11.0.0
117
+ platformdirs==4.3.8
118
+ pluggy==1.6.0
119
+ prometheus_client==0.22.1
120
+ prompt_toolkit==3.0.51
121
+ propcache==0.3.2
122
+ protobuf==6.31.1
123
+ psutil==7.0.0
124
+ ptyprocess==0.7.0
125
+ pure_eval==0.2.3
126
+ pyarrow==20.0.0
127
+ pycparser==2.22
128
+ Pygments==2.19.2
129
+ PyGObject==3.48.2
130
+ PyJWT==2.7.0
131
+ pyparsing==3.1.1
132
+ pytest==8.4.1
133
+ python-apt==2.7.7+ubuntu4
134
+ python-dateutil==2.9.0.post0
135
+ python-json-logger==3.3.0
136
+ pytz==2025.2
137
+ PyYAML==6.0.2
138
+ pyzmq==27.0.0
139
+ referencing==0.36.2
140
+ regex==2024.11.6
141
+ requests==2.32.4
142
+ rfc3339-validator==0.1.4
143
+ rfc3986-validator==0.1.1
144
+ rpds-py==0.25.1
145
+ safetensors==0.5.3
146
+ Send2Trash==1.8.3
147
+ setuptools==68.1.2
148
+ six==1.16.0
149
+ sniffio==1.3.1
150
+ soupsieve==2.7
151
+ stack-data==0.6.3
152
+ supervisor==4.2.5
153
+ sympy==1.13.3
154
+ tensorboard==2.19.0
155
+ tensorboard-data-server==0.7.2
156
+ terminado==0.18.1
157
+ tinycss2==1.4.0
158
+ tokenizers==0.21.2
159
+ torch==2.7.1+cu128
160
+ torchaudio==2.7.1+cu128
161
+ torchvision==0.22.1+cu128
162
+ tornado==6.5.1
163
+ tqdm==4.67.1
164
+ traitlets==5.14.3
165
+ transformers==4.53.1
166
+ triton==3.3.1
167
+ types-python-dateutil==2.9.0.20250516
168
+ typing_extensions==4.14.0
169
+ tzdata==2025.2
170
+ uri-template==1.3.0
171
+ urllib3==2.5.0
172
+ uv==0.7.16
173
+ wadllib==1.3.6
174
+ wcwidth==0.2.13
175
+ webcolors==24.11.1
176
+ webencodings==0.5.1
177
+ websocket-client==1.8.0
178
+ Werkzeug==3.1.3
179
+ wheel==0.42.0
180
+ widgetsnbextension==4.0.14
181
+ xxhash==3.5.0
182
+ yarl==1.20.1
train/download_model.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+
3
+ model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
4
+ tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
5
+
6
+ print("✅ Download complete.")
train/output/qlora-codellama-bugfix/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLLaMA-7b-Instruct-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.16.0
train/output/qlora-codellama-bugfix/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df755384ffd5359513ac7a725ba0c4d3104e6b5c131f98aee0b1cb7d160c17c0
3
+ size 839
train/output/qlora-codellama-bugfix/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73618dfb9cd2422a1a6600a5cec7fc1178bfe579e0c77d827761f26eeaae7f09
3
+ size 134235048
train/output/qlora-codellama-bugfix/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}
train/output/qlora-codellama-bugfix/checkpoint-1000/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLLaMA-7b-Instruct-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.16.0
train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1964c3b05fe32e3371262e1acac8e947ef8f8a52431e37716f780afe50727087
3
+ size 839
train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84064653aacb1aa638242fa35d96415845184e3f97cec4da4bc9412385f7aff9
3
+ size 134235048
train/output/qlora-codellama-bugfix/checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}
train/output/qlora-codellama-bugfix/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e69ea177eb91af1890ec93a594282aef22499e4743b53c958f27820bc33d28e
3
+ size 268544075
train/output/qlora-codellama-bugfix/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8ecd469a8b4c7959941512dc268004dd3111ea9235d256aa9638ce00699c0f2
3
+ size 14645
train/output/qlora-codellama-bugfix/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cee0966af6e7296ad683b3f293ade83fe035fb7007b575377ad7775d13ec8b46
3
+ size 1465
train/output/qlora-codellama-bugfix/checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
3
+ size 538
train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
3
+ size 3620829
train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
3
+ size 1869
train/output/qlora-codellama-bugfix/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e448dea2040f629a632432c297857131d1b3ac2ccdb25548b1cb4fe4ec5d779f
3
+ size 4295
train/output/qlora-codellama-bugfix/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cca9708d68b5d07e91505e97d7945c08ed5c660e02fce750706e45d57565144
3
+ size 5777
train/output/qlora-codellama-bugfix/checkpoint-500/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: codellama/CodeLLaMA-7b-Instruct-hf
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.16.0
train/output/qlora-codellama-bugfix/checkpoint-500/adapter_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1964c3b05fe32e3371262e1acac8e947ef8f8a52431e37716f780afe50727087
3
+ size 839
train/output/qlora-codellama-bugfix/checkpoint-500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddc3978f5ac8ec8f6c879f16e4deab2f4955796fc45daaf9fd7063d6b270b027
3
+ size 134235048
train/output/qlora-codellama-bugfix/checkpoint-500/chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}
train/output/qlora-codellama-bugfix/checkpoint-500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e0ede750ff79b1bb5285ab51d0e4b03eeec5f4673ac149f1e973b6d20ad8e91
3
+ size 268544075
train/output/qlora-codellama-bugfix/checkpoint-500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:873492f033797d17bd144ec6ef43f5efb66933cf27e0c60ddd8d799f5d1f12c1
3
+ size 14645
train/output/qlora-codellama-bugfix/checkpoint-500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1492973eb5cd9c3e10c1c7ee7c4d19bcd22481640c90b6278d1a21419300ba9
3
+ size 1465
train/output/qlora-codellama-bugfix/checkpoint-500/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
3
+ size 538
train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
3
+ size 3620829
train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
3
+ size 1869
train/output/qlora-codellama-bugfix/checkpoint-500/trainer_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22d7fb0a27cf19e9d2091a0f3a62b86790ee0448da788f1599a6b1e4e2a83e84
3
+ size 2543
train/output/qlora-codellama-bugfix/checkpoint-500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cca9708d68b5d07e91505e97d7945c08ed5c660e02fce750706e45d57565144
3
+ size 5777
train/output/qlora-codellama-bugfix/special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
3
+ size 538
train/output/qlora-codellama-bugfix/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
3
+ size 3620829
train/output/qlora-codellama-bugfix/tokenizer_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
3
+ size 1869
train/train.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QLoRA fine-tuning for CodeLLaMA-7B-Instruct on 1x H200
2
+ # Requirements: transformers, peft, accelerate, bitsandbytes, datasets
3
+ from transformers import (
4
+ AutoTokenizer,
5
+ AutoModelForCausalLM,
6
+ TrainingArguments,
7
+ Trainer,
8
+ BitsAndBytesConfig,
9
+ DataCollatorForSeq2Seq
10
+ )
11
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
12
+ from datasets import load_dataset
13
+ import torch
14
+ import os
15
+ import wandb
16
+
17
+ os.environ["WANDB_PROJECT"] = "codellama-7b-instruct-qlora-linux-bugfix"
18
+ os.environ["WANDB_NAME"] = "run-v1"
19
+ # Paths and model
20
+ BASE_MODEL = "codellama/CodeLLaMA-7b-Instruct-hf"
21
+ DATA_PATH = "../dataset/linux_bugfix_100k.jsonl"
22
+ OUTPUT_DIR = "./output/qlora-codellama-bugfix"
23
+
24
+ # Load dataset (prompt-completion format)
25
+ dataset = load_dataset("json", data_files=DATA_PATH, split="train")
26
+
27
+ # BitsandBytes config for QLoRA
28
+ bnb_config = BitsAndBytesConfig(
29
+ load_in_4bit=True,
30
+ bnb_4bit_use_double_quant=True,
31
+ bnb_4bit_quant_type="nf4",
32
+ bnb_4bit_compute_dtype=torch.bfloat16 # optimized for H100/H200
33
+ )
34
+
35
+ # Load tokenizer and model
36
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
37
+ tokenizer.pad_token = tokenizer.eos_token
38
+ tokenizer.padding_side = "right"
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ BASE_MODEL,
42
+ quantization_config=bnb_config,
43
+ device_map="auto"
44
+ )
45
+ model = prepare_model_for_kbit_training(model)
46
+ model.gradient_checkpointing_enable()
47
+ torch.backends.cuda.matmul.allow_tf32 = True
48
+
49
+ # Apply QLoRA (LoRA config)
50
+ lora_config = LoraConfig(
51
+ r=64,
52
+ lora_alpha=16,
53
+ lora_dropout=0.1,
54
+ bias="none",
55
+ task_type="CAUSAL_LM"
56
+ )
57
+ model = get_peft_model(model, lora_config)
58
+ model.config.use_cache = False
59
+ model.config.return_dict = True
60
+ model.config.pad_token_id = tokenizer.pad_token_id
61
+ model.print_trainable_parameters()
62
+
63
+ # Format and tokenize the dataset
64
+ model_max_len = tokenizer.model_max_length
65
+
66
+ def format(example):
67
+ prompt_ids = tokenizer(example["prompt"], truncation=True, max_length=1024)["input_ids"]
68
+ completion_ids = tokenizer(example["completion"], truncation=True, max_length=512)["input_ids"]
69
+
70
+ input_ids = prompt_ids + completion_ids
71
+ labels = [-100] * len(prompt_ids) + completion_ids
72
+
73
+ # pad both input_ids and labels to the same length
74
+ max_len = min(len(input_ids), tokenizer.model_max_length)
75
+ input_ids = input_ids[:max_len]
76
+ labels = labels[:max_len]
77
+
78
+ return {
79
+ "input_ids": input_ids,
80
+ "labels": labels,
81
+ }
82
+
83
+
84
+ # Sanity check
85
+ print("__ Sanity checking one example...")
86
+ sample = format(dataset[0])
87
+ test_input = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
88
+ test_labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device)
89
+ model.train()
90
+ out = model(input_ids=test_input, labels=test_labels)
91
+ assert out.loss.requires_grad, "Sanity check failed: Loss does not require grad."
92
+ print("__ Sanity check passed. Proceeding to map()...")
93
+
94
+ # Apply formatting to entire dataset
95
+ dataset = dataset.map(format, remove_columns=["prompt", "completion"])
96
+ collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt", pad_to_multiple_of=8)
97
+
98
+ # Training arguments
99
+ training_args = TrainingArguments(
100
+ report_to="wandb",
101
+ run_name="codellama-7b-instruct-qlora-linux-bugfix",
102
+ logging_dir=f"{OUTPUT_DIR}/logs",
103
+
104
+ output_dir=OUTPUT_DIR,
105
+ num_train_epochs=3,
106
+ per_device_train_batch_size=64,
107
+ gradient_accumulation_steps=4,
108
+ learning_rate=2e-4,
109
+ lr_scheduler_type="cosine",
110
+ warmup_ratio=0.03,
111
+ gradient_checkpointing=True,
112
+ bf16=True, # Important for H200
113
+ fp16=False,
114
+ max_grad_norm=1.0,
115
+ save_strategy="steps",
116
+ save_steps=500,
117
+ save_total_limit=2,
118
+ logging_steps=50,
119
+ push_to_hub=False,
120
+ label_names=["labels"],
121
+ remove_unused_columns=False, # Critical to prevent data loss
122
+ )
123
+
124
+ # Trainer setup
125
+ trainer = Trainer(
126
+ model=model,
127
+ args=training_args,
128
+ train_dataset=dataset,
129
+ tokenizer=tokenizer,
130
+ data_collator=collator
131
+ )
132
+
133
+
134
+ # Begin training
135
+ model.train()
136
+ print(f"Track this run in Weights & Biases: https://wandb.ai/{os.environ['WANDB_PROJECT']}/{os.environ['WANDB_NAME']}")
137
+ trainer.train(resume_from_checkpoint=True)
138
+
139
+
140
+ # Save final model
141
+ model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
142
+ tokenizer.save_pretrained(OUTPUT_DIR)
143
+ print(f"[DONE] Model saved to {OUTPUT_DIR}")
train/train_codellama_qlora.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # QLoRA fine-tuning for CodeLLaMA-7B-Instruct
2
+ # Requirements: transformers, peft, accelerate, bitsandbytes, datasets
3
+
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
5
+ from peft import LoraConfig, get_peft_model
6
+ from datasets import load_dataset
7
+ import torch
8
+ import os
9
+
10
+ # Paths and parameters
11
+ BASE_MODEL = "codellama/CodeLlama-7b-Instruct-hf"
12
+ DATA_PATH = "../dataset_builder/output/linux_bugfix_prompt_completion.jsonl"
13
+ OUTPUT_DIR = "./output/qlora-codellama-bugfix"
14
+
15
+ # Load dataset (prompt, completion)
16
+ dataset = load_dataset("json", data_files=DATA_PATH, split="train")
17
+
18
+ # Apply formatting for supervised fine-tuning
19
+ def format(example):
20
+ prompt = tokenizer(
21
+ example["prompt"],
22
+ truncation=True,
23
+ padding="max_length",
24
+ max_length=512
25
+ )
26
+ completion = tokenizer(
27
+ example["completion"],
28
+ truncation=True,
29
+ padding="max_length",
30
+ max_length=512
31
+ )
32
+ input_ids = prompt["input_ids"] + completion["input_ids"]
33
+ labels = [-100] * len(prompt["input_ids"]) + completion["input_ids"]
34
+
35
+ return {
36
+ "input_ids": input_ids[:1024],
37
+ "labels": labels[:1024]
38
+ }
39
+
40
+ # Load tokenizer and base model
41
+ bnb_config = BitsAndBytesConfig(
42
+ load_in_4bit=True,
43
+ bnb_4bit_use_double_quant=True,
44
+ bnb_4bit_quant_type="nf4",
45
+ bnb_4bit_compute_dtype=torch.float16
46
+ )
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
49
+ tokenizer.pad_token = tokenizer.eos_token # Required for padding
50
+
51
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto")
52
+
53
+ # Apply QLoRA
54
+ lora_config = LoraConfig(
55
+ r=64,
56
+ lora_alpha=16,
57
+ lora_dropout=0.1,
58
+ bias="none",
59
+ task_type="CAUSAL_LM"
60
+ )
61
+
62
+ model = get_peft_model(model, lora_config)
63
+
64
+ # Tokenize dataset
65
+ dataset = dataset.map(format, remove_columns=["prompt", "completion"])
66
+
67
+ # Training args
68
+ training_args = TrainingArguments(
69
+ output_dir=OUTPUT_DIR,
70
+ num_train_epochs=3,
71
+ per_device_train_batch_size=1,
72
+ gradient_accumulation_steps=4,
73
+ learning_rate=2e-4,
74
+ logging_dir=f"{OUTPUT_DIR}/logs",
75
+ logging_steps=10,
76
+ save_strategy="epoch",
77
+ bf16=False,
78
+ fp16=True,
79
+ save_total_limit=2,
80
+ report_to="none",
81
+ push_to_hub=False
82
+ )
83
+
84
+ # Trainer
85
+ trainer = Trainer(
86
+ model=model,
87
+ args=training_args,
88
+ train_dataset=dataset,
89
+ tokenizer=tokenizer
90
+ )
91
+
92
+ trainer.train()
93
+
94
+ model.save_pretrained(OUTPUT_DIR)
95
+ tokenizer.save_pretrained(OUTPUT_DIR)
96
+ print(f"[DONE] Model saved to {OUTPUT_DIR}")