Mac Huang
commited on
Commit
·
ed6b901
1
Parent(s):
43864c1
Push full project including dataset, code, and training scripts
Browse files- PROJECT_STRUCTURE.md +228 -0
- dataset/linux_bugfix_100k.jsonl +3 -0
- dataset/linux_bugfix_prompt_completion.jsonl +3 -0
- dataset_builder/build_dataset.py +160 -0
- dataset_builder/build_dataset_demo.py +157 -0
- dataset_builder/convert_to_prompt_completion.py +41 -0
- evaluate/__pycache__/evaluate.cpython-312.pyc +0 -0
- evaluate/eval.jsonl +3 -0
- evaluate/evaluate.py +87 -0
- evaluate/output/.eval_results.csv.swp +0 -0
- evaluate/output/eval_results.csv +3 -0
- evaluate/output/eval_results.json +3 -0
- requirements.txt +182 -0
- train/download_model.py +6 -0
- train/output/qlora-codellama-bugfix/README.md +207 -0
- train/output/qlora-codellama-bugfix/adapter_config.json +3 -0
- train/output/qlora-codellama-bugfix/adapter_model.safetensors +3 -0
- train/output/qlora-codellama-bugfix/chat_template.jinja +1 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/README.md +207 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_config.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_model.safetensors +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/chat_template.jinja +1 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/optimizer.pt +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/rng_state.pth +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/scheduler.pt +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/special_tokens_map.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer_config.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/trainer_state.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-1000/training_args.bin +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/README.md +207 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/adapter_config.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/adapter_model.safetensors +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/chat_template.jinja +1 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/optimizer.pt +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/rng_state.pth +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/scheduler.pt +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/special_tokens_map.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer_config.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/trainer_state.json +3 -0
- train/output/qlora-codellama-bugfix/checkpoint-500/training_args.bin +3 -0
- train/output/qlora-codellama-bugfix/special_tokens_map.json +3 -0
- train/output/qlora-codellama-bugfix/tokenizer.json +3 -0
- train/output/qlora-codellama-bugfix/tokenizer_config.json +3 -0
- train/train.py +143 -0
- train/train_codellama_qlora.py +96 -0
PROJECT_STRUCTURE.md
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Linux Kernel Anti-Pattern Detector - Project Structure
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
|
5 |
+
This project is organized into a clear, maintainable structure that separates concerns and makes it easy to find, modify, and extend functionality.
|
6 |
+
|
7 |
+
## Directory Structure
|
8 |
+
|
9 |
+
```
|
10 |
+
Linux Kernel Anti-Pattern Detector/
|
11 |
+
├── 📁 data/ # Analysis data and results
|
12 |
+
│ ├── results.json # Main analysis results
|
13 |
+
│ ├── concurrency_analysis_report.json
|
14 |
+
│ └── kernel_analysis.log # Analysis logs
|
15 |
+
│
|
16 |
+
├── 📁 docs/ # Documentation
|
17 |
+
│ ├── kernel-analysis-guide.md # Kernel analysis documentation
|
18 |
+
│ └── [additional documentation]
|
19 |
+
│
|
20 |
+
├── 📁 examples/ # Example code and usage
|
21 |
+
│ └── [example files]
|
22 |
+
│
|
23 |
+
├── 📁 reports/ # Generated analysis reports
|
24 |
+
│ ├── Linux_Kernel_Anti_Pattern_Analysis_Report.md
|
25 |
+
│ ├── Executive_Summary.md
|
26 |
+
│ └── 📁 concurrency/ # Concurrency-specific reports
|
27 |
+
│ └── Concurrency_Analysis_Report.md
|
28 |
+
│
|
29 |
+
├── 📁 scripts/ # Analysis and utility scripts
|
30 |
+
│ ├── 📁 analysis/ # Core analysis scripts
|
31 |
+
│ │ ├── concurrency_analyzer.py # Concurrency issue analyzer
|
32 |
+
│ │ └── analyze_kernel_structure.py
|
33 |
+
│ ├── 📁 reporting/ # Report generation scripts
|
34 |
+
│ │ └── view_results.py # Results viewer
|
35 |
+
│ └── 📁 utils/ # Utility scripts
|
36 |
+
│ └── quick_summary.py # Quick summary generator
|
37 |
+
│
|
38 |
+
├── 📁 src/ # Source code (main project)
|
39 |
+
│ ├── __init__.py
|
40 |
+
│ ├── 📁 detectors/ # Anti-pattern detection modules
|
41 |
+
│ ├── 📁 rules/ # Detection rules and patterns
|
42 |
+
│ └── 📁 utils/ # Utility functions
|
43 |
+
│
|
44 |
+
├── 📁 tests/ # Test files
|
45 |
+
│ └── [test files]
|
46 |
+
│
|
47 |
+
├── 📁 tools/ # Analysis tools and detectors
|
48 |
+
│ ├── 📁 detectors/ # Main detection tools
|
49 |
+
│ │ ├── detector.py # Main anti-pattern detector
|
50 |
+
│ │ └── config.yaml # Detection configuration
|
51 |
+
│ ├── 📁 visualizers/ # Data visualization tools
|
52 |
+
│ └── 📁 exporters/ # Data export tools
|
53 |
+
│
|
54 |
+
├── 📁 linux/ # Linux kernel source (cloned)
|
55 |
+
│ └── [kernel source files]
|
56 |
+
│
|
57 |
+
├── 📄 README.md # Main project documentation
|
58 |
+
├── 📄 requirements.txt # Main project dependencies
|
59 |
+
├── 📄 requirements-kernel-analysis.txt
|
60 |
+
├── 📄 requirements-simple.txt
|
61 |
+
├── 📄 .gitignore # Git ignore rules
|
62 |
+
└── 📄 PROJECT_STRUCTURE.md # This file
|
63 |
+
```
|
64 |
+
|
65 |
+
## Directory Descriptions
|
66 |
+
|
67 |
+
### 📁 data/
|
68 |
+
Contains all analysis results, logs, and generated data files.
|
69 |
+
- **results.json**: Complete analysis results from the main detector
|
70 |
+
- **concurrency_analysis_report.json**: Detailed concurrency analysis
|
71 |
+
- **kernel_analysis.log**: Analysis execution logs
|
72 |
+
|
73 |
+
### 📁 docs/
|
74 |
+
Project documentation and guides.
|
75 |
+
- **kernel-analysis-guide.md**: Comprehensive guide for kernel analysis
|
76 |
+
- Additional documentation for specific features
|
77 |
+
|
78 |
+
### 📁 examples/
|
79 |
+
Example code, usage patterns, and sample data.
|
80 |
+
- Example kernel modules for testing
|
81 |
+
- Sample configuration files
|
82 |
+
- Usage examples
|
83 |
+
|
84 |
+
### 📁 reports/
|
85 |
+
Generated analysis reports in various formats.
|
86 |
+
- **Linux_Kernel_Anti_Pattern_Analysis_Report.md**: Complete technical report
|
87 |
+
- **Executive_Summary.md**: High-level summary for stakeholders
|
88 |
+
- **concurrency/**: Specialized reports for specific issue types
|
89 |
+
|
90 |
+
### 📁 scripts/
|
91 |
+
Analysis and utility scripts organized by function.
|
92 |
+
|
93 |
+
#### 📁 analysis/
|
94 |
+
Core analysis scripts for different types of anti-patterns.
|
95 |
+
- **concurrency_analyzer.py**: Specialized concurrency issue analysis
|
96 |
+
- **analyze_kernel_structure.py**: Kernel structure analysis
|
97 |
+
|
98 |
+
#### 📁 reporting/
|
99 |
+
Scripts for generating and viewing reports.
|
100 |
+
- **view_results.py**: Interactive results viewer and reporter
|
101 |
+
|
102 |
+
#### 📁 utils/
|
103 |
+
Utility scripts for common tasks.
|
104 |
+
- **quick_summary.py**: Quick summary generation
|
105 |
+
|
106 |
+
### 📁 src/
|
107 |
+
Main project source code (core framework).
|
108 |
+
- **detectors/**: Anti-pattern detection modules
|
109 |
+
- **rules/**: Detection rules and pattern definitions
|
110 |
+
- **utils/**: Utility functions and helpers
|
111 |
+
|
112 |
+
### 📁 tests/
|
113 |
+
Test files and test data.
|
114 |
+
- Unit tests for detection modules
|
115 |
+
- Integration tests
|
116 |
+
- Test data and fixtures
|
117 |
+
|
118 |
+
### 📁 tools/
|
119 |
+
Analysis tools and detectors.
|
120 |
+
|
121 |
+
#### 📁 detectors/
|
122 |
+
Main detection tools and configurations.
|
123 |
+
- **detector.py**: Primary anti-pattern detection engine
|
124 |
+
- **config.yaml**: Detection configuration and rules
|
125 |
+
|
126 |
+
#### 📁 visualizers/
|
127 |
+
Data visualization and charting tools.
|
128 |
+
- Interactive dashboards
|
129 |
+
- Chart generators
|
130 |
+
- Data plotting utilities
|
131 |
+
|
132 |
+
#### 📁 exporters/
|
133 |
+
Data export and format conversion tools.
|
134 |
+
- JSON to other formats
|
135 |
+
- Report generation
|
136 |
+
- Data transformation
|
137 |
+
|
138 |
+
### 📁 linux/
|
139 |
+
Cloned Linux kernel source code for analysis.
|
140 |
+
- Complete kernel source tree
|
141 |
+
- Used for code snippet extraction
|
142 |
+
- Reference for pattern validation
|
143 |
+
|
144 |
+
## File Descriptions
|
145 |
+
|
146 |
+
### Core Files
|
147 |
+
- **README.md**: Main project documentation and getting started guide
|
148 |
+
- **requirements.txt**: Main project Python dependencies
|
149 |
+
- **requirements-kernel-analysis.txt**: Kernel analysis specific dependencies
|
150 |
+
- **requirements-simple.txt**: Simplified dependencies for basic usage
|
151 |
+
- **.gitignore**: Git ignore patterns for the project
|
152 |
+
|
153 |
+
### Configuration Files
|
154 |
+
- **tools/detectors/config.yaml**: Main detection configuration
|
155 |
+
- **tools/detectors/detector.py**: Primary detection engine
|
156 |
+
|
157 |
+
## Usage Patterns
|
158 |
+
|
159 |
+
### Running Analysis
|
160 |
+
```bash
|
161 |
+
# Main analysis
|
162 |
+
python tools/detectors/detector.py --clone --output data/results.json
|
163 |
+
|
164 |
+
# Concurrency analysis
|
165 |
+
python scripts/analysis/concurrency_analyzer.py
|
166 |
+
|
167 |
+
# View results
|
168 |
+
python scripts/reporting/view_results.py data/results.json
|
169 |
+
```
|
170 |
+
|
171 |
+
### Generating Reports
|
172 |
+
```bash
|
173 |
+
# Quick summary
|
174 |
+
python scripts/utils/quick_summary.py
|
175 |
+
|
176 |
+
# Interactive viewer
|
177 |
+
python scripts/reporting/view_results.py --interactive
|
178 |
+
```
|
179 |
+
|
180 |
+
### Development
|
181 |
+
```bash
|
182 |
+
# Install dependencies
|
183 |
+
pip install -r requirements.txt
|
184 |
+
pip install -r requirements-kernel-analysis.txt
|
185 |
+
|
186 |
+
# Run tests
|
187 |
+
python -m pytest tests/
|
188 |
+
|
189 |
+
# Development setup
|
190 |
+
conda activate linux-kernel-anti-pattern-detector
|
191 |
+
```
|
192 |
+
|
193 |
+
## Best Practices
|
194 |
+
|
195 |
+
### Adding New Features
|
196 |
+
1. **Analysis scripts**: Add to `scripts/analysis/`
|
197 |
+
2. **Reporting tools**: Add to `scripts/reporting/`
|
198 |
+
3. **Utilities**: Add to `scripts/utils/`
|
199 |
+
4. **Core detection**: Add to `src/detectors/`
|
200 |
+
5. **Configuration**: Update `tools/detectors/config.yaml`
|
201 |
+
|
202 |
+
### File Naming Conventions
|
203 |
+
- **Python files**: snake_case (e.g., `concurrency_analyzer.py`)
|
204 |
+
- **Configuration files**: kebab-case (e.g., `kernel-analysis-guide.md`)
|
205 |
+
- **Reports**: Pascal_Case (e.g., `Concurrency_Analysis_Report.md`)
|
206 |
+
|
207 |
+
### Data Management
|
208 |
+
- **Raw data**: Store in `data/`
|
209 |
+
- **Processed results**: Store in `data/`
|
210 |
+
- **Reports**: Generate in `reports/`
|
211 |
+
- **Logs**: Store in `data/`
|
212 |
+
|
213 |
+
## Maintenance
|
214 |
+
|
215 |
+
### Regular Tasks
|
216 |
+
1. **Update dependencies**: Review and update requirements files
|
217 |
+
2. **Clean data**: Remove old analysis results periodically
|
218 |
+
3. **Update kernel**: Refresh the Linux kernel source
|
219 |
+
4. **Backup reports**: Archive important analysis reports
|
220 |
+
|
221 |
+
### Version Control
|
222 |
+
- **Track**: Source code, configuration, documentation
|
223 |
+
- **Ignore**: Analysis results, logs, kernel source (large files)
|
224 |
+
- **Archive**: Important reports and findings
|
225 |
+
|
226 |
+
---
|
227 |
+
|
228 |
+
*This structure is designed to be scalable, maintainable, and easy to navigate. Each directory has a clear purpose and the organization supports both development and research workflows.*
|
dataset/linux_bugfix_100k.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef9110bf645ceb42b2a6de21a952f82448ed7cfa31ac383c1c62fb9a840b9574
|
3 |
+
size 241507688
|
dataset/linux_bugfix_prompt_completion.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:621b0b7e04c5b359ea28eec9bf165aad2eaea77b07b834b0c1b7de37202e7992
|
3 |
+
size 2184324531
|
dataset_builder/build_dataset.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydriller import Repository
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from tqdm import tqdm
|
5 |
+
import re
|
6 |
+
|
7 |
+
REPO_PATH = '../linux'
|
8 |
+
OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'
|
9 |
+
|
10 |
+
TEST_MODE = False # Set to False to process the full repository
|
11 |
+
|
12 |
+
BUGFIX_KEYWORDS = [
|
13 |
+
'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
|
14 |
+
'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
|
15 |
+
'security', 'vulnerability', 'exploit', 'buffer', 'stack'
|
16 |
+
]
|
17 |
+
|
18 |
+
def is_bugfix_commit(msg):
|
19 |
+
msg_lower = msg.lower()
|
20 |
+
return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)
|
21 |
+
|
22 |
+
def extract_instruction_from_commit_msg(msg):
|
23 |
+
lines = msg.strip().splitlines()
|
24 |
+
for line in lines:
|
25 |
+
line = line.strip()
|
26 |
+
if len(line) < 5 or not any(c.isalpha() for c in line):
|
27 |
+
continue
|
28 |
+
if line.lower().startswith((
|
29 |
+
'[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
|
30 |
+
'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
|
31 |
+
'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
|
32 |
+
)):
|
33 |
+
continue
|
34 |
+
return line
|
35 |
+
return msg.strip().splitlines()[0] if msg.strip() else "fix"
|
36 |
+
|
37 |
+
def extract_code_context(code, line_number, context_lines=10):
|
38 |
+
if not code:
|
39 |
+
return ""
|
40 |
+
lines = code.split('\n')
|
41 |
+
start = max(0, line_number - context_lines)
|
42 |
+
end = min(len(lines), line_number + context_lines)
|
43 |
+
return '\n'.join(lines[start:end])
|
44 |
+
|
45 |
+
def extract_diff_context(diff_text, context_lines=5):
|
46 |
+
if not diff_text:
|
47 |
+
return ""
|
48 |
+
lines = diff_text.split('\n')
|
49 |
+
change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
|
50 |
+
if not change_lines:
|
51 |
+
return diff_text
|
52 |
+
start = max(0, change_lines[0] - context_lines)
|
53 |
+
end = min(len(lines), change_lines[-1] + context_lines + 1)
|
54 |
+
return '\n'.join(lines[start:end])
|
55 |
+
|
56 |
+
def create_dataset_entry(original_code, commit_msg, diff_code):
|
57 |
+
return {
|
58 |
+
"input": {
|
59 |
+
"original code": original_code.strip(),
|
60 |
+
"instruction": extract_instruction_from_commit_msg(commit_msg)
|
61 |
+
},
|
62 |
+
"output": {
|
63 |
+
"diff codes": diff_code.strip()
|
64 |
+
}
|
65 |
+
}
|
66 |
+
|
67 |
+
def process_commit(commit):
|
68 |
+
entries = []
|
69 |
+
if not is_bugfix_commit(commit.msg):
|
70 |
+
return entries
|
71 |
+
|
72 |
+
for mod in commit.modified_files:
|
73 |
+
if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
|
74 |
+
continue
|
75 |
+
if mod.change_type.name != "MODIFY":
|
76 |
+
continue
|
77 |
+
if not mod.diff or not mod.source_code_before:
|
78 |
+
continue
|
79 |
+
|
80 |
+
focused_diff = extract_diff_context(mod.diff)
|
81 |
+
|
82 |
+
diff_lines = mod.diff.split('\n')
|
83 |
+
line_numbers = []
|
84 |
+
for line in diff_lines:
|
85 |
+
if line.startswith('@@'):
|
86 |
+
match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
|
87 |
+
if match:
|
88 |
+
line_numbers.append(int(match.group(1)))
|
89 |
+
|
90 |
+
if line_numbers:
|
91 |
+
focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
|
92 |
+
else:
|
93 |
+
focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])
|
94 |
+
|
95 |
+
entry = create_dataset_entry(
|
96 |
+
original_code=focused_code,
|
97 |
+
commit_msg=commit.msg,
|
98 |
+
diff_code=focused_diff
|
99 |
+
)
|
100 |
+
entries.append(entry)
|
101 |
+
|
102 |
+
return entries
|
103 |
+
|
104 |
+
def main():
|
105 |
+
if not os.path.exists(REPO_PATH):
|
106 |
+
print(f"\u274c Repository not found at: {REPO_PATH}")
|
107 |
+
return
|
108 |
+
|
109 |
+
os.makedirs('./output', exist_ok=True)
|
110 |
+
|
111 |
+
print("\ud83d\udd0d Building Linux kernel bug-fix dataset...")
|
112 |
+
print(f"\ud83d\udcc1 Repository: {REPO_PATH}")
|
113 |
+
print(f"\ud83d\udcce Output: {OUTPUT_FILE}")
|
114 |
+
|
115 |
+
output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE
|
116 |
+
|
117 |
+
repo = Repository(REPO_PATH)
|
118 |
+
dataset_entries = []
|
119 |
+
processed_commits = 0
|
120 |
+
total_commits = 0
|
121 |
+
bugfix_commits = 0
|
122 |
+
|
123 |
+
for commit in tqdm(repo.traverse_commits(), desc="Processing commits"):
|
124 |
+
total_commits += 1
|
125 |
+
if TEST_MODE and MAX_COMMITS_TEST and total_commits > MAX_COMMITS_TEST:
|
126 |
+
break
|
127 |
+
if is_bugfix_commit(commit.msg):
|
128 |
+
bugfix_commits += 1
|
129 |
+
entries = process_commit(commit)
|
130 |
+
if entries:
|
131 |
+
dataset_entries.extend(entries)
|
132 |
+
processed_commits += 1
|
133 |
+
if TEST_MODE:
|
134 |
+
print(f"\n\ud83d\udd0d Bug-fix commit {processed_commits}: {commit.hash[:8]}")
|
135 |
+
print(f"\ud83d\udcdd Message: {extract_instruction_from_commit_msg(commit.msg)}")
|
136 |
+
print(f"\ud83d\udcca Files: {len(entries)} entries extracted")
|
137 |
+
print(f"\ud83d\udcc1 Files: {[mod.new_path for mod in commit.modified_files if mod.new_path and mod.new_path.endswith(('.c', '.h'))]}")
|
138 |
+
|
139 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
140 |
+
for entry in dataset_entries:
|
141 |
+
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
142 |
+
|
143 |
+
print(f"\n\u2705 Dataset creation completed!")
|
144 |
+
print(f"\ud83d\udcca Total commits processed: {total_commits}")
|
145 |
+
print(f"\ud83d\udc1b Bug-fix commits found: {bugfix_commits}")
|
146 |
+
print(f"\ud83d\udcdd Commits with valid entries: {processed_commits}")
|
147 |
+
print(f"\ud83d\udcdd Total dataset entries: {len(dataset_entries)}")
|
148 |
+
print(f"\ud83d\udcce Saved to: {output_file}")
|
149 |
+
|
150 |
+
if dataset_entries:
|
151 |
+
print(f"\n\ud83d\udccb Sample dataset entry:")
|
152 |
+
sample = dataset_entries[0]
|
153 |
+
print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
|
154 |
+
print(f"\n\ud83d\udcc1 Dataset structure:")
|
155 |
+
print(f" - Input: original code + instruction")
|
156 |
+
print(f" - Output: diff codes")
|
157 |
+
print(f" - Format: JSONL (one JSON object per line)")
|
158 |
+
|
159 |
+
if __name__ == "__main__":
|
160 |
+
main()
|
dataset_builder/build_dataset_demo.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pydriller import Repository
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
from tqdm import tqdm
|
5 |
+
import re
|
6 |
+
from multiprocessing import Pool
|
7 |
+
|
8 |
+
REPO_PATH = '../linux'
|
9 |
+
OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'
|
10 |
+
|
11 |
+
TEST_MODE = False # Set to False to process the full repository
|
12 |
+
MAX_COMMITS_TEST = 50 # Set a limit if TEST_MODE is True
|
13 |
+
NUM_WORKERS = 16 # Adjust to your actual core count
|
14 |
+
|
15 |
+
BUGFIX_KEYWORDS = [
|
16 |
+
'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
|
17 |
+
'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
|
18 |
+
'security', 'vulnerability', 'exploit', 'buffer', 'stack'
|
19 |
+
]
|
20 |
+
|
21 |
+
def is_bugfix_commit(msg):
|
22 |
+
msg_lower = msg.lower()
|
23 |
+
return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)
|
24 |
+
|
25 |
+
def extract_instruction_from_commit_msg(msg):
|
26 |
+
lines = msg.strip().splitlines()
|
27 |
+
for line in lines:
|
28 |
+
line = line.strip()
|
29 |
+
if len(line) < 5 or not any(c.isalpha() for c in line):
|
30 |
+
continue
|
31 |
+
if line.lower().startswith((
|
32 |
+
'[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
|
33 |
+
'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
|
34 |
+
'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
|
35 |
+
)):
|
36 |
+
continue
|
37 |
+
return line
|
38 |
+
return msg.strip().splitlines()[0] if msg.strip() else "fix"
|
39 |
+
|
40 |
+
def extract_code_context(code, line_number, context_lines=10):
|
41 |
+
if not code:
|
42 |
+
return ""
|
43 |
+
lines = code.split('\n')
|
44 |
+
start = max(0, line_number - context_lines)
|
45 |
+
end = min(len(lines), line_number + context_lines)
|
46 |
+
return '\n'.join(lines[start:end])
|
47 |
+
|
48 |
+
def extract_diff_context(diff_text, context_lines=5):
|
49 |
+
if not diff_text:
|
50 |
+
return ""
|
51 |
+
lines = diff_text.split('\n')
|
52 |
+
change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
|
53 |
+
if not change_lines:
|
54 |
+
return diff_text
|
55 |
+
start = max(0, change_lines[0] - context_lines)
|
56 |
+
end = min(len(lines), change_lines[-1] + context_lines + 1)
|
57 |
+
return '\n'.join(lines[start:end])
|
58 |
+
|
59 |
+
def create_dataset_entry(original_code, commit_msg, diff_code):
|
60 |
+
return {
|
61 |
+
"input": {
|
62 |
+
"original code": original_code.strip(),
|
63 |
+
"instruction": extract_instruction_from_commit_msg(commit_msg)
|
64 |
+
},
|
65 |
+
"output": {
|
66 |
+
"diff codes": diff_code.strip()
|
67 |
+
}
|
68 |
+
}
|
69 |
+
|
70 |
+
def process_commit(commit):
|
71 |
+
entries = []
|
72 |
+
if not is_bugfix_commit(commit.msg):
|
73 |
+
return entries
|
74 |
+
|
75 |
+
for mod in commit.modified_files:
|
76 |
+
if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
|
77 |
+
continue
|
78 |
+
if mod.change_type.name != "MODIFY":
|
79 |
+
continue
|
80 |
+
if not mod.diff or not mod.source_code_before:
|
81 |
+
continue
|
82 |
+
|
83 |
+
focused_diff = extract_diff_context(mod.diff)
|
84 |
+
|
85 |
+
diff_lines = mod.diff.split('\n')
|
86 |
+
line_numbers = []
|
87 |
+
for line in diff_lines:
|
88 |
+
if line.startswith('@@'):
|
89 |
+
match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
|
90 |
+
if match:
|
91 |
+
line_numbers.append(int(match.group(1)))
|
92 |
+
|
93 |
+
if line_numbers:
|
94 |
+
focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
|
95 |
+
else:
|
96 |
+
focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])
|
97 |
+
|
98 |
+
entry = create_dataset_entry(
|
99 |
+
original_code=focused_code,
|
100 |
+
commit_msg=commit.msg,
|
101 |
+
diff_code=focused_diff
|
102 |
+
)
|
103 |
+
entries.append(entry)
|
104 |
+
|
105 |
+
return entries
|
106 |
+
|
107 |
+
def collect_entries_from_hash(commit_hash):
|
108 |
+
try:
|
109 |
+
commit = next(Repository(REPO_PATH, only_commits=[commit_hash]).traverse_commits())
|
110 |
+
return process_commit(commit)
|
111 |
+
except Exception:
|
112 |
+
return []
|
113 |
+
|
114 |
+
def main():
|
115 |
+
if not os.path.exists(REPO_PATH):
|
116 |
+
print("[ERROR] Repository not found at:", REPO_PATH)
|
117 |
+
return
|
118 |
+
|
119 |
+
os.makedirs('./output', exist_ok=True)
|
120 |
+
|
121 |
+
print("[INFO] Building Linux kernel bug-fix dataset...")
|
122 |
+
print("[INFO] Repository:", REPO_PATH)
|
123 |
+
print("[INFO] Output file:", OUTPUT_FILE)
|
124 |
+
|
125 |
+
output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE
|
126 |
+
|
127 |
+
all_hashes = [c.hash for c in Repository(REPO_PATH).traverse_commits()]
|
128 |
+
if TEST_MODE and MAX_COMMITS_TEST:
|
129 |
+
all_hashes = all_hashes[:MAX_COMMITS_TEST]
|
130 |
+
|
131 |
+
dataset_entries = []
|
132 |
+
with Pool(NUM_WORKERS) as pool:
|
133 |
+
results = list(tqdm(pool.imap_unordered(collect_entries_from_hash, all_hashes), total=len(all_hashes)))
|
134 |
+
|
135 |
+
for entries in results:
|
136 |
+
dataset_entries.extend(entries)
|
137 |
+
|
138 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
139 |
+
for entry in dataset_entries:
|
140 |
+
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
|
141 |
+
|
142 |
+
print("[DONE] Dataset creation completed!")
|
143 |
+
print("[INFO] Total commits processed:", len(all_hashes))
|
144 |
+
print("[INFO] Total dataset entries:", len(dataset_entries))
|
145 |
+
print("[INFO] Saved to:", output_file)
|
146 |
+
|
147 |
+
if dataset_entries:
|
148 |
+
print("[INFO] Sample dataset entry:")
|
149 |
+
sample = dataset_entries[0]
|
150 |
+
print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
|
151 |
+
print("[INFO] Dataset structure:")
|
152 |
+
print(" - Input: original code + instruction")
|
153 |
+
print(" - Output: diff codes")
|
154 |
+
print(" - Format: JSONL (one JSON object per line)")
|
155 |
+
|
156 |
+
if __name__ == "__main__":
|
157 |
+
main()
|
dataset_builder/convert_to_prompt_completion.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
INPUT_FILE = './output/linux_bugfix_dataset.jsonl'
|
4 |
+
OUTPUT_FILE = './output/linux_bugfix_prompt_completion.jsonl'
|
5 |
+
|
6 |
+
def format_prompt(original_code, instruction):
|
7 |
+
return (
|
8 |
+
"Given the following original C code:\n"
|
9 |
+
f"{original_code.strip()}\n\n"
|
10 |
+
"Instruction:\n"
|
11 |
+
f"{instruction.strip()}\n\n"
|
12 |
+
"Return the diff that fixes it:\n"
|
13 |
+
)
|
14 |
+
|
15 |
+
def format_completion(diff_code):
|
16 |
+
return diff_code.strip()
|
17 |
+
|
18 |
+
def convert_dataset(input_path, output_path):
|
19 |
+
with open(input_path, 'r', encoding='utf-8') as fin, \
|
20 |
+
open(output_path, 'w', encoding='utf-8') as fout:
|
21 |
+
|
22 |
+
for line in fin:
|
23 |
+
data = json.loads(line)
|
24 |
+
original_code = data["input"]["original code"]
|
25 |
+
instruction = data["input"]["instruction"]
|
26 |
+
diff_code = data["output"]["diff codes"]
|
27 |
+
|
28 |
+
prompt = format_prompt(original_code, instruction)
|
29 |
+
completion = format_completion(diff_code)
|
30 |
+
|
31 |
+
new_entry = {
|
32 |
+
"prompt": prompt,
|
33 |
+
"completion": completion
|
34 |
+
}
|
35 |
+
|
36 |
+
fout.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
|
37 |
+
|
38 |
+
print(f"[DONE] Converted dataset saved to: {output_path}")
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
convert_dataset(INPUT_FILE, OUTPUT_FILE)
|
evaluate/__pycache__/evaluate.cpython-312.pyc
ADDED
Binary file (3.93 kB). View file
|
|
evaluate/eval.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0775fb198ac76efb0313376b80d43db691701b1ed52a524520bd38490c123242
|
3 |
+
size 1758795
|
evaluate/evaluate.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
3 |
+
from datasets import load_dataset
|
4 |
+
from tqdm import tqdm
|
5 |
+
import json
|
6 |
+
import csv
|
7 |
+
import os
|
8 |
+
import evaluate
|
9 |
+
|
10 |
+
# ==== CONFIG ====
|
11 |
+
MODEL_PATH = "../train/output/qlora-codellama-bugfix"
|
12 |
+
EVAL_FILE = "eval.jsonl"
|
13 |
+
OUTPUT_JSON = "./output/eval_results.json"
|
14 |
+
OUTPUT_CSV = "./output/eval_results.csv"
|
15 |
+
MAX_INPUT_LEN = 1024
|
16 |
+
MAX_NEW_TOKENS = 256
|
17 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
18 |
+
|
19 |
+
# ==== Ensure output folder exists ====
|
20 |
+
os.makedirs(os.path.dirname(OUTPUT_JSON), exist_ok=True)
|
21 |
+
|
22 |
+
# ==== Load model ====
|
23 |
+
print("🔄 Loading model...")
|
24 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
|
25 |
+
model = AutoModelForCausalLM.from_pretrained(
|
26 |
+
MODEL_PATH,
|
27 |
+
torch_dtype=torch.bfloat16,
|
28 |
+
device_map="auto"
|
29 |
+
)
|
30 |
+
model.eval()
|
31 |
+
|
32 |
+
# ==== Load eval data ====
|
33 |
+
print("📂 Loading evaluation data...")
|
34 |
+
eval_data = load_dataset("json", data_files=EVAL_FILE, split="train")
|
35 |
+
|
36 |
+
# ==== Inference ====
|
37 |
+
results = []
|
38 |
+
print("⚙️ Running inference...")
|
39 |
+
for example in tqdm(eval_data):
|
40 |
+
prompt = example["prompt"]
|
41 |
+
reference = example["completion"]
|
42 |
+
|
43 |
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(DEVICE)
|
44 |
+
|
45 |
+
with torch.no_grad():
|
46 |
+
outputs = model.generate(
|
47 |
+
**inputs,
|
48 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
49 |
+
do_sample=False,
|
50 |
+
num_beams=4,
|
51 |
+
pad_token_id=tokenizer.pad_token_id,
|
52 |
+
eos_token_id=tokenizer.eos_token_id
|
53 |
+
)
|
54 |
+
|
55 |
+
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
56 |
+
|
57 |
+
results.append({
|
58 |
+
"prompt": prompt,
|
59 |
+
"reference": reference.strip(),
|
60 |
+
"prediction": prediction.strip()
|
61 |
+
})
|
62 |
+
|
63 |
+
# ==== Save results ====
|
64 |
+
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
65 |
+
json.dump(results, f, indent=2)
|
66 |
+
print(f"✅ Saved JSON to {OUTPUT_JSON}")
|
67 |
+
|
68 |
+
with open(OUTPUT_CSV, "w", encoding="utf-8", newline='') as f:
|
69 |
+
writer = csv.DictWriter(f, fieldnames=["prompt", "reference", "prediction"])
|
70 |
+
writer.writeheader()
|
71 |
+
writer.writerows(results)
|
72 |
+
print(f"✅ Saved CSV to {OUTPUT_CSV}")
|
73 |
+
|
74 |
+
# ==== Compute Metrics ====
|
75 |
+
print("📊 Computing BLEU and ROUGE...")
|
76 |
+
bleu = evaluate.load("bleu")
|
77 |
+
rouge = evaluate.load("rouge")
|
78 |
+
|
79 |
+
predictions = [r["prediction"] for r in results]
|
80 |
+
references = [r["reference"] for r in results]
|
81 |
+
|
82 |
+
bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
|
83 |
+
rouge_score = rouge.compute(predictions=predictions, references=references)
|
84 |
+
|
85 |
+
print("\n📈 Evaluation Results:")
|
86 |
+
print("BLEU:", bleu_score)
|
87 |
+
print("ROUGE:", json.dumps(rouge_score, indent=2))
|
evaluate/output/.eval_results.csv.swp
ADDED
Binary file (45.1 kB). View file
|
|
evaluate/output/eval_results.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:20a66ee1f8b006ceb278d3363c9337f23a4ffa7e49e54c379e409342cda874fd
|
3 |
+
size 2836184
|
evaluate/output/eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cdf1b2c1317da8ff11b303547cedd6642d5762984e1952134a11cff4ff8120e
|
3 |
+
size 3078110
|
requirements.txt
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
absl-py==2.3.0
|
2 |
+
accelerate==1.8.1
|
3 |
+
aiohappyeyeballs==2.6.1
|
4 |
+
aiohttp==3.12.13
|
5 |
+
aiosignal==1.4.0
|
6 |
+
anyio==4.9.0
|
7 |
+
argon2-cffi==25.1.0
|
8 |
+
argon2-cffi-bindings==21.2.0
|
9 |
+
arrow==1.3.0
|
10 |
+
asttokens==3.0.0
|
11 |
+
async-lru==2.0.5
|
12 |
+
attrs==25.3.0
|
13 |
+
babel==2.17.0
|
14 |
+
bash_kernel==0.10.0
|
15 |
+
beautifulsoup4==4.13.4
|
16 |
+
bitsandbytes==0.46.1
|
17 |
+
bleach==6.2.0
|
18 |
+
blinker==1.7.0
|
19 |
+
certifi==2025.6.15
|
20 |
+
cffi==1.17.1
|
21 |
+
charset-normalizer==3.4.2
|
22 |
+
comm==0.2.2
|
23 |
+
conda-pack==0.8.1
|
24 |
+
cryptography==41.0.7
|
25 |
+
datasets==3.6.0
|
26 |
+
dbus-python==1.3.2
|
27 |
+
debugpy==1.8.14
|
28 |
+
decorator==5.2.1
|
29 |
+
defusedxml==0.7.1
|
30 |
+
dill==0.3.8
|
31 |
+
distro==1.9.0
|
32 |
+
evaluate==0.4.4
|
33 |
+
executing==2.2.0
|
34 |
+
fastjsonschema==2.21.1
|
35 |
+
filelock==3.13.1
|
36 |
+
filetype==1.2.0
|
37 |
+
fqdn==1.5.1
|
38 |
+
frozenlist==1.7.0
|
39 |
+
fsspec==2024.6.1
|
40 |
+
grpcio==1.73.1
|
41 |
+
h11==0.16.0
|
42 |
+
hf-xet==1.1.5
|
43 |
+
httpcore==1.0.9
|
44 |
+
httplib2==0.20.4
|
45 |
+
httpx==0.28.1
|
46 |
+
huggingface-hub==0.33.2
|
47 |
+
idna==3.10
|
48 |
+
iniconfig==2.1.0
|
49 |
+
iotop==0.6
|
50 |
+
ipykernel==6.29.5
|
51 |
+
ipython==9.3.0
|
52 |
+
ipython_pygments_lexers==1.1.1
|
53 |
+
ipywidgets==8.1.7
|
54 |
+
isoduration==20.11.0
|
55 |
+
jedi==0.19.2
|
56 |
+
Jinja2==3.1.6
|
57 |
+
json5==0.12.0
|
58 |
+
jsonpointer==3.0.0
|
59 |
+
jsonschema==4.24.0
|
60 |
+
jsonschema-specifications==2025.4.1
|
61 |
+
jupyter==1.1.1
|
62 |
+
jupyter-archive==3.4.0
|
63 |
+
jupyter-console==6.6.3
|
64 |
+
jupyter-events==0.12.0
|
65 |
+
jupyter-http-over-ws==0.0.8
|
66 |
+
jupyter-lsp==2.2.5
|
67 |
+
jupyter_client==8.6.3
|
68 |
+
jupyter_core==5.8.1
|
69 |
+
jupyter_server==2.16.0
|
70 |
+
jupyter_server_terminals==0.5.3
|
71 |
+
jupyterlab==4.4.4
|
72 |
+
jupyterlab_pygments==0.3.0
|
73 |
+
jupyterlab_server==2.27.3
|
74 |
+
jupyterlab_widgets==3.0.15
|
75 |
+
launchpadlib==1.11.0
|
76 |
+
lazr.restfulclient==0.14.6
|
77 |
+
lazr.uri==1.0.6
|
78 |
+
Markdown==3.8.2
|
79 |
+
MarkupSafe==3.0.2
|
80 |
+
matplotlib-inline==0.1.7
|
81 |
+
mistune==3.1.3
|
82 |
+
mpmath==1.3.0
|
83 |
+
multidict==6.6.3
|
84 |
+
multiprocess==0.70.16
|
85 |
+
nbclient==0.10.2
|
86 |
+
nbconvert==7.16.6
|
87 |
+
nbformat==5.10.4
|
88 |
+
nbzip==0.1.0
|
89 |
+
nest-asyncio==1.6.0
|
90 |
+
networkx==3.3
|
91 |
+
notebook==7.4.3
|
92 |
+
notebook_shim==0.2.4
|
93 |
+
numpy==2.3.1
|
94 |
+
nvidia-cublas-cu12==12.8.3.14
|
95 |
+
nvidia-cuda-cupti-cu12==12.8.57
|
96 |
+
nvidia-cuda-nvrtc-cu12==12.8.61
|
97 |
+
nvidia-cuda-runtime-cu12==12.8.57
|
98 |
+
nvidia-cudnn-cu12==9.7.1.26
|
99 |
+
nvidia-cufft-cu12==11.3.3.41
|
100 |
+
nvidia-cufile-cu12==1.13.0.11
|
101 |
+
nvidia-curand-cu12==10.3.9.55
|
102 |
+
nvidia-cusolver-cu12==11.7.2.55
|
103 |
+
nvidia-cusparse-cu12==12.5.7.53
|
104 |
+
nvidia-cusparselt-cu12==0.6.3
|
105 |
+
nvidia-nccl-cu12==2.26.2
|
106 |
+
nvidia-nvjitlink-cu12==12.8.61
|
107 |
+
nvidia-nvtx-cu12==12.8.55
|
108 |
+
oauthlib==3.2.2
|
109 |
+
overrides==7.7.0
|
110 |
+
packaging==25.0
|
111 |
+
pandas==2.3.1
|
112 |
+
pandocfilters==1.5.1
|
113 |
+
parso==0.8.4
|
114 |
+
peft==0.16.0
|
115 |
+
pexpect==4.9.0
|
116 |
+
pillow==11.0.0
|
117 |
+
platformdirs==4.3.8
|
118 |
+
pluggy==1.6.0
|
119 |
+
prometheus_client==0.22.1
|
120 |
+
prompt_toolkit==3.0.51
|
121 |
+
propcache==0.3.2
|
122 |
+
protobuf==6.31.1
|
123 |
+
psutil==7.0.0
|
124 |
+
ptyprocess==0.7.0
|
125 |
+
pure_eval==0.2.3
|
126 |
+
pyarrow==20.0.0
|
127 |
+
pycparser==2.22
|
128 |
+
Pygments==2.19.2
|
129 |
+
PyGObject==3.48.2
|
130 |
+
PyJWT==2.7.0
|
131 |
+
pyparsing==3.1.1
|
132 |
+
pytest==8.4.1
|
133 |
+
python-apt==2.7.7+ubuntu4
|
134 |
+
python-dateutil==2.9.0.post0
|
135 |
+
python-json-logger==3.3.0
|
136 |
+
pytz==2025.2
|
137 |
+
PyYAML==6.0.2
|
138 |
+
pyzmq==27.0.0
|
139 |
+
referencing==0.36.2
|
140 |
+
regex==2024.11.6
|
141 |
+
requests==2.32.4
|
142 |
+
rfc3339-validator==0.1.4
|
143 |
+
rfc3986-validator==0.1.1
|
144 |
+
rpds-py==0.25.1
|
145 |
+
safetensors==0.5.3
|
146 |
+
Send2Trash==1.8.3
|
147 |
+
setuptools==68.1.2
|
148 |
+
six==1.16.0
|
149 |
+
sniffio==1.3.1
|
150 |
+
soupsieve==2.7
|
151 |
+
stack-data==0.6.3
|
152 |
+
supervisor==4.2.5
|
153 |
+
sympy==1.13.3
|
154 |
+
tensorboard==2.19.0
|
155 |
+
tensorboard-data-server==0.7.2
|
156 |
+
terminado==0.18.1
|
157 |
+
tinycss2==1.4.0
|
158 |
+
tokenizers==0.21.2
|
159 |
+
torch==2.7.1+cu128
|
160 |
+
torchaudio==2.7.1+cu128
|
161 |
+
torchvision==0.22.1+cu128
|
162 |
+
tornado==6.5.1
|
163 |
+
tqdm==4.67.1
|
164 |
+
traitlets==5.14.3
|
165 |
+
transformers==4.53.1
|
166 |
+
triton==3.3.1
|
167 |
+
types-python-dateutil==2.9.0.20250516
|
168 |
+
typing_extensions==4.14.0
|
169 |
+
tzdata==2025.2
|
170 |
+
uri-template==1.3.0
|
171 |
+
urllib3==2.5.0
|
172 |
+
uv==0.7.16
|
173 |
+
wadllib==1.3.6
|
174 |
+
wcwidth==0.2.13
|
175 |
+
webcolors==24.11.1
|
176 |
+
webencodings==0.5.1
|
177 |
+
websocket-client==1.8.0
|
178 |
+
Werkzeug==3.1.3
|
179 |
+
wheel==0.42.0
|
180 |
+
widgetsnbextension==4.0.14
|
181 |
+
xxhash==3.5.0
|
182 |
+
yarl==1.20.1
|
train/download_model.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
+
|
3 |
+
model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
|
4 |
+
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
|
5 |
+
|
6 |
+
print("✅ Download complete.")
|
train/output/qlora-codellama-bugfix/README.md
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: codellama/CodeLLaMA-7b-Instruct-hf
|
3 |
+
library_name: peft
|
4 |
+
pipeline_tag: text-generation
|
5 |
+
tags:
|
6 |
+
- base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
|
7 |
+
- lora
|
8 |
+
- transformers
|
9 |
+
---
|
10 |
+
|
11 |
+
# Model Card for Model ID
|
12 |
+
|
13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
## Model Details
|
18 |
+
|
19 |
+
### Model Description
|
20 |
+
|
21 |
+
<!-- Provide a longer summary of what this model is. -->
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
- **Developed by:** [More Information Needed]
|
26 |
+
- **Funded by [optional]:** [More Information Needed]
|
27 |
+
- **Shared by [optional]:** [More Information Needed]
|
28 |
+
- **Model type:** [More Information Needed]
|
29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
30 |
+
- **License:** [More Information Needed]
|
31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
32 |
+
|
33 |
+
### Model Sources [optional]
|
34 |
+
|
35 |
+
<!-- Provide the basic links for the model. -->
|
36 |
+
|
37 |
+
- **Repository:** [More Information Needed]
|
38 |
+
- **Paper [optional]:** [More Information Needed]
|
39 |
+
- **Demo [optional]:** [More Information Needed]
|
40 |
+
|
41 |
+
## Uses
|
42 |
+
|
43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
44 |
+
|
45 |
+
### Direct Use
|
46 |
+
|
47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
48 |
+
|
49 |
+
[More Information Needed]
|
50 |
+
|
51 |
+
### Downstream Use [optional]
|
52 |
+
|
53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
54 |
+
|
55 |
+
[More Information Needed]
|
56 |
+
|
57 |
+
### Out-of-Scope Use
|
58 |
+
|
59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
60 |
+
|
61 |
+
[More Information Needed]
|
62 |
+
|
63 |
+
## Bias, Risks, and Limitations
|
64 |
+
|
65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
66 |
+
|
67 |
+
[More Information Needed]
|
68 |
+
|
69 |
+
### Recommendations
|
70 |
+
|
71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
72 |
+
|
73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
74 |
+
|
75 |
+
## How to Get Started with the Model
|
76 |
+
|
77 |
+
Use the code below to get started with the model.
|
78 |
+
|
79 |
+
[More Information Needed]
|
80 |
+
|
81 |
+
## Training Details
|
82 |
+
|
83 |
+
### Training Data
|
84 |
+
|
85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
86 |
+
|
87 |
+
[More Information Needed]
|
88 |
+
|
89 |
+
### Training Procedure
|
90 |
+
|
91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
92 |
+
|
93 |
+
#### Preprocessing [optional]
|
94 |
+
|
95 |
+
[More Information Needed]
|
96 |
+
|
97 |
+
|
98 |
+
#### Training Hyperparameters
|
99 |
+
|
100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
101 |
+
|
102 |
+
#### Speeds, Sizes, Times [optional]
|
103 |
+
|
104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
105 |
+
|
106 |
+
[More Information Needed]
|
107 |
+
|
108 |
+
## Evaluation
|
109 |
+
|
110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
111 |
+
|
112 |
+
### Testing Data, Factors & Metrics
|
113 |
+
|
114 |
+
#### Testing Data
|
115 |
+
|
116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
117 |
+
|
118 |
+
[More Information Needed]
|
119 |
+
|
120 |
+
#### Factors
|
121 |
+
|
122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
123 |
+
|
124 |
+
[More Information Needed]
|
125 |
+
|
126 |
+
#### Metrics
|
127 |
+
|
128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
129 |
+
|
130 |
+
[More Information Needed]
|
131 |
+
|
132 |
+
### Results
|
133 |
+
|
134 |
+
[More Information Needed]
|
135 |
+
|
136 |
+
#### Summary
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
## Model Examination [optional]
|
141 |
+
|
142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
143 |
+
|
144 |
+
[More Information Needed]
|
145 |
+
|
146 |
+
## Environmental Impact
|
147 |
+
|
148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
149 |
+
|
150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
151 |
+
|
152 |
+
- **Hardware Type:** [More Information Needed]
|
153 |
+
- **Hours used:** [More Information Needed]
|
154 |
+
- **Cloud Provider:** [More Information Needed]
|
155 |
+
- **Compute Region:** [More Information Needed]
|
156 |
+
- **Carbon Emitted:** [More Information Needed]
|
157 |
+
|
158 |
+
## Technical Specifications [optional]
|
159 |
+
|
160 |
+
### Model Architecture and Objective
|
161 |
+
|
162 |
+
[More Information Needed]
|
163 |
+
|
164 |
+
### Compute Infrastructure
|
165 |
+
|
166 |
+
[More Information Needed]
|
167 |
+
|
168 |
+
#### Hardware
|
169 |
+
|
170 |
+
[More Information Needed]
|
171 |
+
|
172 |
+
#### Software
|
173 |
+
|
174 |
+
[More Information Needed]
|
175 |
+
|
176 |
+
## Citation [optional]
|
177 |
+
|
178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
179 |
+
|
180 |
+
**BibTeX:**
|
181 |
+
|
182 |
+
[More Information Needed]
|
183 |
+
|
184 |
+
**APA:**
|
185 |
+
|
186 |
+
[More Information Needed]
|
187 |
+
|
188 |
+
## Glossary [optional]
|
189 |
+
|
190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
191 |
+
|
192 |
+
[More Information Needed]
|
193 |
+
|
194 |
+
## More Information [optional]
|
195 |
+
|
196 |
+
[More Information Needed]
|
197 |
+
|
198 |
+
## Model Card Authors [optional]
|
199 |
+
|
200 |
+
[More Information Needed]
|
201 |
+
|
202 |
+
## Model Card Contact
|
203 |
+
|
204 |
+
[More Information Needed]
|
205 |
+
### Framework versions
|
206 |
+
|
207 |
+
- PEFT 0.16.0
|
train/output/qlora-codellama-bugfix/adapter_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df755384ffd5359513ac7a725ba0c4d3104e6b5c131f98aee0b1cb7d160c17c0
|
3 |
+
size 839
|
train/output/qlora-codellama-bugfix/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73618dfb9cd2422a1a6600a5cec7fc1178bfe579e0c77d827761f26eeaae7f09
|
3 |
+
size 134235048
|
train/output/qlora-codellama-bugfix/chat_template.jinja
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}
|
train/output/qlora-codellama-bugfix/checkpoint-1000/README.md
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: codellama/CodeLLaMA-7b-Instruct-hf
|
3 |
+
library_name: peft
|
4 |
+
pipeline_tag: text-generation
|
5 |
+
tags:
|
6 |
+
- base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
|
7 |
+
- lora
|
8 |
+
- transformers
|
9 |
+
---
|
10 |
+
|
11 |
+
# Model Card for Model ID
|
12 |
+
|
13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
## Model Details
|
18 |
+
|
19 |
+
### Model Description
|
20 |
+
|
21 |
+
<!-- Provide a longer summary of what this model is. -->
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
- **Developed by:** [More Information Needed]
|
26 |
+
- **Funded by [optional]:** [More Information Needed]
|
27 |
+
- **Shared by [optional]:** [More Information Needed]
|
28 |
+
- **Model type:** [More Information Needed]
|
29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
30 |
+
- **License:** [More Information Needed]
|
31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
32 |
+
|
33 |
+
### Model Sources [optional]
|
34 |
+
|
35 |
+
<!-- Provide the basic links for the model. -->
|
36 |
+
|
37 |
+
- **Repository:** [More Information Needed]
|
38 |
+
- **Paper [optional]:** [More Information Needed]
|
39 |
+
- **Demo [optional]:** [More Information Needed]
|
40 |
+
|
41 |
+
## Uses
|
42 |
+
|
43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
44 |
+
|
45 |
+
### Direct Use
|
46 |
+
|
47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
48 |
+
|
49 |
+
[More Information Needed]
|
50 |
+
|
51 |
+
### Downstream Use [optional]
|
52 |
+
|
53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
54 |
+
|
55 |
+
[More Information Needed]
|
56 |
+
|
57 |
+
### Out-of-Scope Use
|
58 |
+
|
59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
60 |
+
|
61 |
+
[More Information Needed]
|
62 |
+
|
63 |
+
## Bias, Risks, and Limitations
|
64 |
+
|
65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
66 |
+
|
67 |
+
[More Information Needed]
|
68 |
+
|
69 |
+
### Recommendations
|
70 |
+
|
71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
72 |
+
|
73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
74 |
+
|
75 |
+
## How to Get Started with the Model
|
76 |
+
|
77 |
+
Use the code below to get started with the model.
|
78 |
+
|
79 |
+
[More Information Needed]
|
80 |
+
|
81 |
+
## Training Details
|
82 |
+
|
83 |
+
### Training Data
|
84 |
+
|
85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
86 |
+
|
87 |
+
[More Information Needed]
|
88 |
+
|
89 |
+
### Training Procedure
|
90 |
+
|
91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
92 |
+
|
93 |
+
#### Preprocessing [optional]
|
94 |
+
|
95 |
+
[More Information Needed]
|
96 |
+
|
97 |
+
|
98 |
+
#### Training Hyperparameters
|
99 |
+
|
100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
101 |
+
|
102 |
+
#### Speeds, Sizes, Times [optional]
|
103 |
+
|
104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
105 |
+
|
106 |
+
[More Information Needed]
|
107 |
+
|
108 |
+
## Evaluation
|
109 |
+
|
110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
111 |
+
|
112 |
+
### Testing Data, Factors & Metrics
|
113 |
+
|
114 |
+
#### Testing Data
|
115 |
+
|
116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
117 |
+
|
118 |
+
[More Information Needed]
|
119 |
+
|
120 |
+
#### Factors
|
121 |
+
|
122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
123 |
+
|
124 |
+
[More Information Needed]
|
125 |
+
|
126 |
+
#### Metrics
|
127 |
+
|
128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
129 |
+
|
130 |
+
[More Information Needed]
|
131 |
+
|
132 |
+
### Results
|
133 |
+
|
134 |
+
[More Information Needed]
|
135 |
+
|
136 |
+
#### Summary
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
## Model Examination [optional]
|
141 |
+
|
142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
143 |
+
|
144 |
+
[More Information Needed]
|
145 |
+
|
146 |
+
## Environmental Impact
|
147 |
+
|
148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
149 |
+
|
150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
151 |
+
|
152 |
+
- **Hardware Type:** [More Information Needed]
|
153 |
+
- **Hours used:** [More Information Needed]
|
154 |
+
- **Cloud Provider:** [More Information Needed]
|
155 |
+
- **Compute Region:** [More Information Needed]
|
156 |
+
- **Carbon Emitted:** [More Information Needed]
|
157 |
+
|
158 |
+
## Technical Specifications [optional]
|
159 |
+
|
160 |
+
### Model Architecture and Objective
|
161 |
+
|
162 |
+
[More Information Needed]
|
163 |
+
|
164 |
+
### Compute Infrastructure
|
165 |
+
|
166 |
+
[More Information Needed]
|
167 |
+
|
168 |
+
#### Hardware
|
169 |
+
|
170 |
+
[More Information Needed]
|
171 |
+
|
172 |
+
#### Software
|
173 |
+
|
174 |
+
[More Information Needed]
|
175 |
+
|
176 |
+
## Citation [optional]
|
177 |
+
|
178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
179 |
+
|
180 |
+
**BibTeX:**
|
181 |
+
|
182 |
+
[More Information Needed]
|
183 |
+
|
184 |
+
**APA:**
|
185 |
+
|
186 |
+
[More Information Needed]
|
187 |
+
|
188 |
+
## Glossary [optional]
|
189 |
+
|
190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
191 |
+
|
192 |
+
[More Information Needed]
|
193 |
+
|
194 |
+
## More Information [optional]
|
195 |
+
|
196 |
+
[More Information Needed]
|
197 |
+
|
198 |
+
## Model Card Authors [optional]
|
199 |
+
|
200 |
+
[More Information Needed]
|
201 |
+
|
202 |
+
## Model Card Contact
|
203 |
+
|
204 |
+
[More Information Needed]
|
205 |
+
### Framework versions
|
206 |
+
|
207 |
+
- PEFT 0.16.0
|
train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1964c3b05fe32e3371262e1acac8e947ef8f8a52431e37716f780afe50727087
|
3 |
+
size 839
|
train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84064653aacb1aa638242fa35d96415845184e3f97cec4da4bc9412385f7aff9
|
3 |
+
size 134235048
|
train/output/qlora-codellama-bugfix/checkpoint-1000/chat_template.jinja
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}
|
train/output/qlora-codellama-bugfix/checkpoint-1000/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e69ea177eb91af1890ec93a594282aef22499e4743b53c958f27820bc33d28e
|
3 |
+
size 268544075
|
train/output/qlora-codellama-bugfix/checkpoint-1000/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8ecd469a8b4c7959941512dc268004dd3111ea9235d256aa9638ce00699c0f2
|
3 |
+
size 14645
|
train/output/qlora-codellama-bugfix/checkpoint-1000/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cee0966af6e7296ad683b3f293ade83fe035fb7007b575377ad7775d13ec8b46
|
3 |
+
size 1465
|
train/output/qlora-codellama-bugfix/checkpoint-1000/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
|
3 |
+
size 538
|
train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
|
3 |
+
size 3620829
|
train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
|
3 |
+
size 1869
|
train/output/qlora-codellama-bugfix/checkpoint-1000/trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e448dea2040f629a632432c297857131d1b3ac2ccdb25548b1cb4fe4ec5d779f
|
3 |
+
size 4295
|
train/output/qlora-codellama-bugfix/checkpoint-1000/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cca9708d68b5d07e91505e97d7945c08ed5c660e02fce750706e45d57565144
|
3 |
+
size 5777
|
train/output/qlora-codellama-bugfix/checkpoint-500/README.md
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model: codellama/CodeLLaMA-7b-Instruct-hf
|
3 |
+
library_name: peft
|
4 |
+
pipeline_tag: text-generation
|
5 |
+
tags:
|
6 |
+
- base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
|
7 |
+
- lora
|
8 |
+
- transformers
|
9 |
+
---
|
10 |
+
|
11 |
+
# Model Card for Model ID
|
12 |
+
|
13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
## Model Details
|
18 |
+
|
19 |
+
### Model Description
|
20 |
+
|
21 |
+
<!-- Provide a longer summary of what this model is. -->
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
- **Developed by:** [More Information Needed]
|
26 |
+
- **Funded by [optional]:** [More Information Needed]
|
27 |
+
- **Shared by [optional]:** [More Information Needed]
|
28 |
+
- **Model type:** [More Information Needed]
|
29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
30 |
+
- **License:** [More Information Needed]
|
31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
32 |
+
|
33 |
+
### Model Sources [optional]
|
34 |
+
|
35 |
+
<!-- Provide the basic links for the model. -->
|
36 |
+
|
37 |
+
- **Repository:** [More Information Needed]
|
38 |
+
- **Paper [optional]:** [More Information Needed]
|
39 |
+
- **Demo [optional]:** [More Information Needed]
|
40 |
+
|
41 |
+
## Uses
|
42 |
+
|
43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
44 |
+
|
45 |
+
### Direct Use
|
46 |
+
|
47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
48 |
+
|
49 |
+
[More Information Needed]
|
50 |
+
|
51 |
+
### Downstream Use [optional]
|
52 |
+
|
53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
54 |
+
|
55 |
+
[More Information Needed]
|
56 |
+
|
57 |
+
### Out-of-Scope Use
|
58 |
+
|
59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
60 |
+
|
61 |
+
[More Information Needed]
|
62 |
+
|
63 |
+
## Bias, Risks, and Limitations
|
64 |
+
|
65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
66 |
+
|
67 |
+
[More Information Needed]
|
68 |
+
|
69 |
+
### Recommendations
|
70 |
+
|
71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
72 |
+
|
73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
74 |
+
|
75 |
+
## How to Get Started with the Model
|
76 |
+
|
77 |
+
Use the code below to get started with the model.
|
78 |
+
|
79 |
+
[More Information Needed]
|
80 |
+
|
81 |
+
## Training Details
|
82 |
+
|
83 |
+
### Training Data
|
84 |
+
|
85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
86 |
+
|
87 |
+
[More Information Needed]
|
88 |
+
|
89 |
+
### Training Procedure
|
90 |
+
|
91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
92 |
+
|
93 |
+
#### Preprocessing [optional]
|
94 |
+
|
95 |
+
[More Information Needed]
|
96 |
+
|
97 |
+
|
98 |
+
#### Training Hyperparameters
|
99 |
+
|
100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
101 |
+
|
102 |
+
#### Speeds, Sizes, Times [optional]
|
103 |
+
|
104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
105 |
+
|
106 |
+
[More Information Needed]
|
107 |
+
|
108 |
+
## Evaluation
|
109 |
+
|
110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
111 |
+
|
112 |
+
### Testing Data, Factors & Metrics
|
113 |
+
|
114 |
+
#### Testing Data
|
115 |
+
|
116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
117 |
+
|
118 |
+
[More Information Needed]
|
119 |
+
|
120 |
+
#### Factors
|
121 |
+
|
122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
123 |
+
|
124 |
+
[More Information Needed]
|
125 |
+
|
126 |
+
#### Metrics
|
127 |
+
|
128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
129 |
+
|
130 |
+
[More Information Needed]
|
131 |
+
|
132 |
+
### Results
|
133 |
+
|
134 |
+
[More Information Needed]
|
135 |
+
|
136 |
+
#### Summary
|
137 |
+
|
138 |
+
|
139 |
+
|
140 |
+
## Model Examination [optional]
|
141 |
+
|
142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
143 |
+
|
144 |
+
[More Information Needed]
|
145 |
+
|
146 |
+
## Environmental Impact
|
147 |
+
|
148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
149 |
+
|
150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
151 |
+
|
152 |
+
- **Hardware Type:** [More Information Needed]
|
153 |
+
- **Hours used:** [More Information Needed]
|
154 |
+
- **Cloud Provider:** [More Information Needed]
|
155 |
+
- **Compute Region:** [More Information Needed]
|
156 |
+
- **Carbon Emitted:** [More Information Needed]
|
157 |
+
|
158 |
+
## Technical Specifications [optional]
|
159 |
+
|
160 |
+
### Model Architecture and Objective
|
161 |
+
|
162 |
+
[More Information Needed]
|
163 |
+
|
164 |
+
### Compute Infrastructure
|
165 |
+
|
166 |
+
[More Information Needed]
|
167 |
+
|
168 |
+
#### Hardware
|
169 |
+
|
170 |
+
[More Information Needed]
|
171 |
+
|
172 |
+
#### Software
|
173 |
+
|
174 |
+
[More Information Needed]
|
175 |
+
|
176 |
+
## Citation [optional]
|
177 |
+
|
178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
179 |
+
|
180 |
+
**BibTeX:**
|
181 |
+
|
182 |
+
[More Information Needed]
|
183 |
+
|
184 |
+
**APA:**
|
185 |
+
|
186 |
+
[More Information Needed]
|
187 |
+
|
188 |
+
## Glossary [optional]
|
189 |
+
|
190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
191 |
+
|
192 |
+
[More Information Needed]
|
193 |
+
|
194 |
+
## More Information [optional]
|
195 |
+
|
196 |
+
[More Information Needed]
|
197 |
+
|
198 |
+
## Model Card Authors [optional]
|
199 |
+
|
200 |
+
[More Information Needed]
|
201 |
+
|
202 |
+
## Model Card Contact
|
203 |
+
|
204 |
+
[More Information Needed]
|
205 |
+
### Framework versions
|
206 |
+
|
207 |
+
- PEFT 0.16.0
|
train/output/qlora-codellama-bugfix/checkpoint-500/adapter_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1964c3b05fe32e3371262e1acac8e947ef8f8a52431e37716f780afe50727087
|
3 |
+
size 839
|
train/output/qlora-codellama-bugfix/checkpoint-500/adapter_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ddc3978f5ac8ec8f6c879f16e4deab2f4955796fc45daaf9fd7063d6b270b027
|
3 |
+
size 134235048
|
train/output/qlora-codellama-bugfix/checkpoint-500/chat_template.jinja
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}
|
train/output/qlora-codellama-bugfix/checkpoint-500/optimizer.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1e0ede750ff79b1bb5285ab51d0e4b03eeec5f4673ac149f1e973b6d20ad8e91
|
3 |
+
size 268544075
|
train/output/qlora-codellama-bugfix/checkpoint-500/rng_state.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:873492f033797d17bd144ec6ef43f5efb66933cf27e0c60ddd8d799f5d1f12c1
|
3 |
+
size 14645
|
train/output/qlora-codellama-bugfix/checkpoint-500/scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1492973eb5cd9c3e10c1c7ee7c4d19bcd22481640c90b6278d1a21419300ba9
|
3 |
+
size 1465
|
train/output/qlora-codellama-bugfix/checkpoint-500/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
|
3 |
+
size 538
|
train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
|
3 |
+
size 3620829
|
train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
|
3 |
+
size 1869
|
train/output/qlora-codellama-bugfix/checkpoint-500/trainer_state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22d7fb0a27cf19e9d2091a0f3a62b86790ee0448da788f1599a6b1e4e2a83e84
|
3 |
+
size 2543
|
train/output/qlora-codellama-bugfix/checkpoint-500/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7cca9708d68b5d07e91505e97d7945c08ed5c660e02fce750706e45d57565144
|
3 |
+
size 5777
|
train/output/qlora-codellama-bugfix/special_tokens_map.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
|
3 |
+
size 538
|
train/output/qlora-codellama-bugfix/tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
|
3 |
+
size 3620829
|
train/output/qlora-codellama-bugfix/tokenizer_config.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
|
3 |
+
size 1869
|
train/train.py
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# QLoRA fine-tuning for CodeLLaMA-7B-Instruct on 1x H200
|
2 |
+
# Requirements: transformers, peft, accelerate, bitsandbytes, datasets
|
3 |
+
from transformers import (
|
4 |
+
AutoTokenizer,
|
5 |
+
AutoModelForCausalLM,
|
6 |
+
TrainingArguments,
|
7 |
+
Trainer,
|
8 |
+
BitsAndBytesConfig,
|
9 |
+
DataCollatorForSeq2Seq
|
10 |
+
)
|
11 |
+
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
12 |
+
from datasets import load_dataset
|
13 |
+
import torch
|
14 |
+
import os
|
15 |
+
import wandb
|
16 |
+
|
17 |
+
os.environ["WANDB_PROJECT"] = "codellama-7b-instruct-qlora-linux-bugfix"
|
18 |
+
os.environ["WANDB_NAME"] = "run-v1"
|
19 |
+
# Paths and model
|
20 |
+
BASE_MODEL = "codellama/CodeLLaMA-7b-Instruct-hf"
|
21 |
+
DATA_PATH = "../dataset/linux_bugfix_100k.jsonl"
|
22 |
+
OUTPUT_DIR = "./output/qlora-codellama-bugfix"
|
23 |
+
|
24 |
+
# Load dataset (prompt-completion format)
|
25 |
+
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
|
26 |
+
|
27 |
+
# BitsandBytes config for QLoRA
|
28 |
+
bnb_config = BitsAndBytesConfig(
|
29 |
+
load_in_4bit=True,
|
30 |
+
bnb_4bit_use_double_quant=True,
|
31 |
+
bnb_4bit_quant_type="nf4",
|
32 |
+
bnb_4bit_compute_dtype=torch.bfloat16 # optimized for H100/H200
|
33 |
+
)
|
34 |
+
|
35 |
+
# Load tokenizer and model
|
36 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
|
37 |
+
tokenizer.pad_token = tokenizer.eos_token
|
38 |
+
tokenizer.padding_side = "right"
|
39 |
+
|
40 |
+
model = AutoModelForCausalLM.from_pretrained(
|
41 |
+
BASE_MODEL,
|
42 |
+
quantization_config=bnb_config,
|
43 |
+
device_map="auto"
|
44 |
+
)
|
45 |
+
model = prepare_model_for_kbit_training(model)
|
46 |
+
model.gradient_checkpointing_enable()
|
47 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
48 |
+
|
49 |
+
# Apply QLoRA (LoRA config)
|
50 |
+
lora_config = LoraConfig(
|
51 |
+
r=64,
|
52 |
+
lora_alpha=16,
|
53 |
+
lora_dropout=0.1,
|
54 |
+
bias="none",
|
55 |
+
task_type="CAUSAL_LM"
|
56 |
+
)
|
57 |
+
model = get_peft_model(model, lora_config)
|
58 |
+
model.config.use_cache = False
|
59 |
+
model.config.return_dict = True
|
60 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
61 |
+
model.print_trainable_parameters()
|
62 |
+
|
63 |
+
# Format and tokenize the dataset
|
64 |
+
model_max_len = tokenizer.model_max_length
|
65 |
+
|
66 |
+
def format(example):
|
67 |
+
prompt_ids = tokenizer(example["prompt"], truncation=True, max_length=1024)["input_ids"]
|
68 |
+
completion_ids = tokenizer(example["completion"], truncation=True, max_length=512)["input_ids"]
|
69 |
+
|
70 |
+
input_ids = prompt_ids + completion_ids
|
71 |
+
labels = [-100] * len(prompt_ids) + completion_ids
|
72 |
+
|
73 |
+
# pad both input_ids and labels to the same length
|
74 |
+
max_len = min(len(input_ids), tokenizer.model_max_length)
|
75 |
+
input_ids = input_ids[:max_len]
|
76 |
+
labels = labels[:max_len]
|
77 |
+
|
78 |
+
return {
|
79 |
+
"input_ids": input_ids,
|
80 |
+
"labels": labels,
|
81 |
+
}
|
82 |
+
|
83 |
+
|
84 |
+
# Sanity check
|
85 |
+
print("__ Sanity checking one example...")
|
86 |
+
sample = format(dataset[0])
|
87 |
+
test_input = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
|
88 |
+
test_labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device)
|
89 |
+
model.train()
|
90 |
+
out = model(input_ids=test_input, labels=test_labels)
|
91 |
+
assert out.loss.requires_grad, "Sanity check failed: Loss does not require grad."
|
92 |
+
print("__ Sanity check passed. Proceeding to map()...")
|
93 |
+
|
94 |
+
# Apply formatting to entire dataset
|
95 |
+
dataset = dataset.map(format, remove_columns=["prompt", "completion"])
|
96 |
+
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt", pad_to_multiple_of=8)
|
97 |
+
|
98 |
+
# Training arguments
|
99 |
+
training_args = TrainingArguments(
|
100 |
+
report_to="wandb",
|
101 |
+
run_name="codellama-7b-instruct-qlora-linux-bugfix",
|
102 |
+
logging_dir=f"{OUTPUT_DIR}/logs",
|
103 |
+
|
104 |
+
output_dir=OUTPUT_DIR,
|
105 |
+
num_train_epochs=3,
|
106 |
+
per_device_train_batch_size=64,
|
107 |
+
gradient_accumulation_steps=4,
|
108 |
+
learning_rate=2e-4,
|
109 |
+
lr_scheduler_type="cosine",
|
110 |
+
warmup_ratio=0.03,
|
111 |
+
gradient_checkpointing=True,
|
112 |
+
bf16=True, # Important for H200
|
113 |
+
fp16=False,
|
114 |
+
max_grad_norm=1.0,
|
115 |
+
save_strategy="steps",
|
116 |
+
save_steps=500,
|
117 |
+
save_total_limit=2,
|
118 |
+
logging_steps=50,
|
119 |
+
push_to_hub=False,
|
120 |
+
label_names=["labels"],
|
121 |
+
remove_unused_columns=False, # Critical to prevent data loss
|
122 |
+
)
|
123 |
+
|
124 |
+
# Trainer setup
|
125 |
+
trainer = Trainer(
|
126 |
+
model=model,
|
127 |
+
args=training_args,
|
128 |
+
train_dataset=dataset,
|
129 |
+
tokenizer=tokenizer,
|
130 |
+
data_collator=collator
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
# Begin training
|
135 |
+
model.train()
|
136 |
+
print(f"Track this run in Weights & Biases: https://wandb.ai/{os.environ['WANDB_PROJECT']}/{os.environ['WANDB_NAME']}")
|
137 |
+
trainer.train(resume_from_checkpoint=True)
|
138 |
+
|
139 |
+
|
140 |
+
# Save final model
|
141 |
+
model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
|
142 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|
143 |
+
print(f"[DONE] Model saved to {OUTPUT_DIR}")
|
train/train_codellama_qlora.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# QLoRA fine-tuning for CodeLLaMA-7B-Instruct
|
2 |
+
# Requirements: transformers, peft, accelerate, bitsandbytes, datasets
|
3 |
+
|
4 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
|
5 |
+
from peft import LoraConfig, get_peft_model
|
6 |
+
from datasets import load_dataset
|
7 |
+
import torch
|
8 |
+
import os
|
9 |
+
|
10 |
+
# Paths and parameters
|
11 |
+
BASE_MODEL = "codellama/CodeLlama-7b-Instruct-hf"
|
12 |
+
DATA_PATH = "../dataset_builder/output/linux_bugfix_prompt_completion.jsonl"
|
13 |
+
OUTPUT_DIR = "./output/qlora-codellama-bugfix"
|
14 |
+
|
15 |
+
# Load dataset (prompt, completion)
|
16 |
+
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
|
17 |
+
|
18 |
+
# Apply formatting for supervised fine-tuning
|
19 |
+
def format(example):
|
20 |
+
prompt = tokenizer(
|
21 |
+
example["prompt"],
|
22 |
+
truncation=True,
|
23 |
+
padding="max_length",
|
24 |
+
max_length=512
|
25 |
+
)
|
26 |
+
completion = tokenizer(
|
27 |
+
example["completion"],
|
28 |
+
truncation=True,
|
29 |
+
padding="max_length",
|
30 |
+
max_length=512
|
31 |
+
)
|
32 |
+
input_ids = prompt["input_ids"] + completion["input_ids"]
|
33 |
+
labels = [-100] * len(prompt["input_ids"]) + completion["input_ids"]
|
34 |
+
|
35 |
+
return {
|
36 |
+
"input_ids": input_ids[:1024],
|
37 |
+
"labels": labels[:1024]
|
38 |
+
}
|
39 |
+
|
40 |
+
# Load tokenizer and base model
|
41 |
+
bnb_config = BitsAndBytesConfig(
|
42 |
+
load_in_4bit=True,
|
43 |
+
bnb_4bit_use_double_quant=True,
|
44 |
+
bnb_4bit_quant_type="nf4",
|
45 |
+
bnb_4bit_compute_dtype=torch.float16
|
46 |
+
)
|
47 |
+
|
48 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
|
49 |
+
tokenizer.pad_token = tokenizer.eos_token # Required for padding
|
50 |
+
|
51 |
+
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto")
|
52 |
+
|
53 |
+
# Apply QLoRA
|
54 |
+
lora_config = LoraConfig(
|
55 |
+
r=64,
|
56 |
+
lora_alpha=16,
|
57 |
+
lora_dropout=0.1,
|
58 |
+
bias="none",
|
59 |
+
task_type="CAUSAL_LM"
|
60 |
+
)
|
61 |
+
|
62 |
+
model = get_peft_model(model, lora_config)
|
63 |
+
|
64 |
+
# Tokenize dataset
|
65 |
+
dataset = dataset.map(format, remove_columns=["prompt", "completion"])
|
66 |
+
|
67 |
+
# Training args
|
68 |
+
training_args = TrainingArguments(
|
69 |
+
output_dir=OUTPUT_DIR,
|
70 |
+
num_train_epochs=3,
|
71 |
+
per_device_train_batch_size=1,
|
72 |
+
gradient_accumulation_steps=4,
|
73 |
+
learning_rate=2e-4,
|
74 |
+
logging_dir=f"{OUTPUT_DIR}/logs",
|
75 |
+
logging_steps=10,
|
76 |
+
save_strategy="epoch",
|
77 |
+
bf16=False,
|
78 |
+
fp16=True,
|
79 |
+
save_total_limit=2,
|
80 |
+
report_to="none",
|
81 |
+
push_to_hub=False
|
82 |
+
)
|
83 |
+
|
84 |
+
# Trainer
|
85 |
+
trainer = Trainer(
|
86 |
+
model=model,
|
87 |
+
args=training_args,
|
88 |
+
train_dataset=dataset,
|
89 |
+
tokenizer=tokenizer
|
90 |
+
)
|
91 |
+
|
92 |
+
trainer.train()
|
93 |
+
|
94 |
+
model.save_pretrained(OUTPUT_DIR)
|
95 |
+
tokenizer.save_pretrained(OUTPUT_DIR)
|
96 |
+
print(f"[DONE] Model saved to {OUTPUT_DIR}")
|