Mac Huang commited on Jul 9

Commit

ed6b901

1 Parent(s): 43864c1

Push full project including dataset, code, and training scripts

Files changed (47) hide show

PROJECT_STRUCTURE.md +228 -0
dataset/linux_bugfix_100k.jsonl +3 -0
dataset/linux_bugfix_prompt_completion.jsonl +3 -0
dataset_builder/build_dataset.py +160 -0
dataset_builder/build_dataset_demo.py +157 -0
dataset_builder/convert_to_prompt_completion.py +41 -0
evaluate/__pycache__/evaluate.cpython-312.pyc +0 -0
evaluate/eval.jsonl +3 -0
evaluate/evaluate.py +87 -0
evaluate/output/.eval_results.csv.swp +0 -0
evaluate/output/eval_results.csv +3 -0
evaluate/output/eval_results.json +3 -0
requirements.txt +182 -0
train/download_model.py +6 -0
train/output/qlora-codellama-bugfix/README.md +207 -0
train/output/qlora-codellama-bugfix/adapter_config.json +3 -0
train/output/qlora-codellama-bugfix/adapter_model.safetensors +3 -0
train/output/qlora-codellama-bugfix/chat_template.jinja +1 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/README.md +207 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_config.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_model.safetensors +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/chat_template.jinja +1 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/optimizer.pt +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/rng_state.pth +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/scheduler.pt +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/special_tokens_map.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer_config.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/trainer_state.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-1000/training_args.bin +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/README.md +207 -0
train/output/qlora-codellama-bugfix/checkpoint-500/adapter_config.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/adapter_model.safetensors +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/chat_template.jinja +1 -0
train/output/qlora-codellama-bugfix/checkpoint-500/optimizer.pt +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/rng_state.pth +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/scheduler.pt +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/special_tokens_map.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer_config.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/trainer_state.json +3 -0
train/output/qlora-codellama-bugfix/checkpoint-500/training_args.bin +3 -0
train/output/qlora-codellama-bugfix/special_tokens_map.json +3 -0
train/output/qlora-codellama-bugfix/tokenizer.json +3 -0
train/output/qlora-codellama-bugfix/tokenizer_config.json +3 -0
train/train.py +143 -0
train/train_codellama_qlora.py +96 -0

PROJECT_STRUCTURE.md ADDED Viewed

	@@ -0,0 +1,228 @@

+# Linux Kernel Anti-Pattern Detector - Project Structure
+## Overview
+This project is organized into a clear, maintainable structure that separates concerns and makes it easy to find, modify, and extend functionality.
+## Directory Structure
+```
+Linux Kernel Anti-Pattern Detector/
+├── 📁 data/                          # Analysis data and results
+│   ├── results.json                  # Main analysis results
+│   ├── concurrency_analysis_report.json
+│   └── kernel_analysis.log          # Analysis logs
+│
+├── 📁 docs/                          # Documentation
+│   ├── kernel-analysis-guide.md     # Kernel analysis documentation
+│   └── [additional documentation]
+│
+├── 📁 examples/                      # Example code and usage
+│   └── [example files]
+│
+├── 📁 reports/                       # Generated analysis reports
+│   ├── Linux_Kernel_Anti_Pattern_Analysis_Report.md
+│   ├── Executive_Summary.md
+│   └── 📁 concurrency/              # Concurrency-specific reports
+│       └── Concurrency_Analysis_Report.md
+│
+├── 📁 scripts/                       # Analysis and utility scripts
+│   ├── 📁 analysis/                 # Core analysis scripts
+│   │   ├── concurrency_analyzer.py  # Concurrency issue analyzer
+│   │   └── analyze_kernel_structure.py
+│   ├── 📁 reporting/                # Report generation scripts
+│   │   └── view_results.py         # Results viewer
+│   └── 📁 utils/                    # Utility scripts
+│       └── quick_summary.py        # Quick summary generator
+│
+├── 📁 src/                          # Source code (main project)
+│   ├── __init__.py
+│   ├── 📁 detectors/               # Anti-pattern detection modules
+│   ├── 📁 rules/                   # Detection rules and patterns
+│   └── 📁 utils/                   # Utility functions
+│
+├── 📁 tests/                        # Test files
+│   └── [test files]
+│
+├── 📁 tools/                        # Analysis tools and detectors
+│   ├── 📁 detectors/               # Main detection tools
+│   │   ├── detector.py             # Main anti-pattern detector
+│   │   └── config.yaml             # Detection configuration
+│   ├── 📁 visualizers/             # Data visualization tools
+│   └── 📁 exporters/               # Data export tools
+│
+├── 📁 linux/                        # Linux kernel source (cloned)
+│   └── [kernel source files]
+│
+├── 📄 README.md                     # Main project documentation
+├── 📄 requirements.txt              # Main project dependencies
+├── 📄 requirements-kernel-analysis.txt
+├── 📄 requirements-simple.txt
+├── 📄 .gitignore                    # Git ignore rules
+└── 📄 PROJECT_STRUCTURE.md          # This file
+```
+## Directory Descriptions
+### 📁 data/
+Contains all analysis results, logs, and generated data files.
+- **results.json**: Complete analysis results from the main detector
+- **concurrency_analysis_report.json**: Detailed concurrency analysis
+- **kernel_analysis.log**: Analysis execution logs
+### 📁 docs/
+Project documentation and guides.
+- **kernel-analysis-guide.md**: Comprehensive guide for kernel analysis
+- Additional documentation for specific features
+### 📁 examples/
+Example code, usage patterns, and sample data.
+- Example kernel modules for testing
+- Sample configuration files
+- Usage examples
+### 📁 reports/
+Generated analysis reports in various formats.
+- **Linux_Kernel_Anti_Pattern_Analysis_Report.md**: Complete technical report
+- **Executive_Summary.md**: High-level summary for stakeholders
+- **concurrency/**: Specialized reports for specific issue types
+### 📁 scripts/
+Analysis and utility scripts organized by function.
+#### 📁 analysis/
+Core analysis scripts for different types of anti-patterns.
+- **concurrency_analyzer.py**: Specialized concurrency issue analysis
+- **analyze_kernel_structure.py**: Kernel structure analysis
+#### 📁 reporting/
+Scripts for generating and viewing reports.
+- **view_results.py**: Interactive results viewer and reporter
+#### 📁 utils/
+Utility scripts for common tasks.
+- **quick_summary.py**: Quick summary generation
+### 📁 src/
+Main project source code (core framework).
+- **detectors/**: Anti-pattern detection modules
+- **rules/**: Detection rules and pattern definitions
+- **utils/**: Utility functions and helpers
+### 📁 tests/
+Test files and test data.
+- Unit tests for detection modules
+- Integration tests
+- Test data and fixtures
+### 📁 tools/
+Analysis tools and detectors.
+#### 📁 detectors/
+Main detection tools and configurations.
+- **detector.py**: Primary anti-pattern detection engine
+- **config.yaml**: Detection configuration and rules
+#### 📁 visualizers/
+Data visualization and charting tools.
+- Interactive dashboards
+- Chart generators
+- Data plotting utilities
+#### 📁 exporters/
+Data export and format conversion tools.
+- JSON to other formats
+- Report generation
+- Data transformation
+### 📁 linux/
+Cloned Linux kernel source code for analysis.
+- Complete kernel source tree
+- Used for code snippet extraction
+- Reference for pattern validation
+## File Descriptions
+### Core Files
+- **README.md**: Main project documentation and getting started guide
+- **requirements.txt**: Main project Python dependencies
+- **requirements-kernel-analysis.txt**: Kernel analysis specific dependencies
+- **requirements-simple.txt**: Simplified dependencies for basic usage
+- **.gitignore**: Git ignore patterns for the project
+### Configuration Files
+- **tools/detectors/config.yaml**: Main detection configuration
+- **tools/detectors/detector.py**: Primary detection engine
+## Usage Patterns
+### Running Analysis
+```bash
+# Main analysis
+python tools/detectors/detector.py --clone --output data/results.json
+# Concurrency analysis
+python scripts/analysis/concurrency_analyzer.py
+# View results
+python scripts/reporting/view_results.py data/results.json
+```
+### Generating Reports
+```bash
+# Quick summary
+python scripts/utils/quick_summary.py
+# Interactive viewer
+python scripts/reporting/view_results.py --interactive
+```
+### Development
+```bash
+# Install dependencies
+pip install -r requirements.txt
+pip install -r requirements-kernel-analysis.txt
+# Run tests
+python -m pytest tests/
+# Development setup
+conda activate linux-kernel-anti-pattern-detector
+```
+## Best Practices
+### Adding New Features
+1. **Analysis scripts**: Add to `scripts/analysis/`
+2. **Reporting tools**: Add to `scripts/reporting/`
+3. **Utilities**: Add to `scripts/utils/`
+4. **Core detection**: Add to `src/detectors/`
+5. **Configuration**: Update `tools/detectors/config.yaml`
+### File Naming Conventions
+- **Python files**: snake_case (e.g., `concurrency_analyzer.py`)
+- **Configuration files**: kebab-case (e.g., `kernel-analysis-guide.md`)
+- **Reports**: Pascal_Case (e.g., `Concurrency_Analysis_Report.md`)
+### Data Management
+- **Raw data**: Store in `data/`
+- **Processed results**: Store in `data/`
+- **Reports**: Generate in `reports/`
+- **Logs**: Store in `data/`
+## Maintenance
+### Regular Tasks
+1. **Update dependencies**: Review and update requirements files
+2. **Clean data**: Remove old analysis results periodically
+3. **Update kernel**: Refresh the Linux kernel source
+4. **Backup reports**: Archive important analysis reports
+### Version Control
+- **Track**: Source code, configuration, documentation
+- **Ignore**: Analysis results, logs, kernel source (large files)
+- **Archive**: Important reports and findings
+---
+*This structure is designed to be scalable, maintainable, and easy to navigate. Each directory has a clear purpose and the organization supports both development and research workflows.*

dataset/linux_bugfix_100k.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef9110bf645ceb42b2a6de21a952f82448ed7cfa31ac383c1c62fb9a840b9574
+size 241507688

dataset/linux_bugfix_prompt_completion.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:621b0b7e04c5b359ea28eec9bf165aad2eaea77b07b834b0c1b7de37202e7992
+size 2184324531

dataset_builder/build_dataset.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from pydriller import Repository
+import os
+import json
+from tqdm import tqdm
+import re
+REPO_PATH = '../linux'
+OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'
+TEST_MODE = False  # Set to False to process the full repository
+BUGFIX_KEYWORDS = [
+    'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
+    'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
+    'security', 'vulnerability', 'exploit', 'buffer', 'stack'
+]
+def is_bugfix_commit(msg):
+    msg_lower = msg.lower()
+    return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)
+def extract_instruction_from_commit_msg(msg):
+    lines = msg.strip().splitlines()
+    for line in lines:
+        line = line.strip()
+        if len(line) < 5 or not any(c.isalpha() for c in line):
+            continue
+        if line.lower().startswith((
+            '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
+            'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
+            'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
+        )):
+            continue
+        return line
+    return msg.strip().splitlines()[0] if msg.strip() else "fix"
+def extract_code_context(code, line_number, context_lines=10):
+    if not code:
+        return ""
+    lines = code.split('\n')
+    start = max(0, line_number - context_lines)
+    end = min(len(lines), line_number + context_lines)
+    return '\n'.join(lines[start:end])
+def extract_diff_context(diff_text, context_lines=5):
+    if not diff_text:
+        return ""
+    lines = diff_text.split('\n')
+    change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
+    if not change_lines:
+        return diff_text
+    start = max(0, change_lines[0] - context_lines)
+    end = min(len(lines), change_lines[-1] + context_lines + 1)
+    return '\n'.join(lines[start:end])
+def create_dataset_entry(original_code, commit_msg, diff_code):
+    return {
+        "input": {
+            "original code": original_code.strip(),
+            "instruction": extract_instruction_from_commit_msg(commit_msg)
+        },
+        "output": {
+            "diff codes": diff_code.strip()
+        }
+    }
+def process_commit(commit):
+    entries = []
+    if not is_bugfix_commit(commit.msg):
+        return entries
+    for mod in commit.modified_files:
+        if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
+            continue
+        if mod.change_type.name != "MODIFY":
+            continue
+        if not mod.diff or not mod.source_code_before:
+            continue
+        focused_diff = extract_diff_context(mod.diff)
+        diff_lines = mod.diff.split('\n')
+        line_numbers = []
+        for line in diff_lines:
+            if line.startswith('@@'):
+                match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
+                if match:
+                    line_numbers.append(int(match.group(1)))
+        if line_numbers:
+            focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
+        else:
+            focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])
+        entry = create_dataset_entry(
+            original_code=focused_code,
+            commit_msg=commit.msg,
+            diff_code=focused_diff
+        )
+        entries.append(entry)
+    return entries
+def main():
+    if not os.path.exists(REPO_PATH):
+        print(f"\u274c Repository not found at: {REPO_PATH}")
+        return
+    os.makedirs('./output', exist_ok=True)
+    print("\ud83d\udd0d Building Linux kernel bug-fix dataset...")
+    print(f"\ud83d\udcc1 Repository: {REPO_PATH}")
+    print(f"\ud83d\udcce Output: {OUTPUT_FILE}")
+    output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE
+    repo = Repository(REPO_PATH)
+    dataset_entries = []
+    processed_commits = 0
+    total_commits = 0
+    bugfix_commits = 0
+    for commit in tqdm(repo.traverse_commits(), desc="Processing commits"):
+        total_commits += 1
+        if TEST_MODE and MAX_COMMITS_TEST and total_commits > MAX_COMMITS_TEST:
+            break
+        if is_bugfix_commit(commit.msg):
+            bugfix_commits += 1
+            entries = process_commit(commit)
+            if entries:
+                dataset_entries.extend(entries)
+                processed_commits += 1
+                if TEST_MODE:
+                    print(f"\n\ud83d\udd0d Bug-fix commit {processed_commits}: {commit.hash[:8]}")
+                    print(f"\ud83d\udcdd Message: {extract_instruction_from_commit_msg(commit.msg)}")
+                    print(f"\ud83d\udcca Files: {len(entries)} entries extracted")
+                    print(f"\ud83d\udcc1 Files: {[mod.new_path for mod in commit.modified_files if mod.new_path and mod.new_path.endswith(('.c', '.h'))]}")
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for entry in dataset_entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+    print(f"\n\u2705 Dataset creation completed!")
+    print(f"\ud83d\udcca Total commits processed: {total_commits}")
+    print(f"\ud83d\udc1b Bug-fix commits found: {bugfix_commits}")
+    print(f"\ud83d\udcdd Commits with valid entries: {processed_commits}")
+    print(f"\ud83d\udcdd Total dataset entries: {len(dataset_entries)}")
+    print(f"\ud83d\udcce Saved to: {output_file}")
+    if dataset_entries:
+        print(f"\n\ud83d\udccb Sample dataset entry:")
+        sample = dataset_entries[0]
+        print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
+        print(f"\n\ud83d\udcc1 Dataset structure:")
+        print(f"   - Input: original code + instruction")
+        print(f"   - Output: diff codes")
+        print(f"   - Format: JSONL (one JSON object per line)")
+if __name__ == "__main__":
+    main()

dataset_builder/build_dataset_demo.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from pydriller import Repository
+import os
+import json
+from tqdm import tqdm
+import re
+from multiprocessing import Pool
+REPO_PATH = '../linux'
+OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'
+TEST_MODE = False  # Set to False to process the full repository
+MAX_COMMITS_TEST = 50  # Set a limit if TEST_MODE is True
+NUM_WORKERS = 16  # Adjust to your actual core count
+BUGFIX_KEYWORDS = [
+    'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
+    'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
+    'security', 'vulnerability', 'exploit', 'buffer', 'stack'
+]
+def is_bugfix_commit(msg):
+    msg_lower = msg.lower()
+    return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)
+def extract_instruction_from_commit_msg(msg):
+    lines = msg.strip().splitlines()
+    for line in lines:
+        line = line.strip()
+        if len(line) < 5 or not any(c.isalpha() for c in line):
+            continue
+        if line.lower().startswith((
+            '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
+            'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
+            'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
+        )):
+            continue
+        return line
+    return msg.strip().splitlines()[0] if msg.strip() else "fix"
+def extract_code_context(code, line_number, context_lines=10):
+    if not code:
+        return ""
+    lines = code.split('\n')
+    start = max(0, line_number - context_lines)
+    end = min(len(lines), line_number + context_lines)
+    return '\n'.join(lines[start:end])
+def extract_diff_context(diff_text, context_lines=5):
+    if not diff_text:
+        return ""
+    lines = diff_text.split('\n')
+    change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
+    if not change_lines:
+        return diff_text
+    start = max(0, change_lines[0] - context_lines)
+    end = min(len(lines), change_lines[-1] + context_lines + 1)
+    return '\n'.join(lines[start:end])
+def create_dataset_entry(original_code, commit_msg, diff_code):
+    return {
+        "input": {
+            "original code": original_code.strip(),
+            "instruction": extract_instruction_from_commit_msg(commit_msg)
+        },
+        "output": {
+            "diff codes": diff_code.strip()
+        }
+    }
+def process_commit(commit):
+    entries = []
+    if not is_bugfix_commit(commit.msg):
+        return entries
+    for mod in commit.modified_files:
+        if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
+            continue
+        if mod.change_type.name != "MODIFY":
+            continue
+        if not mod.diff or not mod.source_code_before:
+            continue
+        focused_diff = extract_diff_context(mod.diff)
+        diff_lines = mod.diff.split('\n')
+        line_numbers = []
+        for line in diff_lines:
+            if line.startswith('@@'):
+                match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
+                if match:
+                    line_numbers.append(int(match.group(1)))
+        if line_numbers:
+            focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
+        else:
+            focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])
+        entry = create_dataset_entry(
+            original_code=focused_code,
+            commit_msg=commit.msg,
+            diff_code=focused_diff
+        )
+        entries.append(entry)
+    return entries
+def collect_entries_from_hash(commit_hash):
+    try:
+        commit = next(Repository(REPO_PATH, only_commits=[commit_hash]).traverse_commits())
+        return process_commit(commit)
+    except Exception:
+        return []
+def main():
+    if not os.path.exists(REPO_PATH):
+        print("[ERROR] Repository not found at:", REPO_PATH)
+        return
+    os.makedirs('./output', exist_ok=True)
+    print("[INFO] Building Linux kernel bug-fix dataset...")
+    print("[INFO] Repository:", REPO_PATH)
+    print("[INFO] Output file:", OUTPUT_FILE)
+    output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE
+    all_hashes = [c.hash for c in Repository(REPO_PATH).traverse_commits()]
+    if TEST_MODE and MAX_COMMITS_TEST:
+        all_hashes = all_hashes[:MAX_COMMITS_TEST]
+    dataset_entries = []
+    with Pool(NUM_WORKERS) as pool:
+        results = list(tqdm(pool.imap_unordered(collect_entries_from_hash, all_hashes), total=len(all_hashes)))
+    for entries in results:
+        dataset_entries.extend(entries)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        for entry in dataset_entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
+    print("[DONE] Dataset creation completed!")
+    print("[INFO] Total commits processed:", len(all_hashes))
+    print("[INFO] Total dataset entries:", len(dataset_entries))
+    print("[INFO] Saved to:", output_file)
+    if dataset_entries:
+        print("[INFO] Sample dataset entry:")
+        sample = dataset_entries[0]
+        print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
+        print("[INFO] Dataset structure:")
+        print("   - Input: original code + instruction")
+        print("   - Output: diff codes")
+        print("   - Format: JSONL (one JSON object per line)")
+if __name__ == "__main__":
+    main()

dataset_builder/convert_to_prompt_completion.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import json
+INPUT_FILE = './output/linux_bugfix_dataset.jsonl'
+OUTPUT_FILE = './output/linux_bugfix_prompt_completion.jsonl'
+def format_prompt(original_code, instruction):
+    return (
+        "Given the following original C code:\n"
+        f"{original_code.strip()}\n\n"
+        "Instruction:\n"
+        f"{instruction.strip()}\n\n"
+        "Return the diff that fixes it:\n"
+    )
+def format_completion(diff_code):
+    return diff_code.strip()
+def convert_dataset(input_path, output_path):
+    with open(input_path, 'r', encoding='utf-8') as fin, \
+         open(output_path, 'w', encoding='utf-8') as fout:
+        for line in fin:
+            data = json.loads(line)
+            original_code = data["input"]["original code"]
+            instruction = data["input"]["instruction"]
+            diff_code = data["output"]["diff codes"]
+            prompt = format_prompt(original_code, instruction)
+            completion = format_completion(diff_code)
+            new_entry = {
+                "prompt": prompt,
+                "completion": completion
+            }
+            fout.write(json.dumps(new_entry, ensure_ascii=False) + '\n')
+    print(f"[DONE] Converted dataset saved to: {output_path}")
+if __name__ == "__main__":
+    convert_dataset(INPUT_FILE, OUTPUT_FILE)

evaluate/__pycache__/evaluate.cpython-312.pyc ADDED Viewed

Binary file (3.93 kB). View file

evaluate/eval.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0775fb198ac76efb0313376b80d43db691701b1ed52a524520bd38490c123242
+size 1758795

evaluate/evaluate.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from datasets import load_dataset
+from tqdm import tqdm
+import json
+import csv
+import os
+import evaluate
+# ==== CONFIG ====
+MODEL_PATH = "../train/output/qlora-codellama-bugfix"
+EVAL_FILE = "eval.jsonl"
+OUTPUT_JSON = "./output/eval_results.json"
+OUTPUT_CSV = "./output/eval_results.csv"
+MAX_INPUT_LEN = 1024
+MAX_NEW_TOKENS = 256
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# ==== Ensure output folder exists ====
+os.makedirs(os.path.dirname(OUTPUT_JSON), exist_ok=True)
+# ==== Load model ====
+print("🔄 Loading model...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    device_map="auto"
+)
+model.eval()
+# ==== Load eval data ====
+print("📂 Loading evaluation data...")
+eval_data = load_dataset("json", data_files=EVAL_FILE, split="train")
+# ==== Inference ====
+results = []
+print("⚙️ Running inference...")
+for example in tqdm(eval_data):
+    prompt = example["prompt"]
+    reference = example["completion"]
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LEN).to(DEVICE)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            do_sample=False,
+            num_beams=4,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    results.append({
+        "prompt": prompt,
+        "reference": reference.strip(),
+        "prediction": prediction.strip()
+    })
+# ==== Save results ====
+with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+    json.dump(results, f, indent=2)
+print(f"✅ Saved JSON to {OUTPUT_JSON}")
+with open(OUTPUT_CSV, "w", encoding="utf-8", newline='') as f:
+    writer = csv.DictWriter(f, fieldnames=["prompt", "reference", "prediction"])
+    writer.writeheader()
+    writer.writerows(results)
+print(f"✅ Saved CSV to {OUTPUT_CSV}")
+# ==== Compute Metrics ====
+print("📊 Computing BLEU and ROUGE...")
+bleu = evaluate.load("bleu")
+rouge = evaluate.load("rouge")
+predictions = [r["prediction"] for r in results]
+references = [r["reference"] for r in results]
+bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
+rouge_score = rouge.compute(predictions=predictions, references=references)
+print("\n📈 Evaluation Results:")
+print("BLEU:", bleu_score)
+print("ROUGE:", json.dumps(rouge_score, indent=2))

evaluate/output/.eval_results.csv.swp ADDED Viewed

Binary file (45.1 kB). View file

evaluate/output/eval_results.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20a66ee1f8b006ceb278d3363c9337f23a4ffa7e49e54c379e409342cda874fd
+size 2836184

evaluate/output/eval_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cdf1b2c1317da8ff11b303547cedd6642d5762984e1952134a11cff4ff8120e
+size 3078110

requirements.txt ADDED Viewed

	@@ -0,0 +1,182 @@

+absl-py==2.3.0
+accelerate==1.8.1
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.13
+aiosignal==1.4.0
+anyio==4.9.0
+argon2-cffi==25.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.5
+attrs==25.3.0
+babel==2.17.0
+bash_kernel==0.10.0
+beautifulsoup4==4.13.4
+bitsandbytes==0.46.1
+bleach==6.2.0
+blinker==1.7.0
+certifi==2025.6.15
+cffi==1.17.1
+charset-normalizer==3.4.2
+comm==0.2.2
+conda-pack==0.8.1
+cryptography==41.0.7
+datasets==3.6.0
+dbus-python==1.3.2
+debugpy==1.8.14
+decorator==5.2.1
+defusedxml==0.7.1
+dill==0.3.8
+distro==1.9.0
+evaluate==0.4.4
+executing==2.2.0
+fastjsonschema==2.21.1
+filelock==3.13.1
+filetype==1.2.0
+fqdn==1.5.1
+frozenlist==1.7.0
+fsspec==2024.6.1
+grpcio==1.73.1
+h11==0.16.0
+hf-xet==1.1.5
+httpcore==1.0.9
+httplib2==0.20.4
+httpx==0.28.1
+huggingface-hub==0.33.2
+idna==3.10
+iniconfig==2.1.0
+iotop==0.6
+ipykernel==6.29.5
+ipython==9.3.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.7
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+json5==0.12.0
+jsonpointer==3.0.0
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter==1.1.1
+jupyter-archive==3.4.0
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-http-over-ws==0.0.8
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyter_server==2.16.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.4.4
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.15
+launchpadlib==1.11.0
+lazr.restfulclient==0.14.6
+lazr.uri==1.0.6
+Markdown==3.8.2
+MarkupSafe==3.0.2
+matplotlib-inline==0.1.7
+mistune==3.1.3
+mpmath==1.3.0
+multidict==6.6.3
+multiprocess==0.70.16
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nbzip==0.1.0
+nest-asyncio==1.6.0
+networkx==3.3
+notebook==7.4.3
+notebook_shim==0.2.4
+numpy==2.3.1
+nvidia-cublas-cu12==12.8.3.14
+nvidia-cuda-cupti-cu12==12.8.57
+nvidia-cuda-nvrtc-cu12==12.8.61
+nvidia-cuda-runtime-cu12==12.8.57
+nvidia-cudnn-cu12==9.7.1.26
+nvidia-cufft-cu12==11.3.3.41
+nvidia-cufile-cu12==1.13.0.11
+nvidia-curand-cu12==10.3.9.55
+nvidia-cusolver-cu12==11.7.2.55
+nvidia-cusparse-cu12==12.5.7.53
+nvidia-cusparselt-cu12==0.6.3
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.8.61
+nvidia-nvtx-cu12==12.8.55
+oauthlib==3.2.2
+overrides==7.7.0
+packaging==25.0
+pandas==2.3.1
+pandocfilters==1.5.1
+parso==0.8.4
+peft==0.16.0
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.8
+pluggy==1.6.0
+prometheus_client==0.22.1
+prompt_toolkit==3.0.51
+propcache==0.3.2
+protobuf==6.31.1
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==20.0.0
+pycparser==2.22
+Pygments==2.19.2
+PyGObject==3.48.2
+PyJWT==2.7.0
+pyparsing==3.1.1
+pytest==8.4.1
+python-apt==2.7.7+ubuntu4
+python-dateutil==2.9.0.post0
+python-json-logger==3.3.0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==27.0.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.4
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rpds-py==0.25.1
+safetensors==0.5.3
+Send2Trash==1.8.3
+setuptools==68.1.2
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.7
+stack-data==0.6.3
+supervisor==4.2.5
+sympy==1.13.3
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+terminado==0.18.1
+tinycss2==1.4.0
+tokenizers==0.21.2
+torch==2.7.1+cu128
+torchaudio==2.7.1+cu128
+torchvision==0.22.1+cu128
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.53.1
+triton==3.3.1
+types-python-dateutil==2.9.0.20250516
+typing_extensions==4.14.0
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.5.0
+uv==0.7.16
+wadllib==1.3.6
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+Werkzeug==3.1.3
+wheel==0.42.0
+widgetsnbextension==4.0.14
+xxhash==3.5.0
+yarl==1.20.1

train/download_model.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from transformers import AutoTokenizer, AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+print("✅ Download complete.")

train/output/qlora-codellama-bugfix/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: codellama/CodeLLaMA-7b-Instruct-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

train/output/qlora-codellama-bugfix/adapter_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:df755384ffd5359513ac7a725ba0c4d3104e6b5c131f98aee0b1cb7d160c17c0
+size 839

train/output/qlora-codellama-bugfix/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73618dfb9cd2422a1a6600a5cec7fc1178bfe579e0c77d827761f26eeaae7f09
+size 134235048

train/output/qlora-codellama-bugfix/chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

+ {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content | trim + ' ' + eos_token }}{% endif %}{% endfor %}

train/output/qlora-codellama-bugfix/checkpoint-1000/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: codellama/CodeLLaMA-7b-Instruct-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1964c3b05fe32e3371262e1acac8e947ef8f8a52431e37716f780afe50727087
+size 839

train/output/qlora-codellama-bugfix/checkpoint-1000/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84064653aacb1aa638242fa35d96415845184e3f97cec4da4bc9412385f7aff9
+size 134235048

train/output/qlora-codellama-bugfix/checkpoint-1000/chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

train/output/qlora-codellama-bugfix/checkpoint-1000/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e69ea177eb91af1890ec93a594282aef22499e4743b53c958f27820bc33d28e
+size 268544075

train/output/qlora-codellama-bugfix/checkpoint-1000/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8ecd469a8b4c7959941512dc268004dd3111ea9235d256aa9638ce00699c0f2
+size 14645

train/output/qlora-codellama-bugfix/checkpoint-1000/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cee0966af6e7296ad683b3f293ade83fe035fb7007b575377ad7775d13ec8b46
+size 1465

train/output/qlora-codellama-bugfix/checkpoint-1000/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
+size 538

train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
+size 3620829

train/output/qlora-codellama-bugfix/checkpoint-1000/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
+size 1869

train/output/qlora-codellama-bugfix/checkpoint-1000/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e448dea2040f629a632432c297857131d1b3ac2ccdb25548b1cb4fe4ec5d779f
+size 4295

train/output/qlora-codellama-bugfix/checkpoint-1000/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cca9708d68b5d07e91505e97d7945c08ed5c660e02fce750706e45d57565144
+size 5777

train/output/qlora-codellama-bugfix/checkpoint-500/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: codellama/CodeLLaMA-7b-Instruct-hf
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:codellama/CodeLLaMA-7b-Instruct-hf
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.16.0

train/output/qlora-codellama-bugfix/checkpoint-500/adapter_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1964c3b05fe32e3371262e1acac8e947ef8f8a52431e37716f780afe50727087
+size 839

train/output/qlora-codellama-bugfix/checkpoint-500/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddc3978f5ac8ec8f6c879f16e4deab2f4955796fc45daaf9fd7063d6b270b027
+size 134235048

train/output/qlora-codellama-bugfix/checkpoint-500/chat_template.jinja ADDED Viewed

	@@ -0,0 +1 @@

train/output/qlora-codellama-bugfix/checkpoint-500/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e0ede750ff79b1bb5285ab51d0e4b03eeec5f4673ac149f1e973b6d20ad8e91
+size 268544075

train/output/qlora-codellama-bugfix/checkpoint-500/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:873492f033797d17bd144ec6ef43f5efb66933cf27e0c60ddd8d799f5d1f12c1
+size 14645

train/output/qlora-codellama-bugfix/checkpoint-500/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1492973eb5cd9c3e10c1c7ee7c4d19bcd22481640c90b6278d1a21419300ba9
+size 1465

train/output/qlora-codellama-bugfix/checkpoint-500/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
+size 538

train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
+size 3620829

train/output/qlora-codellama-bugfix/checkpoint-500/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
+size 1869

train/output/qlora-codellama-bugfix/checkpoint-500/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22d7fb0a27cf19e9d2091a0f3a62b86790ee0448da788f1599a6b1e4e2a83e84
+size 2543

train/output/qlora-codellama-bugfix/checkpoint-500/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7cca9708d68b5d07e91505e97d7945c08ed5c660e02fce750706e45d57565144
+size 5777

train/output/qlora-codellama-bugfix/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f32ca2dd65336aceb9e359886f34f214330137c9e801e8f41cf542287fc2630
+size 538

train/output/qlora-codellama-bugfix/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef304ea6a0a92c4275c959dd6e1a896bca7e36af0adb2ce25563e7c11bf3927f
+size 3620829

train/output/qlora-codellama-bugfix/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ee02c4a9abf3f4b4fb350d0991f5313bd354c5dd76cabedaacc9e3e1c088e1a
+size 1869

train/train.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# QLoRA fine-tuning for CodeLLaMA-7B-Instruct on 1x H200
+# Requirements: transformers, peft, accelerate, bitsandbytes, datasets
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+    Trainer,
+    BitsAndBytesConfig,
+    DataCollatorForSeq2Seq
+)
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from datasets import load_dataset
+import torch
+import os
+import wandb
+os.environ["WANDB_PROJECT"] = "codellama-7b-instruct-qlora-linux-bugfix"
+os.environ["WANDB_NAME"] = "run-v1"
+# Paths and model
+BASE_MODEL = "codellama/CodeLLaMA-7b-Instruct-hf"
+DATA_PATH = "../dataset/linux_bugfix_100k.jsonl"
+OUTPUT_DIR = "./output/qlora-codellama-bugfix"
+# Load dataset (prompt-completion format)
+dataset = load_dataset("json", data_files=DATA_PATH, split="train")
+# BitsandBytes config for QLoRA
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16  # optimized for H100/H200
+)
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    quantization_config=bnb_config,
+    device_map="auto"
+)
+model = prepare_model_for_kbit_training(model)
+model.gradient_checkpointing_enable()
+torch.backends.cuda.matmul.allow_tf32 = True
+# Apply QLoRA (LoRA config)
+lora_config = LoraConfig(
+    r=64,
+    lora_alpha=16,
+    lora_dropout=0.1,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+model = get_peft_model(model, lora_config)
+model.config.use_cache = False
+model.config.return_dict = True
+model.config.pad_token_id = tokenizer.pad_token_id
+model.print_trainable_parameters()
+# Format and tokenize the dataset
+model_max_len = tokenizer.model_max_length
+def format(example):
+    prompt_ids = tokenizer(example["prompt"], truncation=True, max_length=1024)["input_ids"]
+    completion_ids = tokenizer(example["completion"], truncation=True, max_length=512)["input_ids"]
+    input_ids = prompt_ids + completion_ids
+    labels = [-100] * len(prompt_ids) + completion_ids
+    # pad both input_ids and labels to the same length
+    max_len = min(len(input_ids), tokenizer.model_max_length)
+    input_ids = input_ids[:max_len]
+    labels = labels[:max_len]
+    return {
+        "input_ids": input_ids,
+        "labels": labels,
+    }
+# Sanity check
+print("__ Sanity checking one example...")
+sample = format(dataset[0])
+test_input = torch.tensor(sample["input_ids"]).unsqueeze(0).to(model.device)
+test_labels = torch.tensor(sample["labels"]).unsqueeze(0).to(model.device)
+model.train()
+out = model(input_ids=test_input, labels=test_labels)
+assert out.loss.requires_grad, "Sanity check failed: Loss does not require grad."
+print("__ Sanity check passed. Proceeding to map()...")
+# Apply formatting to entire dataset
+dataset = dataset.map(format, remove_columns=["prompt", "completion"])
+collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="pt", pad_to_multiple_of=8)
+# Training arguments
+training_args = TrainingArguments(
+    report_to="wandb",
+    run_name="codellama-7b-instruct-qlora-linux-bugfix",
+    logging_dir=f"{OUTPUT_DIR}/logs",
+    output_dir=OUTPUT_DIR,
+    num_train_epochs=3,
+    per_device_train_batch_size=64,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-4,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.03,
+    gradient_checkpointing=True,
+    bf16=True,                     # Important for H200
+    fp16=False,
+    max_grad_norm=1.0,
+    save_strategy="steps",
+    save_steps=500,
+    save_total_limit=2,
+    logging_steps=50,
+    push_to_hub=False,
+    label_names=["labels"],
+    remove_unused_columns=False,  # Critical to prevent data loss
+)
+# Trainer setup
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer,
+    data_collator=collator
+)
+# Begin training
+model.train()
+print(f"Track this run in Weights & Biases: https://wandb.ai/{os.environ['WANDB_PROJECT']}/{os.environ['WANDB_NAME']}")
+trainer.train(resume_from_checkpoint=True)
+# Save final model
+model.save_pretrained(OUTPUT_DIR, safe_serialization=True)
+tokenizer.save_pretrained(OUTPUT_DIR)
+print(f"[DONE] Model saved to {OUTPUT_DIR}")

train/train_codellama_qlora.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# QLoRA fine-tuning for CodeLLaMA-7B-Instruct
+# Requirements: transformers, peft, accelerate, bitsandbytes, datasets
+from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
+from peft import LoraConfig, get_peft_model
+from datasets import load_dataset
+import torch
+import os
+# Paths and parameters
+BASE_MODEL = "codellama/CodeLlama-7b-Instruct-hf"
+DATA_PATH = "../dataset_builder/output/linux_bugfix_prompt_completion.jsonl"
+OUTPUT_DIR = "./output/qlora-codellama-bugfix"
+# Load dataset (prompt, completion)
+dataset = load_dataset("json", data_files=DATA_PATH, split="train")
+# Apply formatting for supervised fine-tuning
+def format(example):
+    prompt = tokenizer(
+        example["prompt"],
+        truncation=True,
+        padding="max_length",
+        max_length=512
+    )
+    completion = tokenizer(
+        example["completion"],
+        truncation=True,
+        padding="max_length",
+        max_length=512
+    )
+    input_ids = prompt["input_ids"] + completion["input_ids"]
+    labels = [-100] * len(prompt["input_ids"]) + completion["input_ids"]
+    return {
+        "input_ids": input_ids[:1024],
+        "labels": labels[:1024]
+    }
+# Load tokenizer and base model
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16
+)
+tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
+tokenizer.pad_token = tokenizer.eos_token  # Required for padding
+model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, quantization_config=bnb_config, device_map="auto")
+# Apply QLoRA
+lora_config = LoraConfig(
+    r=64,
+    lora_alpha=16,
+    lora_dropout=0.1,
+    bias="none",
+    task_type="CAUSAL_LM"
+)
+model = get_peft_model(model, lora_config)
+# Tokenize dataset
+dataset = dataset.map(format, remove_columns=["prompt", "completion"])
+# Training args
+training_args = TrainingArguments(
+    output_dir=OUTPUT_DIR,
+    num_train_epochs=3,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-4,
+    logging_dir=f"{OUTPUT_DIR}/logs",
+    logging_steps=10,
+    save_strategy="epoch",
+    bf16=False,
+    fp16=True,
+    save_total_limit=2,
+    report_to="none",
+    push_to_hub=False
+)
+# Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    tokenizer=tokenizer
+)
+trainer.train()
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+print(f"[DONE] Model saved to {OUTPUT_DIR}")