CodeLLaMA-Linux-BugFix / dataset_builder /build_dataset.py

Mac Huang

Push full project including dataset, code, and training scripts

ed6b901 about 1 month ago

5.8 kB

	from pydriller import Repository
	import os
	import json
	from tqdm import tqdm
	import re

	REPO_PATH = '../linux'
	OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl'

	TEST_MODE = False # Set to False to process the full repository

	BUGFIX_KEYWORDS = [
	'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure',
	'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption',
	'security', 'vulnerability', 'exploit', 'buffer', 'stack'
	]

	def is_bugfix_commit(msg):
	msg_lower = msg.lower()
	return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS)

	def extract_instruction_from_commit_msg(msg):
	lines = msg.strip().splitlines()
	for line in lines:
	line = line.strip()
	if len(line) < 5 or not any(c.isalpha() for c in line):
	continue
	if line.lower().startswith((
	'[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack',
	'reported-by', 'cc:', 'co-authored-by', 'patchwork-id',
	'suggested-by', 'fixes:', 'link:', 'cherry picked from commit'
	)):
	continue
	return line
	return msg.strip().splitlines()[0] if msg.strip() else "fix"

	def extract_code_context(code, line_number, context_lines=10):
	if not code:
	return ""
	lines = code.split('\n')
	start = max(0, line_number - context_lines)
	end = min(len(lines), line_number + context_lines)
	return '\n'.join(lines[start:end])

	def extract_diff_context(diff_text, context_lines=5):
	if not diff_text:
	return ""
	lines = diff_text.split('\n')
	change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')]
	if not change_lines:
	return diff_text
	start = max(0, change_lines[0] - context_lines)
	end = min(len(lines), change_lines[-1] + context_lines + 1)
	return '\n'.join(lines[start:end])

	def create_dataset_entry(original_code, commit_msg, diff_code):
	return {
	"input": {
	"original code": original_code.strip(),
	"instruction": extract_instruction_from_commit_msg(commit_msg)
	},
	"output": {
	"diff codes": diff_code.strip()
	}
	}

	def process_commit(commit):
	entries = []
	if not is_bugfix_commit(commit.msg):
	return entries

	for mod in commit.modified_files:
	if not mod.new_path or not mod.new_path.endswith(('.c', '.h')):
	continue
	if mod.change_type.name != "MODIFY":
	continue
	if not mod.diff or not mod.source_code_before:
	continue

	focused_diff = extract_diff_context(mod.diff)

	diff_lines = mod.diff.split('\n')
	line_numbers = []
	for line in diff_lines:
	if line.startswith('@@'):
	match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line)
	if match:
	line_numbers.append(int(match.group(1)))

	if line_numbers:
	focused_code = extract_code_context(mod.source_code_before, line_numbers[0])
	else:
	focused_code = '\n'.join(mod.source_code_before.split('\n')[:50])

	entry = create_dataset_entry(
	original_code=focused_code,
	commit_msg=commit.msg,
	diff_code=focused_diff
	)
	entries.append(entry)

	return entries

	def main():
	if not os.path.exists(REPO_PATH):
	print(f"\u274c Repository not found at: {REPO_PATH}")
	return

	os.makedirs('./output', exist_ok=True)

	print("\ud83d\udd0d Building Linux kernel bug-fix dataset...")
	print(f"\ud83d\udcc1 Repository: {REPO_PATH}")
	print(f"\ud83d\udcce Output: {OUTPUT_FILE}")

	output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE

	repo = Repository(REPO_PATH)
	dataset_entries = []
	processed_commits = 0
	total_commits = 0
	bugfix_commits = 0

	for commit in tqdm(repo.traverse_commits(), desc="Processing commits"):
	total_commits += 1
	if TEST_MODE and MAX_COMMITS_TEST and total_commits > MAX_COMMITS_TEST:
	break
	if is_bugfix_commit(commit.msg):
	bugfix_commits += 1
	entries = process_commit(commit)
	if entries:
	dataset_entries.extend(entries)
	processed_commits += 1
	if TEST_MODE:
	print(f"\n\ud83d\udd0d Bug-fix commit {processed_commits}: {commit.hash[:8]}")
	print(f"\ud83d\udcdd Message: {extract_instruction_from_commit_msg(commit.msg)}")
	print(f"\ud83d\udcca Files: {len(entries)} entries extracted")
	print(f"\ud83d\udcc1 Files: {[mod.new_path for mod in commit.modified_files if mod.new_path and mod.new_path.endswith(('.c', '.h'))]}")

	with open(output_file, 'w', encoding='utf-8') as f:
	for entry in dataset_entries:
	f.write(json.dumps(entry, ensure_ascii=False) + '\n')

	print(f"\n\u2705 Dataset creation completed!")
	print(f"\ud83d\udcca Total commits processed: {total_commits}")
	print(f"\ud83d\udc1b Bug-fix commits found: {bugfix_commits}")
	print(f"\ud83d\udcdd Commits with valid entries: {processed_commits}")
	print(f"\ud83d\udcdd Total dataset entries: {len(dataset_entries)}")
	print(f"\ud83d\udcce Saved to: {output_file}")

	if dataset_entries:
	print(f"\n\ud83d\udccb Sample dataset entry:")
	sample = dataset_entries[0]
	print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...")
	print(f"\n\ud83d\udcc1 Dataset structure:")
	print(f" - Input: original code + instruction")
	print(f" - Output: diff codes")
	print(f" - Format: JSONL (one JSON object per line)")

	if __name__ == "__main__":
	main()