Upload 5 files
Browse files- README.md +14 -0
- app.py +679 -0
- bug_reports.csv +4 -0
- evaluate_system.py +440 -0
- requirements.txt +10 -0
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Bug Report Agent
|
3 |
+
emoji: ๐
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.32.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
short_description: Bug Report Analaysis Agent Using Rag
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,679 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
๐ Enhanced Bug Report Analysis Agent
|
3 |
+
=====================================
|
4 |
+
A comprehensive RAG-based system for analyzing bug reports, finding similar issues,
|
5 |
+
and suggesting fixes with evaluation metrics for retrieval relevance and usefulness.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import pandas as pd
|
10 |
+
import numpy as np
|
11 |
+
import gradio as gr
|
12 |
+
import sqlite3
|
13 |
+
import json
|
14 |
+
import ast
|
15 |
+
import re
|
16 |
+
from datetime import datetime, timedelta
|
17 |
+
from typing import List, Dict, Tuple, Optional
|
18 |
+
import logging
|
19 |
+
|
20 |
+
# Core RAG and ML imports
|
21 |
+
from sentence_transformers import SentenceTransformer
|
22 |
+
import faiss
|
23 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
24 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
25 |
+
import nltk
|
26 |
+
from fuzzywuzzy import fuzz, process
|
27 |
+
|
28 |
+
# LangChain imports
|
29 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
30 |
+
from langchain.docstore.document import Document
|
31 |
+
|
32 |
+
# Evaluation metrics
|
33 |
+
from rouge_score import rouge_scorer
|
34 |
+
import difflib
|
35 |
+
|
36 |
+
# Download required NLTK data
|
37 |
+
try:
|
38 |
+
nltk.download('punkt', quiet=True)
|
39 |
+
nltk.download('stopwords', quiet=True)
|
40 |
+
except:
|
41 |
+
pass
|
42 |
+
|
43 |
+
# Configure logging
|
44 |
+
logging.basicConfig(level=logging.INFO)
|
45 |
+
logger = logging.getLogger(__name__)
|
46 |
+
|
47 |
+
class BugReportRAG:
|
48 |
+
"""Enhanced RAG system for bug report analysis"""
|
49 |
+
|
50 |
+
def __init__(self):
|
51 |
+
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
52 |
+
self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
|
53 |
+
self.bug_index = None
|
54 |
+
self.code_index = None
|
55 |
+
self.bug_data = None
|
56 |
+
self.code_data = None
|
57 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
58 |
+
chunk_size=500,
|
59 |
+
chunk_overlap=50,
|
60 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
61 |
+
)
|
62 |
+
|
63 |
+
def load_and_index_data(self):
|
64 |
+
"""Load and index bug reports and code files"""
|
65 |
+
logger.info("Loading and indexing data...")
|
66 |
+
|
67 |
+
# Load bug reports
|
68 |
+
self._load_bug_reports()
|
69 |
+
|
70 |
+
# Load and process code files
|
71 |
+
self._load_code_files()
|
72 |
+
|
73 |
+
# Create FAISS indices
|
74 |
+
self._create_faiss_indices()
|
75 |
+
|
76 |
+
logger.info("Data loading and indexing completed")
|
77 |
+
|
78 |
+
def _load_bug_reports(self):
|
79 |
+
"""Load and process bug reports from CSV"""
|
80 |
+
try:
|
81 |
+
df = pd.read_csv("bug_reports.csv")
|
82 |
+
|
83 |
+
# Create comprehensive text representation for each bug
|
84 |
+
bug_texts = []
|
85 |
+
bug_metadata = []
|
86 |
+
|
87 |
+
for _, row in df.iterrows():
|
88 |
+
# Combine relevant fields for better semantic search
|
89 |
+
text_parts = [
|
90 |
+
f"Title: {row.get('title', '')}",
|
91 |
+
f"Description: {row.get('description', '')}",
|
92 |
+
f"Component: {row.get('component', '')}",
|
93 |
+
f"Severity: {row.get('severity', '')}",
|
94 |
+
f"Status: {row.get('status', '')}",
|
95 |
+
]
|
96 |
+
|
97 |
+
if pd.notna(row.get('fix_description')):
|
98 |
+
text_parts.append(f"Fix: {row['fix_description']}")
|
99 |
+
|
100 |
+
bug_text = " | ".join(text_parts)
|
101 |
+
bug_texts.append(bug_text)
|
102 |
+
|
103 |
+
# Store metadata
|
104 |
+
metadata = {
|
105 |
+
'id': row.get('id', ''),
|
106 |
+
'title': row.get('title', ''),
|
107 |
+
'description': row.get('description', ''),
|
108 |
+
'severity': row.get('severity', ''),
|
109 |
+
'status': row.get('status', ''),
|
110 |
+
'component': row.get('component', ''),
|
111 |
+
'fix_description': row.get('fix_description', ''),
|
112 |
+
'related_files': row.get('related_files', ''),
|
113 |
+
'created_date': row.get('created_date', ''),
|
114 |
+
'resolved_date': row.get('resolved_date', ''),
|
115 |
+
}
|
116 |
+
bug_metadata.append(metadata)
|
117 |
+
|
118 |
+
self.bug_data = {
|
119 |
+
'texts': bug_texts,
|
120 |
+
'metadata': bug_metadata
|
121 |
+
}
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
logger.error(f"Error loading bug reports: {e}")
|
125 |
+
self.bug_data = {'texts': [], 'metadata': []}
|
126 |
+
|
127 |
+
def _load_code_files(self):
|
128 |
+
"""Load and process code files"""
|
129 |
+
code_texts = []
|
130 |
+
code_metadata = []
|
131 |
+
|
132 |
+
for root, dirs, files in os.walk("codebase"):
|
133 |
+
for file in files:
|
134 |
+
if file.endswith(('.py', '.js', '.html', '.css')):
|
135 |
+
file_path = os.path.join(root, file)
|
136 |
+
try:
|
137 |
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
138 |
+
content = f.read()
|
139 |
+
|
140 |
+
# Split large files into chunks
|
141 |
+
if len(content) > 1000:
|
142 |
+
chunks = self.text_splitter.split_text(content)
|
143 |
+
for i, chunk in enumerate(chunks):
|
144 |
+
code_texts.append(f"File: {file} | {chunk}")
|
145 |
+
code_metadata.append({
|
146 |
+
'file_path': file_path,
|
147 |
+
'file_name': file,
|
148 |
+
'chunk_index': i,
|
149 |
+
'total_chunks': len(chunks)
|
150 |
+
})
|
151 |
+
else:
|
152 |
+
code_texts.append(f"File: {file} | {content}")
|
153 |
+
code_metadata.append({
|
154 |
+
'file_path': file_path,
|
155 |
+
'file_name': file,
|
156 |
+
'chunk_index': 0,
|
157 |
+
'total_chunks': 1
|
158 |
+
})
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
logger.warning(f"Error reading {file_path}: {e}")
|
162 |
+
|
163 |
+
self.code_data = {
|
164 |
+
'texts': code_texts,
|
165 |
+
'metadata': code_metadata
|
166 |
+
}
|
167 |
+
|
168 |
+
def _create_faiss_indices(self):
|
169 |
+
"""Create FAISS indices for efficient similarity search"""
|
170 |
+
# Create bug report index
|
171 |
+
if self.bug_data['texts']:
|
172 |
+
bug_embeddings = self.embedding_model.encode(self.bug_data['texts'])
|
173 |
+
self.bug_index = faiss.IndexFlatIP(bug_embeddings.shape[1])
|
174 |
+
# Normalize embeddings for cosine similarity
|
175 |
+
faiss.normalize_L2(bug_embeddings)
|
176 |
+
self.bug_index.add(bug_embeddings.astype('float32'))
|
177 |
+
|
178 |
+
# Create code index
|
179 |
+
if self.code_data['texts']:
|
180 |
+
code_embeddings = self.embedding_model.encode(self.code_data['texts'])
|
181 |
+
self.code_index = faiss.IndexFlatIP(code_embeddings.shape[1])
|
182 |
+
faiss.normalize_L2(code_embeddings)
|
183 |
+
self.code_index.add(code_embeddings.astype('float32'))
|
184 |
+
|
185 |
+
def search_similar_bugs(self, query: str, k: int = 5) -> List[Dict]:
|
186 |
+
"""Search for similar bug reports"""
|
187 |
+
if not self.bug_index or not self.bug_data['texts']:
|
188 |
+
return []
|
189 |
+
|
190 |
+
# Encode query
|
191 |
+
query_embedding = self.embedding_model.encode([query])
|
192 |
+
faiss.normalize_L2(query_embedding)
|
193 |
+
|
194 |
+
# Search
|
195 |
+
scores, indices = self.bug_index.search(query_embedding.astype('float32'), k)
|
196 |
+
|
197 |
+
results = []
|
198 |
+
for score, idx in zip(scores[0], indices[0]):
|
199 |
+
if idx < len(self.bug_data['metadata']):
|
200 |
+
result = self.bug_data['metadata'][idx].copy()
|
201 |
+
result['similarity_score'] = float(score)
|
202 |
+
results.append(result)
|
203 |
+
|
204 |
+
return results
|
205 |
+
|
206 |
+
def search_relevant_code(self, query: str, k: int = 5) -> List[Dict]:
|
207 |
+
"""Search for relevant code sections"""
|
208 |
+
if not self.code_index or not self.code_data['texts']:
|
209 |
+
return []
|
210 |
+
|
211 |
+
# Encode query
|
212 |
+
query_embedding = self.embedding_model.encode([query])
|
213 |
+
faiss.normalize_L2(query_embedding)
|
214 |
+
|
215 |
+
# Search
|
216 |
+
scores, indices = self.code_index.search(query_embedding.astype('float32'), k)
|
217 |
+
|
218 |
+
results = []
|
219 |
+
for score, idx in zip(scores[0], indices[0]):
|
220 |
+
if idx < len(self.code_data['metadata']):
|
221 |
+
result = self.code_data['metadata'][idx].copy()
|
222 |
+
result['similarity_score'] = float(score)
|
223 |
+
result['code_text'] = self.code_data['texts'][idx]
|
224 |
+
results.append(result)
|
225 |
+
|
226 |
+
return results
|
227 |
+
|
228 |
+
class BugAnalysisEvaluator:
|
229 |
+
"""Evaluate the quality and relevance of bug analysis results"""
|
230 |
+
|
231 |
+
def __init__(self):
|
232 |
+
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
233 |
+
|
234 |
+
def evaluate_retrieval_relevance(self, query: str, results: List[Dict]) -> Dict:
|
235 |
+
"""Evaluate how relevant retrieved results are to the query"""
|
236 |
+
if not results:
|
237 |
+
return {
|
238 |
+
'average_similarity': 0.0,
|
239 |
+
'relevance_score': 0.0,
|
240 |
+
'result_count': 0
|
241 |
+
}
|
242 |
+
|
243 |
+
# Calculate average similarity score
|
244 |
+
similarity_scores = [r.get('similarity_score', 0.0) for r in results]
|
245 |
+
average_similarity = np.mean(similarity_scores) if similarity_scores else 0.0
|
246 |
+
|
247 |
+
# Calculate semantic relevance using text similarity
|
248 |
+
query_lower = query.lower()
|
249 |
+
relevance_scores = []
|
250 |
+
|
251 |
+
for result in results:
|
252 |
+
# Combine title and description for relevance calculation
|
253 |
+
result_text = f"{result.get('title', '')} {result.get('description', '')}"
|
254 |
+
relevance_score = fuzz.partial_ratio(query_lower, result_text.lower()) / 100.0
|
255 |
+
relevance_scores.append(relevance_score)
|
256 |
+
|
257 |
+
relevance_score = np.mean(relevance_scores) if relevance_scores else 0.0
|
258 |
+
|
259 |
+
return {
|
260 |
+
'average_similarity': float(average_similarity),
|
261 |
+
'relevance_score': float(relevance_score),
|
262 |
+
'result_count': len(results),
|
263 |
+
'individual_scores': similarity_scores
|
264 |
+
}
|
265 |
+
|
266 |
+
def evaluate_suggestion_usefulness(self, query: str, suggestions: str) -> Dict:
|
267 |
+
"""Evaluate the usefulness of generated suggestions"""
|
268 |
+
if not suggestions or not query:
|
269 |
+
return {
|
270 |
+
'completeness_score': 0.0,
|
271 |
+
'specificity_score': 0.0,
|
272 |
+
'actionability_score': 0.0,
|
273 |
+
'overall_usefulness': 0.0
|
274 |
+
}
|
275 |
+
|
276 |
+
# Completeness: How well suggestions address the query
|
277 |
+
rouge_scores = self.rouge_scorer.score(query.lower(), suggestions.lower())
|
278 |
+
completeness_score = rouge_scores['rougeL'].fmeasure
|
279 |
+
|
280 |
+
# Specificity: Presence of specific technical terms, file names, functions
|
281 |
+
specificity_indicators = [
|
282 |
+
r'\b\w+\.py\b', # Python files
|
283 |
+
r'\bdef \w+\b', # Function definitions
|
284 |
+
r'\bclass \w+\b', # Class definitions
|
285 |
+
r'\b\w+\(\)', # Function calls
|
286 |
+
r'\bfix\b|\bupdate\b|\bchange\b|\bmodify\b', # Action words
|
287 |
+
]
|
288 |
+
|
289 |
+
specificity_count = sum(len(re.findall(pattern, suggestions.lower()))
|
290 |
+
for pattern in specificity_indicators)
|
291 |
+
specificity_score = min(specificity_count / 5.0, 1.0) # Normalize to 0-1
|
292 |
+
|
293 |
+
# Actionability: Presence of actionable steps
|
294 |
+
actionable_phrases = [
|
295 |
+
'check', 'verify', 'update', 'modify', 'fix', 'add', 'remove',
|
296 |
+
'ensure', 'validate', 'test', 'debug', 'implement', 'configure'
|
297 |
+
]
|
298 |
+
|
299 |
+
actionability_count = sum(1 for phrase in actionable_phrases
|
300 |
+
if phrase in suggestions.lower())
|
301 |
+
actionability_score = min(actionability_count / 5.0, 1.0)
|
302 |
+
|
303 |
+
# Overall usefulness (weighted average)
|
304 |
+
overall_usefulness = (
|
305 |
+
0.3 * completeness_score +
|
306 |
+
0.4 * specificity_score +
|
307 |
+
0.3 * actionability_score
|
308 |
+
)
|
309 |
+
|
310 |
+
return {
|
311 |
+
'completeness_score': float(completeness_score),
|
312 |
+
'specificity_score': float(specificity_score),
|
313 |
+
'actionability_score': float(actionability_score),
|
314 |
+
'overall_usefulness': float(overall_usefulness)
|
315 |
+
}
|
316 |
+
|
317 |
+
class FixSuggestionEngine:
|
318 |
+
"""Generate intelligent fix suggestions based on analysis"""
|
319 |
+
|
320 |
+
def __init__(self):
|
321 |
+
self.common_fixes = {
|
322 |
+
'authentication': [
|
323 |
+
"Check password validation regex patterns",
|
324 |
+
"Verify session management configuration",
|
325 |
+
"Ensure proper error handling in login flow",
|
326 |
+
"Review authentication middleware setup"
|
327 |
+
],
|
328 |
+
'database': [
|
329 |
+
"Check database connection pooling settings",
|
330 |
+
"Review query optimization and indexing",
|
331 |
+
"Verify transaction handling and rollbacks",
|
332 |
+
"Check for connection timeout configurations"
|
333 |
+
],
|
334 |
+
'email': [
|
335 |
+
"Verify SMTP server configuration",
|
336 |
+
"Check email template rendering",
|
337 |
+
"Ensure email credentials are properly set",
|
338 |
+
"Review email queue processing"
|
339 |
+
],
|
340 |
+
'ui': [
|
341 |
+
"Check JavaScript event listeners",
|
342 |
+
"Verify CSS styling and responsive design",
|
343 |
+
"Review form validation logic",
|
344 |
+
"Ensure proper DOM element targeting"
|
345 |
+
]
|
346 |
+
}
|
347 |
+
|
348 |
+
def generate_suggestions(self, query: str, similar_bugs: List[Dict],
|
349 |
+
relevant_code: List[Dict]) -> str:
|
350 |
+
"""Generate fix suggestions based on analysis"""
|
351 |
+
suggestions = []
|
352 |
+
|
353 |
+
# Add context-based suggestions
|
354 |
+
suggestions.append("## ๐ Analysis Summary")
|
355 |
+
suggestions.append(f"Based on the query: '{query}'")
|
356 |
+
suggestions.append("")
|
357 |
+
|
358 |
+
# Add similar bug insights
|
359 |
+
if similar_bugs:
|
360 |
+
suggestions.append("## ๐ชฒ Similar Issues Found")
|
361 |
+
for i, bug in enumerate(similar_bugs[:3], 1):
|
362 |
+
status = bug.get('status', 'Unknown')
|
363 |
+
severity = bug.get('severity', 'Unknown')
|
364 |
+
suggestions.append(f"{i}. **{bug.get('title', 'Untitled')}** (Status: {status}, Severity: {severity})")
|
365 |
+
|
366 |
+
if bug.get('fix_description'):
|
367 |
+
suggestions.append(f" - Previous fix: {bug['fix_description']}")
|
368 |
+
suggestions.append("")
|
369 |
+
|
370 |
+
# Add code analysis
|
371 |
+
if relevant_code:
|
372 |
+
suggestions.append("## ๐ป Relevant Code Sections")
|
373 |
+
for i, code in enumerate(relevant_code[:3], 1):
|
374 |
+
file_name = code.get('file_name', 'Unknown file')
|
375 |
+
suggestions.append(f"{i}. **{file_name}** (Similarity: {code.get('similarity_score', 0):.2f})")
|
376 |
+
suggestions.append("")
|
377 |
+
|
378 |
+
# Add specific fix suggestions based on component analysis
|
379 |
+
component_suggestions = self._get_component_suggestions(query, similar_bugs)
|
380 |
+
if component_suggestions:
|
381 |
+
suggestions.append("## ๐ ๏ธ Suggested Actions")
|
382 |
+
for suggestion in component_suggestions:
|
383 |
+
suggestions.append(f"- {suggestion}")
|
384 |
+
suggestions.append("")
|
385 |
+
|
386 |
+
# Add general debugging steps
|
387 |
+
suggestions.append("## ๐ง General Debugging Steps")
|
388 |
+
suggestions.extend([
|
389 |
+
"- Review error logs and stack traces",
|
390 |
+
"- Test in different environments (dev/staging/prod)",
|
391 |
+
"- Check recent code changes in related files",
|
392 |
+
"- Verify configuration settings",
|
393 |
+
"- Run relevant test suites",
|
394 |
+
"- Consider rollback if issue is critical"
|
395 |
+
])
|
396 |
+
|
397 |
+
return "\n".join(suggestions)
|
398 |
+
|
399 |
+
def _get_component_suggestions(self, query: str, similar_bugs: List[Dict]) -> List[str]:
|
400 |
+
"""Get component-specific suggestions"""
|
401 |
+
suggestions = []
|
402 |
+
query_lower = query.lower()
|
403 |
+
|
404 |
+
# Identify likely component based on keywords and similar bugs
|
405 |
+
components = [bug.get('component', '').lower() for bug in similar_bugs]
|
406 |
+
|
407 |
+
# Keyword-based component detection
|
408 |
+
if any(keyword in query_lower for keyword in ['login', 'auth', 'password', 'session']):
|
409 |
+
suggestions.extend(self.common_fixes.get('authentication', []))
|
410 |
+
|
411 |
+
if any(keyword in query_lower for keyword in ['database', 'db', 'query', 'connection']):
|
412 |
+
suggestions.extend(self.common_fixes.get('database', []))
|
413 |
+
|
414 |
+
if any(keyword in query_lower for keyword in ['email', 'smtp', 'mail', 'notification']):
|
415 |
+
suggestions.extend(self.common_fixes.get('email', []))
|
416 |
+
|
417 |
+
if any(keyword in query_lower for keyword in ['button', 'form', 'ui', 'interface', 'display']):
|
418 |
+
suggestions.extend(self.common_fixes.get('ui', []))
|
419 |
+
|
420 |
+
# Component-based suggestions from similar bugs
|
421 |
+
for component in components:
|
422 |
+
if component and component in self.common_fixes:
|
423 |
+
suggestions.extend(self.common_fixes[component])
|
424 |
+
|
425 |
+
return list(set(suggestions)) # Remove duplicates
|
426 |
+
|
427 |
+
# Initialize the RAG system and other components
|
428 |
+
rag_system = BugReportRAG()
|
429 |
+
evaluator = BugAnalysisEvaluator()
|
430 |
+
suggestion_engine = FixSuggestionEngine()
|
431 |
+
|
432 |
+
# Load and index data on startup
|
433 |
+
rag_system.load_and_index_data()
|
434 |
+
|
435 |
+
def analyze_bug_report(query: str) -> Tuple[str, str, str, str]:
|
436 |
+
"""Main function to analyze bug reports"""
|
437 |
+
try:
|
438 |
+
if not query.strip():
|
439 |
+
return "Please enter a bug description", "", "", ""
|
440 |
+
|
441 |
+
logger.info(f"Analyzing query: {query}")
|
442 |
+
|
443 |
+
# Search for similar bugs and relevant code
|
444 |
+
similar_bugs = rag_system.search_similar_bugs(query, k=5)
|
445 |
+
relevant_code = rag_system.search_relevant_code(query, k=5)
|
446 |
+
|
447 |
+
# Generate suggestions
|
448 |
+
suggestions = suggestion_engine.generate_suggestions(query, similar_bugs, relevant_code)
|
449 |
+
|
450 |
+
# Evaluate results
|
451 |
+
bug_evaluation = evaluator.evaluate_retrieval_relevance(query, similar_bugs)
|
452 |
+
suggestion_evaluation = evaluator.evaluate_suggestion_usefulness(query, suggestions)
|
453 |
+
|
454 |
+
# Format similar bugs output
|
455 |
+
similar_bugs_output = format_similar_bugs(similar_bugs, bug_evaluation)
|
456 |
+
|
457 |
+
# Format relevant code output
|
458 |
+
relevant_code_output = format_relevant_code(relevant_code)
|
459 |
+
|
460 |
+
# Format evaluation metrics
|
461 |
+
evaluation_output = format_evaluation_metrics(bug_evaluation, suggestion_evaluation)
|
462 |
+
|
463 |
+
return similar_bugs_output, relevant_code_output, suggestions, evaluation_output
|
464 |
+
|
465 |
+
except Exception as e:
|
466 |
+
logger.error(f"Error analyzing bug report: {e}")
|
467 |
+
return f"Error: {str(e)}", "", "", ""
|
468 |
+
|
469 |
+
def format_similar_bugs(bugs: List[Dict], evaluation: Dict) -> str:
|
470 |
+
"""Format similar bugs for display"""
|
471 |
+
if not bugs:
|
472 |
+
return "No similar bugs found in the database."
|
473 |
+
|
474 |
+
output = [f"## ๐ Found {len(bugs)} Similar Bug Reports"]
|
475 |
+
output.append(f"**Relevance Score: {evaluation['relevance_score']:.2f}/1.0**")
|
476 |
+
output.append(f"**Average Similarity: {evaluation['average_similarity']:.2f}/1.0**")
|
477 |
+
output.append("")
|
478 |
+
|
479 |
+
for i, bug in enumerate(bugs, 1):
|
480 |
+
output.append(f"### {i}. {bug.get('title', 'Untitled Bug')}")
|
481 |
+
output.append(f"**ID:** {bug.get('id', 'N/A')} | **Severity:** {bug.get('severity', 'N/A')} | **Status:** {bug.get('status', 'N/A')}")
|
482 |
+
output.append(f"**Similarity:** {bug.get('similarity_score', 0):.3f}")
|
483 |
+
output.append(f"**Component:** {bug.get('component', 'N/A')}")
|
484 |
+
output.append("")
|
485 |
+
output.append(f"**Description:** {bug.get('description', 'No description available')}")
|
486 |
+
|
487 |
+
if bug.get('fix_description'):
|
488 |
+
output.append(f"**Previous Fix:** {bug['fix_description']}")
|
489 |
+
|
490 |
+
if bug.get('related_files'):
|
491 |
+
output.append(f"**Related Files:** {bug['related_files']}")
|
492 |
+
|
493 |
+
output.append("---")
|
494 |
+
|
495 |
+
return "\n".join(output)
|
496 |
+
|
497 |
+
def format_relevant_code(code_results: List[Dict]) -> str:
|
498 |
+
"""Format relevant code sections for display"""
|
499 |
+
if not code_results:
|
500 |
+
return "No relevant code sections found."
|
501 |
+
|
502 |
+
output = [f"## ๐ป Found {len(code_results)} Relevant Code Sections"]
|
503 |
+
output.append("")
|
504 |
+
|
505 |
+
for i, code in enumerate(code_results, 1):
|
506 |
+
file_name = code.get('file_name', 'Unknown file')
|
507 |
+
similarity = code.get('similarity_score', 0)
|
508 |
+
|
509 |
+
output.append(f"### {i}. {file_name}")
|
510 |
+
output.append(f"**Similarity:** {similarity:.3f} | **Path:** {code.get('file_path', 'N/A')}")
|
511 |
+
|
512 |
+
if code.get('chunk_index', 0) > 0:
|
513 |
+
total_chunks = code.get('total_chunks', 1)
|
514 |
+
output.append(f"**Chunk:** {code['chunk_index'] + 1}/{total_chunks}")
|
515 |
+
|
516 |
+
output.append("")
|
517 |
+
|
518 |
+
# Extract and display code snippet
|
519 |
+
code_text = code.get('code_text', '')
|
520 |
+
if 'File:' in code_text:
|
521 |
+
_, code_content = code_text.split('|', 1)
|
522 |
+
code_content = code_content.strip()
|
523 |
+
else:
|
524 |
+
code_content = code_text
|
525 |
+
|
526 |
+
# Limit code display length
|
527 |
+
if len(code_content) > 500:
|
528 |
+
code_content = code_content[:500] + "\n... (truncated)"
|
529 |
+
|
530 |
+
output.append("```python")
|
531 |
+
output.append(code_content)
|
532 |
+
output.append("```")
|
533 |
+
output.append("---")
|
534 |
+
|
535 |
+
return "\n".join(output)
|
536 |
+
|
537 |
+
def format_evaluation_metrics(bug_eval: Dict, suggestion_eval: Dict) -> str:
|
538 |
+
"""Format evaluation metrics for display"""
|
539 |
+
output = ["## ๐ Analysis Quality Metrics"]
|
540 |
+
output.append("")
|
541 |
+
|
542 |
+
# Bug retrieval metrics
|
543 |
+
output.append("### ๐ Retrieval Relevance")
|
544 |
+
output.append(f"- **Average Similarity Score:** {bug_eval['average_similarity']:.3f}/1.0")
|
545 |
+
output.append(f"- **Semantic Relevance:** {bug_eval['relevance_score']:.3f}/1.0")
|
546 |
+
output.append(f"- **Results Retrieved:** {bug_eval['result_count']}")
|
547 |
+
|
548 |
+
# Suggestion quality metrics
|
549 |
+
output.append("")
|
550 |
+
output.append("### ๐ ๏ธ Suggestion Quality")
|
551 |
+
output.append(f"- **Completeness:** {suggestion_eval['completeness_score']:.3f}/1.0")
|
552 |
+
output.append(f"- **Specificity:** {suggestion_eval['specificity_score']:.3f}/1.0")
|
553 |
+
output.append(f"- **Actionability:** {suggestion_eval['actionability_score']:.3f}/1.0")
|
554 |
+
output.append(f"- **Overall Usefulness:** {suggestion_eval['overall_usefulness']:.3f}/1.0")
|
555 |
+
|
556 |
+
# Quality assessment
|
557 |
+
overall_quality = (bug_eval['relevance_score'] + suggestion_eval['overall_usefulness']) / 2
|
558 |
+
output.append("")
|
559 |
+
output.append("### โญ Overall Analysis Quality")
|
560 |
+
|
561 |
+
if overall_quality >= 0.8:
|
562 |
+
quality_label = "๐ข Excellent"
|
563 |
+
elif overall_quality >= 0.6:
|
564 |
+
quality_label = "๐ก Good"
|
565 |
+
elif overall_quality >= 0.4:
|
566 |
+
quality_label = "๐ Fair"
|
567 |
+
else:
|
568 |
+
quality_label = "๐ด Poor"
|
569 |
+
|
570 |
+
output.append(f"**Quality Rating:** {quality_label} ({overall_quality:.3f}/1.0)")
|
571 |
+
|
572 |
+
return "\n".join(output)
|
573 |
+
|
574 |
+
# Create Gradio interface
|
575 |
+
def create_interface():
|
576 |
+
"""Create the Gradio interface for the Bug Report Analysis Agent"""
|
577 |
+
|
578 |
+
with gr.Blocks(
|
579 |
+
title="๐ Bug Report Analysis Agent",
|
580 |
+
theme=gr.themes.Soft(),
|
581 |
+
css="""
|
582 |
+
.gradio-container {
|
583 |
+
max-width: 1200px !important;
|
584 |
+
}
|
585 |
+
.tab-nav {
|
586 |
+
font-weight: bold;
|
587 |
+
}
|
588 |
+
"""
|
589 |
+
) as demo:
|
590 |
+
|
591 |
+
gr.Markdown("""
|
592 |
+
# ๐ Bug Report Analysis Agent
|
593 |
+
|
594 |
+
**Advanced RAG-powered system for intelligent bug analysis**
|
595 |
+
|
596 |
+
This system analyzes bug reports using Retrieval-Augmented Generation (RAG) to:
|
597 |
+
- ๐ Find similar past issues in the bug database
|
598 |
+
- ๐ป Identify relevant code sections that might be related
|
599 |
+
- ๐ ๏ธ Suggest potential causes and fixes
|
600 |
+
- ๐ Evaluate retrieval relevance and suggestion usefulness
|
601 |
+
|
602 |
+
---
|
603 |
+
""")
|
604 |
+
|
605 |
+
with gr.Row():
|
606 |
+
with gr.Column(scale=1):
|
607 |
+
input_box = gr.Textbox(
|
608 |
+
lines=6,
|
609 |
+
label="๐ Bug Description",
|
610 |
+
placeholder="Describe the bug you're experiencing...\n\nExample: 'Login form redirects back to login page after entering correct credentials'",
|
611 |
+
info="Provide as much detail as possible for better analysis"
|
612 |
+
)
|
613 |
+
|
614 |
+
with gr.Row():
|
615 |
+
analyze_btn = gr.Button("๐ Analyze Bug", variant="primary", size="lg")
|
616 |
+
clear_btn = gr.Button("๐๏ธ Clear", variant="secondary")
|
617 |
+
|
618 |
+
with gr.Row():
|
619 |
+
with gr.Column(scale=1):
|
620 |
+
similar_bugs_output = gr.Markdown(
|
621 |
+
label="๐ชฒ Similar Bug Reports",
|
622 |
+
value="Enter a bug description and click 'Analyze Bug' to see similar issues..."
|
623 |
+
)
|
624 |
+
|
625 |
+
with gr.Column(scale=1):
|
626 |
+
relevant_code_output = gr.Markdown(
|
627 |
+
label="๐ป Relevant Code Sections",
|
628 |
+
value="Code analysis will appear here..."
|
629 |
+
)
|
630 |
+
|
631 |
+
with gr.Row():
|
632 |
+
with gr.Column(scale=1):
|
633 |
+
suggestions_output = gr.Markdown(
|
634 |
+
label="๐ ๏ธ Fix Suggestions",
|
635 |
+
value="Intelligent fix suggestions will be generated here..."
|
636 |
+
)
|
637 |
+
|
638 |
+
with gr.Column(scale=1):
|
639 |
+
evaluation_output = gr.Markdown(
|
640 |
+
label="๐ Quality Metrics",
|
641 |
+
value="Analysis quality metrics will be shown here..."
|
642 |
+
)
|
643 |
+
|
644 |
+
# Event handlers
|
645 |
+
analyze_btn.click(
|
646 |
+
fn=analyze_bug_report,
|
647 |
+
inputs=[input_box],
|
648 |
+
outputs=[similar_bugs_output, relevant_code_output, suggestions_output, evaluation_output],
|
649 |
+
api_name="analyze_bug"
|
650 |
+
)
|
651 |
+
|
652 |
+
clear_btn.click(
|
653 |
+
fn=lambda: ("", "Enter a bug description and click 'Analyze Bug' to see similar issues...",
|
654 |
+
"Code analysis will appear here...",
|
655 |
+
"Intelligent fix suggestions will be generated here...",
|
656 |
+
"Analysis quality metrics will be shown here..."),
|
657 |
+
inputs=[],
|
658 |
+
outputs=[input_box, similar_bugs_output, relevant_code_output, suggestions_output, evaluation_output]
|
659 |
+
)
|
660 |
+
|
661 |
+
# Footer
|
662 |
+
gr.Markdown("""
|
663 |
+
---
|
664 |
+
**๐ Built with:** LangChain โข Sentence Transformers โข FAISS โข Gradio
|
665 |
+
|
666 |
+
**๐ Features:** Semantic Search โข Similarity Scoring โข Code Analysis โข Fix Suggestions โข Quality Evaluation
|
667 |
+
""")
|
668 |
+
|
669 |
+
return demo
|
670 |
+
|
671 |
+
if __name__ == "__main__":
|
672 |
+
# Create and launch the interface
|
673 |
+
demo = create_interface()
|
674 |
+
demo.launch(
|
675 |
+
share=True,
|
676 |
+
server_name="0.0.0.0",
|
677 |
+
server_port=7860,
|
678 |
+
show_error=True
|
679 |
+
)
|
bug_reports.csv
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
description
|
2 |
+
Login fails with correct credentials
|
3 |
+
Submit button does not respond
|
4 |
+
Error message is not displayed when login fails
|
evaluate_system.py
ADDED
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Bug Report Analysis Agent - Comprehensive Evaluation Script
|
4 |
+
============================================================
|
5 |
+
This script demonstrates and evaluates the RAG system's performance
|
6 |
+
on various types of bug reports and provides detailed analysis.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import sys
|
10 |
+
import time
|
11 |
+
import json
|
12 |
+
from typing import Dict, List, Tuple
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
# Import the main system components
|
16 |
+
from app import (
|
17 |
+
rag_system, evaluator, suggestion_engine,
|
18 |
+
analyze_bug_report, format_similar_bugs,
|
19 |
+
format_relevant_code, format_evaluation_metrics
|
20 |
+
)
|
21 |
+
|
22 |
+
class SystemEvaluator:
|
23 |
+
"""Comprehensive evaluation of the Bug Report Analysis system"""
|
24 |
+
|
25 |
+
def __init__(self):
|
26 |
+
self.test_queries = [
|
27 |
+
{
|
28 |
+
"query": "Login form redirects back to login page after entering correct credentials",
|
29 |
+
"category": "Authentication",
|
30 |
+
"expected_components": ["login", "auth", "session"],
|
31 |
+
"description": "Classic authentication redirect issue"
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"query": "Database connection times out during high traffic periods",
|
35 |
+
"category": "Database",
|
36 |
+
"expected_components": ["database", "connection", "timeout"],
|
37 |
+
"description": "Performance issue under load"
|
38 |
+
},
|
39 |
+
{
|
40 |
+
"query": "Email notifications for password reset are not being sent to users",
|
41 |
+
"category": "Email",
|
42 |
+
"expected_components": ["email", "smtp", "password"],
|
43 |
+
"description": "Email service functionality problem"
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"query": "Submit button on contact form doesn't respond when clicked",
|
47 |
+
"category": "UI/Frontend",
|
48 |
+
"expected_components": ["button", "form", "javascript"],
|
49 |
+
"description": "Frontend interaction issue"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"query": "API returns 500 internal server error for user profile updates",
|
53 |
+
"category": "API",
|
54 |
+
"expected_components": ["api", "profile", "server"],
|
55 |
+
"description": "Backend API error"
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"query": "Memory usage increases continuously when uploading large files",
|
59 |
+
"category": "Performance",
|
60 |
+
"expected_components": ["memory", "upload", "file"],
|
61 |
+
"description": "Memory leak in file handling"
|
62 |
+
},
|
63 |
+
{
|
64 |
+
"query": "Dashboard charts show incorrect data for monthly revenue reports",
|
65 |
+
"category": "Data/Analytics",
|
66 |
+
"expected_components": ["dashboard", "chart", "data"],
|
67 |
+
"description": "Data visualization accuracy issue"
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"query": "User session expires too quickly causing frequent re-authentication",
|
71 |
+
"category": "Session Management",
|
72 |
+
"expected_components": ["session", "timeout", "authentication"],
|
73 |
+
"description": "Session timeout configuration issue"
|
74 |
+
}
|
75 |
+
]
|
76 |
+
|
77 |
+
def run_comprehensive_evaluation(self) -> Dict:
|
78 |
+
"""Run comprehensive evaluation of the system"""
|
79 |
+
print("๐ Starting Comprehensive Bug Report Analysis Evaluation")
|
80 |
+
print("=" * 70)
|
81 |
+
|
82 |
+
start_time = time.time()
|
83 |
+
results = {
|
84 |
+
"test_results": [],
|
85 |
+
"performance_metrics": {},
|
86 |
+
"quality_analysis": {},
|
87 |
+
"component_coverage": {},
|
88 |
+
"recommendations": []
|
89 |
+
}
|
90 |
+
|
91 |
+
# Test each query
|
92 |
+
for i, test_case in enumerate(self.test_queries, 1):
|
93 |
+
print(f"\n๐ Test Case {i}/{len(self.test_queries)}: {test_case['category']}")
|
94 |
+
print(f"Query: {test_case['query']}")
|
95 |
+
print("-" * 50)
|
96 |
+
|
97 |
+
# Run analysis
|
98 |
+
test_result = self.evaluate_single_query(test_case)
|
99 |
+
results["test_results"].append(test_result)
|
100 |
+
|
101 |
+
# Print summary
|
102 |
+
self.print_test_summary(test_result)
|
103 |
+
|
104 |
+
time.sleep(0.5) # Brief pause between tests
|
105 |
+
|
106 |
+
# Calculate overall metrics
|
107 |
+
results["performance_metrics"] = self.calculate_performance_metrics(results["test_results"])
|
108 |
+
results["quality_analysis"] = self.analyze_quality_patterns(results["test_results"])
|
109 |
+
results["component_coverage"] = self.analyze_component_coverage(results["test_results"])
|
110 |
+
results["recommendations"] = self.generate_recommendations(results)
|
111 |
+
|
112 |
+
total_time = time.time() - start_time
|
113 |
+
results["evaluation_time"] = total_time
|
114 |
+
|
115 |
+
# Print final report
|
116 |
+
self.print_final_report(results)
|
117 |
+
|
118 |
+
return results
|
119 |
+
|
120 |
+
def evaluate_single_query(self, test_case: Dict) -> Dict:
|
121 |
+
"""Evaluate a single test query"""
|
122 |
+
query = test_case["query"]
|
123 |
+
start_time = time.time()
|
124 |
+
|
125 |
+
# Run the analysis
|
126 |
+
try:
|
127 |
+
similar_bugs_output, relevant_code_output, suggestions, evaluation_output = analyze_bug_report(query)
|
128 |
+
|
129 |
+
# Get raw data for analysis
|
130 |
+
similar_bugs = rag_system.search_similar_bugs(query, k=5)
|
131 |
+
relevant_code = rag_system.search_relevant_code(query, k=5)
|
132 |
+
|
133 |
+
# Evaluate results
|
134 |
+
bug_evaluation = evaluator.evaluate_retrieval_relevance(query, similar_bugs)
|
135 |
+
suggestion_evaluation = evaluator.evaluate_suggestion_usefulness(query, suggestions)
|
136 |
+
|
137 |
+
processing_time = time.time() - start_time
|
138 |
+
|
139 |
+
return {
|
140 |
+
"test_case": test_case,
|
141 |
+
"processing_time": processing_time,
|
142 |
+
"similar_bugs": similar_bugs,
|
143 |
+
"relevant_code": relevant_code,
|
144 |
+
"suggestions": suggestions,
|
145 |
+
"bug_evaluation": bug_evaluation,
|
146 |
+
"suggestion_evaluation": suggestion_evaluation,
|
147 |
+
"outputs": {
|
148 |
+
"similar_bugs_output": similar_bugs_output,
|
149 |
+
"relevant_code_output": relevant_code_output,
|
150 |
+
"evaluation_output": evaluation_output
|
151 |
+
},
|
152 |
+
"success": True
|
153 |
+
}
|
154 |
+
|
155 |
+
except Exception as e:
|
156 |
+
return {
|
157 |
+
"test_case": test_case,
|
158 |
+
"processing_time": time.time() - start_time,
|
159 |
+
"error": str(e),
|
160 |
+
"success": False
|
161 |
+
}
|
162 |
+
|
163 |
+
def print_test_summary(self, result: Dict):
|
164 |
+
"""Print summary for a single test"""
|
165 |
+
if not result["success"]:
|
166 |
+
print(f"โ Error: {result['error']}")
|
167 |
+
return
|
168 |
+
|
169 |
+
bug_eval = result["bug_evaluation"]
|
170 |
+
suggestion_eval = result["suggestion_evaluation"]
|
171 |
+
|
172 |
+
print(f"โฑ๏ธ Processing Time: {result['processing_time']:.2f}s")
|
173 |
+
print(f"๐ Similar Bugs Found: {bug_eval['result_count']}")
|
174 |
+
print(f"๐ Retrieval Relevance: {bug_eval['relevance_score']:.3f}/1.0")
|
175 |
+
print(f"๐ ๏ธ Suggestion Quality: {suggestion_eval['overall_usefulness']:.3f}/1.0")
|
176 |
+
|
177 |
+
# Quality indicator
|
178 |
+
overall_quality = (bug_eval['relevance_score'] + suggestion_eval['overall_usefulness']) / 2
|
179 |
+
if overall_quality >= 0.8:
|
180 |
+
quality_icon = "๐ข"
|
181 |
+
elif overall_quality >= 0.6:
|
182 |
+
quality_icon = "๐ก"
|
183 |
+
elif overall_quality >= 0.4:
|
184 |
+
quality_icon = "๐ "
|
185 |
+
else:
|
186 |
+
quality_icon = "๐ด"
|
187 |
+
|
188 |
+
print(f"{quality_icon} Overall Quality: {overall_quality:.3f}/1.0")
|
189 |
+
|
190 |
+
def calculate_performance_metrics(self, test_results: List[Dict]) -> Dict:
|
191 |
+
"""Calculate overall performance metrics"""
|
192 |
+
successful_tests = [r for r in test_results if r["success"]]
|
193 |
+
|
194 |
+
if not successful_tests:
|
195 |
+
return {"error": "No successful tests to analyze"}
|
196 |
+
|
197 |
+
processing_times = [r["processing_time"] for r in successful_tests]
|
198 |
+
retrieval_scores = [r["bug_evaluation"]["relevance_score"] for r in successful_tests]
|
199 |
+
suggestion_scores = [r["suggestion_evaluation"]["overall_usefulness"] for r in successful_tests]
|
200 |
+
bug_counts = [r["bug_evaluation"]["result_count"] for r in successful_tests]
|
201 |
+
|
202 |
+
return {
|
203 |
+
"total_tests": len(test_results),
|
204 |
+
"successful_tests": len(successful_tests),
|
205 |
+
"success_rate": len(successful_tests) / len(test_results),
|
206 |
+
"average_processing_time": sum(processing_times) / len(processing_times),
|
207 |
+
"min_processing_time": min(processing_times),
|
208 |
+
"max_processing_time": max(processing_times),
|
209 |
+
"average_retrieval_score": sum(retrieval_scores) / len(retrieval_scores),
|
210 |
+
"average_suggestion_score": sum(suggestion_scores) / len(suggestion_scores),
|
211 |
+
"average_bugs_found": sum(bug_counts) / len(bug_counts),
|
212 |
+
"retrieval_score_std": pd.Series(retrieval_scores).std(),
|
213 |
+
"suggestion_score_std": pd.Series(suggestion_scores).std()
|
214 |
+
}
|
215 |
+
|
216 |
+
def analyze_quality_patterns(self, test_results: List[Dict]) -> Dict:
|
217 |
+
"""Analyze quality patterns across different categories"""
|
218 |
+
successful_tests = [r for r in test_results if r["success"]]
|
219 |
+
|
220 |
+
category_analysis = {}
|
221 |
+
for result in successful_tests:
|
222 |
+
category = result["test_case"]["category"]
|
223 |
+
|
224 |
+
if category not in category_analysis:
|
225 |
+
category_analysis[category] = {
|
226 |
+
"count": 0,
|
227 |
+
"retrieval_scores": [],
|
228 |
+
"suggestion_scores": [],
|
229 |
+
"processing_times": []
|
230 |
+
}
|
231 |
+
|
232 |
+
category_analysis[category]["count"] += 1
|
233 |
+
category_analysis[category]["retrieval_scores"].append(
|
234 |
+
result["bug_evaluation"]["relevance_score"]
|
235 |
+
)
|
236 |
+
category_analysis[category]["suggestion_scores"].append(
|
237 |
+
result["suggestion_evaluation"]["overall_usefulness"]
|
238 |
+
)
|
239 |
+
category_analysis[category]["processing_times"].append(
|
240 |
+
result["processing_time"]
|
241 |
+
)
|
242 |
+
|
243 |
+
# Calculate averages for each category
|
244 |
+
for category, data in category_analysis.items():
|
245 |
+
data["avg_retrieval"] = sum(data["retrieval_scores"]) / len(data["retrieval_scores"])
|
246 |
+
data["avg_suggestion"] = sum(data["suggestion_scores"]) / len(data["suggestion_scores"])
|
247 |
+
data["avg_processing_time"] = sum(data["processing_times"]) / len(data["processing_times"])
|
248 |
+
|
249 |
+
return category_analysis
|
250 |
+
|
251 |
+
def analyze_component_coverage(self, test_results: List[Dict]) -> Dict:
|
252 |
+
"""Analyze how well the system covers different components"""
|
253 |
+
component_coverage = {}
|
254 |
+
|
255 |
+
for result in test_results:
|
256 |
+
if not result["success"]:
|
257 |
+
continue
|
258 |
+
|
259 |
+
test_case = result["test_case"]
|
260 |
+
expected_components = test_case.get("expected_components", [])
|
261 |
+
|
262 |
+
# Check if similar bugs contain expected components
|
263 |
+
similar_bugs = result["similar_bugs"]
|
264 |
+
found_components = set()
|
265 |
+
|
266 |
+
for bug in similar_bugs:
|
267 |
+
component = bug.get("component", "").lower()
|
268 |
+
description = bug.get("description", "").lower()
|
269 |
+
title = bug.get("title", "").lower()
|
270 |
+
|
271 |
+
for expected in expected_components:
|
272 |
+
if expected.lower() in f"{component} {description} {title}":
|
273 |
+
found_components.add(expected)
|
274 |
+
|
275 |
+
component_coverage[test_case["category"]] = {
|
276 |
+
"expected": expected_components,
|
277 |
+
"found": list(found_components),
|
278 |
+
"coverage_ratio": len(found_components) / len(expected_components) if expected_components else 0
|
279 |
+
}
|
280 |
+
|
281 |
+
return component_coverage
|
282 |
+
|
283 |
+
def generate_recommendations(self, results: Dict) -> List[str]:
|
284 |
+
"""Generate recommendations based on evaluation results"""
|
285 |
+
recommendations = []
|
286 |
+
performance = results["performance_metrics"]
|
287 |
+
quality = results["quality_analysis"]
|
288 |
+
|
289 |
+
# Performance recommendations
|
290 |
+
if performance.get("average_processing_time", 0) > 3.0:
|
291 |
+
recommendations.append("Consider optimizing query processing time (currently > 3s average)")
|
292 |
+
|
293 |
+
if performance.get("success_rate", 1.0) < 0.95:
|
294 |
+
recommendations.append("Improve error handling and system reliability")
|
295 |
+
|
296 |
+
# Quality recommendations
|
297 |
+
avg_retrieval = performance.get("average_retrieval_score", 0)
|
298 |
+
avg_suggestion = performance.get("average_suggestion_score", 0)
|
299 |
+
|
300 |
+
if avg_retrieval < 0.7:
|
301 |
+
recommendations.append("Improve bug retrieval relevance (add more diverse training data)")
|
302 |
+
|
303 |
+
if avg_suggestion < 0.7:
|
304 |
+
recommendations.append("Enhance suggestion generation quality (refine fix templates)")
|
305 |
+
|
306 |
+
# Category-specific recommendations
|
307 |
+
for category, data in quality.items():
|
308 |
+
if data["avg_retrieval"] < 0.6:
|
309 |
+
recommendations.append(f"Improve {category} category retrieval performance")
|
310 |
+
|
311 |
+
if data["avg_suggestion"] < 0.6:
|
312 |
+
recommendations.append(f"Enhance {category} category suggestion quality")
|
313 |
+
|
314 |
+
if not recommendations:
|
315 |
+
recommendations.append("System performance is excellent across all metrics!")
|
316 |
+
|
317 |
+
return recommendations
|
318 |
+
|
319 |
+
def print_final_report(self, results: Dict):
|
320 |
+
"""Print comprehensive final evaluation report"""
|
321 |
+
print("\n" + "=" * 70)
|
322 |
+
print("๐ COMPREHENSIVE EVALUATION REPORT")
|
323 |
+
print("=" * 70)
|
324 |
+
|
325 |
+
# Performance Summary
|
326 |
+
perf = results["performance_metrics"]
|
327 |
+
print(f"\n๐ PERFORMANCE SUMMARY")
|
328 |
+
print(f"{'Total Tests:':<25} {perf['total_tests']}")
|
329 |
+
print(f"{'Success Rate:':<25} {perf['success_rate']:.1%}")
|
330 |
+
print(f"{'Avg Processing Time:':<25} {perf['average_processing_time']:.2f}s")
|
331 |
+
print(f"{'Avg Retrieval Score:':<25} {perf['average_retrieval_score']:.3f}/1.0")
|
332 |
+
print(f"{'Avg Suggestion Score:':<25} {perf['average_suggestion_score']:.3f}/1.0")
|
333 |
+
print(f"{'Avg Bugs Found:':<25} {perf['average_bugs_found']:.1f}")
|
334 |
+
|
335 |
+
# Quality Analysis by Category
|
336 |
+
print(f"\n๐ QUALITY ANALYSIS BY CATEGORY")
|
337 |
+
quality = results["quality_analysis"]
|
338 |
+
for category, data in quality.items():
|
339 |
+
print(f"\n{category}:")
|
340 |
+
print(f" Retrieval: {data['avg_retrieval']:.3f} | Suggestions: {data['avg_suggestion']:.3f}")
|
341 |
+
|
342 |
+
# Component Coverage
|
343 |
+
print(f"\n๐ฏ COMPONENT COVERAGE ANALYSIS")
|
344 |
+
coverage = results["component_coverage"]
|
345 |
+
for category, data in coverage.items():
|
346 |
+
coverage_pct = data['coverage_ratio'] * 100
|
347 |
+
print(f"{category}: {coverage_pct:.0f}% coverage ({len(data['found'])}/{len(data['expected'])} components)")
|
348 |
+
|
349 |
+
# Recommendations
|
350 |
+
print(f"\n๐ก RECOMMENDATIONS")
|
351 |
+
for i, rec in enumerate(results["recommendations"], 1):
|
352 |
+
print(f"{i}. {rec}")
|
353 |
+
|
354 |
+
# Overall Rating
|
355 |
+
overall_score = (perf['average_retrieval_score'] + perf['average_suggestion_score']) / 2
|
356 |
+
if overall_score >= 0.8:
|
357 |
+
rating = "๐ข EXCELLENT"
|
358 |
+
elif overall_score >= 0.7:
|
359 |
+
rating = "๐ก GOOD"
|
360 |
+
elif overall_score >= 0.6:
|
361 |
+
rating = "๐ FAIR"
|
362 |
+
else:
|
363 |
+
rating = "๐ด NEEDS IMPROVEMENT"
|
364 |
+
|
365 |
+
print(f"\nโญ OVERALL SYSTEM RATING: {rating} ({overall_score:.3f}/1.0)")
|
366 |
+
print(f"๐
Evaluation completed in {results['evaluation_time']:.1f} seconds")
|
367 |
+
print("=" * 70)
|
368 |
+
|
369 |
+
def save_results(self, results: Dict, filename: str = "evaluation_results.json"):
|
370 |
+
"""Save evaluation results to file"""
|
371 |
+
try:
|
372 |
+
# Convert numpy types to native Python types for JSON serialization
|
373 |
+
def convert_types(obj):
|
374 |
+
if hasattr(obj, 'item'): # numpy scalar
|
375 |
+
return obj.item()
|
376 |
+
elif isinstance(obj, dict):
|
377 |
+
return {k: convert_types(v) for k, v in obj.items()}
|
378 |
+
elif isinstance(obj, list):
|
379 |
+
return [convert_types(item) for item in obj]
|
380 |
+
else:
|
381 |
+
return obj
|
382 |
+
|
383 |
+
serializable_results = convert_types(results)
|
384 |
+
|
385 |
+
with open(filename, 'w') as f:
|
386 |
+
json.dump(serializable_results, f, indent=2, default=str)
|
387 |
+
|
388 |
+
print(f"๐ Results saved to {filename}")
|
389 |
+
|
390 |
+
except Exception as e:
|
391 |
+
print(f"โ Error saving results: {e}")
|
392 |
+
|
393 |
+
def run_interactive_demo():
|
394 |
+
"""Run an interactive demonstration of the system"""
|
395 |
+
print("๐ฎ Interactive Bug Report Analysis Demo")
|
396 |
+
print("Enter bug descriptions to see real-time analysis")
|
397 |
+
print("Type 'quit' to exit\n")
|
398 |
+
|
399 |
+
while True:
|
400 |
+
try:
|
401 |
+
query = input("๐ Describe a bug: ").strip()
|
402 |
+
|
403 |
+
if query.lower() in ['quit', 'exit', 'q']:
|
404 |
+
print("๐ Thanks for trying the Bug Report Analysis Agent!")
|
405 |
+
break
|
406 |
+
|
407 |
+
if not query:
|
408 |
+
continue
|
409 |
+
|
410 |
+
print("\n๐ Analyzing...")
|
411 |
+
start_time = time.time()
|
412 |
+
|
413 |
+
similar_bugs_output, relevant_code_output, suggestions, evaluation_output = analyze_bug_report(query)
|
414 |
+
|
415 |
+
processing_time = time.time() - start_time
|
416 |
+
|
417 |
+
print(f"โฑ๏ธ Analysis completed in {processing_time:.2f} seconds\n")
|
418 |
+
print("๐ RESULTS:")
|
419 |
+
print("-" * 50)
|
420 |
+
print(similar_bugs_output[:500] + "..." if len(similar_bugs_output) > 500 else similar_bugs_output)
|
421 |
+
print("\n" + evaluation_output)
|
422 |
+
print("\n" + "="*50 + "\n")
|
423 |
+
|
424 |
+
except KeyboardInterrupt:
|
425 |
+
print("\n๐ Goodbye!")
|
426 |
+
break
|
427 |
+
except Exception as e:
|
428 |
+
print(f"โ Error: {e}")
|
429 |
+
|
430 |
+
if __name__ == "__main__":
|
431 |
+
evaluator_instance = SystemEvaluator()
|
432 |
+
|
433 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--demo":
|
434 |
+
run_interactive_demo()
|
435 |
+
else:
|
436 |
+
# Run comprehensive evaluation
|
437 |
+
results = evaluator_instance.run_comprehensive_evaluation()
|
438 |
+
evaluator_instance.save_results(results)
|
439 |
+
|
440 |
+
print("\n๐ฏ To run interactive demo: python evaluate_system.py --demo")
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
pandas>=2.0.0
|
3 |
+
transformers>=4.36.0
|
4 |
+
sentence-transformers>=2.2.2
|
5 |
+
langchain>=0.1.0
|
6 |
+
faiss-cpu>=1.7.4
|
7 |
+
nltk>=3.8.1
|
8 |
+
fuzzywuzzy
|
9 |
+
python-Levenshtein
|
10 |
+
rouge-score
|