Devaharibabu commited on
Commit
83d51a6
ยท
verified ยท
1 Parent(s): 6a11deb

Upload 5 files

Browse files
Files changed (5) hide show
  1. README.md +14 -0
  2. app.py +679 -0
  3. bug_reports.csv +4 -0
  4. evaluate_system.py +440 -0
  5. requirements.txt +10 -0
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Bug Report Agent
3
+ emoji: ๐Ÿ‘
4
+ colorFrom: pink
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 5.32.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ short_description: Bug Report Analaysis Agent Using Rag
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,679 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ๐Ÿž Enhanced Bug Report Analysis Agent
3
+ =====================================
4
+ A comprehensive RAG-based system for analyzing bug reports, finding similar issues,
5
+ and suggesting fixes with evaluation metrics for retrieval relevance and usefulness.
6
+ """
7
+
8
+ import os
9
+ import pandas as pd
10
+ import numpy as np
11
+ import gradio as gr
12
+ import sqlite3
13
+ import json
14
+ import ast
15
+ import re
16
+ from datetime import datetime, timedelta
17
+ from typing import List, Dict, Tuple, Optional
18
+ import logging
19
+
20
+ # Core RAG and ML imports
21
+ from sentence_transformers import SentenceTransformer
22
+ import faiss
23
+ from sklearn.metrics.pairwise import cosine_similarity
24
+ from sklearn.feature_extraction.text import TfidfVectorizer
25
+ import nltk
26
+ from fuzzywuzzy import fuzz, process
27
+
28
+ # LangChain imports
29
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
30
+ from langchain.docstore.document import Document
31
+
32
+ # Evaluation metrics
33
+ from rouge_score import rouge_scorer
34
+ import difflib
35
+
36
+ # Download required NLTK data
37
+ try:
38
+ nltk.download('punkt', quiet=True)
39
+ nltk.download('stopwords', quiet=True)
40
+ except:
41
+ pass
42
+
43
+ # Configure logging
44
+ logging.basicConfig(level=logging.INFO)
45
+ logger = logging.getLogger(__name__)
46
+
47
+ class BugReportRAG:
48
+ """Enhanced RAG system for bug report analysis"""
49
+
50
+ def __init__(self):
51
+ self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
52
+ self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
53
+ self.bug_index = None
54
+ self.code_index = None
55
+ self.bug_data = None
56
+ self.code_data = None
57
+ self.text_splitter = RecursiveCharacterTextSplitter(
58
+ chunk_size=500,
59
+ chunk_overlap=50,
60
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
61
+ )
62
+
63
+ def load_and_index_data(self):
64
+ """Load and index bug reports and code files"""
65
+ logger.info("Loading and indexing data...")
66
+
67
+ # Load bug reports
68
+ self._load_bug_reports()
69
+
70
+ # Load and process code files
71
+ self._load_code_files()
72
+
73
+ # Create FAISS indices
74
+ self._create_faiss_indices()
75
+
76
+ logger.info("Data loading and indexing completed")
77
+
78
+ def _load_bug_reports(self):
79
+ """Load and process bug reports from CSV"""
80
+ try:
81
+ df = pd.read_csv("bug_reports.csv")
82
+
83
+ # Create comprehensive text representation for each bug
84
+ bug_texts = []
85
+ bug_metadata = []
86
+
87
+ for _, row in df.iterrows():
88
+ # Combine relevant fields for better semantic search
89
+ text_parts = [
90
+ f"Title: {row.get('title', '')}",
91
+ f"Description: {row.get('description', '')}",
92
+ f"Component: {row.get('component', '')}",
93
+ f"Severity: {row.get('severity', '')}",
94
+ f"Status: {row.get('status', '')}",
95
+ ]
96
+
97
+ if pd.notna(row.get('fix_description')):
98
+ text_parts.append(f"Fix: {row['fix_description']}")
99
+
100
+ bug_text = " | ".join(text_parts)
101
+ bug_texts.append(bug_text)
102
+
103
+ # Store metadata
104
+ metadata = {
105
+ 'id': row.get('id', ''),
106
+ 'title': row.get('title', ''),
107
+ 'description': row.get('description', ''),
108
+ 'severity': row.get('severity', ''),
109
+ 'status': row.get('status', ''),
110
+ 'component': row.get('component', ''),
111
+ 'fix_description': row.get('fix_description', ''),
112
+ 'related_files': row.get('related_files', ''),
113
+ 'created_date': row.get('created_date', ''),
114
+ 'resolved_date': row.get('resolved_date', ''),
115
+ }
116
+ bug_metadata.append(metadata)
117
+
118
+ self.bug_data = {
119
+ 'texts': bug_texts,
120
+ 'metadata': bug_metadata
121
+ }
122
+
123
+ except Exception as e:
124
+ logger.error(f"Error loading bug reports: {e}")
125
+ self.bug_data = {'texts': [], 'metadata': []}
126
+
127
+ def _load_code_files(self):
128
+ """Load and process code files"""
129
+ code_texts = []
130
+ code_metadata = []
131
+
132
+ for root, dirs, files in os.walk("codebase"):
133
+ for file in files:
134
+ if file.endswith(('.py', '.js', '.html', '.css')):
135
+ file_path = os.path.join(root, file)
136
+ try:
137
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
138
+ content = f.read()
139
+
140
+ # Split large files into chunks
141
+ if len(content) > 1000:
142
+ chunks = self.text_splitter.split_text(content)
143
+ for i, chunk in enumerate(chunks):
144
+ code_texts.append(f"File: {file} | {chunk}")
145
+ code_metadata.append({
146
+ 'file_path': file_path,
147
+ 'file_name': file,
148
+ 'chunk_index': i,
149
+ 'total_chunks': len(chunks)
150
+ })
151
+ else:
152
+ code_texts.append(f"File: {file} | {content}")
153
+ code_metadata.append({
154
+ 'file_path': file_path,
155
+ 'file_name': file,
156
+ 'chunk_index': 0,
157
+ 'total_chunks': 1
158
+ })
159
+
160
+ except Exception as e:
161
+ logger.warning(f"Error reading {file_path}: {e}")
162
+
163
+ self.code_data = {
164
+ 'texts': code_texts,
165
+ 'metadata': code_metadata
166
+ }
167
+
168
+ def _create_faiss_indices(self):
169
+ """Create FAISS indices for efficient similarity search"""
170
+ # Create bug report index
171
+ if self.bug_data['texts']:
172
+ bug_embeddings = self.embedding_model.encode(self.bug_data['texts'])
173
+ self.bug_index = faiss.IndexFlatIP(bug_embeddings.shape[1])
174
+ # Normalize embeddings for cosine similarity
175
+ faiss.normalize_L2(bug_embeddings)
176
+ self.bug_index.add(bug_embeddings.astype('float32'))
177
+
178
+ # Create code index
179
+ if self.code_data['texts']:
180
+ code_embeddings = self.embedding_model.encode(self.code_data['texts'])
181
+ self.code_index = faiss.IndexFlatIP(code_embeddings.shape[1])
182
+ faiss.normalize_L2(code_embeddings)
183
+ self.code_index.add(code_embeddings.astype('float32'))
184
+
185
+ def search_similar_bugs(self, query: str, k: int = 5) -> List[Dict]:
186
+ """Search for similar bug reports"""
187
+ if not self.bug_index or not self.bug_data['texts']:
188
+ return []
189
+
190
+ # Encode query
191
+ query_embedding = self.embedding_model.encode([query])
192
+ faiss.normalize_L2(query_embedding)
193
+
194
+ # Search
195
+ scores, indices = self.bug_index.search(query_embedding.astype('float32'), k)
196
+
197
+ results = []
198
+ for score, idx in zip(scores[0], indices[0]):
199
+ if idx < len(self.bug_data['metadata']):
200
+ result = self.bug_data['metadata'][idx].copy()
201
+ result['similarity_score'] = float(score)
202
+ results.append(result)
203
+
204
+ return results
205
+
206
+ def search_relevant_code(self, query: str, k: int = 5) -> List[Dict]:
207
+ """Search for relevant code sections"""
208
+ if not self.code_index or not self.code_data['texts']:
209
+ return []
210
+
211
+ # Encode query
212
+ query_embedding = self.embedding_model.encode([query])
213
+ faiss.normalize_L2(query_embedding)
214
+
215
+ # Search
216
+ scores, indices = self.code_index.search(query_embedding.astype('float32'), k)
217
+
218
+ results = []
219
+ for score, idx in zip(scores[0], indices[0]):
220
+ if idx < len(self.code_data['metadata']):
221
+ result = self.code_data['metadata'][idx].copy()
222
+ result['similarity_score'] = float(score)
223
+ result['code_text'] = self.code_data['texts'][idx]
224
+ results.append(result)
225
+
226
+ return results
227
+
228
+ class BugAnalysisEvaluator:
229
+ """Evaluate the quality and relevance of bug analysis results"""
230
+
231
+ def __init__(self):
232
+ self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
233
+
234
+ def evaluate_retrieval_relevance(self, query: str, results: List[Dict]) -> Dict:
235
+ """Evaluate how relevant retrieved results are to the query"""
236
+ if not results:
237
+ return {
238
+ 'average_similarity': 0.0,
239
+ 'relevance_score': 0.0,
240
+ 'result_count': 0
241
+ }
242
+
243
+ # Calculate average similarity score
244
+ similarity_scores = [r.get('similarity_score', 0.0) for r in results]
245
+ average_similarity = np.mean(similarity_scores) if similarity_scores else 0.0
246
+
247
+ # Calculate semantic relevance using text similarity
248
+ query_lower = query.lower()
249
+ relevance_scores = []
250
+
251
+ for result in results:
252
+ # Combine title and description for relevance calculation
253
+ result_text = f"{result.get('title', '')} {result.get('description', '')}"
254
+ relevance_score = fuzz.partial_ratio(query_lower, result_text.lower()) / 100.0
255
+ relevance_scores.append(relevance_score)
256
+
257
+ relevance_score = np.mean(relevance_scores) if relevance_scores else 0.0
258
+
259
+ return {
260
+ 'average_similarity': float(average_similarity),
261
+ 'relevance_score': float(relevance_score),
262
+ 'result_count': len(results),
263
+ 'individual_scores': similarity_scores
264
+ }
265
+
266
+ def evaluate_suggestion_usefulness(self, query: str, suggestions: str) -> Dict:
267
+ """Evaluate the usefulness of generated suggestions"""
268
+ if not suggestions or not query:
269
+ return {
270
+ 'completeness_score': 0.0,
271
+ 'specificity_score': 0.0,
272
+ 'actionability_score': 0.0,
273
+ 'overall_usefulness': 0.0
274
+ }
275
+
276
+ # Completeness: How well suggestions address the query
277
+ rouge_scores = self.rouge_scorer.score(query.lower(), suggestions.lower())
278
+ completeness_score = rouge_scores['rougeL'].fmeasure
279
+
280
+ # Specificity: Presence of specific technical terms, file names, functions
281
+ specificity_indicators = [
282
+ r'\b\w+\.py\b', # Python files
283
+ r'\bdef \w+\b', # Function definitions
284
+ r'\bclass \w+\b', # Class definitions
285
+ r'\b\w+\(\)', # Function calls
286
+ r'\bfix\b|\bupdate\b|\bchange\b|\bmodify\b', # Action words
287
+ ]
288
+
289
+ specificity_count = sum(len(re.findall(pattern, suggestions.lower()))
290
+ for pattern in specificity_indicators)
291
+ specificity_score = min(specificity_count / 5.0, 1.0) # Normalize to 0-1
292
+
293
+ # Actionability: Presence of actionable steps
294
+ actionable_phrases = [
295
+ 'check', 'verify', 'update', 'modify', 'fix', 'add', 'remove',
296
+ 'ensure', 'validate', 'test', 'debug', 'implement', 'configure'
297
+ ]
298
+
299
+ actionability_count = sum(1 for phrase in actionable_phrases
300
+ if phrase in suggestions.lower())
301
+ actionability_score = min(actionability_count / 5.0, 1.0)
302
+
303
+ # Overall usefulness (weighted average)
304
+ overall_usefulness = (
305
+ 0.3 * completeness_score +
306
+ 0.4 * specificity_score +
307
+ 0.3 * actionability_score
308
+ )
309
+
310
+ return {
311
+ 'completeness_score': float(completeness_score),
312
+ 'specificity_score': float(specificity_score),
313
+ 'actionability_score': float(actionability_score),
314
+ 'overall_usefulness': float(overall_usefulness)
315
+ }
316
+
317
+ class FixSuggestionEngine:
318
+ """Generate intelligent fix suggestions based on analysis"""
319
+
320
+ def __init__(self):
321
+ self.common_fixes = {
322
+ 'authentication': [
323
+ "Check password validation regex patterns",
324
+ "Verify session management configuration",
325
+ "Ensure proper error handling in login flow",
326
+ "Review authentication middleware setup"
327
+ ],
328
+ 'database': [
329
+ "Check database connection pooling settings",
330
+ "Review query optimization and indexing",
331
+ "Verify transaction handling and rollbacks",
332
+ "Check for connection timeout configurations"
333
+ ],
334
+ 'email': [
335
+ "Verify SMTP server configuration",
336
+ "Check email template rendering",
337
+ "Ensure email credentials are properly set",
338
+ "Review email queue processing"
339
+ ],
340
+ 'ui': [
341
+ "Check JavaScript event listeners",
342
+ "Verify CSS styling and responsive design",
343
+ "Review form validation logic",
344
+ "Ensure proper DOM element targeting"
345
+ ]
346
+ }
347
+
348
+ def generate_suggestions(self, query: str, similar_bugs: List[Dict],
349
+ relevant_code: List[Dict]) -> str:
350
+ """Generate fix suggestions based on analysis"""
351
+ suggestions = []
352
+
353
+ # Add context-based suggestions
354
+ suggestions.append("## ๐Ÿ” Analysis Summary")
355
+ suggestions.append(f"Based on the query: '{query}'")
356
+ suggestions.append("")
357
+
358
+ # Add similar bug insights
359
+ if similar_bugs:
360
+ suggestions.append("## ๐Ÿชฒ Similar Issues Found")
361
+ for i, bug in enumerate(similar_bugs[:3], 1):
362
+ status = bug.get('status', 'Unknown')
363
+ severity = bug.get('severity', 'Unknown')
364
+ suggestions.append(f"{i}. **{bug.get('title', 'Untitled')}** (Status: {status}, Severity: {severity})")
365
+
366
+ if bug.get('fix_description'):
367
+ suggestions.append(f" - Previous fix: {bug['fix_description']}")
368
+ suggestions.append("")
369
+
370
+ # Add code analysis
371
+ if relevant_code:
372
+ suggestions.append("## ๐Ÿ’ป Relevant Code Sections")
373
+ for i, code in enumerate(relevant_code[:3], 1):
374
+ file_name = code.get('file_name', 'Unknown file')
375
+ suggestions.append(f"{i}. **{file_name}** (Similarity: {code.get('similarity_score', 0):.2f})")
376
+ suggestions.append("")
377
+
378
+ # Add specific fix suggestions based on component analysis
379
+ component_suggestions = self._get_component_suggestions(query, similar_bugs)
380
+ if component_suggestions:
381
+ suggestions.append("## ๐Ÿ› ๏ธ Suggested Actions")
382
+ for suggestion in component_suggestions:
383
+ suggestions.append(f"- {suggestion}")
384
+ suggestions.append("")
385
+
386
+ # Add general debugging steps
387
+ suggestions.append("## ๐Ÿ”ง General Debugging Steps")
388
+ suggestions.extend([
389
+ "- Review error logs and stack traces",
390
+ "- Test in different environments (dev/staging/prod)",
391
+ "- Check recent code changes in related files",
392
+ "- Verify configuration settings",
393
+ "- Run relevant test suites",
394
+ "- Consider rollback if issue is critical"
395
+ ])
396
+
397
+ return "\n".join(suggestions)
398
+
399
+ def _get_component_suggestions(self, query: str, similar_bugs: List[Dict]) -> List[str]:
400
+ """Get component-specific suggestions"""
401
+ suggestions = []
402
+ query_lower = query.lower()
403
+
404
+ # Identify likely component based on keywords and similar bugs
405
+ components = [bug.get('component', '').lower() for bug in similar_bugs]
406
+
407
+ # Keyword-based component detection
408
+ if any(keyword in query_lower for keyword in ['login', 'auth', 'password', 'session']):
409
+ suggestions.extend(self.common_fixes.get('authentication', []))
410
+
411
+ if any(keyword in query_lower for keyword in ['database', 'db', 'query', 'connection']):
412
+ suggestions.extend(self.common_fixes.get('database', []))
413
+
414
+ if any(keyword in query_lower for keyword in ['email', 'smtp', 'mail', 'notification']):
415
+ suggestions.extend(self.common_fixes.get('email', []))
416
+
417
+ if any(keyword in query_lower for keyword in ['button', 'form', 'ui', 'interface', 'display']):
418
+ suggestions.extend(self.common_fixes.get('ui', []))
419
+
420
+ # Component-based suggestions from similar bugs
421
+ for component in components:
422
+ if component and component in self.common_fixes:
423
+ suggestions.extend(self.common_fixes[component])
424
+
425
+ return list(set(suggestions)) # Remove duplicates
426
+
427
+ # Initialize the RAG system and other components
428
+ rag_system = BugReportRAG()
429
+ evaluator = BugAnalysisEvaluator()
430
+ suggestion_engine = FixSuggestionEngine()
431
+
432
+ # Load and index data on startup
433
+ rag_system.load_and_index_data()
434
+
435
+ def analyze_bug_report(query: str) -> Tuple[str, str, str, str]:
436
+ """Main function to analyze bug reports"""
437
+ try:
438
+ if not query.strip():
439
+ return "Please enter a bug description", "", "", ""
440
+
441
+ logger.info(f"Analyzing query: {query}")
442
+
443
+ # Search for similar bugs and relevant code
444
+ similar_bugs = rag_system.search_similar_bugs(query, k=5)
445
+ relevant_code = rag_system.search_relevant_code(query, k=5)
446
+
447
+ # Generate suggestions
448
+ suggestions = suggestion_engine.generate_suggestions(query, similar_bugs, relevant_code)
449
+
450
+ # Evaluate results
451
+ bug_evaluation = evaluator.evaluate_retrieval_relevance(query, similar_bugs)
452
+ suggestion_evaluation = evaluator.evaluate_suggestion_usefulness(query, suggestions)
453
+
454
+ # Format similar bugs output
455
+ similar_bugs_output = format_similar_bugs(similar_bugs, bug_evaluation)
456
+
457
+ # Format relevant code output
458
+ relevant_code_output = format_relevant_code(relevant_code)
459
+
460
+ # Format evaluation metrics
461
+ evaluation_output = format_evaluation_metrics(bug_evaluation, suggestion_evaluation)
462
+
463
+ return similar_bugs_output, relevant_code_output, suggestions, evaluation_output
464
+
465
+ except Exception as e:
466
+ logger.error(f"Error analyzing bug report: {e}")
467
+ return f"Error: {str(e)}", "", "", ""
468
+
469
+ def format_similar_bugs(bugs: List[Dict], evaluation: Dict) -> str:
470
+ """Format similar bugs for display"""
471
+ if not bugs:
472
+ return "No similar bugs found in the database."
473
+
474
+ output = [f"## ๐Ÿ” Found {len(bugs)} Similar Bug Reports"]
475
+ output.append(f"**Relevance Score: {evaluation['relevance_score']:.2f}/1.0**")
476
+ output.append(f"**Average Similarity: {evaluation['average_similarity']:.2f}/1.0**")
477
+ output.append("")
478
+
479
+ for i, bug in enumerate(bugs, 1):
480
+ output.append(f"### {i}. {bug.get('title', 'Untitled Bug')}")
481
+ output.append(f"**ID:** {bug.get('id', 'N/A')} | **Severity:** {bug.get('severity', 'N/A')} | **Status:** {bug.get('status', 'N/A')}")
482
+ output.append(f"**Similarity:** {bug.get('similarity_score', 0):.3f}")
483
+ output.append(f"**Component:** {bug.get('component', 'N/A')}")
484
+ output.append("")
485
+ output.append(f"**Description:** {bug.get('description', 'No description available')}")
486
+
487
+ if bug.get('fix_description'):
488
+ output.append(f"**Previous Fix:** {bug['fix_description']}")
489
+
490
+ if bug.get('related_files'):
491
+ output.append(f"**Related Files:** {bug['related_files']}")
492
+
493
+ output.append("---")
494
+
495
+ return "\n".join(output)
496
+
497
+ def format_relevant_code(code_results: List[Dict]) -> str:
498
+ """Format relevant code sections for display"""
499
+ if not code_results:
500
+ return "No relevant code sections found."
501
+
502
+ output = [f"## ๐Ÿ’ป Found {len(code_results)} Relevant Code Sections"]
503
+ output.append("")
504
+
505
+ for i, code in enumerate(code_results, 1):
506
+ file_name = code.get('file_name', 'Unknown file')
507
+ similarity = code.get('similarity_score', 0)
508
+
509
+ output.append(f"### {i}. {file_name}")
510
+ output.append(f"**Similarity:** {similarity:.3f} | **Path:** {code.get('file_path', 'N/A')}")
511
+
512
+ if code.get('chunk_index', 0) > 0:
513
+ total_chunks = code.get('total_chunks', 1)
514
+ output.append(f"**Chunk:** {code['chunk_index'] + 1}/{total_chunks}")
515
+
516
+ output.append("")
517
+
518
+ # Extract and display code snippet
519
+ code_text = code.get('code_text', '')
520
+ if 'File:' in code_text:
521
+ _, code_content = code_text.split('|', 1)
522
+ code_content = code_content.strip()
523
+ else:
524
+ code_content = code_text
525
+
526
+ # Limit code display length
527
+ if len(code_content) > 500:
528
+ code_content = code_content[:500] + "\n... (truncated)"
529
+
530
+ output.append("```python")
531
+ output.append(code_content)
532
+ output.append("```")
533
+ output.append("---")
534
+
535
+ return "\n".join(output)
536
+
537
+ def format_evaluation_metrics(bug_eval: Dict, suggestion_eval: Dict) -> str:
538
+ """Format evaluation metrics for display"""
539
+ output = ["## ๐Ÿ“Š Analysis Quality Metrics"]
540
+ output.append("")
541
+
542
+ # Bug retrieval metrics
543
+ output.append("### ๐Ÿ” Retrieval Relevance")
544
+ output.append(f"- **Average Similarity Score:** {bug_eval['average_similarity']:.3f}/1.0")
545
+ output.append(f"- **Semantic Relevance:** {bug_eval['relevance_score']:.3f}/1.0")
546
+ output.append(f"- **Results Retrieved:** {bug_eval['result_count']}")
547
+
548
+ # Suggestion quality metrics
549
+ output.append("")
550
+ output.append("### ๐Ÿ› ๏ธ Suggestion Quality")
551
+ output.append(f"- **Completeness:** {suggestion_eval['completeness_score']:.3f}/1.0")
552
+ output.append(f"- **Specificity:** {suggestion_eval['specificity_score']:.3f}/1.0")
553
+ output.append(f"- **Actionability:** {suggestion_eval['actionability_score']:.3f}/1.0")
554
+ output.append(f"- **Overall Usefulness:** {suggestion_eval['overall_usefulness']:.3f}/1.0")
555
+
556
+ # Quality assessment
557
+ overall_quality = (bug_eval['relevance_score'] + suggestion_eval['overall_usefulness']) / 2
558
+ output.append("")
559
+ output.append("### โญ Overall Analysis Quality")
560
+
561
+ if overall_quality >= 0.8:
562
+ quality_label = "๐ŸŸข Excellent"
563
+ elif overall_quality >= 0.6:
564
+ quality_label = "๐ŸŸก Good"
565
+ elif overall_quality >= 0.4:
566
+ quality_label = "๐ŸŸ  Fair"
567
+ else:
568
+ quality_label = "๐Ÿ”ด Poor"
569
+
570
+ output.append(f"**Quality Rating:** {quality_label} ({overall_quality:.3f}/1.0)")
571
+
572
+ return "\n".join(output)
573
+
574
+ # Create Gradio interface
575
+ def create_interface():
576
+ """Create the Gradio interface for the Bug Report Analysis Agent"""
577
+
578
+ with gr.Blocks(
579
+ title="๐Ÿž Bug Report Analysis Agent",
580
+ theme=gr.themes.Soft(),
581
+ css="""
582
+ .gradio-container {
583
+ max-width: 1200px !important;
584
+ }
585
+ .tab-nav {
586
+ font-weight: bold;
587
+ }
588
+ """
589
+ ) as demo:
590
+
591
+ gr.Markdown("""
592
+ # ๐Ÿž Bug Report Analysis Agent
593
+
594
+ **Advanced RAG-powered system for intelligent bug analysis**
595
+
596
+ This system analyzes bug reports using Retrieval-Augmented Generation (RAG) to:
597
+ - ๐Ÿ” Find similar past issues in the bug database
598
+ - ๐Ÿ’ป Identify relevant code sections that might be related
599
+ - ๐Ÿ› ๏ธ Suggest potential causes and fixes
600
+ - ๐Ÿ“Š Evaluate retrieval relevance and suggestion usefulness
601
+
602
+ ---
603
+ """)
604
+
605
+ with gr.Row():
606
+ with gr.Column(scale=1):
607
+ input_box = gr.Textbox(
608
+ lines=6,
609
+ label="๐Ÿ” Bug Description",
610
+ placeholder="Describe the bug you're experiencing...\n\nExample: 'Login form redirects back to login page after entering correct credentials'",
611
+ info="Provide as much detail as possible for better analysis"
612
+ )
613
+
614
+ with gr.Row():
615
+ analyze_btn = gr.Button("๐Ÿ” Analyze Bug", variant="primary", size="lg")
616
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ Clear", variant="secondary")
617
+
618
+ with gr.Row():
619
+ with gr.Column(scale=1):
620
+ similar_bugs_output = gr.Markdown(
621
+ label="๐Ÿชฒ Similar Bug Reports",
622
+ value="Enter a bug description and click 'Analyze Bug' to see similar issues..."
623
+ )
624
+
625
+ with gr.Column(scale=1):
626
+ relevant_code_output = gr.Markdown(
627
+ label="๐Ÿ’ป Relevant Code Sections",
628
+ value="Code analysis will appear here..."
629
+ )
630
+
631
+ with gr.Row():
632
+ with gr.Column(scale=1):
633
+ suggestions_output = gr.Markdown(
634
+ label="๐Ÿ› ๏ธ Fix Suggestions",
635
+ value="Intelligent fix suggestions will be generated here..."
636
+ )
637
+
638
+ with gr.Column(scale=1):
639
+ evaluation_output = gr.Markdown(
640
+ label="๐Ÿ“Š Quality Metrics",
641
+ value="Analysis quality metrics will be shown here..."
642
+ )
643
+
644
+ # Event handlers
645
+ analyze_btn.click(
646
+ fn=analyze_bug_report,
647
+ inputs=[input_box],
648
+ outputs=[similar_bugs_output, relevant_code_output, suggestions_output, evaluation_output],
649
+ api_name="analyze_bug"
650
+ )
651
+
652
+ clear_btn.click(
653
+ fn=lambda: ("", "Enter a bug description and click 'Analyze Bug' to see similar issues...",
654
+ "Code analysis will appear here...",
655
+ "Intelligent fix suggestions will be generated here...",
656
+ "Analysis quality metrics will be shown here..."),
657
+ inputs=[],
658
+ outputs=[input_box, similar_bugs_output, relevant_code_output, suggestions_output, evaluation_output]
659
+ )
660
+
661
+ # Footer
662
+ gr.Markdown("""
663
+ ---
664
+ **๐Ÿš€ Built with:** LangChain โ€ข Sentence Transformers โ€ข FAISS โ€ข Gradio
665
+
666
+ **๐Ÿ“ˆ Features:** Semantic Search โ€ข Similarity Scoring โ€ข Code Analysis โ€ข Fix Suggestions โ€ข Quality Evaluation
667
+ """)
668
+
669
+ return demo
670
+
671
+ if __name__ == "__main__":
672
+ # Create and launch the interface
673
+ demo = create_interface()
674
+ demo.launch(
675
+ share=True,
676
+ server_name="0.0.0.0",
677
+ server_port=7860,
678
+ show_error=True
679
+ )
bug_reports.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ description
2
+ Login fails with correct credentials
3
+ Submit button does not respond
4
+ Error message is not displayed when login fails
evaluate_system.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Bug Report Analysis Agent - Comprehensive Evaluation Script
4
+ ============================================================
5
+ This script demonstrates and evaluates the RAG system's performance
6
+ on various types of bug reports and provides detailed analysis.
7
+ """
8
+
9
+ import sys
10
+ import time
11
+ import json
12
+ from typing import Dict, List, Tuple
13
+ import pandas as pd
14
+
15
+ # Import the main system components
16
+ from app import (
17
+ rag_system, evaluator, suggestion_engine,
18
+ analyze_bug_report, format_similar_bugs,
19
+ format_relevant_code, format_evaluation_metrics
20
+ )
21
+
22
+ class SystemEvaluator:
23
+ """Comprehensive evaluation of the Bug Report Analysis system"""
24
+
25
+ def __init__(self):
26
+ self.test_queries = [
27
+ {
28
+ "query": "Login form redirects back to login page after entering correct credentials",
29
+ "category": "Authentication",
30
+ "expected_components": ["login", "auth", "session"],
31
+ "description": "Classic authentication redirect issue"
32
+ },
33
+ {
34
+ "query": "Database connection times out during high traffic periods",
35
+ "category": "Database",
36
+ "expected_components": ["database", "connection", "timeout"],
37
+ "description": "Performance issue under load"
38
+ },
39
+ {
40
+ "query": "Email notifications for password reset are not being sent to users",
41
+ "category": "Email",
42
+ "expected_components": ["email", "smtp", "password"],
43
+ "description": "Email service functionality problem"
44
+ },
45
+ {
46
+ "query": "Submit button on contact form doesn't respond when clicked",
47
+ "category": "UI/Frontend",
48
+ "expected_components": ["button", "form", "javascript"],
49
+ "description": "Frontend interaction issue"
50
+ },
51
+ {
52
+ "query": "API returns 500 internal server error for user profile updates",
53
+ "category": "API",
54
+ "expected_components": ["api", "profile", "server"],
55
+ "description": "Backend API error"
56
+ },
57
+ {
58
+ "query": "Memory usage increases continuously when uploading large files",
59
+ "category": "Performance",
60
+ "expected_components": ["memory", "upload", "file"],
61
+ "description": "Memory leak in file handling"
62
+ },
63
+ {
64
+ "query": "Dashboard charts show incorrect data for monthly revenue reports",
65
+ "category": "Data/Analytics",
66
+ "expected_components": ["dashboard", "chart", "data"],
67
+ "description": "Data visualization accuracy issue"
68
+ },
69
+ {
70
+ "query": "User session expires too quickly causing frequent re-authentication",
71
+ "category": "Session Management",
72
+ "expected_components": ["session", "timeout", "authentication"],
73
+ "description": "Session timeout configuration issue"
74
+ }
75
+ ]
76
+
77
+ def run_comprehensive_evaluation(self) -> Dict:
78
+ """Run comprehensive evaluation of the system"""
79
+ print("๐Ÿš€ Starting Comprehensive Bug Report Analysis Evaluation")
80
+ print("=" * 70)
81
+
82
+ start_time = time.time()
83
+ results = {
84
+ "test_results": [],
85
+ "performance_metrics": {},
86
+ "quality_analysis": {},
87
+ "component_coverage": {},
88
+ "recommendations": []
89
+ }
90
+
91
+ # Test each query
92
+ for i, test_case in enumerate(self.test_queries, 1):
93
+ print(f"\n๐Ÿ“‹ Test Case {i}/{len(self.test_queries)}: {test_case['category']}")
94
+ print(f"Query: {test_case['query']}")
95
+ print("-" * 50)
96
+
97
+ # Run analysis
98
+ test_result = self.evaluate_single_query(test_case)
99
+ results["test_results"].append(test_result)
100
+
101
+ # Print summary
102
+ self.print_test_summary(test_result)
103
+
104
+ time.sleep(0.5) # Brief pause between tests
105
+
106
+ # Calculate overall metrics
107
+ results["performance_metrics"] = self.calculate_performance_metrics(results["test_results"])
108
+ results["quality_analysis"] = self.analyze_quality_patterns(results["test_results"])
109
+ results["component_coverage"] = self.analyze_component_coverage(results["test_results"])
110
+ results["recommendations"] = self.generate_recommendations(results)
111
+
112
+ total_time = time.time() - start_time
113
+ results["evaluation_time"] = total_time
114
+
115
+ # Print final report
116
+ self.print_final_report(results)
117
+
118
+ return results
119
+
120
+ def evaluate_single_query(self, test_case: Dict) -> Dict:
121
+ """Evaluate a single test query"""
122
+ query = test_case["query"]
123
+ start_time = time.time()
124
+
125
+ # Run the analysis
126
+ try:
127
+ similar_bugs_output, relevant_code_output, suggestions, evaluation_output = analyze_bug_report(query)
128
+
129
+ # Get raw data for analysis
130
+ similar_bugs = rag_system.search_similar_bugs(query, k=5)
131
+ relevant_code = rag_system.search_relevant_code(query, k=5)
132
+
133
+ # Evaluate results
134
+ bug_evaluation = evaluator.evaluate_retrieval_relevance(query, similar_bugs)
135
+ suggestion_evaluation = evaluator.evaluate_suggestion_usefulness(query, suggestions)
136
+
137
+ processing_time = time.time() - start_time
138
+
139
+ return {
140
+ "test_case": test_case,
141
+ "processing_time": processing_time,
142
+ "similar_bugs": similar_bugs,
143
+ "relevant_code": relevant_code,
144
+ "suggestions": suggestions,
145
+ "bug_evaluation": bug_evaluation,
146
+ "suggestion_evaluation": suggestion_evaluation,
147
+ "outputs": {
148
+ "similar_bugs_output": similar_bugs_output,
149
+ "relevant_code_output": relevant_code_output,
150
+ "evaluation_output": evaluation_output
151
+ },
152
+ "success": True
153
+ }
154
+
155
+ except Exception as e:
156
+ return {
157
+ "test_case": test_case,
158
+ "processing_time": time.time() - start_time,
159
+ "error": str(e),
160
+ "success": False
161
+ }
162
+
163
+ def print_test_summary(self, result: Dict):
164
+ """Print summary for a single test"""
165
+ if not result["success"]:
166
+ print(f"โŒ Error: {result['error']}")
167
+ return
168
+
169
+ bug_eval = result["bug_evaluation"]
170
+ suggestion_eval = result["suggestion_evaluation"]
171
+
172
+ print(f"โฑ๏ธ Processing Time: {result['processing_time']:.2f}s")
173
+ print(f"๐Ÿ” Similar Bugs Found: {bug_eval['result_count']}")
174
+ print(f"๐Ÿ“Š Retrieval Relevance: {bug_eval['relevance_score']:.3f}/1.0")
175
+ print(f"๐Ÿ› ๏ธ Suggestion Quality: {suggestion_eval['overall_usefulness']:.3f}/1.0")
176
+
177
+ # Quality indicator
178
+ overall_quality = (bug_eval['relevance_score'] + suggestion_eval['overall_usefulness']) / 2
179
+ if overall_quality >= 0.8:
180
+ quality_icon = "๐ŸŸข"
181
+ elif overall_quality >= 0.6:
182
+ quality_icon = "๐ŸŸก"
183
+ elif overall_quality >= 0.4:
184
+ quality_icon = "๐ŸŸ "
185
+ else:
186
+ quality_icon = "๐Ÿ”ด"
187
+
188
+ print(f"{quality_icon} Overall Quality: {overall_quality:.3f}/1.0")
189
+
190
+ def calculate_performance_metrics(self, test_results: List[Dict]) -> Dict:
191
+ """Calculate overall performance metrics"""
192
+ successful_tests = [r for r in test_results if r["success"]]
193
+
194
+ if not successful_tests:
195
+ return {"error": "No successful tests to analyze"}
196
+
197
+ processing_times = [r["processing_time"] for r in successful_tests]
198
+ retrieval_scores = [r["bug_evaluation"]["relevance_score"] for r in successful_tests]
199
+ suggestion_scores = [r["suggestion_evaluation"]["overall_usefulness"] for r in successful_tests]
200
+ bug_counts = [r["bug_evaluation"]["result_count"] for r in successful_tests]
201
+
202
+ return {
203
+ "total_tests": len(test_results),
204
+ "successful_tests": len(successful_tests),
205
+ "success_rate": len(successful_tests) / len(test_results),
206
+ "average_processing_time": sum(processing_times) / len(processing_times),
207
+ "min_processing_time": min(processing_times),
208
+ "max_processing_time": max(processing_times),
209
+ "average_retrieval_score": sum(retrieval_scores) / len(retrieval_scores),
210
+ "average_suggestion_score": sum(suggestion_scores) / len(suggestion_scores),
211
+ "average_bugs_found": sum(bug_counts) / len(bug_counts),
212
+ "retrieval_score_std": pd.Series(retrieval_scores).std(),
213
+ "suggestion_score_std": pd.Series(suggestion_scores).std()
214
+ }
215
+
216
+ def analyze_quality_patterns(self, test_results: List[Dict]) -> Dict:
217
+ """Analyze quality patterns across different categories"""
218
+ successful_tests = [r for r in test_results if r["success"]]
219
+
220
+ category_analysis = {}
221
+ for result in successful_tests:
222
+ category = result["test_case"]["category"]
223
+
224
+ if category not in category_analysis:
225
+ category_analysis[category] = {
226
+ "count": 0,
227
+ "retrieval_scores": [],
228
+ "suggestion_scores": [],
229
+ "processing_times": []
230
+ }
231
+
232
+ category_analysis[category]["count"] += 1
233
+ category_analysis[category]["retrieval_scores"].append(
234
+ result["bug_evaluation"]["relevance_score"]
235
+ )
236
+ category_analysis[category]["suggestion_scores"].append(
237
+ result["suggestion_evaluation"]["overall_usefulness"]
238
+ )
239
+ category_analysis[category]["processing_times"].append(
240
+ result["processing_time"]
241
+ )
242
+
243
+ # Calculate averages for each category
244
+ for category, data in category_analysis.items():
245
+ data["avg_retrieval"] = sum(data["retrieval_scores"]) / len(data["retrieval_scores"])
246
+ data["avg_suggestion"] = sum(data["suggestion_scores"]) / len(data["suggestion_scores"])
247
+ data["avg_processing_time"] = sum(data["processing_times"]) / len(data["processing_times"])
248
+
249
+ return category_analysis
250
+
251
+ def analyze_component_coverage(self, test_results: List[Dict]) -> Dict:
252
+ """Analyze how well the system covers different components"""
253
+ component_coverage = {}
254
+
255
+ for result in test_results:
256
+ if not result["success"]:
257
+ continue
258
+
259
+ test_case = result["test_case"]
260
+ expected_components = test_case.get("expected_components", [])
261
+
262
+ # Check if similar bugs contain expected components
263
+ similar_bugs = result["similar_bugs"]
264
+ found_components = set()
265
+
266
+ for bug in similar_bugs:
267
+ component = bug.get("component", "").lower()
268
+ description = bug.get("description", "").lower()
269
+ title = bug.get("title", "").lower()
270
+
271
+ for expected in expected_components:
272
+ if expected.lower() in f"{component} {description} {title}":
273
+ found_components.add(expected)
274
+
275
+ component_coverage[test_case["category"]] = {
276
+ "expected": expected_components,
277
+ "found": list(found_components),
278
+ "coverage_ratio": len(found_components) / len(expected_components) if expected_components else 0
279
+ }
280
+
281
+ return component_coverage
282
+
283
+ def generate_recommendations(self, results: Dict) -> List[str]:
284
+ """Generate recommendations based on evaluation results"""
285
+ recommendations = []
286
+ performance = results["performance_metrics"]
287
+ quality = results["quality_analysis"]
288
+
289
+ # Performance recommendations
290
+ if performance.get("average_processing_time", 0) > 3.0:
291
+ recommendations.append("Consider optimizing query processing time (currently > 3s average)")
292
+
293
+ if performance.get("success_rate", 1.0) < 0.95:
294
+ recommendations.append("Improve error handling and system reliability")
295
+
296
+ # Quality recommendations
297
+ avg_retrieval = performance.get("average_retrieval_score", 0)
298
+ avg_suggestion = performance.get("average_suggestion_score", 0)
299
+
300
+ if avg_retrieval < 0.7:
301
+ recommendations.append("Improve bug retrieval relevance (add more diverse training data)")
302
+
303
+ if avg_suggestion < 0.7:
304
+ recommendations.append("Enhance suggestion generation quality (refine fix templates)")
305
+
306
+ # Category-specific recommendations
307
+ for category, data in quality.items():
308
+ if data["avg_retrieval"] < 0.6:
309
+ recommendations.append(f"Improve {category} category retrieval performance")
310
+
311
+ if data["avg_suggestion"] < 0.6:
312
+ recommendations.append(f"Enhance {category} category suggestion quality")
313
+
314
+ if not recommendations:
315
+ recommendations.append("System performance is excellent across all metrics!")
316
+
317
+ return recommendations
318
+
319
+ def print_final_report(self, results: Dict):
320
+ """Print comprehensive final evaluation report"""
321
+ print("\n" + "=" * 70)
322
+ print("๐Ÿ“Š COMPREHENSIVE EVALUATION REPORT")
323
+ print("=" * 70)
324
+
325
+ # Performance Summary
326
+ perf = results["performance_metrics"]
327
+ print(f"\n๐Ÿš€ PERFORMANCE SUMMARY")
328
+ print(f"{'Total Tests:':<25} {perf['total_tests']}")
329
+ print(f"{'Success Rate:':<25} {perf['success_rate']:.1%}")
330
+ print(f"{'Avg Processing Time:':<25} {perf['average_processing_time']:.2f}s")
331
+ print(f"{'Avg Retrieval Score:':<25} {perf['average_retrieval_score']:.3f}/1.0")
332
+ print(f"{'Avg Suggestion Score:':<25} {perf['average_suggestion_score']:.3f}/1.0")
333
+ print(f"{'Avg Bugs Found:':<25} {perf['average_bugs_found']:.1f}")
334
+
335
+ # Quality Analysis by Category
336
+ print(f"\n๐Ÿ“ˆ QUALITY ANALYSIS BY CATEGORY")
337
+ quality = results["quality_analysis"]
338
+ for category, data in quality.items():
339
+ print(f"\n{category}:")
340
+ print(f" Retrieval: {data['avg_retrieval']:.3f} | Suggestions: {data['avg_suggestion']:.3f}")
341
+
342
+ # Component Coverage
343
+ print(f"\n๐ŸŽฏ COMPONENT COVERAGE ANALYSIS")
344
+ coverage = results["component_coverage"]
345
+ for category, data in coverage.items():
346
+ coverage_pct = data['coverage_ratio'] * 100
347
+ print(f"{category}: {coverage_pct:.0f}% coverage ({len(data['found'])}/{len(data['expected'])} components)")
348
+
349
+ # Recommendations
350
+ print(f"\n๐Ÿ’ก RECOMMENDATIONS")
351
+ for i, rec in enumerate(results["recommendations"], 1):
352
+ print(f"{i}. {rec}")
353
+
354
+ # Overall Rating
355
+ overall_score = (perf['average_retrieval_score'] + perf['average_suggestion_score']) / 2
356
+ if overall_score >= 0.8:
357
+ rating = "๐ŸŸข EXCELLENT"
358
+ elif overall_score >= 0.7:
359
+ rating = "๐ŸŸก GOOD"
360
+ elif overall_score >= 0.6:
361
+ rating = "๐ŸŸ  FAIR"
362
+ else:
363
+ rating = "๐Ÿ”ด NEEDS IMPROVEMENT"
364
+
365
+ print(f"\nโญ OVERALL SYSTEM RATING: {rating} ({overall_score:.3f}/1.0)")
366
+ print(f"๐Ÿ“… Evaluation completed in {results['evaluation_time']:.1f} seconds")
367
+ print("=" * 70)
368
+
369
+ def save_results(self, results: Dict, filename: str = "evaluation_results.json"):
370
+ """Save evaluation results to file"""
371
+ try:
372
+ # Convert numpy types to native Python types for JSON serialization
373
+ def convert_types(obj):
374
+ if hasattr(obj, 'item'): # numpy scalar
375
+ return obj.item()
376
+ elif isinstance(obj, dict):
377
+ return {k: convert_types(v) for k, v in obj.items()}
378
+ elif isinstance(obj, list):
379
+ return [convert_types(item) for item in obj]
380
+ else:
381
+ return obj
382
+
383
+ serializable_results = convert_types(results)
384
+
385
+ with open(filename, 'w') as f:
386
+ json.dump(serializable_results, f, indent=2, default=str)
387
+
388
+ print(f"๐Ÿ“ Results saved to {filename}")
389
+
390
+ except Exception as e:
391
+ print(f"โŒ Error saving results: {e}")
392
+
393
+ def run_interactive_demo():
394
+ """Run an interactive demonstration of the system"""
395
+ print("๐ŸŽฎ Interactive Bug Report Analysis Demo")
396
+ print("Enter bug descriptions to see real-time analysis")
397
+ print("Type 'quit' to exit\n")
398
+
399
+ while True:
400
+ try:
401
+ query = input("๐Ÿž Describe a bug: ").strip()
402
+
403
+ if query.lower() in ['quit', 'exit', 'q']:
404
+ print("๐Ÿ‘‹ Thanks for trying the Bug Report Analysis Agent!")
405
+ break
406
+
407
+ if not query:
408
+ continue
409
+
410
+ print("\n๐Ÿ” Analyzing...")
411
+ start_time = time.time()
412
+
413
+ similar_bugs_output, relevant_code_output, suggestions, evaluation_output = analyze_bug_report(query)
414
+
415
+ processing_time = time.time() - start_time
416
+
417
+ print(f"โฑ๏ธ Analysis completed in {processing_time:.2f} seconds\n")
418
+ print("๐Ÿ“‹ RESULTS:")
419
+ print("-" * 50)
420
+ print(similar_bugs_output[:500] + "..." if len(similar_bugs_output) > 500 else similar_bugs_output)
421
+ print("\n" + evaluation_output)
422
+ print("\n" + "="*50 + "\n")
423
+
424
+ except KeyboardInterrupt:
425
+ print("\n๐Ÿ‘‹ Goodbye!")
426
+ break
427
+ except Exception as e:
428
+ print(f"โŒ Error: {e}")
429
+
430
+ if __name__ == "__main__":
431
+ evaluator_instance = SystemEvaluator()
432
+
433
+ if len(sys.argv) > 1 and sys.argv[1] == "--demo":
434
+ run_interactive_demo()
435
+ else:
436
+ # Run comprehensive evaluation
437
+ results = evaluator_instance.run_comprehensive_evaluation()
438
+ evaluator_instance.save_results(results)
439
+
440
+ print("\n๐ŸŽฏ To run interactive demo: python evaluate_system.py --demo")
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
+ transformers>=4.36.0
4
+ sentence-transformers>=2.2.2
5
+ langchain>=0.1.0
6
+ faiss-cpu>=1.7.4
7
+ nltk>=3.8.1
8
+ fuzzywuzzy
9
+ python-Levenshtein
10
+ rouge-score