Phoenix21 commited on
Commit
a1cb144
Β·
verified Β·
1 Parent(s): 5980642

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +345 -0
app.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import time
3
+ import os
4
+ from urllib.parse import urlparse
5
+ from treelib import Tree
6
+ from typing import Dict, List, Optional, Tuple
7
+ import logging
8
+ from dataclasses import dataclass
9
+ from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ from groq import Groq, GroqError
11
+ import gradio as gr
12
+ from tqdm.auto import tqdm
13
+
14
+ # --- Basic Configuration ---
15
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # --- Data Structures ---
19
+ @dataclass
20
+ class FileInfo:
21
+ """Data class to store file information"""
22
+ path: str
23
+ name: str
24
+ content: str
25
+ explanation: str
26
+ size: int
27
+ file_type: str
28
+
29
+ # --- Core Application Logic ---
30
+ class GitHubRepositoryAnalyzer:
31
+ """
32
+ A class to analyze GitHub repositories by fetching file structures,
33
+ downloading content, and using an LLM to explain the code.
34
+ """
35
+ def __init__(self, github_token: Optional[str] = None, groq_api_key: Optional[str] = None):
36
+ self.github_token = github_token
37
+ self.session = requests.Session()
38
+ self.file_contents: Dict[str, FileInfo] = {}
39
+
40
+ # Configure GitHub API access
41
+ if self.github_token:
42
+ logger.info("Using provided GitHub token for higher rate limits.")
43
+ self.session.headers.update({'Authorization': f'token {self.github_token}'})
44
+ # Authenticated GitHub API: 5000 requests/hour
45
+ self.rate_limiter = RateLimiter(max_calls=4500, time_window=3600)
46
+ else:
47
+ logger.warning("No GitHub token. Using unauthenticated access with lower rate limits.")
48
+ # Unauthenticated: 60 requests/hour
49
+ self.rate_limiter = RateLimiter(max_calls=50, time_window=3600)
50
+
51
+ # Configure Groq client
52
+ if groq_api_key:
53
+ self.groq_client = Groq(api_key=groq_api_key)
54
+ logger.info("Groq client initialized.")
55
+ else:
56
+ self.groq_client = None
57
+ logger.warning("Groq API key not provided. Code analysis will be skipped.")
58
+
59
+ # File types to analyze
60
+ self.analyzable_extensions = {
61
+ '.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.php',
62
+ '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bash',
63
+ '.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml',
64
+ '.md', '.txt', '.cfg', '.conf', '.ini', '.env', '.dockerfile', 'requirements.txt'
65
+ }
66
+
67
+ def extract_repo_info(self, repo_url: str) -> Tuple[str, str]:
68
+ """Extract owner and repository name from a GitHub URL."""
69
+ try:
70
+ parsed_url = urlparse(repo_url)
71
+ path = parsed_url.path.strip('/').replace('.git', '')
72
+ parts = path.split('/')
73
+ if len(parts) >= 2:
74
+ return parts[0], parts[1]
75
+ raise ValueError("Invalid repository URL format")
76
+ except Exception as e:
77
+ logger.error(f"Error parsing repository URL: {e}")
78
+ raise
79
+
80
+ def get_repository_structure(self, owner: str, repo: str, path: str = "") -> List[Dict]:
81
+ """Recursively fetch the entire file structure of the repository."""
82
+ all_files = []
83
+ try:
84
+ contents = self._fetch_contents(owner, repo, path)
85
+ for item in contents:
86
+ if item['type'] == 'dir':
87
+ all_files.extend(self.get_repository_structure(owner, repo, item['path']))
88
+ else:
89
+ all_files.append(item)
90
+ except Exception as e:
91
+ logger.error(f"Failed to get repository structure for {path}: {e}")
92
+ return all_files
93
+
94
+ def _fetch_contents(self, owner: str, repo: str, path: str) -> List[Dict]:
95
+ """Helper to fetch contents of a specific directory with pagination."""
96
+ url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
97
+ items = []
98
+ while url:
99
+ self.rate_limiter.wait_if_needed()
100
+ response = self.session.get(url)
101
+ response.raise_for_status()
102
+ items.extend(response.json())
103
+ url = response.links.get('next', {}).get('url')
104
+ return items
105
+
106
+ def map_reduce_analysis(self, owner: str, repo: str, files: List[Dict], progress: gr.Progress):
107
+ """
108
+ Analyzes files in parallel (Map phase) and aggregates results (Reduce phase).
109
+ This method uses a ThreadPoolExecutor to perform I/O-bound tasks concurrently.
110
+ """
111
+ # --- MAP PHASE ---
112
+ # Each file is processed independently in a separate thread.
113
+ # This is efficient for tasks that wait for network responses (API calls).
114
+ logger.info(f"Starting Map phase: Analyzing {len(files)} files in parallel.")
115
+ with ThreadPoolExecutor(max_workers=10) as executor:
116
+ future_to_file = {executor.submit(self._process_single_file, owner, repo, file_item): file_item for file_item in files}
117
+
118
+ # tqdm progress tracking integrated with Gradio
119
+ pbar = tqdm(as_completed(future_to_file), total=len(files), desc="Analyzing Files")
120
+ for future in pbar:
121
+ try:
122
+ file_info = future.result()
123
+ if file_info:
124
+ # Store the result of the map phase
125
+ self.file_contents[file_info.path] = file_info
126
+ pbar.set_description(f"Analyzed {file_info.name}")
127
+ # Update Gradio progress bar
128
+ progress(pbar.n / pbar.total, desc=pbar.desc)
129
+ except Exception as e:
130
+ file_item = future_to_file[future]
131
+ logger.error(f"Error processing {file_item['path']}: {e}")
132
+
133
+ # --- REDUCE PHASE ---
134
+ # The reduce phase is the aggregation and structuring of the mapped results,
135
+ # which happens after the loop when creating the tree and summary.
136
+ logger.info("Reduce phase: Aggregating results.")
137
+ tree = self._create_directory_tree(owner, repo)
138
+ details = self._format_detailed_explanations()
139
+ summary = self._format_summary(owner, repo)
140
+
141
+ return tree, details, summary
142
+
143
+ def _process_single_file(self, owner: str, repo: str, file_item: Dict) -> Optional[FileInfo]:
144
+ """Processes a single file: download, check, and analyze."""
145
+ file_path = file_item['path']
146
+ file_size = file_item.get('size', 0)
147
+
148
+ if not self._should_analyze_file(file_path, file_size):
149
+ return None
150
+
151
+ content = self._get_raw_file(owner, repo, file_path)
152
+ if content is None:
153
+ return None
154
+
155
+ explanation = self._analyze_code_with_llm(content, file_path)
156
+
157
+ return FileInfo(
158
+ path=file_path, name=file_item['name'], content=content,
159
+ explanation=explanation, size=file_size, file_type=os.path.splitext(file_path)[1]
160
+ )
161
+
162
+ def _should_analyze_file(self, file_path: str, file_size: int) -> bool:
163
+ """Determine if a file should be analyzed based on extension and size."""
164
+ if file_size > 1024 * 1024: return False # Skip files > 1MB
165
+ file_name = os.path.basename(file_path)
166
+ _, file_ext = os.path.splitext(file_name)
167
+ return file_ext.lower() in self.analyzable_extensions or file_name in self.analyzable_extensions
168
+
169
+ def _get_raw_file(self, owner: str, repo: str, file_path: str) -> Optional[str]:
170
+ """Fetch raw file content with fallback branches."""
171
+ for branch in ['main', 'master']:
172
+ url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
173
+ try:
174
+ response = self.session.get(url, timeout=10)
175
+ if response.status_code == 200:
176
+ # Simple check for binary content
177
+ return response.text if '\x00' not in response.text else None
178
+ except (requests.RequestException, UnicodeDecodeError) as e:
179
+ logger.warning(f"Could not fetch or decode {file_path} from branch {branch}: {e}")
180
+ return None
181
+
182
+ def _analyze_code_with_llm(self, code: str, file_path: str) -> str:
183
+ """Analyze code using Groq LLM API."""
184
+ if not self.groq_client:
185
+ return "Analysis skipped: Groq API key not provided."
186
+
187
+ max_code_length = 8000
188
+ if len(code) > max_code_length: code = code[:max_code_length] + "\n... (truncated)"
189
+
190
+ prompt = f"""Analyze the following code from file '{file_path}'. Provide a concise explanation of its functionality, purpose, and key components.
191
+ ```
192
+ {code}
193
+ ```
194
+ Structure your analysis with these points:
195
+ 1. **Main Purpose**: What is the primary goal of this file?
196
+ 2. **Key Functions/Classes**: What are the main components and what do they do?
197
+ 3. **Overall Role**: How does this file fit into the larger project?
198
+ """
199
+ try:
200
+ chat_completion = self.groq_client.chat.completions.create(
201
+ messages=[{"role": "user", "content": prompt}],
202
+ model="llama3-8b-8192",
203
+ temperature=0.2, max_tokens=1024
204
+ )
205
+ return chat_completion.choices[0].message.content.strip()
206
+ except GroqError as e:
207
+ logger.error(f"Groq API error for {file_path}: {e}")
208
+ return f"Error: Groq API request failed - {e.message}"
209
+ except Exception as e:
210
+ logger.error(f"Error calling Groq API for {file_path}: {e}")
211
+ return f"Error: {e}"
212
+
213
+ def _create_directory_tree(self, owner: str, repo: str) -> str:
214
+ """Creates a string representation of the directory tree."""
215
+ tree = Tree()
216
+ root_id = f"{owner}/{repo}"
217
+ tree.create_node(f"🌳 {root_id}", root_id)
218
+ created_nodes = {root_id}
219
+
220
+ for file_path in sorted(self.file_contents.keys()):
221
+ path_parts = file_path.split('/')
222
+ parent_id = root_id
223
+ for i, part in enumerate(path_parts[:-1]):
224
+ node_id = f"{root_id}/{'/'.join(path_parts[:i+1])}"
225
+ if node_id not in created_nodes:
226
+ tree.create_node(f"πŸ“ {part}", node_id, parent=parent_id)
227
+ created_nodes.add(node_id)
228
+ parent_id = node_id
229
+
230
+ file_name = path_parts[-1]
231
+ file_type = self.file_contents[file_path].file_type
232
+ emoji = self.get_file_emoji(file_type)
233
+ tree.create_node(f"{emoji} {file_name}", f"{root_id}/{file_path}", parent=parent_id)
234
+
235
+ return f"```\n{tree.show(line_type='ascii-ex')}\n```"
236
+
237
+ def _format_detailed_explanations(self) -> str:
238
+ """Formats all file explanations into a single Markdown string."""
239
+ if not self.file_contents: return "No files were analyzed."
240
+
241
+ output = []
242
+ for path, info in sorted(self.file_contents.items()):
243
+ output.append(f"### πŸ“„ `{path}`")
244
+ output.append(f"**Size**: {info.size:,} bytes")
245
+ output.append("---")
246
+ output.append(info.explanation)
247
+ output.append("\n---\n")
248
+ return "\n".join(output)
249
+
250
+ def _format_summary(self, owner: str, repo: str) -> str:
251
+ """Creates a summary of the analysis."""
252
+ total_files = len(self.file_contents)
253
+ total_size = sum(info.size for info in self.file_contents.values())
254
+ return (
255
+ f"## Analysis Summary for `{owner}/{repo}`\n"
256
+ f"- **Total Files Analyzed**: {total_files}\n"
257
+ f"- **Total Code Size Analyzed**: {total_size:,} bytes"
258
+ )
259
+
260
+ @staticmethod
261
+ def get_file_emoji(file_type: str) -> str:
262
+ """Returns an emoji for a given file type."""
263
+ emoji_map = {
264
+ '.py': '🐍', '.js': '🟨', '.ts': 'πŸ”·', '.java': 'β˜•', '.html': '🌐',
265
+ '.css': '🎨', '.json': 'πŸ“‹', '.md': 'πŸ“', '.sh': '🐚', '.yml': 'βš™οΈ',
266
+ '.yaml': 'βš™οΈ', '.dockerfile': '🐳', '.sql': 'πŸ—„οΈ', 'requirements.txt': 'πŸ“¦'
267
+ }
268
+ return emoji_map.get(file_type.lower(), 'πŸ“„')
269
+
270
+ class RateLimiter:
271
+ """Simple rate limiter to avoid exceeding API limits."""
272
+ def __init__(self, max_calls: int, time_window: int):
273
+ self.max_calls = max_calls
274
+ self.time_window = time_window
275
+ self.calls = []
276
+
277
+ def wait_if_needed(self):
278
+ now = time.time()
279
+ self.calls = [t for t in self.calls if now - t < self.time_window]
280
+ if len(self.calls) >= self.max_calls:
281
+ sleep_time = self.time_window - (now - self.calls[0])
282
+ if sleep_time > 0:
283
+ logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
284
+ time.sleep(sleep_time)
285
+ self.calls.append(time.time())
286
+
287
+ # --- Gradio Interface ---
288
+ def analyze_repo_gradio(repo_url: str, github_token: str, groq_key: str, progress=gr.Progress(track_tqdm=True)):
289
+ """The main function executed by the Gradio interface."""
290
+ if not repo_url:
291
+ return "Please enter a GitHub repository URL.", "", ""
292
+ if not groq_key:
293
+ return "Please enter your Groq API Key.", "", ""
294
+
295
+ try:
296
+ analyzer = GitHubRepositoryAnalyzer(github_token=github_token, groq_api_key=groq_key)
297
+
298
+ progress(0, desc="Extracting repo info...")
299
+ owner, repo = analyzer.extract_repo_info(repo_url)
300
+
301
+ progress(0.1, desc="Fetching repository file structure...")
302
+ all_files = analyzer.get_repository_structure(owner, repo)
303
+ if not all_files:
304
+ return "Could not retrieve repository structure. Check URL or token.", "", ""
305
+
306
+ tree, details, summary = analyzer.map_reduce_analysis(owner, repo, all_files, progress)
307
+
308
+ return tree, details, summary
309
+ except Exception as e:
310
+ logger.error(f"A critical error occurred: {e}", exc_info=True)
311
+ return f"An error occurred: {e}", "", ""
312
+
313
+ def create_gradio_interface():
314
+ """Builds and returns the Gradio web interface."""
315
+ with gr.Blocks(theme=gr.themes.Soft(), title="GitHub Repo Analyzer") as demo:
316
+ gr.Markdown("# πŸ€– AI-Powered GitHub Repository Analyzer")
317
+ gr.Markdown("Enter a GitHub repository URL to generate a file tree, get AI-powered explanations for each file, and see a summary.")
318
+
319
+ with gr.Row():
320
+ with gr.Column(scale=2):
321
+ repo_url = gr.Textbox(label="GitHub Repository URL", placeholder="e.g., https://github.com/google/generative-ai-python")
322
+ with gr.Accordion("API Keys (Optional but Recommended)", open=False):
323
+ github_token = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token for a higher rate limit", type="password")
324
+ groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key for code analysis", type="password")
325
+
326
+ analyze_btn = gr.Button("Analyze Repository", variant="primary")
327
+
328
+ with gr.Tabs():
329
+ with gr.TabItem("πŸ“Š Summary"):
330
+ summary_output = gr.Markdown()
331
+ with gr.TabItem("🌳 File Tree"):
332
+ tree_output = gr.Markdown()
333
+ with gr.TabItem("πŸ“„ Detailed Analysis"):
334
+ details_output = gr.Markdown()
335
+
336
+ analyze_btn.click(
337
+ fn=analyze_repo_gradio,
338
+ inputs=[repo_url, github_token, groq_key],
339
+ outputs=[tree_output, details_output, summary_output]
340
+ )
341
+ return demo
342
+
343
+ if __name__ == "__main__":
344
+ app = create_gradio_interface()
345
+ app.launch(debug=True)