File size: 15,686 Bytes
a1cb144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
import requests
import time
import os
from urllib.parse import urlparse
from treelib import Tree
from typing import Dict, List, Optional, Tuple
import logging
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
from groq import Groq, GroqError
import gradio as gr
from tqdm.auto import tqdm

# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Data Structures ---
@dataclass
class FileInfo:
    """Data class to store file information"""
    path: str
    name: str
    content: str
    explanation: str
    size: int
    file_type: str

# --- Core Application Logic ---
class GitHubRepositoryAnalyzer:
    """
    A class to analyze GitHub repositories by fetching file structures,
    downloading content, and using an LLM to explain the code.
    """
    def __init__(self, github_token: Optional[str] = None, groq_api_key: Optional[str] = None):
        self.github_token = github_token
        self.session = requests.Session()
        self.file_contents: Dict[str, FileInfo] = {}

        # Configure GitHub API access
        if self.github_token:
            logger.info("Using provided GitHub token for higher rate limits.")
            self.session.headers.update({'Authorization': f'token {self.github_token}'})
            # Authenticated GitHub API: 5000 requests/hour
            self.rate_limiter = RateLimiter(max_calls=4500, time_window=3600)
        else:
            logger.warning("No GitHub token. Using unauthenticated access with lower rate limits.")
            # Unauthenticated: 60 requests/hour
            self.rate_limiter = RateLimiter(max_calls=50, time_window=3600)
        
        # Configure Groq client
        if groq_api_key:
            self.groq_client = Groq(api_key=groq_api_key)
            logger.info("Groq client initialized.")
        else:
            self.groq_client = None
            logger.warning("Groq API key not provided. Code analysis will be skipped.")
            
        # File types to analyze
        self.analyzable_extensions = {
            '.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.php',
            '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bash',
            '.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml',
            '.md', '.txt', '.cfg', '.conf', '.ini', '.env', '.dockerfile', 'requirements.txt'
        }

    def extract_repo_info(self, repo_url: str) -> Tuple[str, str]:
        """Extract owner and repository name from a GitHub URL."""
        try:
            parsed_url = urlparse(repo_url)
            path = parsed_url.path.strip('/').replace('.git', '')
            parts = path.split('/')
            if len(parts) >= 2:
                return parts[0], parts[1]
            raise ValueError("Invalid repository URL format")
        except Exception as e:
            logger.error(f"Error parsing repository URL: {e}")
            raise

    def get_repository_structure(self, owner: str, repo: str, path: str = "") -> List[Dict]:
        """Recursively fetch the entire file structure of the repository."""
        all_files = []
        try:
            contents = self._fetch_contents(owner, repo, path)
            for item in contents:
                if item['type'] == 'dir':
                    all_files.extend(self.get_repository_structure(owner, repo, item['path']))
                else:
                    all_files.append(item)
        except Exception as e:
            logger.error(f"Failed to get repository structure for {path}: {e}")
        return all_files

    def _fetch_contents(self, owner: str, repo: str, path: str) -> List[Dict]:
        """Helper to fetch contents of a specific directory with pagination."""
        url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
        items = []
        while url:
            self.rate_limiter.wait_if_needed()
            response = self.session.get(url)
            response.raise_for_status()
            items.extend(response.json())
            url = response.links.get('next', {}).get('url')
        return items

    def map_reduce_analysis(self, owner: str, repo: str, files: List[Dict], progress: gr.Progress):
        """
        Analyzes files in parallel (Map phase) and aggregates results (Reduce phase).
        This method uses a ThreadPoolExecutor to perform I/O-bound tasks concurrently.
        """
        # --- MAP PHASE ---
        # Each file is processed independently in a separate thread.
        # This is efficient for tasks that wait for network responses (API calls).
        logger.info(f"Starting Map phase: Analyzing {len(files)} files in parallel.")
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_file = {executor.submit(self._process_single_file, owner, repo, file_item): file_item for file_item in files}
            
            # tqdm progress tracking integrated with Gradio
            pbar = tqdm(as_completed(future_to_file), total=len(files), desc="Analyzing Files")
            for future in pbar:
                try:
                    file_info = future.result()
                    if file_info:
                        # Store the result of the map phase
                        self.file_contents[file_info.path] = file_info
                        pbar.set_description(f"Analyzed {file_info.name}")
                    # Update Gradio progress bar
                    progress(pbar.n / pbar.total, desc=pbar.desc)
                except Exception as e:
                    file_item = future_to_file[future]
                    logger.error(f"Error processing {file_item['path']}: {e}")

        # --- REDUCE PHASE ---
        # The reduce phase is the aggregation and structuring of the mapped results,
        # which happens after the loop when creating the tree and summary.
        logger.info("Reduce phase: Aggregating results.")
        tree = self._create_directory_tree(owner, repo)
        details = self._format_detailed_explanations()
        summary = self._format_summary(owner, repo)
        
        return tree, details, summary

    def _process_single_file(self, owner: str, repo: str, file_item: Dict) -> Optional[FileInfo]:
        """Processes a single file: download, check, and analyze."""
        file_path = file_item['path']
        file_size = file_item.get('size', 0)

        if not self._should_analyze_file(file_path, file_size):
            return None

        content = self._get_raw_file(owner, repo, file_path)
        if content is None:
            return None

        explanation = self._analyze_code_with_llm(content, file_path)
        
        return FileInfo(
            path=file_path, name=file_item['name'], content=content,
            explanation=explanation, size=file_size, file_type=os.path.splitext(file_path)[1]
        )

    def _should_analyze_file(self, file_path: str, file_size: int) -> bool:
        """Determine if a file should be analyzed based on extension and size."""
        if file_size > 1024 * 1024: return False  # Skip files > 1MB
        file_name = os.path.basename(file_path)
        _, file_ext = os.path.splitext(file_name)
        return file_ext.lower() in self.analyzable_extensions or file_name in self.analyzable_extensions

    def _get_raw_file(self, owner: str, repo: str, file_path: str) -> Optional[str]:
        """Fetch raw file content with fallback branches."""
        for branch in ['main', 'master']:
            url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
            try:
                response = self.session.get(url, timeout=10)
                if response.status_code == 200:
                    # Simple check for binary content
                    return response.text if '\x00' not in response.text else None
            except (requests.RequestException, UnicodeDecodeError) as e:
                logger.warning(f"Could not fetch or decode {file_path} from branch {branch}: {e}")
        return None

    def _analyze_code_with_llm(self, code: str, file_path: str) -> str:
        """Analyze code using Groq LLM API."""
        if not self.groq_client:
            return "Analysis skipped: Groq API key not provided."
        
        max_code_length = 8000
        if len(code) > max_code_length: code = code[:max_code_length] + "\n... (truncated)"
        
        prompt = f"""Analyze the following code from file '{file_path}'. Provide a concise explanation of its functionality, purpose, and key components.
        ```
        {code}
        ```
        Structure your analysis with these points:
        1. **Main Purpose**: What is the primary goal of this file?
        2. **Key Functions/Classes**: What are the main components and what do they do?
        3. **Overall Role**: How does this file fit into the larger project?
        """
        try:
            chat_completion = self.groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama3-8b-8192",
                temperature=0.2, max_tokens=1024
            )
            return chat_completion.choices[0].message.content.strip()
        except GroqError as e:
            logger.error(f"Groq API error for {file_path}: {e}")
            return f"Error: Groq API request failed - {e.message}"
        except Exception as e:
            logger.error(f"Error calling Groq API for {file_path}: {e}")
            return f"Error: {e}"

    def _create_directory_tree(self, owner: str, repo: str) -> str:
        """Creates a string representation of the directory tree."""
        tree = Tree()
        root_id = f"{owner}/{repo}"
        tree.create_node(f"🌳 {root_id}", root_id)
        created_nodes = {root_id}
        
        for file_path in sorted(self.file_contents.keys()):
            path_parts = file_path.split('/')
            parent_id = root_id
            for i, part in enumerate(path_parts[:-1]):
                node_id = f"{root_id}/{'/'.join(path_parts[:i+1])}"
                if node_id not in created_nodes:
                    tree.create_node(f"πŸ“ {part}", node_id, parent=parent_id)
                    created_nodes.add(node_id)
                parent_id = node_id
            
            file_name = path_parts[-1]
            file_type = self.file_contents[file_path].file_type
            emoji = self.get_file_emoji(file_type)
            tree.create_node(f"{emoji} {file_name}", f"{root_id}/{file_path}", parent=parent_id)
            
        return f"```\n{tree.show(line_type='ascii-ex')}\n```"

    def _format_detailed_explanations(self) -> str:
        """Formats all file explanations into a single Markdown string."""
        if not self.file_contents: return "No files were analyzed."
        
        output = []
        for path, info in sorted(self.file_contents.items()):
            output.append(f"### πŸ“„ `{path}`")
            output.append(f"**Size**: {info.size:,} bytes")
            output.append("---")
            output.append(info.explanation)
            output.append("\n---\n")
        return "\n".join(output)

    def _format_summary(self, owner: str, repo: str) -> str:
        """Creates a summary of the analysis."""
        total_files = len(self.file_contents)
        total_size = sum(info.size for info in self.file_contents.values())
        return (
            f"## Analysis Summary for `{owner}/{repo}`\n"
            f"- **Total Files Analyzed**: {total_files}\n"
            f"- **Total Code Size Analyzed**: {total_size:,} bytes"
        )

    @staticmethod
    def get_file_emoji(file_type: str) -> str:
        """Returns an emoji for a given file type."""
        emoji_map = {
            '.py': '🐍', '.js': '🟨', '.ts': 'πŸ”·', '.java': 'β˜•', '.html': '🌐',
            '.css': '🎨', '.json': 'πŸ“‹', '.md': 'πŸ“', '.sh': '🐚', '.yml': 'βš™οΈ',
            '.yaml': 'βš™οΈ', '.dockerfile': '🐳', '.sql': 'πŸ—„οΈ', 'requirements.txt': 'πŸ“¦'
        }
        return emoji_map.get(file_type.lower(), 'πŸ“„')

class RateLimiter:
    """Simple rate limiter to avoid exceeding API limits."""
    def __init__(self, max_calls: int, time_window: int):
        self.max_calls = max_calls
        self.time_window = time_window
        self.calls = []
    
    def wait_if_needed(self):
        now = time.time()
        self.calls = [t for t in self.calls if now - t < self.time_window]
        if len(self.calls) >= self.max_calls:
            sleep_time = self.time_window - (now - self.calls[0])
            if sleep_time > 0:
                logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
                time.sleep(sleep_time)
        self.calls.append(time.time())

# --- Gradio Interface ---
def analyze_repo_gradio(repo_url: str, github_token: str, groq_key: str, progress=gr.Progress(track_tqdm=True)):
    """The main function executed by the Gradio interface."""
    if not repo_url:
        return "Please enter a GitHub repository URL.", "", ""
    if not groq_key:
        return "Please enter your Groq API Key.", "", ""

    try:
        analyzer = GitHubRepositoryAnalyzer(github_token=github_token, groq_api_key=groq_key)
        
        progress(0, desc="Extracting repo info...")
        owner, repo = analyzer.extract_repo_info(repo_url)
        
        progress(0.1, desc="Fetching repository file structure...")
        all_files = analyzer.get_repository_structure(owner, repo)
        if not all_files:
            return "Could not retrieve repository structure. Check URL or token.", "", ""
        
        tree, details, summary = analyzer.map_reduce_analysis(owner, repo, all_files, progress)
        
        return tree, details, summary
    except Exception as e:
        logger.error(f"A critical error occurred: {e}", exc_info=True)
        return f"An error occurred: {e}", "", ""

def create_gradio_interface():
    """Builds and returns the Gradio web interface."""
    with gr.Blocks(theme=gr.themes.Soft(), title="GitHub Repo Analyzer") as demo:
        gr.Markdown("# πŸ€– AI-Powered GitHub Repository Analyzer")
        gr.Markdown("Enter a GitHub repository URL to generate a file tree, get AI-powered explanations for each file, and see a summary.")
        
        with gr.Row():
            with gr.Column(scale=2):
                repo_url = gr.Textbox(label="GitHub Repository URL", placeholder="e.g., https://github.com/google/generative-ai-python")
                with gr.Accordion("API Keys (Optional but Recommended)", open=False):
                    github_token = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token for a higher rate limit", type="password")
                    groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key for code analysis", type="password")
                
                analyze_btn = gr.Button("Analyze Repository", variant="primary")
        
        with gr.Tabs():
            with gr.TabItem("πŸ“Š Summary"):
                summary_output = gr.Markdown()
            with gr.TabItem("🌳 File Tree"):
                tree_output = gr.Markdown()
            with gr.TabItem("πŸ“„ Detailed Analysis"):
                details_output = gr.Markdown()

        analyze_btn.click(
            fn=analyze_repo_gradio,
            inputs=[repo_url, github_token, groq_key],
            outputs=[tree_output, details_output, summary_output]
        )
    return demo

if __name__ == "__main__":
    app = create_gradio_interface()
    app.launch(debug=True)