Phoenix21's picture
Create app.py
a1cb144 verified
import requests
import time
import os
from urllib.parse import urlparse
from treelib import Tree
from typing import Dict, List, Optional, Tuple
import logging
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
from groq import Groq, GroqError
import gradio as gr
from tqdm.auto import tqdm
# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# --- Data Structures ---
@dataclass
class FileInfo:
"""Data class to store file information"""
path: str
name: str
content: str
explanation: str
size: int
file_type: str
# --- Core Application Logic ---
class GitHubRepositoryAnalyzer:
"""
A class to analyze GitHub repositories by fetching file structures,
downloading content, and using an LLM to explain the code.
"""
def __init__(self, github_token: Optional[str] = None, groq_api_key: Optional[str] = None):
self.github_token = github_token
self.session = requests.Session()
self.file_contents: Dict[str, FileInfo] = {}
# Configure GitHub API access
if self.github_token:
logger.info("Using provided GitHub token for higher rate limits.")
self.session.headers.update({'Authorization': f'token {self.github_token}'})
# Authenticated GitHub API: 5000 requests/hour
self.rate_limiter = RateLimiter(max_calls=4500, time_window=3600)
else:
logger.warning("No GitHub token. Using unauthenticated access with lower rate limits.")
# Unauthenticated: 60 requests/hour
self.rate_limiter = RateLimiter(max_calls=50, time_window=3600)
# Configure Groq client
if groq_api_key:
self.groq_client = Groq(api_key=groq_api_key)
logger.info("Groq client initialized.")
else:
self.groq_client = None
logger.warning("Groq API key not provided. Code analysis will be skipped.")
# File types to analyze
self.analyzable_extensions = {
'.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.php',
'.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bash',
'.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml',
'.md', '.txt', '.cfg', '.conf', '.ini', '.env', '.dockerfile', 'requirements.txt'
}
def extract_repo_info(self, repo_url: str) -> Tuple[str, str]:
"""Extract owner and repository name from a GitHub URL."""
try:
parsed_url = urlparse(repo_url)
path = parsed_url.path.strip('/').replace('.git', '')
parts = path.split('/')
if len(parts) >= 2:
return parts[0], parts[1]
raise ValueError("Invalid repository URL format")
except Exception as e:
logger.error(f"Error parsing repository URL: {e}")
raise
def get_repository_structure(self, owner: str, repo: str, path: str = "") -> List[Dict]:
"""Recursively fetch the entire file structure of the repository."""
all_files = []
try:
contents = self._fetch_contents(owner, repo, path)
for item in contents:
if item['type'] == 'dir':
all_files.extend(self.get_repository_structure(owner, repo, item['path']))
else:
all_files.append(item)
except Exception as e:
logger.error(f"Failed to get repository structure for {path}: {e}")
return all_files
def _fetch_contents(self, owner: str, repo: str, path: str) -> List[Dict]:
"""Helper to fetch contents of a specific directory with pagination."""
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
items = []
while url:
self.rate_limiter.wait_if_needed()
response = self.session.get(url)
response.raise_for_status()
items.extend(response.json())
url = response.links.get('next', {}).get('url')
return items
def map_reduce_analysis(self, owner: str, repo: str, files: List[Dict], progress: gr.Progress):
"""
Analyzes files in parallel (Map phase) and aggregates results (Reduce phase).
This method uses a ThreadPoolExecutor to perform I/O-bound tasks concurrently.
"""
# --- MAP PHASE ---
# Each file is processed independently in a separate thread.
# This is efficient for tasks that wait for network responses (API calls).
logger.info(f"Starting Map phase: Analyzing {len(files)} files in parallel.")
with ThreadPoolExecutor(max_workers=10) as executor:
future_to_file = {executor.submit(self._process_single_file, owner, repo, file_item): file_item for file_item in files}
# tqdm progress tracking integrated with Gradio
pbar = tqdm(as_completed(future_to_file), total=len(files), desc="Analyzing Files")
for future in pbar:
try:
file_info = future.result()
if file_info:
# Store the result of the map phase
self.file_contents[file_info.path] = file_info
pbar.set_description(f"Analyzed {file_info.name}")
# Update Gradio progress bar
progress(pbar.n / pbar.total, desc=pbar.desc)
except Exception as e:
file_item = future_to_file[future]
logger.error(f"Error processing {file_item['path']}: {e}")
# --- REDUCE PHASE ---
# The reduce phase is the aggregation and structuring of the mapped results,
# which happens after the loop when creating the tree and summary.
logger.info("Reduce phase: Aggregating results.")
tree = self._create_directory_tree(owner, repo)
details = self._format_detailed_explanations()
summary = self._format_summary(owner, repo)
return tree, details, summary
def _process_single_file(self, owner: str, repo: str, file_item: Dict) -> Optional[FileInfo]:
"""Processes a single file: download, check, and analyze."""
file_path = file_item['path']
file_size = file_item.get('size', 0)
if not self._should_analyze_file(file_path, file_size):
return None
content = self._get_raw_file(owner, repo, file_path)
if content is None:
return None
explanation = self._analyze_code_with_llm(content, file_path)
return FileInfo(
path=file_path, name=file_item['name'], content=content,
explanation=explanation, size=file_size, file_type=os.path.splitext(file_path)[1]
)
def _should_analyze_file(self, file_path: str, file_size: int) -> bool:
"""Determine if a file should be analyzed based on extension and size."""
if file_size > 1024 * 1024: return False # Skip files > 1MB
file_name = os.path.basename(file_path)
_, file_ext = os.path.splitext(file_name)
return file_ext.lower() in self.analyzable_extensions or file_name in self.analyzable_extensions
def _get_raw_file(self, owner: str, repo: str, file_path: str) -> Optional[str]:
"""Fetch raw file content with fallback branches."""
for branch in ['main', 'master']:
url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
try:
response = self.session.get(url, timeout=10)
if response.status_code == 200:
# Simple check for binary content
return response.text if '\x00' not in response.text else None
except (requests.RequestException, UnicodeDecodeError) as e:
logger.warning(f"Could not fetch or decode {file_path} from branch {branch}: {e}")
return None
def _analyze_code_with_llm(self, code: str, file_path: str) -> str:
"""Analyze code using Groq LLM API."""
if not self.groq_client:
return "Analysis skipped: Groq API key not provided."
max_code_length = 8000
if len(code) > max_code_length: code = code[:max_code_length] + "\n... (truncated)"
prompt = f"""Analyze the following code from file '{file_path}'. Provide a concise explanation of its functionality, purpose, and key components.
```
{code}
```
Structure your analysis with these points:
1. **Main Purpose**: What is the primary goal of this file?
2. **Key Functions/Classes**: What are the main components and what do they do?
3. **Overall Role**: How does this file fit into the larger project?
"""
try:
chat_completion = self.groq_client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192",
temperature=0.2, max_tokens=1024
)
return chat_completion.choices[0].message.content.strip()
except GroqError as e:
logger.error(f"Groq API error for {file_path}: {e}")
return f"Error: Groq API request failed - {e.message}"
except Exception as e:
logger.error(f"Error calling Groq API for {file_path}: {e}")
return f"Error: {e}"
def _create_directory_tree(self, owner: str, repo: str) -> str:
"""Creates a string representation of the directory tree."""
tree = Tree()
root_id = f"{owner}/{repo}"
tree.create_node(f"🌳 {root_id}", root_id)
created_nodes = {root_id}
for file_path in sorted(self.file_contents.keys()):
path_parts = file_path.split('/')
parent_id = root_id
for i, part in enumerate(path_parts[:-1]):
node_id = f"{root_id}/{'/'.join(path_parts[:i+1])}"
if node_id not in created_nodes:
tree.create_node(f"πŸ“ {part}", node_id, parent=parent_id)
created_nodes.add(node_id)
parent_id = node_id
file_name = path_parts[-1]
file_type = self.file_contents[file_path].file_type
emoji = self.get_file_emoji(file_type)
tree.create_node(f"{emoji} {file_name}", f"{root_id}/{file_path}", parent=parent_id)
return f"```\n{tree.show(line_type='ascii-ex')}\n```"
def _format_detailed_explanations(self) -> str:
"""Formats all file explanations into a single Markdown string."""
if not self.file_contents: return "No files were analyzed."
output = []
for path, info in sorted(self.file_contents.items()):
output.append(f"### πŸ“„ `{path}`")
output.append(f"**Size**: {info.size:,} bytes")
output.append("---")
output.append(info.explanation)
output.append("\n---\n")
return "\n".join(output)
def _format_summary(self, owner: str, repo: str) -> str:
"""Creates a summary of the analysis."""
total_files = len(self.file_contents)
total_size = sum(info.size for info in self.file_contents.values())
return (
f"## Analysis Summary for `{owner}/{repo}`\n"
f"- **Total Files Analyzed**: {total_files}\n"
f"- **Total Code Size Analyzed**: {total_size:,} bytes"
)
@staticmethod
def get_file_emoji(file_type: str) -> str:
"""Returns an emoji for a given file type."""
emoji_map = {
'.py': '🐍', '.js': '🟨', '.ts': 'πŸ”·', '.java': 'β˜•', '.html': '🌐',
'.css': '🎨', '.json': 'πŸ“‹', '.md': 'πŸ“', '.sh': '🐚', '.yml': 'βš™οΈ',
'.yaml': 'βš™οΈ', '.dockerfile': '🐳', '.sql': 'πŸ—„οΈ', 'requirements.txt': 'πŸ“¦'
}
return emoji_map.get(file_type.lower(), 'πŸ“„')
class RateLimiter:
"""Simple rate limiter to avoid exceeding API limits."""
def __init__(self, max_calls: int, time_window: int):
self.max_calls = max_calls
self.time_window = time_window
self.calls = []
def wait_if_needed(self):
now = time.time()
self.calls = [t for t in self.calls if now - t < self.time_window]
if len(self.calls) >= self.max_calls:
sleep_time = self.time_window - (now - self.calls[0])
if sleep_time > 0:
logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
time.sleep(sleep_time)
self.calls.append(time.time())
# --- Gradio Interface ---
def analyze_repo_gradio(repo_url: str, github_token: str, groq_key: str, progress=gr.Progress(track_tqdm=True)):
"""The main function executed by the Gradio interface."""
if not repo_url:
return "Please enter a GitHub repository URL.", "", ""
if not groq_key:
return "Please enter your Groq API Key.", "", ""
try:
analyzer = GitHubRepositoryAnalyzer(github_token=github_token, groq_api_key=groq_key)
progress(0, desc="Extracting repo info...")
owner, repo = analyzer.extract_repo_info(repo_url)
progress(0.1, desc="Fetching repository file structure...")
all_files = analyzer.get_repository_structure(owner, repo)
if not all_files:
return "Could not retrieve repository structure. Check URL or token.", "", ""
tree, details, summary = analyzer.map_reduce_analysis(owner, repo, all_files, progress)
return tree, details, summary
except Exception as e:
logger.error(f"A critical error occurred: {e}", exc_info=True)
return f"An error occurred: {e}", "", ""
def create_gradio_interface():
"""Builds and returns the Gradio web interface."""
with gr.Blocks(theme=gr.themes.Soft(), title="GitHub Repo Analyzer") as demo:
gr.Markdown("# πŸ€– AI-Powered GitHub Repository Analyzer")
gr.Markdown("Enter a GitHub repository URL to generate a file tree, get AI-powered explanations for each file, and see a summary.")
with gr.Row():
with gr.Column(scale=2):
repo_url = gr.Textbox(label="GitHub Repository URL", placeholder="e.g., https://github.com/google/generative-ai-python")
with gr.Accordion("API Keys (Optional but Recommended)", open=False):
github_token = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token for a higher rate limit", type="password")
groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key for code analysis", type="password")
analyze_btn = gr.Button("Analyze Repository", variant="primary")
with gr.Tabs():
with gr.TabItem("πŸ“Š Summary"):
summary_output = gr.Markdown()
with gr.TabItem("🌳 File Tree"):
tree_output = gr.Markdown()
with gr.TabItem("πŸ“„ Detailed Analysis"):
details_output = gr.Markdown()
analyze_btn.click(
fn=analyze_repo_gradio,
inputs=[repo_url, github_token, groq_key],
outputs=[tree_output, details_output, summary_output]
)
return demo
if __name__ == "__main__":
app = create_gradio_interface()
app.launch(debug=True)