Spaces:

Phoenix21
/

FileStructureAnalyzer

Sleeping

File size: 15,686 Bytes

a1cb144

import requests
import time
import os
from urllib.parse import urlparse
from treelib import Tree
from typing import Dict, List, Optional, Tuple
import logging
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
from groq import Groq, GroqError
import gradio as gr
from tqdm.auto import tqdm

# --- Basic Configuration ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# --- Data Structures ---
@dataclass
class FileInfo:
    """Data class to store file information"""
    path: str
    name: str
    content: str
    explanation: str
    size: int
    file_type: str

# --- Core Application Logic ---
class GitHubRepositoryAnalyzer:
    """
    A class to analyze GitHub repositories by fetching file structures,
    downloading content, and using an LLM to explain the code.
    """
    def __init__(self, github_token: Optional[str] = None, groq_api_key: Optional[str] = None):
        self.github_token = github_token
        self.session = requests.Session()
        self.file_contents: Dict[str, FileInfo] = {}

        # Configure GitHub API access
        if self.github_token:
            logger.info("Using provided GitHub token for higher rate limits.")
            self.session.headers.update({'Authorization': f'token {self.github_token}'})
            # Authenticated GitHub API: 5000 requests/hour
            self.rate_limiter = RateLimiter(max_calls=4500, time_window=3600)
        else:
            logger.warning("No GitHub token. Using unauthenticated access with lower rate limits.")
            # Unauthenticated: 60 requests/hour
            self.rate_limiter = RateLimiter(max_calls=50, time_window=3600)
        
        # Configure Groq client
        if groq_api_key:
            self.groq_client = Groq(api_key=groq_api_key)
            logger.info("Groq client initialized.")
        else:
            self.groq_client = None
            logger.warning("Groq API key not provided. Code analysis will be skipped.")
            
        # File types to analyze
        self.analyzable_extensions = {
            '.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.php',
            '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bash',
            '.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml',
            '.md', '.txt', '.cfg', '.conf', '.ini', '.env', '.dockerfile', 'requirements.txt'
        }

    def extract_repo_info(self, repo_url: str) -> Tuple[str, str]:
        """Extract owner and repository name from a GitHub URL."""
        try:
            parsed_url = urlparse(repo_url)
            path = parsed_url.path.strip('/').replace('.git', '')
            parts = path.split('/')
            if len(parts) >= 2:
                return parts[0], parts[1]
            raise ValueError("Invalid repository URL format")
        except Exception as e:
            logger.error(f"Error parsing repository URL: {e}")
            raise

    def get_repository_structure(self, owner: str, repo: str, path: str = "") -> List[Dict]:
        """Recursively fetch the entire file structure of the repository."""
        all_files = []
        try:
            contents = self._fetch_contents(owner, repo, path)
            for item in contents:
                if item['type'] == 'dir':
                    all_files.extend(self.get_repository_structure(owner, repo, item['path']))
                else:
                    all_files.append(item)
        except Exception as e:
            logger.error(f"Failed to get repository structure for {path}: {e}")
        return all_files

    def _fetch_contents(self, owner: str, repo: str, path: str) -> List[Dict]:
        """Helper to fetch contents of a specific directory with pagination."""
        url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
        items = []
        while url:
            self.rate_limiter.wait_if_needed()
            response = self.session.get(url)
            response.raise_for_status()
            items.extend(response.json())
            url = response.links.get('next', {}).get('url')
        return items

    def map_reduce_analysis(self, owner: str, repo: str, files: List[Dict], progress: gr.Progress):
        """
        Analyzes files in parallel (Map phase) and aggregates results (Reduce phase).
        This method uses a ThreadPoolExecutor to perform I/O-bound tasks concurrently.
        """
        # --- MAP PHASE ---
        # Each file is processed independently in a separate thread.
        # This is efficient for tasks that wait for network responses (API calls).
        logger.info(f"Starting Map phase: Analyzing {len(files)} files in parallel.")
        with ThreadPoolExecutor(max_workers=10) as executor:
            future_to_file = {executor.submit(self._process_single_file, owner, repo, file_item): file_item for file_item in files}
            
            # tqdm progress tracking integrated with Gradio
            pbar = tqdm(as_completed(future_to_file), total=len(files), desc="Analyzing Files")
            for future in pbar:
                try:
                    file_info = future.result()
                    if file_info:
                        # Store the result of the map phase
                        self.file_contents[file_info.path] = file_info
                        pbar.set_description(f"Analyzed {file_info.name}")
                    # Update Gradio progress bar
                    progress(pbar.n / pbar.total, desc=pbar.desc)
                except Exception as e:
                    file_item = future_to_file[future]
                    logger.error(f"Error processing {file_item['path']}: {e}")

        # --- REDUCE PHASE ---
        # The reduce phase is the aggregation and structuring of the mapped results,
        # which happens after the loop when creating the tree and summary.
        logger.info("Reduce phase: Aggregating results.")
        tree = self._create_directory_tree(owner, repo)
        details = self._format_detailed_explanations()
        summary = self._format_summary(owner, repo)
        
        return tree, details, summary

    def _process_single_file(self, owner: str, repo: str, file_item: Dict) -> Optional[FileInfo]:
        """Processes a single file: download, check, and analyze."""
        file_path = file_item['path']
        file_size = file_item.get('size', 0)

        if not self._should_analyze_file(file_path, file_size):
            return None

        content = self._get_raw_file(owner, repo, file_path)
        if content is None:
            return None

        explanation = self._analyze_code_with_llm(content, file_path)
        
        return FileInfo(
            path=file_path, name=file_item['name'], content=content,
            explanation=explanation, size=file_size, file_type=os.path.splitext(file_path)[1]
        )

    def _should_analyze_file(self, file_path: str, file_size: int) -> bool:
        """Determine if a file should be analyzed based on extension and size."""
        if file_size > 1024 * 1024: return False  # Skip files > 1MB
        file_name = os.path.basename(file_path)
        _, file_ext = os.path.splitext(file_name)
        return file_ext.lower() in self.analyzable_extensions or file_name in self.analyzable_extensions

    def _get_raw_file(self, owner: str, repo: str, file_path: str) -> Optional[str]:
        """Fetch raw file content with fallback branches."""
        for branch in ['main', 'master']:
            url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
            try:
                response = self.session.get(url, timeout=10)
                if response.status_code == 200:
                    # Simple check for binary content
                    return response.text if '\x00' not in response.text else None
            except (requests.RequestException, UnicodeDecodeError) as e:
                logger.warning(f"Could not fetch or decode {file_path} from branch {branch}: {e}")
        return None

    def _analyze_code_with_llm(self, code: str, file_path: str) -> str:
        """Analyze code using Groq LLM API."""
        if not self.groq_client:
            return "Analysis skipped: Groq API key not provided."
        
        max_code_length = 8000
        if len(code) > max_code_length: code = code[:max_code_length] + "\n... (truncated)"
        
        prompt = f"""Analyze the following code from file '{file_path}'. Provide a concise explanation of its functionality, purpose, and key components.
        ```
        {code}
        ```
        Structure your analysis with these points:
        1. **Main Purpose**: What is the primary goal of this file?
        2. **Key Functions/Classes**: What are the main components and what do they do?
        3. **Overall Role**: How does this file fit into the larger project?
        """
        try:
            chat_completion = self.groq_client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model="llama3-8b-8192",
                temperature=0.2, max_tokens=1024
            )
            return chat_completion.choices[0].message.content.strip()
        except GroqError as e:
            logger.error(f"Groq API error for {file_path}: {e}")
            return f"Error: Groq API request failed - {e.message}"
        except Exception as e:
            logger.error(f"Error calling Groq API for {file_path}: {e}")
            return f"Error: {e}"

    def _create_directory_tree(self, owner: str, repo: str) -> str:
        """Creates a string representation of the directory tree."""
        tree = Tree()
        root_id = f"{owner}/{repo}"
        tree.create_node(f"🌳 {root_id}", root_id)
        created_nodes = {root_id}
        
        for file_path in sorted(self.file_contents.keys()):
            path_parts = file_path.split('/')
            parent_id = root_id
            for i, part in enumerate(path_parts[:-1]):
                node_id = f"{root_id}/{'/'.join(path_parts[:i+1])}"
                if node_id not in created_nodes:
                    tree.create_node(f"📁 {part}", node_id, parent=parent_id)
                    created_nodes.add(node_id)
                parent_id = node_id
            
            file_name = path_parts[-1]
            file_type = self.file_contents[file_path].file_type
            emoji = self.get_file_emoji(file_type)
            tree.create_node(f"{emoji} {file_name}", f"{root_id}/{file_path}", parent=parent_id)
            
        return f"```\n{tree.show(line_type='ascii-ex')}\n```"

    def _format_detailed_explanations(self) -> str:
        """Formats all file explanations into a single Markdown string."""
        if not self.file_contents: return "No files were analyzed."
        
        output = []
        for path, info in sorted(self.file_contents.items()):
            output.append(f"### 📄 `{path}`")
            output.append(f"**Size**: {info.size:,} bytes")
            output.append("---")
            output.append(info.explanation)
            output.append("\n---\n")
        return "\n".join(output)

    def _format_summary(self, owner: str, repo: str) -> str:
        """Creates a summary of the analysis."""
        total_files = len(self.file_contents)
        total_size = sum(info.size for info in self.file_contents.values())
        return (
            f"## Analysis Summary for `{owner}/{repo}`\n"
            f"- **Total Files Analyzed**: {total_files}\n"
            f"- **Total Code Size Analyzed**: {total_size:,} bytes"
        )

    @staticmethod
    def get_file_emoji(file_type: str) -> str:
        """Returns an emoji for a given file type."""
        emoji_map = {
            '.py': '🐍', '.js': '🟨', '.ts': '🔷', '.java': '☕', '.html': '🌐',
            '.css': '🎨', '.json': '📋', '.md': '📝', '.sh': '🐚', '.yml': '⚙️',
            '.yaml': '⚙️', '.dockerfile': '🐳', '.sql': '🗄️', 'requirements.txt': '📦'
        }
        return emoji_map.get(file_type.lower(), '📄')

class RateLimiter:
    """Simple rate limiter to avoid exceeding API limits."""
    def __init__(self, max_calls: int, time_window: int):
        self.max_calls = max_calls
        self.time_window = time_window
        self.calls = []
    
    def wait_if_needed(self):
        now = time.time()
        self.calls = [t for t in self.calls if now - t < self.time_window]
        if len(self.calls) >= self.max_calls:
            sleep_time = self.time_window - (now - self.calls[0])
            if sleep_time > 0:
                logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
                time.sleep(sleep_time)
        self.calls.append(time.time())

# --- Gradio Interface ---
def analyze_repo_gradio(repo_url: str, github_token: str, groq_key: str, progress=gr.Progress(track_tqdm=True)):
    """The main function executed by the Gradio interface."""
    if not repo_url:
        return "Please enter a GitHub repository URL.", "", ""
    if not groq_key:
        return "Please enter your Groq API Key.", "", ""

    try:
        analyzer = GitHubRepositoryAnalyzer(github_token=github_token, groq_api_key=groq_key)
        
        progress(0, desc="Extracting repo info...")
        owner, repo = analyzer.extract_repo_info(repo_url)
        
        progress(0.1, desc="Fetching repository file structure...")
        all_files = analyzer.get_repository_structure(owner, repo)
        if not all_files:
            return "Could not retrieve repository structure. Check URL or token.", "", ""
        
        tree, details, summary = analyzer.map_reduce_analysis(owner, repo, all_files, progress)
        
        return tree, details, summary
    except Exception as e:
        logger.error(f"A critical error occurred: {e}", exc_info=True)
        return f"An error occurred: {e}", "", ""

def create_gradio_interface():
    """Builds and returns the Gradio web interface."""
    with gr.Blocks(theme=gr.themes.Soft(), title="GitHub Repo Analyzer") as demo:
        gr.Markdown("# 🤖 AI-Powered GitHub Repository Analyzer")
        gr.Markdown("Enter a GitHub repository URL to generate a file tree, get AI-powered explanations for each file, and see a summary.")
        
        with gr.Row():
            with gr.Column(scale=2):
                repo_url = gr.Textbox(label="GitHub Repository URL", placeholder="e.g., https://github.com/google/generative-ai-python")
                with gr.Accordion("API Keys (Optional but Recommended)", open=False):
                    github_token = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token for a higher rate limit", type="password")
                    groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key for code analysis", type="password")
                
                analyze_btn = gr.Button("Analyze Repository", variant="primary")
        
        with gr.Tabs():
            with gr.TabItem("📊 Summary"):
                summary_output = gr.Markdown()
            with gr.TabItem("🌳 File Tree"):
                tree_output = gr.Markdown()
            with gr.TabItem("📄 Detailed Analysis"):
                details_output = gr.Markdown()

        analyze_btn.click(
            fn=analyze_repo_gradio,
            inputs=[repo_url, github_token, groq_key],
            outputs=[tree_output, details_output, summary_output]
        )
    return demo

if __name__ == "__main__":
    app = create_gradio_interface()
    app.launch(debug=True)