Spaces:

Phoenix21
/

FileStructureAnalyzer

Sleeping

App Files Files Community

Phoenix21 commited on 8 days ago

Commit

a1cb144

verified ·

1 Parent(s): 5980642

Create app.py

Browse files

Files changed (1) hide show

app.py +345 -0

app.py ADDED Viewed

	@@ -0,0 +1,345 @@

+import requests
+import time
+import os
+from urllib.parse import urlparse
+from treelib import Tree
+from typing import Dict, List, Optional, Tuple
+import logging
+from dataclasses import dataclass
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from groq import Groq, GroqError
+import gradio as gr
+from tqdm.auto import tqdm
+# --- Basic Configuration ---
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# --- Data Structures ---
+@dataclass
+class FileInfo:
+    """Data class to store file information"""
+    path: str
+    name: str
+    content: str
+    explanation: str
+    size: int
+    file_type: str
+# --- Core Application Logic ---
+class GitHubRepositoryAnalyzer:
+    """
+    A class to analyze GitHub repositories by fetching file structures,
+    downloading content, and using an LLM to explain the code.
+    """
+    def __init__(self, github_token: Optional[str] = None, groq_api_key: Optional[str] = None):
+        self.github_token = github_token
+        self.session = requests.Session()
+        self.file_contents: Dict[str, FileInfo] = {}
+        # Configure GitHub API access
+        if self.github_token:
+            logger.info("Using provided GitHub token for higher rate limits.")
+            self.session.headers.update({'Authorization': f'token {self.github_token}'})
+            # Authenticated GitHub API: 5000 requests/hour
+            self.rate_limiter = RateLimiter(max_calls=4500, time_window=3600)
+        else:
+            logger.warning("No GitHub token. Using unauthenticated access with lower rate limits.")
+            # Unauthenticated: 60 requests/hour
+            self.rate_limiter = RateLimiter(max_calls=50, time_window=3600)
+        # Configure Groq client
+        if groq_api_key:
+            self.groq_client = Groq(api_key=groq_api_key)
+            logger.info("Groq client initialized.")
+        else:
+            self.groq_client = None
+            logger.warning("Groq API key not provided. Code analysis will be skipped.")
+        # File types to analyze
+        self.analyzable_extensions = {
+            '.py', '.js', '.ts', '.java', '.cpp', '.c', '.h', '.cs', '.php',
+            '.rb', '.go', '.rs', '.swift', '.kt', '.scala', '.sh', '.bash',
+            '.sql', '.html', '.css', '.scss', '.less', '.json', '.xml', '.yaml', '.yml',
+            '.md', '.txt', '.cfg', '.conf', '.ini', '.env', '.dockerfile', 'requirements.txt'
+        }
+    def extract_repo_info(self, repo_url: str) -> Tuple[str, str]:
+        """Extract owner and repository name from a GitHub URL."""
+        try:
+            parsed_url = urlparse(repo_url)
+            path = parsed_url.path.strip('/').replace('.git', '')
+            parts = path.split('/')
+            if len(parts) >= 2:
+                return parts[0], parts[1]
+            raise ValueError("Invalid repository URL format")
+        except Exception as e:
+            logger.error(f"Error parsing repository URL: {e}")
+            raise
+    def get_repository_structure(self, owner: str, repo: str, path: str = "") -> List[Dict]:
+        """Recursively fetch the entire file structure of the repository."""
+        all_files = []
+        try:
+            contents = self._fetch_contents(owner, repo, path)
+            for item in contents:
+                if item['type'] == 'dir':
+                    all_files.extend(self.get_repository_structure(owner, repo, item['path']))
+                else:
+                    all_files.append(item)
+        except Exception as e:
+            logger.error(f"Failed to get repository structure for {path}: {e}")
+        return all_files
+    def _fetch_contents(self, owner: str, repo: str, path: str) -> List[Dict]:
+        """Helper to fetch contents of a specific directory with pagination."""
+        url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
+        items = []
+        while url:
+            self.rate_limiter.wait_if_needed()
+            response = self.session.get(url)
+            response.raise_for_status()
+            items.extend(response.json())
+            url = response.links.get('next', {}).get('url')
+        return items
+    def map_reduce_analysis(self, owner: str, repo: str, files: List[Dict], progress: gr.Progress):
+        """
+        Analyzes files in parallel (Map phase) and aggregates results (Reduce phase).
+        This method uses a ThreadPoolExecutor to perform I/O-bound tasks concurrently.
+        """
+        # --- MAP PHASE ---
+        # Each file is processed independently in a separate thread.
+        # This is efficient for tasks that wait for network responses (API calls).
+        logger.info(f"Starting Map phase: Analyzing {len(files)} files in parallel.")
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            future_to_file = {executor.submit(self._process_single_file, owner, repo, file_item): file_item for file_item in files}
+            # tqdm progress tracking integrated with Gradio
+            pbar = tqdm(as_completed(future_to_file), total=len(files), desc="Analyzing Files")
+            for future in pbar:
+                try:
+                    file_info = future.result()
+                    if file_info:
+                        # Store the result of the map phase
+                        self.file_contents[file_info.path] = file_info
+                        pbar.set_description(f"Analyzed {file_info.name}")
+                    # Update Gradio progress bar
+                    progress(pbar.n / pbar.total, desc=pbar.desc)
+                except Exception as e:
+                    file_item = future_to_file[future]
+                    logger.error(f"Error processing {file_item['path']}: {e}")
+        # --- REDUCE PHASE ---
+        # The reduce phase is the aggregation and structuring of the mapped results,
+        # which happens after the loop when creating the tree and summary.
+        logger.info("Reduce phase: Aggregating results.")
+        tree = self._create_directory_tree(owner, repo)
+        details = self._format_detailed_explanations()
+        summary = self._format_summary(owner, repo)
+        return tree, details, summary
+    def _process_single_file(self, owner: str, repo: str, file_item: Dict) -> Optional[FileInfo]:
+        """Processes a single file: download, check, and analyze."""
+        file_path = file_item['path']
+        file_size = file_item.get('size', 0)
+        if not self._should_analyze_file(file_path, file_size):
+            return None
+        content = self._get_raw_file(owner, repo, file_path)
+        if content is None:
+            return None
+        explanation = self._analyze_code_with_llm(content, file_path)
+        return FileInfo(
+            path=file_path, name=file_item['name'], content=content,
+            explanation=explanation, size=file_size, file_type=os.path.splitext(file_path)[1]
+        )
+    def _should_analyze_file(self, file_path: str, file_size: int) -> bool:
+        """Determine if a file should be analyzed based on extension and size."""
+        if file_size > 1024 * 1024: return False  # Skip files > 1MB
+        file_name = os.path.basename(file_path)
+        _, file_ext = os.path.splitext(file_name)
+        return file_ext.lower() in self.analyzable_extensions or file_name in self.analyzable_extensions
+    def _get_raw_file(self, owner: str, repo: str, file_path: str) -> Optional[str]:
+        """Fetch raw file content with fallback branches."""
+        for branch in ['main', 'master']:
+            url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{file_path}"
+            try:
+                response = self.session.get(url, timeout=10)
+                if response.status_code == 200:
+                    # Simple check for binary content
+                    return response.text if '\x00' not in response.text else None
+            except (requests.RequestException, UnicodeDecodeError) as e:
+                logger.warning(f"Could not fetch or decode {file_path} from branch {branch}: {e}")
+        return None
+    def _analyze_code_with_llm(self, code: str, file_path: str) -> str:
+        """Analyze code using Groq LLM API."""
+        if not self.groq_client:
+            return "Analysis skipped: Groq API key not provided."
+        max_code_length = 8000
+        if len(code) > max_code_length: code = code[:max_code_length] + "\n... (truncated)"
+        prompt = f"""Analyze the following code from file '{file_path}'. Provide a concise explanation of its functionality, purpose, and key components.
+        ```
+        {code}
+        ```
+        Structure your analysis with these points:
+        1. **Main Purpose**: What is the primary goal of this file?
+        2. **Key Functions/Classes**: What are the main components and what do they do?
+        3. **Overall Role**: How does this file fit into the larger project?
+        """
+        try:
+            chat_completion = self.groq_client.chat.completions.create(
+                messages=[{"role": "user", "content": prompt}],
+                model="llama3-8b-8192",
+                temperature=0.2, max_tokens=1024
+            )
+            return chat_completion.choices[0].message.content.strip()
+        except GroqError as e:
+            logger.error(f"Groq API error for {file_path}: {e}")
+            return f"Error: Groq API request failed - {e.message}"
+        except Exception as e:
+            logger.error(f"Error calling Groq API for {file_path}: {e}")
+            return f"Error: {e}"
+    def _create_directory_tree(self, owner: str, repo: str) -> str:
+        """Creates a string representation of the directory tree."""
+        tree = Tree()
+        root_id = f"{owner}/{repo}"
+        tree.create_node(f"🌳 {root_id}", root_id)
+        created_nodes = {root_id}
+        for file_path in sorted(self.file_contents.keys()):
+            path_parts = file_path.split('/')
+            parent_id = root_id
+            for i, part in enumerate(path_parts[:-1]):
+                node_id = f"{root_id}/{'/'.join(path_parts[:i+1])}"
+                if node_id not in created_nodes:
+                    tree.create_node(f"📁 {part}", node_id, parent=parent_id)
+                    created_nodes.add(node_id)
+                parent_id = node_id
+            file_name = path_parts[-1]
+            file_type = self.file_contents[file_path].file_type
+            emoji = self.get_file_emoji(file_type)
+            tree.create_node(f"{emoji} {file_name}", f"{root_id}/{file_path}", parent=parent_id)
+        return f"```\n{tree.show(line_type='ascii-ex')}\n```"
+    def _format_detailed_explanations(self) -> str:
+        """Formats all file explanations into a single Markdown string."""
+        if not self.file_contents: return "No files were analyzed."
+        output = []
+        for path, info in sorted(self.file_contents.items()):
+            output.append(f"### 📄 `{path}`")
+            output.append(f"**Size**: {info.size:,} bytes")
+            output.append("---")
+            output.append(info.explanation)
+            output.append("\n---\n")
+        return "\n".join(output)
+    def _format_summary(self, owner: str, repo: str) -> str:
+        """Creates a summary of the analysis."""
+        total_files = len(self.file_contents)
+        total_size = sum(info.size for info in self.file_contents.values())
+        return (
+            f"## Analysis Summary for `{owner}/{repo}`\n"
+            f"- **Total Files Analyzed**: {total_files}\n"
+            f"- **Total Code Size Analyzed**: {total_size:,} bytes"
+        )
+    @staticmethod
+    def get_file_emoji(file_type: str) -> str:
+        """Returns an emoji for a given file type."""
+        emoji_map = {
+            '.py': '🐍', '.js': '🟨', '.ts': '🔷', '.java': '☕', '.html': '🌐',
+            '.css': '🎨', '.json': '📋', '.md': '📝', '.sh': '🐚', '.yml': '⚙️',
+            '.yaml': '⚙️', '.dockerfile': '🐳', '.sql': '🗄️', 'requirements.txt': '📦'
+        }
+        return emoji_map.get(file_type.lower(), '📄')
+class RateLimiter:
+    """Simple rate limiter to avoid exceeding API limits."""
+    def __init__(self, max_calls: int, time_window: int):
+        self.max_calls = max_calls
+        self.time_window = time_window
+        self.calls = []
+    def wait_if_needed(self):
+        now = time.time()
+        self.calls = [t for t in self.calls if now - t < self.time_window]
+        if len(self.calls) >= self.max_calls:
+            sleep_time = self.time_window - (now - self.calls[0])
+            if sleep_time > 0:
+                logger.info(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds.")
+                time.sleep(sleep_time)
+        self.calls.append(time.time())
+# --- Gradio Interface ---
+def analyze_repo_gradio(repo_url: str, github_token: str, groq_key: str, progress=gr.Progress(track_tqdm=True)):
+    """The main function executed by the Gradio interface."""
+    if not repo_url:
+        return "Please enter a GitHub repository URL.", "", ""
+    if not groq_key:
+        return "Please enter your Groq API Key.", "", ""
+    try:
+        analyzer = GitHubRepositoryAnalyzer(github_token=github_token, groq_api_key=groq_key)
+        progress(0, desc="Extracting repo info...")
+        owner, repo = analyzer.extract_repo_info(repo_url)
+        progress(0.1, desc="Fetching repository file structure...")
+        all_files = analyzer.get_repository_structure(owner, repo)
+        if not all_files:
+            return "Could not retrieve repository structure. Check URL or token.", "", ""
+        tree, details, summary = analyzer.map_reduce_analysis(owner, repo, all_files, progress)
+        return tree, details, summary
+    except Exception as e:
+        logger.error(f"A critical error occurred: {e}", exc_info=True)
+        return f"An error occurred: {e}", "", ""
+def create_gradio_interface():
+    """Builds and returns the Gradio web interface."""
+    with gr.Blocks(theme=gr.themes.Soft(), title="GitHub Repo Analyzer") as demo:
+        gr.Markdown("# 🤖 AI-Powered GitHub Repository Analyzer")
+        gr.Markdown("Enter a GitHub repository URL to generate a file tree, get AI-powered explanations for each file, and see a summary.")
+        with gr.Row():
+            with gr.Column(scale=2):
+                repo_url = gr.Textbox(label="GitHub Repository URL", placeholder="e.g., https://github.com/google/generative-ai-python")
+                with gr.Accordion("API Keys (Optional but Recommended)", open=False):
+                    github_token = gr.Textbox(label="GitHub Token", placeholder="Enter your GitHub token for a higher rate limit", type="password")
+                    groq_key = gr.Textbox(label="Groq API Key", placeholder="Enter your Groq API key for code analysis", type="password")
+                analyze_btn = gr.Button("Analyze Repository", variant="primary")
+        with gr.Tabs():
+            with gr.TabItem("📊 Summary"):
+                summary_output = gr.Markdown()
+            with gr.TabItem("🌳 File Tree"):
+                tree_output = gr.Markdown()
+            with gr.TabItem("📄 Detailed Analysis"):
+                details_output = gr.Markdown()
+        analyze_btn.click(
+            fn=analyze_repo_gradio,
+            inputs=[repo_url, github_token, groq_key],
+            outputs=[tree_output, details_output, summary_output]
+        )
+    return demo
+if __name__ == "__main__":
+    app = create_gradio_interface()
+    app.launch(debug=True)