diff --git "a/app2.py" "b/app2.py"
--- "a/app2.py"
+++ "b/app2.py"
@@ -1,3798 +1,1358 @@
-import gradio as gr
-import os
-import aiohttp
-import asyncio
-from git import Repo, GitCommandError, InvalidGitRepositoryError, NoSuchPathError
-from pathlib import Path
-from datetime import datetime, timedelta, timezone
-import shutil
import json
-import logging
+import os
import re
-from typing import Dict, List, Optional, Tuple, Any
-import subprocess
-import plotly.express as px
-import plotly.graph_objects as go
import time
-import random
-import pandas as pd
-from collections import Counter
-import string
-from concurrent.futures import ThreadPoolExecutor
-from hdbscan import HDBSCAN
-# --- Required Imports ---
-import hashlib
-from functools import lru_cache
-import threading
-from http.server import HTTPServer, BaseHTTPRequestHandler
-import markdown2
-import websockets
-from websockets.server import WebSocketServerProtocol
-from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionAbortedError, ConnectionResetError, WebSocketException
-import signal # For graceful shutdown
-# ---------------------
-
-# Assuming code_editor is available, e.g., installed via pip or included locally
+import logging
+import mimetypes
+import zipfile
+import tempfile
+import chardet
+import io # Needed for processing CSV from string
+import csv # Needed for CSV
+import xml.etree.ElementTree as ET # Needed for XML
+from datetime import datetime
+from typing import List, Dict, Optional, Union, Tuple, Any # Added Any for extracted_data
+from pathlib import Path
+from urllib.parse import urlparse, urljoin
+import requests
+import validators
+import gradio as gr
+from diskcache import Cache
+from bs4 import BeautifulSoup
+from fake_useragent import UserAgent
+from cleantext import clean
+import qrcode
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import tarfile
+import gzip
+import math
+
+# Setup enhanced logging with more detailed formatting
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
+ handlers=[
+ logging.StreamHandler(),
+ logging.FileHandler('app.log', encoding='utf-8')
+ ])
+logger = logging.getLogger(__name__) # logger is now defined here
+
+# Conditional imports for document processing
try:
- from code_editor import code_editor
+ from PyPDF2 import PdfReader
+ PDF_SUPPORT = True
except ImportError:
- logging.error("The 'code_editor' Gradio component is not installed or available.")
- logging.error("Please install it, e.g., 'pip install gradio_code_editor'")
- def code_editor(*args, **kwargs):
- logging.warning("Using dummy code_editor. Code editing and collaboration will not function.")
- # Create a dummy component that looks like a Textbox but is non-interactive
- return gr.Textbox(label=kwargs.get('label', 'Code Editor (Unavailable)'), interactive=False, value="Error: Code editor component not found. Install 'gradio_code_editor'.", lines=10)
-
-
-# ========== Configuration ==========
-WORKSPACE = Path("./issue_workspace")
-WORKSPACE.mkdir(exist_ok=True)
-GITHUB_API = "https://api.github.com/repos"
-HF_INFERENCE_API = "https://api-inference.huggingface.co/models"
-WEBHOOK_PORT = int(os.environ.get("WEBHOOK_PORT", 8000))
-WS_PORT = int(os.environ.get("WS_PORT", 8001))
-GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
-
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-# Set a higher logging level for libraries that are too verbose, e.g.:
-logging.getLogger("websockets").setLevel(logging.WARNING)
-logging.getLogger("aiohttp").setLevel(logging.WARNING)
-logging.getLogger("urllib3").setLevel(logging.WARNING) # Might be used by git or other libs
-logging.getLogger("git").setLevel(logging.WARNING)
-logging.getLogger("hdbscan").setLevel(logging.WARNING) # HDBSCAN can be chatty
-
-
-# Use ThreadPoolExecutor for any synchronous blocking operations if needed,
-# but most heavy lifting (API calls, git) is now async.
-executor = ThreadPoolExecutor(max_workers=4)
-
-
-# Example HF models (replace with your actual models)
-# Ensure these models are suitable for the tasks (text generation, embeddings)
-HF_MODELS = {
- "Mistral-8x7B": "mistralai/Mixtral-8x7B-Instruct-v0.1",
- "Llama-2-7B-chat": "huggingface/llama-2-7b-chat-hf",
- "CodeLlama-34B": "codellama/CodeLlama-34b-Instruct-hf",
- "Gemma-7B-it": "google/gemma-7b-it", # Added another option
-}
-# Embedding model ID - fixed, not user selectable
-HF_EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
-
-# Select default models defensively
-DEFAULT_MODEL_KEY = "Mistral-8x7B" if "Mistral-8x7B" in HF_MODELS else (list(HF_MODELS.keys())[0] if HF_MODELS else None)
-DEFAULT_MODEL_ID = HF_MODELS.get(DEFAULT_MODEL_KEY, None)
-
-DEFAULT_IDLE_MODEL_KEY = "Gemma-7B-it" if "Gemma-7B-it" in HF_MODELS else DEFAULT_MODEL_KEY # Prefer a smaller one if available
-DEFAULT_IDLE_MODEL_ID = HF_MODELS.get(DEFAULT_IDLE_MODEL_KEY, DEFAULT_MODEL_ID)
-
-if not HF_MODELS:
- logger.critical("No HF models configured! AI features will be disabled.")
-elif DEFAULT_MODEL_ID is None:
- logger.critical(f"Default model key '{DEFAULT_MODEL_KEY}' not found in configured models. AI features may be limited.")
-if DEFAULT_IDLE_MODEL_ID is None:
- logger.warning(f"Idle model key '{DEFAULT_IDLE_MODEL_KEY}' not found or no models configured. Idle tasks may be disabled or use the default model if available.")
-
-
-# --- Idle State Configuration ---
-STALE_ISSUE_THRESHOLD_DAYS = 30
-MAX_SUMMARY_COMPUTATIONS_PER_CYCLE = 2
-MAX_CONTEXT_COMPUTATIONS_PER_CYCLE = 3
-MAX_MISSING_INFO_COMPUTATIONS_PER_CYCLE = 1
-MAX_ANALYSIS_COMPUTATIONS_PER_CYCLE = 1
-RECLUSTER_THRESHOLD = 5 # Number of significant webhook changes before re-clustering is flagged
-IDLE_PROCESSING_INTERVAL_SECONDS = 60.0 # How often the idle task loop runs
-
-# ========== Placeholder OTCodeEditor Class ==========
-# WARNING: This is a placeholder and DOES NOT implement Operational Transformation.
-# Concurrent edits WILL lead to data loss or inconsistencies.
-class OTCodeEditor:
- """
- Placeholder for an Operational Transformation (OT) enabled code editor backend.
- This implementation is NOT thread-safe and does NOT handle concurrent edits correctly.
- It merely logs received deltas and maintains a basic revision counter.
- The actual document state is held client-side by the Gradio code_editor component.
- A real collaborative editor would require a robust OT backend to manage
- the authoritative document state and transform operations.
- """
- def __init__(self, initial_value: Dict[str, str]):
- # In a real OT system, this would initialize the document state
- # For this placeholder, we just store the initial files dict
- self.files: Dict[str, str] = initial_value.copy()
- self.revision = 0 # Basic revision counter, not used for OT logic
- logger.debug(f"OTCodeEditor initialized with files: {list(self.files.keys())}")
-
- def apply_delta(self, delta: Dict[str, Any]):
- # VERY basic placeholder: This logs the delta but does NOT perform OT.
- # It does NOT handle concurrent edits safely.
- # In a real OT system, this would transform the delta against the current state
- # and apply it, incrementing the revision based on successful application.
- logger.warning(f"Placeholder apply_delta called. Delta: {str(delta)[:200]}. "
- "WARNING: Full Operational Transformation is NOT implemented. Concurrent edits are UNSAFE.")
- # The Gradio component holds the actual text state client-side.
- # A real OT backend would use the delta to update its authoritative state.
- # We only increment revision for basic tracking visibility if needed.
- self.revision += 1
-
- def get_content(self) -> Dict[str, str]:
- # This is not used by the current Gradio code_editor integration,
- # as the component holds the state client-side.
- # In a real OT system, this would return the current authoritative document state.
- return self.files.copy()
-
-# ========== Modern Theme ==========
+ PDF_SUPPORT = False
+ logger.warning("PyPDF2 not installed. PDF file processing will be limited.") # logger is available
+
try:
- theme = gr.themes.Soft(
- primary_hue="violet",
- secondary_hue="emerald",
- radius_size="lg",
- font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
- ).set(
- button_primary_background_fill="linear-gradient(90deg, #8B5CF6 0%, #EC4899 100%)",
- button_primary_text_color="white",
- block_label_text_size="lg",
- block_label_text_weight="600",
- block_title_text_size="lg",
- block_title_text_weight="800",
- panel_background_fill="white",
- block_shadow="*shadow_drop_lg",
- # Add some more modern touches
- input_background_fill="#f9fafb",
- input_border_color="#e5e7eb",
- input_border_radius="md",
- button_secondary_background_fill="#f3f4f6",
- button_secondary_text_color="#374151",
- button_secondary_border_color="#d1d5db",
- table_border_color="#e5e7eb",
- table_row_background_even="#f9fafb",
- table_row_background_odd="#ffffff",
- # Use slightly softer colors for plots if default is too bright
- # (This might need specific Plotly config instead of theme)
- )
-except AttributeError as e:
- logger.warning(f"Could not apply all theme settings (might be Gradio version difference): {e}. Using default Soft theme.")
- theme = gr.themes.Soft()
-
-
-# ========== Enhanced Webhook Handler ==========
-class WebhookHandler(BaseHTTPRequestHandler):
- manager_instance: Optional['IssueManager'] = None
- main_loop: Optional[asyncio.AbstractEventLoop] = None # Store reference to the main asyncio loop
-
- def do_POST(self):
- content_length = int(self.headers.get('Content-Length', 0))
- if content_length == 0:
- self.send_response(400)
- self.send_header("Content-type", "text/plain")
- self.end_headers()
- self.wfile.write(b"Empty payload")
- logger.warning("Received empty webhook payload.")
- return
+ from docx import Document
+ DOCX_SUPPORT = True
+except ImportError:
+ DOCX_SUPPORT = False
+ logger.warning("python-docx not installed. DOCX file processing will be limited.") # logger is available
- try:
- payload_bytes = self.rfile.read(content_length)
- payload = json.loads(payload_bytes.decode('utf-8'))
- except json.JSONDecodeError:
- logger.error(f"Invalid JSON payload received: {payload_bytes[:500]}")
- self.send_response(400)
- self.send_header("Content-type", "text/plain")
- self.end_headers()
- self.wfile.write(b"Invalid JSON payload")
- return
- except Exception as e:
- logger.error(f"Error reading webhook payload: {e}")
- self.send_response(500)
- self.end_headers()
- return
-
- event = self.headers.get('X-GitHub-Event')
- delivery_id = self.headers.get('X-GitHub-Delivery')
- logger.info(f"Received GitHub webhook event: {event} (Delivery ID: {delivery_id})")
-
- if event == 'issues' and WebhookHandler.manager_instance and WebhookHandler.main_loop:
- action = payload.get('action')
- logger.info(f"Issue action: {action}")
- # Handle common actions that affect issue state or content
- if action in ['opened', 'reopened', 'closed', 'assigned', 'unassigned', 'edited', 'labeled', 'unlabeled', 'milestoned', 'demilestoned']:
- # Check if the loop is running before scheduling
- if WebhookHandler.main_loop.is_running():
- # Schedule the async handler to run in the main event loop
- # Use run_coroutine_threadsafe as this is called from a different thread
- asyncio.run_coroutine_threadsafe(
- WebhookHandler.manager_instance.handle_webhook_event(event, action, payload),
- WebhookHandler.main_loop
- )
- logger.debug(f"Scheduled webhook processing for action '{action}' in main loop.")
- else:
- logger.error("Asyncio event loop is not running in the target thread for webhook. Cannot process event.")
- # Respond with an error as processing couldn't be scheduled
- self.send_response(500)
- self.send_header("Content-type", "text/plain")
- self.end_headers()
- self.wfile.write(b"Async processing loop not available.")
- return
- else:
- logger.info(f"Webhook action '{action}' received but not actively handled by current logic.")
- elif event == 'ping':
- logger.info("Received GitHub webhook ping.")
- else:
- logger.warning(f"Unhandled event type: {event} or manager/loop not initialized.")
+try:
+ from pyth.plugins.rtf15.reader import Rtf15Reader
+ from pyth.plugins.plaintext.writer import PlaintextWriter
+ RTF_SUPPORT = True
+except ImportError:
+ RTF_SUPPORT = False
+ logger.warning("pyth not installed. RTF file processing will be limited.")
+
+try:
+ from odf.opendocument import OpenDocumentText
+ from odf import text as odftext
+ ODT_SUPPORT = True
+except ImportError:
+ ODT_SUPPORT = False
+ logger.warning("odfpy not installed. ODT file processing will be limited.")
+
+
+# Setup enhanced logging with more detailed formatting
+logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
+ handlers=[
+ logging.StreamHandler(),
+ logging.FileHandler('app.log', encoding='utf-8')
+ ])
+logger = logging.getLogger(__name__)
- # Always respond 200 OK for successful receipt, even if action is ignored
- self.send_response(200)
- self.send_header("Content-type", "text/plain")
- self.end_headers()
- self.wfile.write(b"OK")
+# Ensure output directories exist with modern structure
+OUTPUTS_DIR = Path('output')
+QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
+TEMP_DIR = OUTPUTS_DIR / 'temp'
+for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
+ directory.mkdir(parents=True, exist_ok=True)
-# ========== AI-Powered Issue Manager ==========
-class IssueManager:
+class EnhancedURLProcessor:
+ """Advanced URL processing with enhanced content extraction"""
def __init__(self):
- self.issues: Dict[int, dict] = {}
- self.repo_url: Optional[str] = None
- self.repo_owner: Optional[str] = None
- self.repo_name: Optional[str] = None
- self.repo_local_path: Optional[Path] = None
- self.repo: Optional[Repo] = None
- self.github_token: Optional[str] = None
- self.hf_token: Optional[str] = None
- self.collaborators: Dict[str, dict] = {} # {client_id: {name: str, status: str}}
- self.points: int = 0 # Placeholder for potential gamification
- self.severity_rules: Dict[str, List[str]] = {
- "Critical": ["critical", "urgent", "security", "crash", "blocker", "p0", "s0"],
- "High": ["high", "important", "error", "regression", "major", "p1", "s1"],
- "Medium": ["medium", "bug", "performance", "minor", "p2", "s2"],
- "Low": ["low", "documentation", "enhancement", "trivial", "feature", "p3", "s3", "chore", "refactor", "question", "help wanted"]
- }
- self.issue_clusters: Dict[int, List[int]] = {} # {cluster_id: [issue_index_in_list, ...]}
- self.issue_list_for_clustering: List[dict] = [] # List of issue dicts used for the last clustering run
- self.ws_clients: List[WebSocketServerProtocol] = []
- self.code_editors: Dict[int, OTCodeEditor] = {} # {issue_id: OTCodeEditor_instance}
- # Get or create the loop in the thread where the manager is initialized (main thread)
+ self.session = requests.Session()
+ self.timeout = 15 # Extended timeout for larger content
+ self.max_retries = 3
+ self.user_agent = UserAgent()
+
+ # Enhanced headers for better site compatibility
+ self.session.headers.update({
+ 'User-Agent': self.user_agent.random,
+ 'Accept': 'text/html, application/json, application/xml, text/plain, */*', # Request common types
+ 'Accept-Language': 'en-US,en;q=0.9',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1', # May be ignored for non-HTML
+ 'Sec-Fetch-Dest': 'document',
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Sec-Fetch-Site': 'none',
+ 'Sec-Fetch-User': '?1',
+ 'DNT': '1'
+ })
+
+ def validate_url(self, url: str) -> Dict:
+ """Enhanced URL validation with detailed feedback"""
try:
- self.main_loop = asyncio.get_running_loop()
- logger.debug(f"IssueManager found running asyncio loop: {id(self.main_loop)}")
- except RuntimeError:
- self.main_loop = asyncio.new_event_loop()
- asyncio.set_event_loop(self.main_loop)
- logger.debug(f"IssueManager created and set new asyncio loop: {id(self.main_loop)}")
-
- self.broadcast_task: Optional[asyncio.Task] = None
- self.idle_task: Optional[asyncio.Task] = None
-
- # --- State for Idle Processing Results ---
- self.precomputed_context: Dict[int, Dict[str, Any]] = {} # {issue_id: {content: str, files: list, error: str, timestamp: float}}
- self.precomputed_summaries: Dict[int, Dict[str, Any]] = {} # {issue_id: {summary: str, error: str, timestamp: float}}
- self.precomputed_missing_info: Dict[int, Dict[str, Any]] = {} # {issue_id: {info_needed: str, error: str, timestamp: float}}
- self.precomputed_analysis: Dict[int, Dict[str, Any]] = {} # {issue_id: {hypothesis: str, error: str, timestamp: float}}
- # self.code_embeddings: Dict[str, List[float]] = {} # Not currently used after clustering
- self.potential_duplicates: Dict[int, List[int]] = {} # {issue_id: [duplicate_issue_id, ...]}
- self.stale_issues: List[int] = [] # [issue_id, ...]
- self.high_priority_candidates: List[int] = [] # [issue_id, ...]
- self.last_webhook_time: float = time.time() # Track last webhook for potential future use
- self.needs_recluster: bool = False
- self._webhook_change_count = 0
-
- # --- Configuration for Idle Tasks ---
- self.idle_processing_interval = IDLE_PROCESSING_INTERVAL_SECONDS
- self.max_context_computations_per_cycle = MAX_CONTEXT_COMPUTATIONS_PER_CYCLE
- self.max_summary_computations_per_cycle = MAX_SUMMARY_COMPUTATIONS_PER_CYCLE
- self.max_missing_info_computations_per_cycle = MAX_MISSING_INFO_COMPUTATIONS_PER_CYCLE
- self.max_analysis_computations_per_cycle = MAX_ANALYSIS_COMPUTATIONS_PER_CYCLE
- self.stale_issue_threshold_days = STALE_ISSUE_THRESHOLD_DAYS
- self.recluster_threshold = RECLUSTER_THRESHOLD
-
- # Shutdown signals (placeholders, set by main execution block)
- self.stop_ws_server = None
- self.stop_webhook_server = None
-
- def start_broadcast_loop(self):
- """Starts the periodic broadcast task."""
- # Ensure task is created in the correct loop
- if not self.main_loop.is_running():
- logger.error("Cannot start broadcast loop: Main event loop is not running.")
- return
-
- if not self.broadcast_task or self.broadcast_task.done():
- self.broadcast_task = self.main_loop.create_task(self.broadcast_collaboration_status())
- logger.info("Started collaboration status broadcast loop.")
- else:
- logger.debug("Broadcast loop already running.")
-
-
- def stop_broadcast_loop(self):
- """Stops the periodic broadcast task."""
- if self.broadcast_task and not self.broadcast_task.done():
- logger.info("Stopping collaboration status broadcast loop...")
- self.broadcast_task.cancel()
- # Await the task to finish cancellation in an async context if needed,
- # but cancelling is sufficient to signal it to stop.
- self.broadcast_task = None # Clear the reference
-
-
- def _get_issue_hash(self, issue_data: Optional[dict]) -> str:
- """Generates a hash based on key issue content for caching AI suggestions."""
- if not issue_data:
- return "empty_issue_hash" # Handle cases where issue_data is None
-
- content = f"{issue_data.get('title', '')}{issue_data.get('body', '')}{','.join(issue_data.get('labels',[]))}"
- return hashlib.md5(content.encode('utf-8', errors='ignore')).hexdigest() # Use utf-8 and ignore errors
-
-
- @lru_cache(maxsize=100)
- async def cached_suggestion(self, issue_hash: str, model_key: str) -> str:
- """Retrieves or generates an AI suggestion, using an LRU cache based on issue content hash."""
- logger.debug(f"Checking cache for suggestion: hash={issue_hash}, model={model_key}")
- # The cache decorator handles the cache hit/miss logic.
- # If it's a miss, the decorated function body is executed.
-
- # Find the issue data corresponding to the hash
- found_issue = None
- # This linear scan is inefficient for many issues, but hashes are only for cache keys.
- # A dict mapping hashes to issue IDs could be more efficient if this becomes a bottleneck.
- # However, issues dict is not huge usually, so this is likely fine.
- for issue in self.issues.values():
- if self._get_issue_hash(issue) == issue_hash:
- found_issue = issue
- break
-
- if not found_issue:
- logger.error(f"Could not find issue data for hash {issue_hash} in current state. Suggestion cannot be generated.")
- # Corrected: Return error message here if issue data is missing
- return "Error: Issue data for this suggestion request (hash) not found in current state. The issue might have been updated or closed. Please re-select the issue."
-
- if model_key not in HF_MODELS or HF_MODELS.get(model_key) is None:
- logger.error(f"Invalid or unconfigured model key requested: {model_key}")
- return f"Error: Invalid or unconfigured model key: {model_key}"
-
- logger.info(f"Cache miss or first request for issue hash {issue_hash}. Requesting suggestion from {model_key}.")
- # Call the actual suggestion generation function
- return await self.suggest_resolution(found_issue, model_key)
-
- async def handle_webhook_event(self, event: str, action: str, payload: dict):
- """Processes incoming webhook events to update the issue state."""
- logger.info(f"Processing webhook event: {event}, action: {action}")
- issue_data = payload.get('issue')
- repo_data = payload.get('repository')
-
- if not issue_data or not repo_data:
- logger.warning("Webhook payload missing 'issue' or 'repository' data.")
- return
-
- event_repo_url = repo_data.get('html_url')
- # Only process events for the currently loaded repository
- # Use .rstrip("/") on both sides for robust comparison
- if self.repo_url is None or event_repo_url is None or event_repo_url.rstrip("/") != self.repo_url.rstrip("/"):
- logger.info(f"Ignoring webhook event for different repository: {event_repo_url} (Current: {self.repo_url})")
- return
-
- issue_number = issue_data.get('number')
- if issue_number is None: # Check explicitly for None
- logger.warning("Webhook issue data missing 'number'.")
- return
-
- needs_ui_update = False
- significant_change = False # Flag for changes affecting clustering/content/AI caches
-
- if action == 'closed':
- logger.info(f"Webhook: Removing closed issue {issue_number} from active list.")
- if issue_number in self.issues:
- self.issues.pop(issue_number)
- needs_ui_update = True
- significant_change = True # Closing is a significant change
- # Clean up associated cached/computed data
- self.precomputed_context.pop(issue_number, None)
- self.precomputed_summaries.pop(issue_number, None)
- self.precomputed_missing_info.pop(issue_number, None)
- self.precomputed_analysis.pop(issue_number, None)
- self.potential_duplicates.pop(issue_number, None)
- # Remove from lists if present (use list comprehension for safe removal)
- self.stale_issues = [i for i in self.stale_issues if i != issue_number]
- self.high_priority_candidates = [i for i in self.high_priority_candidates if i != issue_number]
- # Remove the code editor instance for the closed issue
- self.code_editors.pop(issue_number, None)
- logger.debug(f"Cleaned up state for closed issue {issue_number}.")
- else:
- logger.debug(f"Webhook: Issue {issue_number} closed, but not found in current active list. No state change needed.")
-
-
- elif action in ['opened', 'reopened', 'edited', 'assigned', 'unassigned', 'labeled', 'unlabeled', 'milestoned', 'demilestoned']:
- logger.info(f"Webhook: Adding/Updating issue {issue_number} (action: {action}).")
- processed_data = self._process_issue_data(issue_data)
-
- old_issue = self.issues.get(issue_number)
- # Check for changes that impact AI suggestions or clustering
- if not old_issue or \
- old_issue.get('body') != processed_data.get('body') or \
- old_issue.get('title') != processed_data.get('title') or \
- set(old_issue.get('labels', [])) != set(processed_data.get('labels', [])):
- significant_change = True
- logger.info(f"Significant change detected for issue {issue_number} (content/labels).")
- # Invalidate ALL precomputed AI state on significant edit
- self.precomputed_context.pop(issue_number, None)
- self.precomputed_summaries.pop(issue_number, None)
- self.precomputed_missing_info.pop(issue_number, None)
- self.precomputed_analysis.pop(issue_number, None)
- # Clear the entire suggestion cache on significant change
- self.cached_suggestion.cache_clear()
- logger.debug("Cleared suggestion cache due to significant issue change.")
-
- # Check if state-related fields changed (affecting idle processing lists)
- # This check is for logging/debugging, the idle loop re-evaluates lists anyway
- if not old_issue or \
- old_issue.get('updated_at') != processed_data.get('updated_at') or \
- old_issue.get('assignee') != processed_data.get('assignee') or \
- set(old_issue.get('labels', [])) != set(processed_data.get('labels', [])) or \
- old_issue.get('state') != processed_data.get('state'): # State change (open/reopened)
- logger.debug(f"State-related change detected for issue {issue_number} (update time, assignee, labels, state). Idle loop will re-evaluate.")
-
- self.issues[issue_number] = processed_data
- needs_ui_update = True
- else:
- logger.info(f"Ignoring webhook action '{action}' for issue {issue_number} (already filtered).")
-
- # --- Track changes for idle processing ---
- if needs_ui_update:
- self.last_webhook_time = time.time()
- if significant_change:
- self._increment_change_counter()
- # Rebuild the list used for clustering immediately if a significant change occurred
- # This list is a snapshot used by the async clustering task
- self.issue_list_for_clustering = list(self.issues.values())
- logger.info("Issue list for clustering updated due to significant webhook change.")
- # Broadcast UI update notification
- # Schedule this in the main loop using call_soon_threadsafe
- if self.main_loop.is_running():
- self.main_loop.call_soon_threadsafe(asyncio.create_task, self.broadcast_issue_update())
- logger.debug("Scheduled issue update broadcast.")
- else:
- logger.warning("Main loop not running, cannot broadcast issue update.")
-
-
- def _increment_change_counter(self):
- """Increments change counter and sets recluster flag if threshold reached."""
- self._webhook_change_count += 1
- logger.debug(f"Significant change detected. Change count: {self._webhook_change_count}/{self.recluster_threshold}")
- if self._webhook_change_count >= self.recluster_threshold:
- self.needs_recluster = True
- logger.info(f"Change threshold ({self.recluster_threshold}) reached. Flagging for re-clustering.")
-
- def _process_issue_data(self, issue_data: dict) -> dict:
- """Helper to structure issue data consistently."""
- return {
- "id": issue_data.get('number'), # Use .get for safety
- "title": issue_data.get('title', 'No Title Provided'),
- "body": issue_data.get('body', ''),
- "state": issue_data.get('state', 'unknown'),
- "labels": sorted([label.get('name', '') for label in issue_data.get('labels', []) if isinstance(label, dict) and label.get('name')]), # Ensure labels are dicts and have name
- "assignee": issue_data.get('assignee', {}).get('login') if issue_data.get('assignee') and isinstance(issue_data.get('assignee'), dict) else None,
- "url": issue_data.get('html_url', '#'),
- "created_at": issue_data.get('created_at'),
- "updated_at": issue_data.get('updated_at'),
- }
-
- async def crawl_issues(self, repo_url: str, github_token: Optional[str], hf_token: Optional[str]) -> Tuple[List[List], go.Figure, str, go.Figure]:
- """
- Crawls issues, resets state, clones repo, clusters, starts background tasks.
- Returns dataframe data, stats plot, status message, and analytics plot.
- """
- # Strip whitespace from inputs
- repo_url = repo_url.strip() if repo_url else None
- github_token = github_token.strip() if github_token else None
- hf_token = hf_token.strip() if hf_token else None
-
- # Define a default empty plot for consistent return type
- def get_empty_plot(title="Plot"):
- fig = go.Figure()
- fig.update_layout(title=title, xaxis={"visible": False}, yaxis={"visible": False},
- annotations=[{"text": "Scan needed.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}],
- plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
- return fig
-
- if not repo_url or not hf_token:
- logger.error("Repository URL and Hugging Face Token are required.")
- empty_fig = get_empty_plot("Issue Severity Distribution")
- return [], empty_fig, "Error: Repository URL and Hugging Face Token are required.", empty_fig
-
- logger.info(f"Starting new issue crawl and setup for {repo_url}")
-
- # --- Reset Manager State ---
- # Stop background tasks first
- self.stop_idle_processing()
- self.stop_broadcast_loop()
-
- self.issues = {}
- # Clear code_editors instances
- self.code_editors = {}
- self.issue_clusters = {}
- self.issue_list_for_clustering = []
- self.cached_suggestion.cache_clear() # Clear AI suggestion cache
- self.precomputed_context = {}
- self.precomputed_summaries = {}
- self.precomputed_missing_info = {}
- self.precomputed_analysis = {}
- # self.code_embeddings = {} # Not used
- self.potential_duplicates = {}
- self.stale_issues = []
- self.high_priority_candidates = []
- self.needs_recluster = False
- self._webhook_change_count = 0
- self.last_webhook_time = time.time()
- self.repo = None # Clear the repo object
- self.repo_url = repo_url
- self.github_token = github_token
- self.hf_token = hf_token
- logger.info("Internal state reset for new crawl.")
-
- # --- Repository Cloning/Updating ---
- match = re.match(r"https?://github\.com/([^/]+)/([^/]+)", self.repo_url)
- if not match:
- logger.error(f"Invalid GitHub URL format: {self.repo_url}")
- empty_fig = get_empty_plot("Issue Severity Distribution")
- return [], empty_fig, "Error: Invalid GitHub URL format. Use https://github.com/owner/repo", empty_fig
- self.repo_owner, self.repo_name = match.groups()
- self.repo_local_path = WORKSPACE / f"{self.repo_owner}_{self.repo_name}"
+ if not validators.url(url):
+ return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
+ parsed = urlparse(url)
+ if not all([parsed.scheme, parsed.netloc]):
+ return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
+ # Try HEAD request first to check accessibility
+ try:
+ head_response = self.session.head(url, timeout=5)
+ head_response.raise_for_status()
+ final_url = head_response.url # Capture potential redirects
+ except requests.exceptions.RequestException:
+ # If HEAD fails, try GET as some servers don't support HEAD
+ response = self.session.get(url, timeout=self.timeout)
+ response.raise_for_status()
+ final_url = response.url # Capture potential redirects
+
+ return {
+ 'is_valid': True,
+ 'message': 'URL is valid and accessible',
+ 'details': {
+ 'final_url': final_url,
+ 'content_type': head_response.headers.get('Content-Type', 'unknown'),
+ 'server': head_response.headers.get('Server', 'unknown'),
+ 'size': head_response.headers.get('Content-Length', 'unknown')
+ }
+ }
+ except Exception as e:
+ return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
+ def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
+ """Enhanced content fetcher with retry mechanism and complete character extraction"""
try:
- if self.repo_local_path.exists():
- logger.info(f"Attempting to update existing repository clone at {self.repo_local_path}")
- try:
- self.repo = Repo(self.repo_local_path)
- # Ensure the origin remote matches the requested URL
- if not self.repo.remotes or 'origin' not in self.repo.remotes:
- logger.warning(f"Existing repo at {self.repo_local_path} has no 'origin' remote. Re-cloning.")
- if self.repo_local_path.exists(): shutil.rmtree(self.repo_local_path)
- self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}"))
- else:
- origin = self.repo.remotes.origin
- remote_url = next((u for u in origin.urls), None) # Get first URL
- expected_urls = {self.repo_url, self.repo_url + ".git"}
- if remote_url not in expected_urls:
- logger.warning(f"Existing repo path {self.repo_local_path} has different remote URL ('{remote_url}' vs '{self.repo_url}'). Removing and re-cloning.")
- # Remove the directory entirely before re-cloning
- if self.repo_local_path.exists(): shutil.rmtree(self.repo_local_path)
- self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}"))
- else:
- logger.info("Pulling latest changes...")
- # Use a timeout for pull operations
- try:
- # Fetch first to get latest refs
- origin.fetch(progress=lambda op, cur, tot, msg: logger.debug(f"Fetch progress: {msg}"), timeout=120)
- # Then pull
- origin.pull(progress=lambda op, cur, tot, msg: logger.debug(f"Pull progress: {msg}"), timeout=120)
- # Unshallow if necessary
- if self.repo.git.rev_parse('--is-shallow-repository').strip() == 'true':
- logger.info("Repository is shallow, unshallowing...")
- # Use a timeout for unshallow
- self.repo.git.fetch('--unshallow', timeout=300)
- except GitCommandError as pull_err:
- logger.error(f"Git pull/fetch error: {pull_err}. Proceeding with potentially stale local copy.")
- except Exception as pull_err:
- logger.exception(f"Unexpected error during git pull/fetch: {pull_err}. Proceeding with potentially stale local copy.")
-
-
- except (InvalidGitRepositoryError, NoSuchPathError):
- logger.warning(f"Invalid or missing Git repository at {self.repo_local_path}. Re-cloning.")
- # Ensure directory is clean before re-cloning
- if self.repo_local_path.exists(): shutil.rmtree(self.repo_local_path)
- self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}"))
- except GitCommandError as git_err:
- logger.error(f"Git operation error during update: {git_err}. Trying to proceed with existing copy, but it might be stale.")
- if not self.repo: # If repo object wasn't successfully created before the error
- try: self.repo = Repo(self.repo_local_path)
- except Exception: logger.error("Failed to even load existing repo after git error.")
- except Exception as e:
- logger.exception(f"An unexpected error occurred during repository update check: {e}")
- # If repo object wasn't successfully created before the error
- if not self.repo:
- try: self.repo = Repo(self.repo_local_path)
- except Exception: logger.error("Failed to even load existing repo after update error.")
+ logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
+
+ # Update User-Agent randomly for each request
+ self.session.headers.update({'User-Agent': self.user_agent.random})
+ response = self.session.get(url, timeout=self.timeout)
+ response.raise_for_status()
+ final_url = response.url # Capture potential redirects
+
+ # Detect encoding
+ if response.encoding is None or response.encoding == 'ISO-8859-1': # chardet often better than default response.encoding for text
+ encoding_detection = chardet.detect(response.content)
+ encoding = encoding_detection['encoding'] or 'utf-8'
+ logger.debug(f"Detected encoding '{encoding}' with confidence {encoding_detection['confidence']:.2f} for {url}")
else:
- logger.info(f"Cloning repository {self.repo_url} to {self.repo_local_path}")
- # Use a timeout for the initial clone
- self.repo = Repo.clone_from(self.repo_url, self.repo_local_path, progress=lambda op, cur, tot, msg: logger.debug(f"Clone progress: {msg}"), timeout=300)
+ encoding = response.encoding
+ logger.debug(f"Using response.encoding '{encoding}' for {url}")
+ # Decode content with fallback
+ try:
+ raw_content = response.content.decode(encoding, errors='replace')
+ except (UnicodeDecodeError, LookupError):
+ # Fallback to a more common encoding if the first attempt fails
+ try:
+ raw_content = response.content.decode('utf-8', errors='replace')
+ encoding = 'utf-8 (fallback)'
+ logger.warning(f"Decoding with {encoding} fallback for {url}")
+ except Exception:
+ raw_content = response.content.decode('latin-1', errors='replace') # Another common fallback
+ encoding = 'latin-1 (fallback)'
+ logger.warning(f"Decoding with {encoding} fallback for {url}")
+
+
+ # Extract metadata
+ metadata = {
+ 'original_url': url,
+ 'final_url': final_url,
+ 'timestamp': datetime.now().isoformat(),
+ 'detected_encoding': encoding,
+ 'content_type': response.headers.get('Content-Type', ''),
+ 'content_length': len(response.content),
+ 'headers': dict(response.headers),
+ 'status_code': response.status_code
+ }
- logger.info("Repository clone/update process finished.")
- if not self.repo:
- raise Exception("Repository object could not be initialized after cloning/update.")
+ # Process based on content type
+ processed_extraction = self._process_web_content(raw_content, metadata['content_type'], final_url)
- except GitCommandError as e:
- logger.error(f"Failed to clone/update repository: {e}")
- empty_fig = get_empty_plot("Issue Severity Distribution")
- empty_fig.update_layout(annotations=[{"text": "Repo Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}])
- return [], empty_fig, f"Error cloning/updating repository: {e}. Check URL, permissions, and network.", empty_fig
+ return {
+ 'source': 'url',
+ 'url': url, # Keep original URL as identifier
+ 'raw_content': raw_content,
+ 'metadata': metadata,
+ 'extracted_data': processed_extraction['data'],
+ 'processing_notes': processed_extraction['notes']
+ }
+ except requests.exceptions.RequestException as e:
+ if retry_count < self.max_retries - 1:
+ logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
+ time.sleep(2 ** retry_count) # Exponential backoff
+ return self.fetch_content(url, retry_count + 1)
+ logger.error(f"Failed to fetch content after {self.max_retries} attempts from {url}: {e}")
+ return {
+ 'source': 'url',
+ 'url': url,
+ 'raw_content': None,
+ 'metadata': {'original_url': url, 'timestamp': datetime.now().isoformat()},
+ 'extracted_data': None,
+ 'processing_notes': f"Failed to fetch content: {str(e)}"
+ }
except Exception as e:
- logger.exception(f"An unexpected error occurred during repository handling: {e}")
- empty_fig = get_empty_plot("Issue Severity Distribution")
- empty_fig.update_layout(annotations=[{"text": "Repo Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}])
- return [], empty_fig, f"An unexpected error occurred during repo setup: {e}", empty_fig
-
+ logger.error(f"Unexpected error while fetching or processing URL {url}: {e}")
+ return {
+ 'source': 'url',
+ 'url': url,
+ 'raw_content': raw_content if 'raw_content' in locals() else None,
+ 'metadata': metadata if 'metadata' in locals() else {'original_url': url, 'timestamp': datetime.now().isoformat()},
+ 'extracted_data': None,
+ 'processing_notes': f"Unexpected processing error: {str(e)}"
+ }
- # --- Issue Fetching ---
- api_url = f"{GITHUB_API}/{self.repo_owner}/{self.repo_name}/issues?state=open&per_page=100"
- headers = {"Accept": "application/vnd.github.v3+json"}
- if github_token:
- headers["Authorization"] = f"token {github_token}"
+ def _process_web_content(self, content: str, content_type: str, base_url: str) -> Dict[str, Any]:
+ """Process content based on detected content type"""
+ lower_content_type = content_type.lower()
+ notes = []
+ extracted_data: Any = None # Use Any to allow different types
try:
- all_issues_data = []
- page = 1
- logger.info(f"Fetching open issues from GitHub API (repo: {self.repo_owner}/{self.repo_name})...")
- async with aiohttp.ClientSession(headers=headers) as session:
- while True:
- paginated_url = api_url
- logger.debug(f"Fetching URL: {paginated_url}")
- # Use a timeout for API requests
- try:
- async with session.get(paginated_url, timeout=30) as response:
- rate_limit_remaining = response.headers.get('X-RateLimit-Remaining')
- rate_limit_reset = response.headers.get('X-RateLimit-Reset')
- logger.debug(f"GitHub API Response Status: {response.status}, RateLimit Remaining: {rate_limit_remaining}, Reset: {rate_limit_reset}")
-
- if response.status == 403 and rate_limit_remaining == '0':
- reset_time = int(rate_limit_reset) if rate_limit_reset else time.time() + 60
- wait_time = max(reset_time - time.time() + 5, 0) # Wait until reset time + a buffer
- logger.warning(f"GitHub API rate limit exceeded. Waiting until {datetime.fromtimestamp(reset_time).strftime('%H:%M:%S')} ({wait_time:.0f}s)")
- await asyncio.sleep(wait_time)
- continue # Retry the same page
-
- response.raise_for_status() # Raise for other 4xx/5xx errors
-
- issues_page_data = await response.json()
- if not issues_page_data: break # No more issues on this page
-
- logger.info(f"Fetched page {page} with {len(issues_page_data)} items.")
- all_issues_data.extend(issues_page_data)
-
- link_header = response.headers.get('Link')
- if link_header and 'rel="next"' in link_header:
- # Simple parsing for the next link, more robust parsing might be needed for complex headers
- next_url_match = re.search(r'<([^>]+)>;\s*rel="next"', link_header)
- if next_url_match:
- # The next URL is provided directly, use it
- api_url = next_url_match.group(1)
- page += 1 # Increment page counter for logging, though not strictly needed for the loop logic now
- logger.debug(f"Found next page link: {api_url}")
- else:
- logger.warning("Link header contains 'rel=\"next\"' but could not parse the URL. Stopping pagination.")
- break
- else:
- logger.debug("No 'next' link found in Link header. Assuming last page.")
- break
- await asyncio.sleep(0.1) # Small delay between requests
-
- except asyncio.TimeoutError:
- logger.warning(f"GitHub API request timed out for page {page}. Stopping pagination early.")
- break # Stop pagination on timeout
- except aiohttp.ClientResponseError as e:
- # Re-raise client response errors so the outer handler catches them
- raise e
- except Exception as e:
- logger.exception(f"An unexpected error occurred during GitHub API pagination for page {page}. Stopping pagination.")
- break # Stop pagination on unexpected error
+ if 'text/html' in lower_content_type:
+ logger.debug(f"Processing HTML content from {base_url}")
+ extracted_data = self._process_html_content_enhanced(content, base_url)
+ notes.append("Processed as HTML")
+ elif 'application/json' in lower_content_type or 'text/json' in lower_content_type:
+ logger.debug(f"Processing JSON content from {base_url}")
+ try:
+ extracted_data = json.loads(content)
+ notes.append("Parsed as JSON")
+ except json.JSONDecodeError as e:
+ extracted_data = content # Keep raw text if invalid JSON
+ notes.append(f"Failed to parse as JSON: {e}")
+ logger.warning(f"Failed to parse JSON from {base_url}: {e}")
+ except Exception as e:
+ extracted_data = content
+ notes.append(f"Error processing JSON: {e}")
+ logger.error(f"Error processing JSON from {base_url}: {e}")
+ elif 'application/xml' in lower_content_type or 'text/xml' in lower_content_type or lower_content_type.endswith('+xml'):
+ logger.debug(f"Processing XML content from {base_url}")
+ try:
+ # Try parsing XML. Convert to a string or a dict representation if needed.
+ # For simplicity, we'll convert to a readable string representation of the tree.
+ root = ET.fromstring(content)
+ # A simple way to represent XML as text
+ xml_text = ET.tostring(root, encoding='unicode', method='xml')
+ extracted_data = xml_text # Store as string for now
+ notes.append("Parsed as XML (text representation)")
+ except ET.ParseError as e:
+ extracted_data = content
+ notes.append(f"Failed to parse as XML: {e}")
+ logger.warning(f"Failed to parse XML from {base_url}: {e}")
+ except Exception as e:
+ extracted_data = content
+ notes.append(f"Error processing XML: {e}")
+ logger.error(f"Error processing XML from {base_url}: {e}")
+ elif 'text/plain' in lower_content_type or 'text/' in lower_content_type: # Catch other text types
+ logger.debug(f"Processing Plain Text content from {base_url}")
+ extracted_data = content
+ notes.append("Processed as Plain Text")
+ else:
+ logger.debug(f"Unknown content type '{content_type}' from {base_url}. Storing raw content.")
+ extracted_data = content # Store raw content for unknown types
+ notes.append(f"Unknown content type '{content_type}'. Stored raw text.")
+ except Exception as e:
+ logger.error(f"Unexpected error in _process_web_content for {base_url} ({content_type}): {e}")
+ extracted_data = content # Fallback to raw content on error
+ notes.append(f"Unexpected processing error: {e}. Stored raw text.")
- logger.info(f"Total items fetched (including potential PRs): {len(all_issues_data)}")
- # Filter out pull requests (issues with 'pull_request' key)
- self.issues = {
- issue_data['number']: self._process_issue_data(issue_data)
- for issue_data in all_issues_data
- if 'pull_request' not in issue_data and issue_data.get('number') is not None # Ensure number exists
- }
+ return {'data': extracted_data, 'notes': notes}
- logger.info(f"Filtered out pull requests, {len(self.issues)} actual open issues remaining.")
-
- empty_fig = get_empty_plot("Issue Severity Distribution")
- if not self.issues:
- logger.warning("No open issues found for this repository.")
- empty_fig.update_layout(annotations=[{"text": "No issues found.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}])
- return [], empty_fig, "No open issues found in the repository.", empty_fig
-
- # --- Clustering and UI Data Prep ---
- self.issue_list_for_clustering = list(self.issues.values())
- logger.info("Clustering issues...")
- await self._cluster_similar_issues()
-
- # --- Initial Idle Task Prep (Run synchronously after load) ---
- logger.info("Identifying potential duplicates based on initial clusters...")
- self._identify_potential_duplicates()
- logger.info("Identifying potentially stale issues...")
- self._identify_stale_issues()
- logger.info("Identifying high priority candidates...")
- self._identify_high_priority_candidates()
-
- # --- Prepare Dataframe Output & Stats ---
- dataframe_data = []
- severity_counts = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0, "Unknown": 0}
- index_to_cluster_id: Dict[int, int] = {}
- # Map issue index in the clustering list back to its cluster ID
- for cluster_id, indices in self.issue_clusters.items():
- for index in indices:
- if 0 <= index < len(self.issue_list_for_clustering):
- index_to_cluster_id[index] = cluster_id
- else:
- logger.warning(f"Clustering returned invalid index {index} for list of length {len(self.issue_list_for_clustering)}")
-
- for i, issue in enumerate(self.issue_list_for_clustering):
- severity = self._determine_severity(issue.get('labels', [])) # Use get for safety
- severity_counts[severity] += 1
- # Get cluster ID using the index from the clustering list
- cluster_id = index_to_cluster_id.get(i, -1)
- dataframe_data.append([
- issue.get('id', 'N/A'), # Use get for safety
- issue.get('title', 'No Title'), # Use get for safety
- severity,
- cluster_id if cluster_id != -1 else "N/A" # Display "N/A" for noise (-1)
- ])
-
- logger.info("Generating statistics plot...")
- stats_fig = self._generate_stats_plot(severity_counts)
-
- # --- Start Background Tasks ---
- # Ensure tasks are created in the manager's loop
- self.start_broadcast_loop()
- self.start_idle_processing()
-
- success_msg = f"Found {len(self.issues)} open issues. Clustered into {len(self.issue_clusters)} groups. Repo ready. Background analysis started."
- logger.info(success_msg)
- # Return both plots (stats and analytics severity are the same initially)
- return dataframe_data, stats_fig, success_msg, stats_fig
-
- except aiohttp.ClientResponseError as e:
- logger.error(f"GitHub API request failed: Status={e.status}, Message='{e.message}', URL='{e.request_info.url}'")
- error_msg = f"Error fetching issues: {e.status} - {e.message}. Check token/URL."
- if e.status == 404: error_msg = f"Error: Repository not found at {self.repo_url}."
- elif e.status == 401: error_msg = "Error: Invalid GitHub token or insufficient permissions for this repository."
- elif e.status == 403:
- rate_limit_remaining = e.headers.get('X-RateLimit-Remaining') # FIX: Access rate_limit_remaining from error headers
- rate_limit_reset = e.headers.get('X-RateLimit-Reset')
- reset_time_str = "unknown"
- if rate_limit_reset:
- try: reset_time_str = datetime.fromtimestamp(int(rate_limit_reset), timezone.utc).strftime('%Y-%m-%d %H:%M:%S %Z')
- except ValueError: pass
- error_msg = f"Error: GitHub API rate limit likely exceeded or access forbidden (Remaining: {rate_limit_remaining}). Reset time: {reset_time_str}. Check token or wait."
- self.stop_idle_processing()
- self.stop_broadcast_loop()
- empty_fig = get_empty_plot("Issue Severity Distribution")
- empty_fig.update_layout(annotations=[{"text": "API Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}])
- return [], empty_fig, error_msg, empty_fig
- except asyncio.TimeoutError:
- logger.error("GitHub API request timed out.")
- self.stop_idle_processing()
- self.stop_broadcast_loop()
- empty_fig = get_empty_plot("Issue Severity Distribution")
- empty_fig.update_layout(annotations=[{"text": "API Timeout.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}])
- return [], empty_fig, "Error: GitHub API request timed out.", empty_fig
- except Exception as e:
- self.stop_idle_processing()
- self.stop_broadcast_loop()
- logger.exception(f"An unexpected error occurred during issue crawl: {e}")
- empty_fig = get_empty_plot("Issue Severity Distribution")
- empty_fig.update_layout(annotations=[{"text": "Unexpected Error.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}])
- return [], empty_fig, f"An unexpected error occurred: {e}", empty_fig
-
- def _determine_severity(self, labels: List[str]) -> str:
- """Determines issue severity based on labels using predefined rules."""
- labels_lower = {label.lower().strip() for label in labels}
- for severity, keywords in self.severity_rules.items():
- if any(keyword in label for label in labels_lower for keyword in keywords):
- return severity
- return "Unknown"
-
- def _generate_stats_plot(self, severity_counts: Dict[str, int]) -> go.Figure:
- """Generates a Plotly bar chart for issue severity distribution."""
- filtered_counts = {k: v for k, v in severity_counts.items() if v > 0}
- if not filtered_counts:
- fig = go.Figure()
- fig.update_layout(title="Issue Severity Distribution", xaxis={"visible": False}, yaxis={"visible": False},
- # FIX: Corrected dictionary syntax for font size
- annotations=[{"text": "No issues to display.", "xref": "paper", "yref": "paper", "showarrow": False, "font": {"size": 16}}],
- plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)')
- return fig # Return the empty figure here
-
- severities = list(filtered_counts.keys())
- counts = list(filtered_counts.values())
- order = ['Critical', 'High', 'Medium', 'Low', 'Unknown']
- # Sort severities based on the predefined order
- severities_sorted = sorted(severities, key=lambda x: order.index(x) if x in order else len(order))
- counts_sorted = [filtered_counts[s] for s in severities_sorted]
-
- fig = px.bar(x=severities_sorted, y=counts_sorted, title="Issue Severity Distribution",
- labels={'x': 'Severity', 'y': 'Number of Issues'}, color=severities_sorted,
- color_discrete_map={'Critical': '#DC2626', 'High': '#F97316', 'Medium': '#FACC15', 'Low': '#84CC16', 'Unknown': '#6B7280'},
- text=counts_sorted)
- fig.update_layout(xaxis_title=None, yaxis_title="Number of Issues", plot_bgcolor='rgba(0,0,0,0)',
- paper_bgcolor='rgba(0,0,0,0)', showlegend=False,
- xaxis={'categoryorder':'array', 'categoryarray': order},
- yaxis={'rangemode': 'tozero'}) # Ensure y-axis starts at 0
- fig.update_traces(textposition='outside')
- return fig
-
- async def _cluster_similar_issues(self):
- """Generates embeddings and clusters issues using HDBSCAN. Uses self.issue_list_for_clustering."""
- if not self.issue_list_for_clustering:
- logger.warning("Cannot cluster issues: No issues loaded or list is empty.")
- self.issue_clusters = {}
- self._webhook_change_count = 0 # Reset on empty list
- self.needs_recluster = False
- return
- if not self.hf_token or HF_EMBEDDING_MODEL is None:
- logger.error("Cannot cluster issues: Hugging Face token or embedding model missing.")
- self.issue_clusters = {}
- self._webhook_change_count = 0 # Reset on missing token/model
- self.needs_recluster = False
- return
-
- num_issues = len(self.issue_list_for_clustering)
- logger.info(f"Generating embeddings for {num_issues} issues for clustering using {HF_EMBEDDING_MODEL}...")
+
+ def _process_html_content_enhanced(self, content: str, base_url: str) -> Dict[str, Any]:
+ """Process HTML content, preserving text, and extracting metadata."""
+ extracted: Dict[str, Any] = {
+ 'title': None,
+ 'meta_description': None, # Add extraction for meta description
+ 'full_text': "",
+ 'links': [] # Add extraction for links
+ }
try:
- # Use title + a snippet of the body for embedding
- texts_to_embed = [
- f"Title: {i.get('title','')} Body: {i.get('body','')[:500]}" # Limit body length
- for i in self.issue_list_for_clustering
- ]
- embeddings = await self._generate_embeddings(texts_to_embed)
-
- if embeddings is None or not isinstance(embeddings, list) or len(embeddings) != num_issues:
- logger.error(f"Failed to generate valid embeddings for clustering. Expected {num_issues}, got {type(embeddings)} len {len(embeddings) if embeddings else 'N/A'}.")
- self.issue_clusters = {}
- self._webhook_change_count = 0 # Reset on embedding failure
- self.needs_recluster = False
- return
-
- logger.info(f"Generated {len(embeddings)} embeddings. Running HDBSCAN clustering...")
- # Adjust min_cluster_size dynamically based on issue count?
- min_cluster_size = max(2, min(5, num_issues // 10)) # Example: min 2, max 5, or 10% of issues
- clusterer = HDBSCAN(min_cluster_size=min_cluster_size, metric='cosine', allow_single_cluster=True, gen_min_span_tree=True)
- clusters = clusterer.fit_predict(embeddings)
-
- new_issue_clusters: Dict[int, List[int]] = {}
- noise_count = 0
- for i, cluster_id in enumerate(clusters):
- cluster_id_int = int(cluster_id)
- if cluster_id_int == -1:
- noise_count += 1
- continue
- if cluster_id_int not in new_issue_clusters:
- new_issue_clusters[cluster_id_int] = []
- new_issue_clusters[cluster_id_int].append(i)
-
- self.issue_clusters = new_issue_clusters
- logger.info(f"Clustering complete. Found {len(self.issue_clusters)} clusters (min size {min_cluster_size}) with {noise_count} noise points.")
-
- # Reset the change counter and flag after successful clustering
- self._webhook_change_count = 0
- self.needs_recluster = False
- logger.debug("Reset webhook change counter and recluster flag after clustering.")
+ soup = BeautifulSoup(content, 'html.parser')
- except Exception as e:
- logger.exception(f"Error during issue clustering: {e}")
- self.issue_clusters = {}
- self._webhook_change_count = 0 # Reset on clustering failure
- self.needs_recluster = False
+ # Extract Title
+ if soup.title and soup.title.string:
+ extracted['title'] = soup.title.string.strip()
+ # Extract Meta Description
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
+ if meta_desc and meta_desc.get('content'):
+ extracted['meta_description'] = meta_desc['content'].strip()
- def _identify_potential_duplicates(self):
- """Populates self.potential_duplicates based on self.issue_clusters and self.issue_list_for_clustering."""
- self.potential_duplicates = {}
- if not self.issue_clusters or not self.issue_list_for_clustering:
- logger.debug("Skipping duplicate identification: No clusters or issue list.")
- return
+ # Extract and process links (convert relative to absolute)
+ for a_tag in soup.find_all('a', href=True):
+ href = a_tag['href']
+ text = a_tag.get_text().strip()
+ try:
+ absolute_url = urljoin(base_url, href)
+ extracted['links'].append({'text': text, 'url': absolute_url})
+ except Exception:
+ extracted['links'].append({'text': text, 'url': href}) # Keep relative if join fails
- index_to_id = {}
- try:
- for i, issue in enumerate(self.issue_list_for_clustering):
- issue_id = issue.get('id')
- if issue_id is None:
- logger.warning(f"Issue at index {i} in clustering list is missing an ID.")
- continue
- index_to_id[i] = issue_id
- except Exception as e:
- logger.error(f"Error creating index-to-ID map for duplicate check: {e}. Issue list might be inconsistent.")
- return
-
- for cluster_id, indices in self.issue_clusters.items():
- if len(indices) > 1:
- # Get issue IDs for indices in this cluster, skipping any invalid indices
- cluster_issue_ids = [index_to_id[i] for i in indices if i in index_to_id]
- if len(cluster_issue_ids) > 1: # Ensure there's more than one valid issue ID in the cluster
- for issue_id in cluster_issue_ids:
- # For each issue in the cluster, list all *other* issues in the same cluster as potential duplicates
- self.potential_duplicates[issue_id] = [other_id for other_id in cluster_issue_ids if other_id != issue_id]
-
- logger.info(f"Identified potential duplicates for {len(self.potential_duplicates)} issues based on clustering.")
-
- async def _generate_embeddings(self, texts: List[str]):
- """Generates sentence embeddings using Hugging Face Inference API."""
- if not self.hf_token:
- logger.error("Hugging Face token is not set. Cannot generate embeddings.")
- return None
- if not texts:
- logger.warning("Embedding generation requested with empty text list.")
- return []
- if HF_EMBEDDING_MODEL is None:
- logger.error("HF Embedding model is not configured.")
- return None
-
- model_id = HF_EMBEDDING_MODEL # Use the fixed embedding model
- api_url = f"{HF_INFERENCE_API}/{model_id}"
- headers = {"Authorization": f"Bearer {self.hf_token}"}
- timeout = aiohttp.ClientTimeout(total=180) # Increased timeout for embedding large batches
-
- logger.info(f"Requesting embeddings from {api_url} for {len(texts)} texts.")
- # HF Inference API has a limit on the number of inputs per request (often 512 or 1024)
- # Batching is recommended for large lists of texts.
- batch_size = 500 # Example batch size, adjust based on model limits if known
- all_embeddings = []
-
- for i in range(0, len(texts), batch_size):
- batch_texts = texts[i:i + batch_size]
- payload = {"inputs": batch_texts, "options": {"wait_for_model": True}}
- logger.debug(f"Processing embedding batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1} ({len(batch_texts)} texts)")
-
- # Implement retry logic for batches
- retries = 3
- for attempt in range(retries):
- try:
- async with aiohttp.ClientSession(headers=headers, timeout=timeout) as session:
- async with session.post(api_url, json=payload) as response:
- rate_limit_remaining = response.headers.get('X-Ratelimit-Remaining')
- logger.debug(f"HF Embedding API Response Status: {response.status}, RateLimit Remaining: {rate_limit_remaining}")
-
- if response.status == 429: # Too Many Requests
- retry_after = int(response.headers.get('Retry-After', 10))
- logger.warning(f"HF Embedding API rate limited. Waiting for {retry_after} seconds before retry {attempt + 1}/{retries}.")
- await asyncio.sleep(retry_after)
- continue # Retry the same batch
- response.raise_for_status() # Raise for other 4xx/5xx errors
-
- result = await response.json()
-
- if isinstance(result, list) and all(isinstance(emb, list) and all(isinstance(f, float) for f in emb) for emb in result):
- if len(result) == len(batch_texts):
- all_embeddings.extend(result)
- logger.debug(f"Successfully received {len(result)} embeddings for batch.")
- break # Batch successful, move to next batch
- else:
- logger.error(f"HF Embedding API returned wrong number of embeddings for batch: Got {len(result)}, expected {len(batch_texts)}.")
- return None # Indicate failure
- elif isinstance(result, dict) and 'error' in result:
- error_msg = result['error']
- estimated_time = result.get('estimated_time')
- logger.error(f"HF Inference API embedding error on batch: {error_msg}" + (f" (Estimated time: {estimated_time}s)" if estimated_time else ""))
- return None # Indicate failure
- else:
- logger.error(f"Unexpected embedding format received on batch: Type={type(result)}. Response: {str(result)[:500]}")
- return None # Indicate failure
- except asyncio.TimeoutError:
- logger.warning(f"HF Inference API embedding request timed out after {timeout.total} seconds for batch. Retry {attempt + 1}/{retries}.")
- if attempt < retries - 1:
- await asyncio.sleep(5) # Wait a bit before retrying timeout
- continue
- else:
- logger.error("Max retries reached for embedding batch timeout.")
- return None # Indicate failure
- except aiohttp.ClientResponseError as e:
- error_body = await e.text()
- logger.error(f"HF Inference API embedding request failed on batch: Status={e.status}, Message='{e.message}'. Body: {error_body[:500]}. Retry {attempt + 1}/{retries}.")
- if attempt < retries - 1 and e.status in [500, 502, 503, 504]: # Retry on server errors
- await asyncio.sleep(5)
- continue
- else:
- logger.error("Max retries reached or non-retryable error for embedding batch.")
- return None # Indicate failure
- except Exception as e:
- logger.exception(f"Unexpected error during embedding generation on batch: {e}. Retry {attempt + 1}/{retries}.")
- if attempt < retries - 1:
- await asyncio.sleep(5)
- continue
- else:
- logger.error("Max retries reached for unexpected error during embedding batch.")
- return None # Indicate failure
- else: # This else block executes if the inner loop completes without a 'break' (i.e., all retries failed)
- logger.error(f"Failed to process embedding batch after {retries} retries.")
- return None # Indicate failure
+ # Extract all text content (similar to stripped_strings but ensures order)
+ text_parts = []
+ # Use a more robust way to get visible text, including handling script/style tags
+ for script_or_style in soup(["script", "style"]):
+ script_or_style.extract() # Remove script and style tags
+ text = soup.get_text(separator='\n') # Get text with newlines
- await asyncio.sleep(0.1) # Small delay between batches
+ # Clean up whitespace and empty lines
+ lines = text.splitlines()
+ cleaned_lines = [line.strip() for line in lines if line.strip()]
+ extracted['full_text'] = '\n'.join(cleaned_lines)
- if len(all_embeddings) == len(texts):
- logger.info(f"Successfully generated embeddings for all {len(all_embeddings)} texts.")
- return all_embeddings
- else:
- logger.error(f"Embedding generation failed partway through. Expected {len(texts)}, got {len(all_embeddings)}.")
- return None # Indicate overall failure
-
-
- async def generate_code_patch(self, issue_number: int, model_key: str) -> dict:
- """Generates a code patch suggestion using a selected AI model."""
- if issue_number not in self.issues:
- return {"error": f"Issue {issue_number} not found."}
- if not self.hf_token:
- return {"error": "Hugging Face token not set."}
- if model_key not in HF_MODELS or HF_MODELS.get(model_key) is None:
- return {"error": f"Invalid or unconfigured model key: {model_key}"}
- if not self.repo_local_path or not self.repo:
- return {"error": "Repository not cloned/available locally. Please scan the repository first."}
-
- issue = self.issues[issue_number]
- model_id = HF_MODELS[model_key]
- logger.info(f"Generating patch for issue {issue_number} ('{issue.get('title', 'N/A')[:50]}...') using model {model_id}")
-
- # --- Context Gathering ---
- context_str = "Context gathering failed or not available."
- context_source = "Error"
- start_time_context = time.time()
- context_data = self.precomputed_context.get(issue_number) # Use .get for safety
-
- if context_data:
- timestamp = context_data.get('timestamp', 0)
- timestamp_str = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
- if context_data.get("error"):
- context_str = f"Pre-computed context retrieval failed: {context_data['error']}"
- context_source = f"Pre-computed (Failed @ {timestamp_str})"
- elif context_data.get("content"):
- context_str = context_data["content"]
- num_files = len(context_data.get('files',[]))
- context_source = f"Pre-computed ({num_files} files @ {timestamp_str})"
- else:
- context_str = "Pre-computed context was empty or unavailable."
- context_source = f"Pre-computed (Empty @ {timestamp_str})"
- logger.info(f"Using pre-computed context for issue {issue_number} (Source: {context_source})")
- else:
- logger.info(f"No pre-computed context found for issue {issue_number}, computing now.")
- context_source = "Computed On-Demand"
- # Compute context on demand and store it
- context_result = await self._get_code_context(issue)
- self.precomputed_context[issue_number] = {
- "content": context_result.get("content"),
- "files": context_result.get("files", []),
- "error": context_result.get("error"),
- "timestamp": time.time()
- }
- if "error" in context_result and context_result["error"]:
- context_str = f"Error retrieving context: {context_result['error']}"
- context_source += " (Error)"
- else:
- context_str = context_result.get("content", "No specific context found.")
- context_source += f" ({len(context_result.get('files',[]))} files)"
-
- context_load_duration = time.time() - start_time_context
- logger.info(f"Computed context on-demand in {context_load_duration:.2f}s. Source: {context_source}")
-
- # --- Get Pre-computed Info ---
- summary_text = self._get_precomputed_text(issue_number, self.precomputed_summaries, "summary", "Summary")
- missing_info_text = self._get_precomputed_text(issue_number, self.precomputed_missing_info, "info_needed", "Missing Info Analysis")
- analysis_text = self._get_precomputed_text(issue_number, self.precomputed_analysis, "hypothesis", "Preliminary Analysis")
- duplicate_info = self._get_duplicate_info_text(issue_number)
-
- # --- Enhanced Prompt ---
- # Added clear delimiters and instructions for the AI
- prompt = f"""You are an expert software engineer AI assistant generating a minimal `diff` code patch to fix a GitHub issue.
-
-
Select an issue from the 'Issue Board' tab.
" + return str(output_path) + except Exception as e: + logger.error(f"QR generation error: {e}") + return "" - try: - issue = manager.issues[issue_num] - # Use markdown2 for rendering issue body - html_body = markdown2.markdown( - issue.get('body', '*No description provided.*') or '*No description provided.*', - extras=["fenced-code-blocks", "tables", "strike", "task_list", "code-friendly", "html-classes", "nofollow", "spoiler"] - ) +def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]: + """Generate QR codes with enhanced visual appeal and metadata""" + # Assume 'data' here is the list of dictionaries produced by process_inputs + if not isinstance(data, list): + logger.error("generate_qr_codes received data that is not a list.") + return [] - # Generate HTML for labels - labels_html = ' '.join(f'{gr.Textbox.sanitize_html(l)}' for l in issue.get('labels', [])) or 'None' - - # Generate HTML for status indicators - status_indicators = [] - if issue_num in manager.stale_issues: - status_indicators.append(f"[Stale]") - if issue_num in manager.high_priority_candidates: - severity = manager._determine_severity(issue.get('labels', [])) - if severity == "Critical": color, bgcolor = "#ef4444", "#fee2e2" - elif severity == "High": color, bgcolor = "#f97316", "#ffedd5" - else: color, bgcolor = "#c2410c", "#fffbeb" # Should ideally not happen if logic works - status_indicators.append(f"[{severity}]") - status_html = " ".join(status_indicators) - - # --- Get Precomputed Data with Helper --- - summary_text = manager._get_precomputed_text(issue_num, manager.precomputed_summaries, "summary", "Summary") - missing_info_text = manager._get_precomputed_text(issue_num, manager.precomputed_missing_info, "info_needed", "Missing Info") - analysis_text = manager._get_precomputed_text(issue_num, manager.precomputed_analysis, "hypothesis", "Analysis") - duplicate_text = manager._get_duplicate_info_text(issue_num) # This already includes formatting and links - - # --- Format AI Sections --- - ai_sections = [] - # Only show sections if they have content or an error, not just pending state - # Use the presence of icons (❌, ⏳, ℹ️) to determine if it's just a status message - if not summary_text.startswith("⏳") and not summary_text.startswith("(ℹ️"): - color = "#f0e6ff" if not summary_text.startswith("❌") else "#fee2e2" - border_color = "#ddd6fe" if not summary_text.startswith("❌") else "#fecaca" - ai_sections.append(f""" -Error generating preview for issue {issue_num}. Check logs.
" - - async def get_ai_suggestion_wrapper(issue_num: Optional[int], model_key: str, progress=gr.Progress()) -> str: - """UI wrapper for getting AI suggestions, handles state and progress.""" - progress(0, desc="Preparing request...") - if issue_num is None or issue_num not in manager.issues: - return "⚠️ Error: Please select a valid issue first." - if not manager.hf_token: - return "🔒 Error: Hugging Face Token is not configured." - if model_key not in HF_MODELS or HF_MODELS.get(model_key) is None: - return f"⚠️ Error: Invalid or unconfigured model key selected: {model_key}" - - issue = manager.issues[issue_num] - issue_hash = manager._get_issue_hash(issue) - logger.info(f"Requesting suggestion for issue {issue_num} (hash: {issue_hash}) using model {model_key}.") - - try: - progress(0.3, desc=f"Querying {model_key}...") - # The cached_suggestion method handles the cache lookup and actual generation - suggestion = await manager.cached_suggestion(issue_hash, model_key) - progress(1, desc="Suggestion received.") - - if suggestion.lower().startswith("error:"): - return f"⚠️ {suggestion}" - else: - # Format the output clearly - return f"**💡 Suggestion based on {model_key}:**\n\n---\n{suggestion}" - except Exception as e: - logger.exception(f"Error in get_ai_suggestion_wrapper for issue {issue_num}: {e}") - return f"❌ An unexpected error occurred while getting the suggestion: {e}" - - async def get_ai_patch_wrapper(issue_num: Optional[int], model_key: str, progress=gr.Progress()) -> str: - """UI wrapper for getting AI patches, handles state and progress.""" - progress(0, desc="Preparing request...") - if issue_num is None or issue_num not in manager.issues: - return "⚠️ Error: Please select a valid issue first." - if not manager.hf_token: - return "🔒 Error: Hugging Face Token is not configured." - if not manager.repo: - return "❌ Error: Repository not loaded. Please scan the repository first." - if model_key not in HF_MODELS or HF_MODELS.get(model_key) is None: - return f"⚠️ Error: Invalid or unconfigured model key selected: {model_key}" - - logger.info(f"Requesting patch for issue {issue_num} using model {model_key}.") - progress(0.1, desc="Gathering code context (using cache if available)...") - try: - # Context is gathered inside generate_code_patch, which uses the precomputed_context cache - progress(0.4, desc=f"Querying {model_key} for patch...") - result = await manager.generate_code_patch(issue_num, model_key) - progress(1, desc="Patch result received.") - - if "error" in result: - logger.error(f"Patch generation failed for issue {issue_num}: {result['error']}") - return f"**❌ Error generating patch:**\n\n{result['error']}" - else: - model_used = result.get('model_used', model_key) - explanation = result.get('explanation', '(No explanation provided)') - patch_content = result.get('patch') - status_msg = result.get('status') # Get specific status if available - - header = f"**🩹 Patch Suggestion from {model_used}:**" - if status_msg: - header = f"**🩹 Patch Generation Result from {model_used}:** ({status_msg})" - - if patch_content: - # Escape backticks in patch content for markdown code block - patch_content_sanitized = patch_content.replace('`', '\\`') - logger.info(f"Successfully generated patch for issue {issue_num} using {model_used}.") - return f"""{header} -**Explanation:** -{explanation} ---- -**Patch:** -```diff -{patch_content_sanitized} -```""" - else: - logger.warning(f"AI provided explanation but no patch for issue {issue_num}. Explanation: {explanation}") - return f"""{header} -**Explanation:** -{explanation} ---- -**(No valid diff block generated)**""" - - except Exception as e: - logger.exception(f"Error in get_ai_patch_wrapper for issue {issue_num}: {e}") - return f"❌ An unexpected error occurred while generating the patch: {e}" - - async def handle_issue_select(evt: gr.SelectData): - """ - Handles issue selection in the Dataframe: updates preview, loads code context into editor. - Reads the selected issue ID from the event data. - Returns updates for multiple components. - """ - # Default state for deselection or error - default_updates = { - # Update the hidden state element first - "selected_issue_id_hidden": gr.update(value=""), - "issue_preview_html": gr.update(value="Select an issue from the 'Issue Board' tab.
"), - # Clear the code editor - "code_edit_component": gr.update(value={"placeholder.txt": "# Select an issue to load relevant code context."}, interactive=True, language="text"), - "ai_output_display": gr.update(value="*AI suggestions and patches will appear here after selecting an issue and action.*"), - "copy_patch_btn": gr.update(visible=False), # Hide copy button on selection change - } + try: + file_processor = EnhancedFileProcessor() # Use the enhanced processor for chunking + paths = [] + + if combined: + # Process combined data + chunks = file_processor.chunk_data(data) # chunk_data works on the list of dicts + if not chunks: + logger.warning("No chunks generated for combined data.") + return [] + for i, chunk in enumerate(chunks): + filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png' + qr_path = generate_stylish_qr( + data=chunk, # Pass the chunk dictionary + filename=filename, + fill_color="#1a365d", # Deep blue + back_color="#ffffff" + ) + if qr_path: + paths.append(qr_path) + else: + logger.warning(f"Failed to generate QR for chunk {i+1}/{len(chunks)}.") + else: + # Process individual items (each dictionary in the list) + if data: # Ensure data is not empty + for idx, item in enumerate(data): + chunks = file_processor.chunk_data(item) # chunk_data works on individual dict + if not chunks: + logger.warning(f"No chunks generated for item {idx+1}.") + continue + for chunk_idx, chunk in enumerate(chunks): + filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png' + qr_path = generate_stylish_qr( + data=chunk, # Pass the chunk dictionary + filename=filename, + fill_color="#1a365d", # Deep blue + back_color="#ffffff" + ) + if qr_path: + paths.append(qr_path) + else: + logger.warning(f"Failed to generate QR for item {idx+1} chunk {chunk_idx+1}/{len(chunks)}.") + else: + logger.warning("No items in data list to process individually.") - # Check if a row was actually selected and has a value - if evt.index is None or not hasattr(evt, 'value') or not evt.value or evt.value[0] is None: - logger.info("Issue deselected or invalid selection event.") - # Return default updates to clear the UI - return default_updates + logger.info(f"Generated {len(paths)} QR codes.") + return paths - try: - # The first column (index 0) is the Issue ID - selected_id = int(evt.value[0]) - logger.info(f"Issue selected via Dataframe: ID {selected_id}") - - if selected_id not in manager.issues: - logger.error(f"Selected issue ID {selected_id} not found in manager's issue list.") - return { - **default_updates, - "issue_preview_html": gr.update(value=f"Error: Issue {selected_id} not found in the current list. Try re-scanning.
"), - "selected_issue_id_hidden": gr.update(value=""), # Ensure hidden state is cleared + except Exception as e: + logger.error(f"QR code generation error: {e}") + return [] + +# Keep the Gradio UI definition and main function as they are, +# as the changes are internal to the processing classes and the +# process_inputs function already handles calling them and getting +# the combined list of results. + +def create_modern_interface(): + """Create a modern and visually appealing Gradio interface""" + + # Modern CSS styling + css = """ + /* Modern color scheme */ + :root { + --primary-color: #1a365d; + --secondary-color: #2d3748; + --accent-color: #4299e1; + --background-color: #f7fafc; + --success-color: #48bb78; + --error-color: #f56565; + --warning-color: #ed8936; + } + /* Container styling */ + .container { + max-width: 1200px; + margin: auto; + padding: 2rem; + background-color: var(--background-color); + border-radius: 1rem; + box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); + } + /* Component styling */ + .input-container { + background-color: white; + padding: 1.5rem; + border-radius: 0.5rem; + border: 1px solid #e2e8f0; + margin-bottom: 1rem; + } + /* Button styling */ + .primary-button { + background-color: var(--primary-color); + color: white; + padding: 0.75rem 1.5rem; + border-radius: 0.375rem; + border: none; + cursor: pointer; + transition: all 0.2s; + } + .primary-button:hover { + background-color: var(--accent-color); + transform: translateY(-1px); + } + /* Status messages */ + .status { + padding: 1rem; + border-radius: 0.375rem; + margin: 1rem 0; + } + .status.success { background-color: #f0fff4; color: var(--success-color); } + .status.error { background-color: #fff5f5; color: var(--error-color); } + .status.warning { background-color: #fffaf0; color: var(--warning-color); } + /* Gallery styling */ + .gallery { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 1rem; + padding: 1rem; + background-color: white; + border-radius: 0.5rem; + border: 1px solid #e2e8f0; + } + .gallery img { + width: 100%; + height: auto; + border-radius: 0.375rem; + transition: transform 0.2s; + } + .gallery img:hover { + transform: scale(1.05); + } + /* QR Code Viewport Styling */ + .viewport-container { + display: grid; + gap: 0.5rem; + padding: 1rem; + background-color: white; + border-radius: 0.5rem; + border: 1px solid #e2e8f0; + margin-top: 1rem; + } + .viewport-item { + display: flex; + flex-direction: column; + align-items: center; + } + .viewport-item img { + width: 100%; + height: auto; + border-radius: 0.375rem; + transition: transform 0.2s; + max-width: 150px; /* Adjust as needed */ + max-height: 150px; /* Adjust as needed */ + } + """ + # Create interface with modern design + with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface: + interface.head += """ + + """ + qr_code_paths = gr.State([]) gr.Markdown(""" -Collaborative Issue Resolution Powered by AI
-Select an issue from the 'Issue Board' tab.
", elem_id="issue_preview_div") - - with gr.Accordion("🛠️ AI Assistance Tools", open=True, elem_id="ai_tools_accordion"): - suggest_btn = gr.Button("🧠 Suggest Resolution Steps", icon="💡", elem_id="suggest_btn", interactive=bool(HF_MODELS)) - patch_btn = gr.Button("📝 Generate Code Patch", icon="🩹", elem_id="patch_btn", interactive=bool(HF_MODELS)) - - gr.Markdown("### AI Output") - # Use a Markdown component with scrollability - ai_output_display = gr.Markdown(value="*AI suggestions and patches will appear here...*", elem_id="ai_output_md") - with gr.Row(): - copy_patch_btn = gr.Button("📋 Copy Patch", elem_id="copy_patch_btn", visible=False) # Initially hidden - clear_ai_output_btn = gr.Button("🧹 Clear Output", elem_id="clear_ai_output_btn") - - - with gr.Column(scale=2, min_width=600): - gr.Markdown("### Collaborative Code Editor (Context-Aware)") - gr.Markdown("⚠️ Warning: Real-time collaborative editing is experimental and may lose data with simultaneous edits. Use with caution and save work frequently!
") - # Initialize with placeholder content - code_edit_component = code_editor( - label="Code Context / Editor", - language="python", # Default language, can be changed based on file extension in JS - interactive=True, - elem_id="code_editor_component", - value={"placeholder.txt": "# Select an issue to load relevant code context."} - ) - - with gr.Tab("📈 Analytics", id="analytics", elem_id="tab-analytics"): - gr.Markdown("### Repository Analytics") - with gr.Row(equal_height=False): - with gr.Column(scale=1): - gr.Markdown("#### Issue Severity Distribution") - # Initialize with empty plot, will be updated after crawl - analytics_severity_plot = gr.Plot(label="Severity Distribution (Analytics)", elem_id="analytics_severity_plot", value=manager._generate_stats_plot({})) - with gr.Column(scale=1): - gr.Markdown("#### Issue Cluster Analysis (Top Clusters)") - cluster_info_df = gr.Dataframe( - headers=["Cluster ID", "Issue Count", "Top Keywords (Example)"], - datatype=["number", "number", "str"], - value=[["Scan a repository to see cluster data.", 0, ""]], # Initial placeholder data - label="Issue Clusters", elem_id="cluster_info_df", - interactive=False, # Make this dataframe non-interactive - row_count=(5, "dynamic"), - col_count=(3, "fixed"), - wrap=True - ) - gr.Markdown("*(Analytics update after scanning the repository. More detailed analytics could be added.)*") - - # --- Event Handlers --- - # The crawl button updates the issue list, stats plot, status, and analytics plot - crawl_btn.click( - fn=manager.crawl_issues, - inputs=[repo_url, github_token, hf_token], - outputs=[issue_list, stats_plot, status_output, analytics_severity_plot], - api_name="crawl_issues", - show_progress="full" - ).then( - # After crawl_issues completes, update the cluster analytics dataframe - fn=lambda: update_cluster_analytics(manager), - inputs=[], - outputs=[cluster_info_df] - ) - - # The issue list selection updates the preview, code editor, AI output area, and hidden state - issue_list.select( - fn=handle_issue_select, - # Pass the event data, which contains the selected row's value (including ID) - inputs=[gr.SelectData()], - outputs=[selected_issue_id_hidden, issue_preview_html, code_edit_component, ai_output_display, copy_patch_btn], - show_progress="minimal", - # Trigger the JS function to update the tracked issue ID and editor listeners - # This is handled by the MutationObserver and reading the hidden input value in JS - ) - - # AI Suggestion button - suggest_btn.click( - fn=get_ai_suggestion_wrapper, - inputs=[selected_issue_id_hidden, model_select], # Read selected issue ID from hidden state - outputs=[ai_output_display], - api_name="suggest_resolution", - show_progress="full" - ).then( - # After getting suggestion, hide the copy patch button - fn=lambda: gr.update(visible=False), - inputs=[], - outputs=[copy_patch_btn] - ) - - # AI Patch button - patch_btn.click( - fn=get_ai_patch_wrapper, - inputs=[selected_issue_id_hidden, model_select], # Read selected issue ID from hidden state - outputs=[ai_output_display], - api_name="generate_patch", - show_progress="full" - ).then( - # After getting patch, check if it contains a diff block and show the copy button if so - # Use a lambda to check the output text - fn=lambda output_text: gr.update(visible="```diff" in output_text), - inputs=[ai_output_display], - outputs=[copy_patch_btn] + with gr.Tab("📝 URL Processing"): + url_input = gr.Textbox( + label="Enter URLs (comma or newline separated)", + lines=5, + placeholder="https://example1.com\nhttps://example2.com", + value="" + ) + with gr.Tab("📁 File Input"): + file_input = gr.File( + label="Upload Files", + file_types=None, # Accept all file types + file_count="multiple" + ) + with gr.Tab("📋 JSON Input"): + text_input = gr.TextArea( + label="Direct JSON Input", + lines=15, + placeholder="Paste your JSON data here...", + value="" + ) + with gr.Row(): + example_btn = gr.Button("📝 Load Example", variant="secondary") + clear_btn = gr.Button("🗑️ Clear", variant="secondary") + with gr.Row(): + combine_data = gr.Checkbox( + label="Combine all data into sequence", + value=True, + info="Generate sequential QR codes for combined data" + ) + process_btn = gr.Button( + "🔄 Process & Generate QR", + variant="primary" + ) + # Output components + output_json = gr.JSON(label="Processed Data") + output_gallery = gr.Gallery( + label="Generated QR Codes", + columns=3, + height=400, + show_label=True ) - - # Clear AI Output button - clear_ai_output_btn.click( - fn=lambda: ["*AI suggestions and patches will appear here...*", gr.update(visible=False)], - inputs=[], - outputs=[ai_output_display, copy_patch_btn] + output_text = gr.Textbox( + label="Processing Status", + interactive=False ) - # --- JavaScript for WebSocket Communication and UI Interaction --- - def web_socket_js(ws_port, gradio_port): - # Generate a unique client ID on page load - # Note: This JS is generated *once* when the Gradio app is created. - # The actual client ID is generated and logged when the JS runs in the browser. - # The Python-side logging here is just for context during app startup. - temp_client_id_placeholder = f"client_{hashlib.sha1(os.urandom(16)).hexdigest()[:8]}" - logger.info(f"Generating JS with placeholder Client ID: {temp_client_id_placeholder}") - - return f""" - - """ - # The _js parameter injects the JavaScript code into the Gradio page - demo_app.load(_js=web_socket_js(WS_PORT, GRADIO_PORT), fn=None, inputs=None, outputs=None) - - return demo_app - -# ========== WebSocket Server Logic ========== -async def handle_ws_connection(websocket: WebSocketServerProtocol, path: str, manager: IssueManager): - """Handles incoming WebSocket connections and messages for collaboration.""" - # Generate a client ID and attach it to the websocket object - client_id = f"client_{hashlib.sha1(os.urandom(16)).hexdigest()[:8]}" - setattr(websocket, 'client_id', client_id) - remote_addr = websocket.remote_address - logger.info(f"WebSocket client connected: {remote_addr} assigned ID {client_id}") - - # Add the new client to the list of active clients - manager.ws_clients.append(websocket) - logger.info(f"Client list size: {len(manager.ws_clients)}") + # Initialize enabledStates if it's empty (first load) + if not enabled_states and paths: + enabled_states = list(range(num_qr_codes)) # Enable all by default on first view - try: - # Wait for the first message (expected to be 'join') or other messages - async for message in websocket: - try: - # Ensure message is bytes or string before decoding/parsing - if isinstance(message, bytes): - message = message.decode('utf-8') - if not isinstance(message, str): - logger.warning(f"Received non-string/bytes message from {client_id}: {message!r}") - continue - - data = json.loads(message) - msg_type = data.get("type") - # Use the client_id assigned by the server, not one sent by the client, for security - sender_id = client_id - - logger.debug(f"Received WS message type '{msg_type}' from {sender_id} ({remote_addr})") - - if msg_type == "join": - # Store collaborator info when they explicitly join - # Use the name provided by the client, default if not provided - client_name = data.get("name", f"User_{sender_id[:4]}") - # Ensure name is a string and not too long - client_name = str(client_name)[:50] if client_name else f"User_{sender_id[:4]}" - - if sender_id in manager.collaborators: - # Update existing entry if client reconnects or sends join again - manager.collaborators[sender_id].update({"name": client_name, "status": "Connected"}) - logger.info(f"Client {sender_id} ({client_name}) updated status to Connected.") - else: - # Add new entry - manager.collaborators[sender_id] = {"name": client_name, "status": "Connected"} - logger.info(f"Client {sender_id} ({client_name}) joined collaboration. Current collaborators: {list(manager.collaborators.keys())}") - - # Broadcast updated status list to all clients - await manager.broadcast_collaboration_status_once() - - elif msg_type == "code_update": - issue_num = data.get("issue_num") - delta_str = data.get("delta") - # Ensure data is valid and sender ID matches - if issue_num is not None and delta_str is not None and sender_id == client_id: - # FIX: Corrected call - handle_ws_connection is already async, just await the async manager method - await manager.handle_code_editor_update(int(issue_num), delta_str, sender_id) - else: - logger.warning(f"Invalid or unauthorized 'code_update' message from {sender_id}: Missing issue_num/delta or sender mismatch. Data: {str(data)[:200]}") - - elif msg_type == "status_update": - status = data.get("status", "Idle") - # Only update status for the client ID that sent the message - if sender_id == client_id: - # Ensure status is a string and not too long - status = str(status)[:100] if status else "Idle" - - if sender_id in manager.collaborators: - manager.collaborators[sender_id]["status"] = status - # Broadcast updated status list - await manager.broadcast_collaboration_status_once() - else: - # This might happen if 'join' wasn't received first, or state is out of sync - logger.warning(f"Received status update from client {sender_id} not in collaborator list. Adding/Updating with default name.") - manager.collaborators[sender_id] = {"name": f"User_{sender_id[:4]} (Re-added)", "status": status} - await manager.broadcast_collaboration_status_once() - else: - logger.warning(f"Unauthorized status update from {sender_id} attempting to update status for another client. Ignoring.") + for i, path in enumerate(paths): + is_enabled = i in enabled_states + border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;" + opacity = "opacity: 1.0;" if is_enabled else "opacity: 0.5;" + viewport_html += f' ' + viewport_html += '' + return viewport_html - else: - logger.warning(f"Unknown WebSocket message type '{msg_type}' received from {sender_id} ({remote_addr}). Message: {str(message)[:200]}") + def process_inputs(urls, files, text, combine): + """Process all inputs and generate QR codes""" + results = [] + processing_status_messages = [] - except json.JSONDecodeError: - logger.error(f"Received invalid JSON over WebSocket from {client_id} ({remote_addr}): {str(message)[:200]}...") - except Exception as e: - logger.exception(f"Error processing WebSocket message from {client_id} ({remote_addr}): {e}") + url_processor = EnhancedURLProcessor() + file_processor = EnhancedFileProcessor() - # Catch standard socket exceptions for disconnects - except (ConnectionClosed, ConnectionClosedOK, ConnectionAbortedError, ConnectionResetError, WebSocketException) as e: - logger.info(f"WebSocket client {client_id} ({remote_addr}) disconnected: Type={type(e).__name__}, Code={getattr(e, 'code', 'N/A')}, Reason='{getattr(e, 'reason', 'N/A')}'") - except Exception as e: - logger.exception(f"Unexpected error in WebSocket handler for {client_id} ({remote_addr}): {e}") - finally: - # Ensure cleanup happens regardless of how the loop exits - logger.info(f"Cleaning up connection for client {client_id} ({remote_addr})") - # Pass the websocket object itself for removal - # Schedule this in the main loop if not already in it - if manager.main_loop.is_running(): - manager.main_loop.call_soon_threadsafe(manager.remove_ws_client, websocket) - else: - logger.warning("Main loop not running, cannot schedule final client removal.") + try: + # Process JSON input + if text and text.strip(): + try: + json_data = json.loads(text) + # Wrap direct JSON input in a dictionary for consistency with file/URL output structure + results.append({ + 'source': 'json_input', + 'extracted_data': json_data, + 'timestamp': datetime.now().isoformat(), + 'processing_notes': ['Parsed from direct JSON input.'] + }) + processing_status_messages.append("✅ Successfully parsed direct JSON input.") + except json.JSONDecodeError as e: + processing_status_messages.append(f"❌ Invalid JSON format in text input: {str(e)}") + except Exception as e: + processing_status_messages.append(f"❌ Error processing direct JSON input: {str(e)}") + + + # Process URLs + if urls and urls.strip(): + url_list = re.split(r'[,\n]', urls) + url_list = [url.strip() for url in url_list if url.strip()] + for url in url_list: + validation = url_processor.validate_url(url) + if validation['is_valid']: + processing_status_messages.append(f"🌐 Fetching URL: {url}...") + content_result = url_processor.fetch_content(url) + if content_result: + results.append(content_result) + processing_status_messages.append(f"✅ Fetched and processed URL: {url}") + else: + processing_status_messages.append(f"❌ Failed to fetch/process URL: {url}") + if validation['details'].get('final_url'): + processing_status_messages[-1] += f" (Redirected to {validation['details']['final_url']})" + else: + processing_status_messages.append(f"⚠️ Skipping invalid URL: {url} ({validation['message']})") + + # Process files + if files: + for file in files: + processing_status_messages.append(f"📁 Processing file: {file.name}...") + file_results = file_processor.process_file(file) + if file_results: + results.extend(file_results) + processing_status_messages.append(f"✅ Processed file: {file.name}") + else: + processing_status_messages.append(f"❌ Failed to process file: {file.name}") + # Generate QR codes + qr_paths = [] + final_json_output = None -async def start_websocket_server(manager: IssueManager, port: int): - """Starts the WebSocket server.""" - # The handler needs the manager instance - handler_with_manager = lambda ws, path: handle_ws_connection(ws, path, manager) - server = None - # Use a Future to signal when the server should stop - stop_event = asyncio.Future() + if results: + # Use the collected results (list of dicts) for QR code generation + qr_paths = generate_qr_codes(results, combine) + final_json_output = results # Show the structured data in the JSON output box - # Add a method to signal the server to stop from outside the async function - # This is needed for graceful shutdown from the main thread - manager.stop_ws_server = lambda: stop_event.set_result(True) if not stop_event.done() else None + if qr_paths: + processing_status_messages.append(f"✅ Successfully generated {len(qr_paths)} QR codes.") + else: + processing_status_messages.append("❌ Failed to generate QR codes.") - try: - # Start the websockets server - server = await websockets.serve( - handler_with_manager, - "0.0.0.0", # Listen on all interfaces - port, - ping_interval=20, # Send ping every 20 seconds - ping_timeout=20 # Close connection if no pong received within 20 seconds - ) - logger.info(f"WebSocket server started successfully on ws://0.0.0.0:{port}") - - # Wait until the stop_event is set (signaled from main thread or shutdown) - await stop_event - - except OSError as e: - logger.error(f"Failed to start WebSocket server on port {port}: {e}. Is the port already in use?") - # Signal the main loop to stop gracefully if possible, or re-raise - # If this happens during startup, main thread will catch it and trigger shutdown. - # No need to explicitly call shutdown_handler here, the main thread's except/finally will handle it. - raise SystemExit(f"WebSocket Port {port} unavailable. Application cannot start.") from e - except asyncio.CancelledError: - logger.info("WebSocket server task cancelled.") - except Exception as e: - logger.exception(f"An unexpected error occurred starting or running the WebSocket server: {e}") - # Ensure the stop event is set so the await completes - if not stop_event.done(): stop_event.set_result(True) - raise # Re-raise to potentially stop the main loop - - finally: - if server: - logger.info("Attempting to stop WebSocket server...") - server.close() # Signal server to close - await server.wait_closed() # Wait for server to finish closing connections - logger.info("WebSocket server stopped.") - # Ensure the stop event is completed even if there was an error before await - if not stop_event.done(): - stop_event.set_result(True) - - -def run_webhook_server(manager: IssueManager, port: int, main_loop: asyncio.AbstractEventLoop): - """Starts the HTTP webhook server in a separate thread.""" - # Pass the manager instance and the main asyncio loop reference to the handler class - WebhookHandler.manager_instance = manager - WebhookHandler.main_loop = main_loop - httpd = None - - # Add a method to signal the webhook server thread to stop - # This is needed for graceful shutdown from the main thread - # Use a simple flag and check it periodically, or rely on httpd.shutdown() - # httpd.shutdown() is the standard way for BaseHTTPRequestHandler servers - manager.stop_webhook_server = lambda: httpd.shutdown() if httpd else None + else: + processing_status_messages.append("⚠️ No valid content collected from inputs.") - try: - server_address = ("0.0.0.0", port) - # Create the HTTP server instance - httpd = HTTPServer(server_address, WebhookHandler) - logger.info(f"Webhook HTTP server starting on http://0.0.0.0:{port}") - # Start serving requests (this call blocks the thread) - httpd.serve_forever() - except OSError as e: - logger.error(f"Failed to start Webhook server on port {port}: {e}. Is the port already in use?") - # If the server fails to start, signal the main loop to stop - if main_loop.is_running(): - # Use call_soon_threadsafe as this is in a different thread - main_loop.call_soon_threadsafe(main_loop.stop) - except Exception as e: - logger.exception(f"Unexpected error in Webhook server thread: {e}") - # If an unexpected error occurs, signal the main loop to stop - if main_loop.is_running(): - main_loop.call_soon_threadsafe(main_loop.stop) - finally: - if httpd: - logger.info("Shutting down Webhook HTTP server...") - # This method stops the serve_forever() loop - # It needs to be called from a different thread than the one running serve_forever() - # The manager.stop_webhook_server lambda is designed for this. - # If this finally block is reached due to an exception *within* serve_forever(), - # httpd.shutdown() might not be needed or might fail, but calling it is safer. - try: - httpd.shutdown() # Signal the server to stop accepting new connections and finish current ones - except Exception as e: - logger.warning(f"Error during httpd.shutdown(): {e}") - try: - httpd.server_close() # Close the server socket except Exception as e: - logger.warning(f"Error during httpd.server_close(): {e}") + logger.error(f"Overall processing error in process_inputs: {e}") + processing_status_messages.append(f"❌ An unexpected error occurred during processing: {str(e)}") - logger.info("Webhook server thread finished.") + return ( + final_json_output, + [str(path) for path in qr_paths], # Gradio Gallery expects list of paths (strings) + "\n".join(processing_status_messages) # Join status messages + ) + def on_qr_generation(qr_paths_list): + # When QR codes are generated, update the state with the list of paths + # and initialize the enabled_qr_codes state with all indices enabled + num_qrs = len(qr_paths_list) + initial_enabled_states = list(range(num_qrs)) + return qr_paths_list, initial_enabled_states # Return paths list and initial enabled state + + + # Link events + example_btn.click(load_example, inputs=[], outputs=text_input) + clear_btn.click(clear_input, inputs=[], outputs=[url_input, file_input, text_input]) # Clear all inputs + + process_btn.click( + process_inputs, + inputs=[url_input, file_input, text_input, combine_data], + outputs=[output_json, output_gallery, output_text] + ).then( # Chain a .then() to update the QR paths state and trigger viewport update + on_qr_generation, + inputs=[output_gallery], # Get the list of paths from the gallery output + outputs=[qr_code_paths, enabled_qr_codes] # Update the state variables + ) -# ========== Main Execution ========== -if __name__ == "__main__": - print("--- Acknowledging potential TensorFlow/CUDA warnings ---") - print("If you see warnings like 'Could not load dynamic library...' or related to CUDA/GPU,") - print("they are often harmless if you are not using a local GPU-accelerated model.") - print("Hugging Face Inference API calls run remotely and do not require local GPU setup.") - print("--- Starting Application ---") - - # Get or create the event loop for the main thread - # This loop will run the async tasks (WS server, idle tasks, broadcast) - try: - loop = asyncio.get_event_loop() - logger.info(f"Using existing asyncio loop in main thread: {id(loop)}") - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - logger.info(f"Created and set new asyncio loop in main thread: {id(loop)}") - - manager = IssueManager() - - # Start the webhook server in a separate thread - webhook_thread = threading.Thread( - target=run_webhook_server, - args=(manager, WEBHOOK_PORT, loop), # Pass the main loop to the webhook thread - name="WebhookServerThread", - daemon=True # Allow the main program to exit even if this thread is running - ) - webhook_thread.start() - - # Create the Gradio UI - app = create_ui(manager) - - # Start the WebSocket server as an asyncio task in the main loop - ws_task = loop.create_task(start_websocket_server(manager, WS_PORT)) - - # Start the manager's background tasks (idle processing, broadcast) - # These tasks are created in the manager's loop (which is the main loop here) - manager.start_idle_processing() - manager.start_broadcast_loop() - - - def shutdown_handler(signum, frame): - """Graceful shutdown logic.""" - logger.info(f"Signal {signum} received. Initiating shutdown...") - # Signal the WebSocket server to stop - if hasattr(manager, 'stop_ws_server') and manager.stop_ws_server: - manager.stop_ws_server() - # Signal the webhook server to stop - if hasattr(manager, 'stop_webhook_server') and manager.stop_webhook_server: - manager.stop_webhook_server() - # Stop manager's internal async tasks - manager.stop_idle_processing() - manager.stop_broadcast_loop() - - # Stop the main asyncio loop - if loop.is_running(): - logger.info("Stopping main asyncio loop...") - # Use call_soon_threadsafe because signal handlers run in a different context - loop.call_soon_threadsafe(loop.stop) - else: - logger.warning("Main asyncio loop not running during shutdown handler.") + # The viewport tab's select event will trigger update_viewport to render the grid + viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output]) + # Add helpful documentation + gr.Markdown(""" + ### 🚀 Features + - **Enhanced URL Scraping**: Extracts HTML text, title, meta description, links, and attempts parsing JSON/XML from URLs based on content type. + - **Advanced File Processing**: Reads various text-based files (.txt, .md, .log etc.), HTML, XML, CSV, and attempts text extraction from common documents (.pdf, .docx, .rtf, .odt - *requires extra dependencies*). + - **Smart JSON Handling**: Parses valid JSON from direct input, files (.json or content), or URLs. + - **Archive Support**: Extracts and processes supported files from .zip, .tar, .gz archives. + - **Robust Encoding Detection**: Uses `chardet` for reliable character encoding identification. + - **Structured Output**: Provides a consistent JSON output format containing raw content (if applicable), extracted data, and processing notes for each processed item. + - **Sequential QR Codes**: Maintains data integrity across multiple codes by chunking the combined/individual processed data. + - **QR Code Viewport**: Visualize generated QR codes in a sequenced square grid with options to enable/disable individual codes for selective scanning/sharing. + - **Modern Design**: Clean, responsive interface with visual feedback. + ### 💡 Tips + 1. **URLs**: Enter multiple URLs separated by commas or newlines. The processor will attempt to fetch and structure the content based on its type. + 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and specific document/structured formats. + 3. **JSON**: Use the "Direct JSON Input" tab for pasting JSON data. The system also tries to detect JSON content in file uploads and URLs. Use the "Load Example" button to see a sample JSON structure. + 4. **Dependencies**: Processing PDF, DOCX, RTF, and ODT files requires installing optional Python libraries. Check the console logs for warnings if a library is missing. + 5. **QR Codes**: Choose whether to "Combine all data into sequence" or generate separate sequences for each input item. + 6. **Processing**: Monitor the "Processing Status" box for real-time updates and notes about errors or processing steps. + 7. **Output**: The "Processed Data" JSON box shows the structured data extracted from your inputs. The "Generated QR Codes" gallery shows the QR code images. + ### 🎨 Output Details + - The "Processed Data" JSON will be a list of dictionaries. Each dictionary represents one processed input (URL or file). + - Each item will have keys like `source`, `filename` (for files), `url` (for URLs), `mime_type`, `raw_content` (if readable), `extracted_data`, and `processing_notes`. + - `extracted_data` will contain the parsed/extracted content, structured according to the input type (e.g., dictionary for JSON, text for documents, list of rows for CSV, dictionary with title/text/links for HTML). + - `processing_notes` will list any issues encountered during extraction. + - Generated QR codes are saved in the `output/qr_codes` directory. + ### ⚙️ QR Code Viewport Instructions + 1. Navigate to the **QR Code Viewport** tab after generating QR codes. + 2. The generated QR codes will be displayed in a grid based on their total count. + 3. Use the checkboxes below each QR code to enable or disable it for visual selection. Enabled codes have a green border and full opacity. + 4. This viewport is currently for visualization and selection *within the UI*; it doesn't change the generated files themselves. You would manually select which physical QR codes to scan based on this view. + """) + return interface - # Add signal handlers for graceful shutdown (e.g., Ctrl+C) - try: - # For Unix-like systems - loop.add_signal_handler(signal.SIGINT, shutdown_handler, signal.SIGINT, None) - loop.add_signal_handler(signal.SIGTERM, shutdown_handler, signal.SIGTERM, None) - logger.info("Added signal handlers for SIGINT and SIGTERM.") - except NotImplementedError: - # Signal handlers are not available on Windows - logger.warning("Signal handlers not available on this platform (likely Windows). Ctrl+C may not be graceful.") - # On Windows, KeyboardInterrupt is usually caught directly in the main thread - - - # Launch Gradio app - # Use prevent_thread_lock=True to run Gradio server in a separate thread, - # freeing the main thread to run the asyncio loop. +def main(): + """Initialize and launch the application""" try: - logger.info(f"Launching Gradio app on port {GRADIO_PORT}...") - app.launch( - server_name="0.0.0.0", - server_port=GRADIO_PORT, - share=False, # Set to True to share publicly (requires auth token usually) - debug=True, # Keep debug=True for development logs - inbrowser=True, - prevent_thread_lock=True # Run Gradio server in a separate thread + # Configure system settings + mimetypes.init() + + # Create and launch interface + interface = create_modern_interface() + + # Launch with configuration + interface.launch( + share=False, + debug=False, # Set to True for more verbose Gradio logging + show_error=True, + show_api=False ) - logger.info("Gradio app launched in separate thread.") - - # Run the main asyncio loop indefinitely to keep async tasks running - logger.info("Running main asyncio loop...") - loop.run_forever() # This blocks the main thread until loop.stop() is called - - except KeyboardInterrupt: - logger.info("KeyboardInterrupt received in main thread.") - # If signal handlers are not implemented (e.g., Windows), KeyboardInterrupt - # is caught here. Trigger shutdown manually. - # Check if signal handlers were likely not registered by checking if signal.SIGINT exists - if not hasattr(signal, 'SIGINT'): - shutdown_handler(None, None) # Call shutdown logic - - except SystemExit as e: - logger.error(f"SystemExit received: {e}") - # SystemExit might be raised by port binding errors etc. - # The source of SystemExit might have already triggered loop.stop() or shutdown. - # Ensure cleanup happens if not already started. - # Check if the loop is still running; if so, stop it. - if loop.is_running(): - logger.info("SystemExit caught before loop stopped, triggering shutdown.") - shutdown_handler(None, None) - else: - logger.info("SystemExit caught, but loop already stopped. Proceeding with final cleanup.") - - except Exception as e: - logger.exception(f"An unexpected error occurred in the main thread: {e}") - # Trigger graceful shutdown on unexpected error - shutdown_handler(None, None) - - finally: - logger.info("Main thread exiting finally block.") - # Wait for background tasks/threads to complete shutdown... - logger.info("Running loop briefly to complete pending tasks (e.g., cancellations)...") - try: - # Gather remaining tasks (like ws_task cancellation, idle_task cancellation) - # Exclude the current task if inside a task (though we are in the main thread here) - # asyncio.all_tasks() returns tasks for the *current* thread's loop - pending_tasks = [task for task in asyncio.all_tasks(loop=loop) if not task.done()] - if pending_tasks: - # Use a timeout for waiting for tasks to finish - try: - # Wait for pending tasks, ignoring exceptions during shutdown - loop.run_until_complete(asyncio.wait(pending_tasks, timeout=5, return_when=asyncio.ALL_COMPLETED)) - logger.info(f"Completed pending tasks.") - except asyncio.TimeoutError: - logger.warning(f"Timed out waiting for {len(pending_tasks)} pending tasks to complete.") - except Exception as e: - logger.error(f"Error during final loop run_until_complete: {e}") - else: - logger.info("No pending tasks to complete.") - except Exception as e: - logger.error(f"Error checking pending tasks: {e}") + logger.error(f"Application startup error: {e}") + # Optionally print a user-friendly message before exiting + print(f"\nFatal Error: {e}\nCheck the logs for details.") + raise # Re-raise the exception to ensure the process exits if launch fails - - # Wait for the webhook thread to join (if it's not daemon, or if shutdown() needs time) - # Since it's daemon, it will be killed, but explicit shutdown is better. - # Add a small timeout for join in case shutdown() hangs. - # Check if the thread is still alive before joining - if webhook_thread.is_alive(): - logger.info("Waiting for webhook thread to join...") - webhook_thread.join(timeout=2) # Wait up to 2 seconds - - # Close the loop - if not loop.is_closed(): - logger.info("Closing asyncio loop.") - loop.close() - - logger.info("Application shutdown complete.") \ No newline at end of file +if __name__ == "__main__": + main() \ No newline at end of file