import openai import os import json import re from typing import Tuple def analyze_code(code: str) -> str: """ Uses qwen2.5-coder-7b-instruct-awq model to analyze the given code. Returns the analysis as a string. """ from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") system_prompt = ( "You are a highly precise and strict JSON generator. Analyze the code given to you. " "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. " "For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. " "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. " "If you cannot answer, still return a valid JSON with empty strings for each key. " "Example of the ONLY valid output:\n" "{\n 'strength': '...', \n 'weaknesses': '...', \n 'speciality': '...', \n 'relevance rating': 'high'\n}" ) response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", # Updated model messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": code} ], max_tokens=512, temperature=0.4 ) return response.choices[0].message.content def parse_llm_json_response(response: str): try: print("DEBUGGGGG ::: ", response) # 1. Extract the JSON object part of the string start = response.find('{') end = response.rfind('}') if start == -1 or end == -1 or end < start: raise ValueError("No valid JSON object found in the response.") json_str = response[start:end+1] # 2. Replace single quotes used for keys/values with double quotes. # This handles cases like {'key': 'value'} json_str = re.sub(r"'", '"', json_str) # 3. Find all string values and escape any unescaped double quotes inside them. # This uses a function as the replacement in re.sub def escape_inner_quotes(match): # The match object gives us the full string matched by the regex. # We take the part between the outer quotes (group 1) # and replace any \" with a temporary unique placeholder. # Then, we replace any remaining " with \", and finally # restore the original escaped quotes. inner_content = match.group(1) placeholder = "___TEMP_QUOTE___" inner_content = inner_content.replace('\\"', placeholder) inner_content = inner_content.replace('"', '\\"') inner_content = inner_content.replace(placeholder, '\\"') return f'"{inner_content}"' # This regex finds a double quote, captures everything until the next double quote, # and then applies the function to that captured group. json_str = re.sub(r'"(.*?)"', escape_inner_quotes, json_str) return json.loads(json_str) except Exception as e: print("DEBUGGGGG error ::: ", e) return {"error": f"Failed to parse JSON: {e}", "raw": response} def combine_repo_files_for_llm(repo_dir="repo_files", output_file="combined_repo.txt"): """ Combines all .py, .md, and .txt files in the given directory (recursively) into a single text file. Returns the path to the combined file. """ combined_content = [] seen_files = set() # Priority files priority_files = ["app.py", "README.md", "requirements.txt"] for pf in priority_files: pf_path = os.path.join(repo_dir, pf) if os.path.isfile(pf_path): try: with open(pf_path, "r", encoding="utf-8") as f: combined_content.append(f"\n# ===== File: {pf} =====\n") combined_content.append(f.read()) seen_files.add(os.path.abspath(pf_path)) except Exception as e: combined_content.append(f"\n# Could not read {pf_path}: {e}\n") # All other .py, .md, and .txt files for root, _, files in os.walk(repo_dir): for file in files: if file.endswith(".py") or file.endswith(".md") or file.endswith(".txt"): file_path = os.path.join(root, file) abs_path = os.path.abspath(file_path) if abs_path in seen_files: continue try: with open(file_path, "r", encoding="utf-8") as f: combined_content.append(f"\n# ===== File: {file} =====\n") combined_content.append(f.read()) seen_files.add(abs_path) except Exception as e: combined_content.append(f"\n# Could not read {file_path}: {e}\n") with open(output_file, "w", encoding="utf-8") as out_f: out_f.write("\n".join(combined_content)) return output_file def analyze_code_chunk(code: str, user_requirements: str = "") -> str: """ Analyzes a code chunk and returns a JSON summary for that chunk. """ from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") # Build the user requirements section requirements_section = "" if user_requirements.strip(): requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen rating relevance, consider how well this code matches the user's stated requirements." chunk_prompt = ( "You are a highly precise and strict JSON generator. Analyze the following code chunk. " "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. " "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. " "For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. " "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. " "If you cannot answer, still return a valid JSON with empty strings for each key. " f"{requirements_section}" "Example of the ONLY valid output:\n" '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}' ) response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": chunk_prompt}, {"role": "user", "content": code} ], temperature=0.4 ) return response.choices[0].message.content def aggregate_chunk_analyses(chunk_jsons: list, user_requirements: str = "") -> str: """ Aggregates a list of chunk JSONs into a single JSON summary using the LLM. """ from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") # Build the user requirements section requirements_section = "" if user_requirements.strip(): requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen aggregating the relevance rating, consider how well the overall repository matches the user's stated requirements." aggregation_prompt = ( "You are a highly precise and strict, code analyzer and JSON generator. You are given a list of JSON analyses of code chunks. " "Aggregate these into a SINGLE overall JSON summary with the same keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. " "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. " "For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. " "Summarize and combine the information from all chunks. Do NOT include any explanation, markdown, or text outside the JSON. " "If a key is missing in all chunks, use an empty string. " f"{requirements_section}" "Example of the ONLY valid output:\n" '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}' ) user_content = "Here are the chunk analyses:\n" + "\n".join(chunk_jsons) response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": aggregation_prompt}, {"role": "user", "content": user_content} ], max_tokens=512, temperature=0.3 ) return response.choices[0].message.content def analyze_combined_file(output_file="combined_repo.txt", user_requirements: str = ""): """ Reads the combined file, splits it into 500-line chunks, analyzes each chunk, and aggregates the LLM's output into a final summary. Now includes user requirements for better relevance rating. Returns the chunk JSONs (for debugging) and the aggregated analysis as a string. """ try: with open(output_file, "r", encoding="utf-8") as f: lines = f.readlines() chunk_size = 1200 chunk_jsons = [] for i in range(0, len(lines), chunk_size): chunk = "".join(lines[i:i+chunk_size]) analysis = analyze_code_chunk(chunk, user_requirements) chunk_jsons.append(analysis) final_summary = aggregate_chunk_analyses(chunk_jsons, user_requirements) debug_output = ( "==== Chunk JSON Outputs ====" + "\n\n".join([f"Chunk {i+1} JSON:\n{chunk_jsons[i]}" for i in range(len(chunk_jsons))]) + "\n\n==== Final Aggregated Summary ====" + f"\n{final_summary}" ) return debug_output except Exception as e: return f"Error analyzing combined file: {e}" def analyze_repo_chunk_for_context(chunk: str, repo_id: str) -> str: """ Analyze a repository chunk to create conversational context for the chatbot. This creates summaries focused on helping users understand the repository. """ try: from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") context_prompt = f"""You are analyzing a chunk of code from the repository '{repo_id}' to create a conversational summary for a chatbot assistant. Create a concise but informative summary that helps understand: - What this code section does - Key functions, classes, or components - Important features or capabilities - How it relates to the overall repository purpose - Any notable patterns or technologies used Focus on information that would be useful for answering user questions about the repository. Repository chunk: {chunk} Provide a clear, conversational summary in 2-3 paragraphs:""" response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": "You are an expert code analyst creating conversational summaries for a repository assistant chatbot."}, {"role": "user", "content": context_prompt} ], max_tokens=600, # Increased for more detailed analysis with larger chunks temperature=0.3 ) return response.choices[0].message.content except Exception as e: logger.error(f"Error analyzing chunk for context: {e}") return f"Code section analysis unavailable: {e}" def create_repo_context_summary(repo_content: str, repo_id: str) -> str: """ Create a comprehensive context summary by analyzing the repository in chunks. Returns a detailed summary that the chatbot can use to answer questions. """ try: lines = repo_content.split('\n') chunk_size = 1200 # Increased for better context and fewer API calls chunk_summaries = [] logger.info(f"Analyzing repository {repo_id} in chunks for chatbot context") for i in range(0, len(lines), chunk_size): chunk = '\n'.join(lines[i:i+chunk_size]) if chunk.strip(): # Only analyze non-empty chunks summary = analyze_repo_chunk_for_context(chunk, repo_id) chunk_summaries.append(f"=== Section {len(chunk_summaries) + 1} ===\n{summary}") # Create final comprehensive summary try: from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") final_prompt = f"""Based on the following section summaries of repository '{repo_id}', create a comprehensive overview that a chatbot can use to answer user questions. Section Summaries: {chr(10).join(chunk_summaries)} Create a well-structured overview covering: 1. Repository Purpose & Main Functionality 2. Key Components & Architecture 3. Important Features & Capabilities 4. Technology Stack & Dependencies 5. Usage Patterns & Examples Make this comprehensive but conversational - it will be used by a chatbot to answer user questions about the repository.""" response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": "You are creating a comprehensive repository summary for a chatbot assistant."}, {"role": "user", "content": final_prompt} ], max_tokens=1500, # Increased for more comprehensive summaries temperature=0.3 ) final_summary = response.choices[0].message.content # Combine everything for the chatbot context full_context = f"""=== REPOSITORY ANALYSIS FOR {repo_id.upper()} === {final_summary} === DETAILED SECTION SUMMARIES === {chr(10).join(chunk_summaries)}""" logger.info(f"Created comprehensive context summary for {repo_id}") return full_context except Exception as e: logger.error(f"Error creating final summary: {e}") # Fallback to just section summaries return f"=== REPOSITORY ANALYSIS FOR {repo_id.upper()} ===\n\n" + '\n\n'.join(chunk_summaries) except Exception as e: logger.error(f"Error creating repo context summary: {e}") return f"Repository analysis unavailable: {e}" def handle_load_repository(repo_id: str) -> Tuple[str, str]: """Load a specific repository and prepare it for exploration with chunk-based analysis.""" if not repo_id.strip(): return "Status: Please enter a repository ID.", "" try: logger.info(f"Loading repository for exploration: {repo_id}") # Download and process the repository try: download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=['.py', '.md', '.txt']) combined_text_path = combine_repo_files_for_llm() except Exception as e: logger.error(f"Error downloading repository {repo_id}: {e}") error_status = f"āŒ Error downloading repository: {e}" return error_status, "" with open(combined_text_path, "r", encoding="utf-8") as f: repo_content = f.read() status = f"āœ… Repository '{repo_id}' loaded successfully!\\nšŸ“ Files processed and ready for exploration.\\nšŸ”„ Analyzing repository in chunks for comprehensive context...\\nšŸ’¬ You can now ask questions about this repository." # Create comprehensive context summary using chunk analysis logger.info(f"Creating context summary for {repo_id}") context_summary = create_repo_context_summary(repo_content, repo_id) logger.info(f"Repository {repo_id} loaded and analyzed successfully for exploration") return status, context_summary except Exception as e: logger.error(f"Error loading repository {repo_id}: {e}") error_status = f"āŒ Error loading repository: {e}" return error_status, ""