HF_RepoSense / analyzer.py
naman1102's picture
butoon
7209842
raw
history blame
16.6 kB
import openai
import os
import json
import re
from typing import Tuple
def analyze_code(code: str) -> str:
"""
Uses qwen2.5-coder-7b-instruct-awq model to analyze the given code.
Returns the analysis as a string.
"""
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
system_prompt = (
"You are a highly precise and strict JSON generator. Analyze the code given to you. "
"Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
"For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
"Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
"If you cannot answer, still return a valid JSON with empty strings for each key. "
"Example of the ONLY valid output:\n"
"{\n 'strength': '...', \n 'weaknesses': '...', \n 'speciality': '...', \n 'relevance rating': 'high'\n}"
)
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", # Updated model
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": code}
],
max_tokens=512,
temperature=0.4
)
return response.choices[0].message.content
def parse_llm_json_response(response: str):
try:
print("DEBUGGGGG ::: ", response)
# 1. Extract the JSON object part of the string
start = response.find('{')
end = response.rfind('}')
if start == -1 or end == -1 or end < start:
raise ValueError("No valid JSON object found in the response.")
json_str = response[start:end+1]
# 2. Replace single quotes used for keys/values with double quotes.
# This handles cases like {'key': 'value'}
json_str = re.sub(r"'", '"', json_str)
# 3. Find all string values and escape any unescaped double quotes inside them.
# This uses a function as the replacement in re.sub
def escape_inner_quotes(match):
# The match object gives us the full string matched by the regex.
# We take the part between the outer quotes (group 1)
# and replace any \" with a temporary unique placeholder.
# Then, we replace any remaining " with \", and finally
# restore the original escaped quotes.
inner_content = match.group(1)
placeholder = "___TEMP_QUOTE___"
inner_content = inner_content.replace('\\"', placeholder)
inner_content = inner_content.replace('"', '\\"')
inner_content = inner_content.replace(placeholder, '\\"')
return f'"{inner_content}"'
# This regex finds a double quote, captures everything until the next double quote,
# and then applies the function to that captured group.
json_str = re.sub(r'"(.*?)"', escape_inner_quotes, json_str)
return json.loads(json_str)
except Exception as e:
print("DEBUGGGGG error ::: ", e)
return {"error": f"Failed to parse JSON: {e}", "raw": response}
def combine_repo_files_for_llm(repo_dir="repo_files", output_file="combined_repo.txt"):
"""
Combines all .py, .md, and .txt files in the given directory (recursively) into a single text file.
Returns the path to the combined file.
"""
combined_content = []
seen_files = set()
# Priority files
priority_files = ["app.py", "README.md", "requirements.txt"]
for pf in priority_files:
pf_path = os.path.join(repo_dir, pf)
if os.path.isfile(pf_path):
try:
with open(pf_path, "r", encoding="utf-8") as f:
combined_content.append(f"\n# ===== File: {pf} =====\n")
combined_content.append(f.read())
seen_files.add(os.path.abspath(pf_path))
except Exception as e:
combined_content.append(f"\n# Could not read {pf_path}: {e}\n")
# All other .py, .md, and .txt files
for root, _, files in os.walk(repo_dir):
for file in files:
if file.endswith(".py") or file.endswith(".md") or file.endswith(".txt"):
file_path = os.path.join(root, file)
abs_path = os.path.abspath(file_path)
if abs_path in seen_files:
continue
try:
with open(file_path, "r", encoding="utf-8") as f:
combined_content.append(f"\n# ===== File: {file} =====\n")
combined_content.append(f.read())
seen_files.add(abs_path)
except Exception as e:
combined_content.append(f"\n# Could not read {file_path}: {e}\n")
with open(output_file, "w", encoding="utf-8") as out_f:
out_f.write("\n".join(combined_content))
return output_file
def analyze_code_chunk(code: str, user_requirements: str = "") -> str:
"""
Analyzes a code chunk and returns a JSON summary for that chunk.
"""
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
# Build the user requirements section
requirements_section = ""
if user_requirements.strip():
requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen rating relevance, consider how well this code matches the user's stated requirements."
chunk_prompt = (
"You are a highly precise and strict JSON generator. Analyze the following code chunk. "
"Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
"All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
"For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
"Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
"If you cannot answer, still return a valid JSON with empty strings for each key. "
f"{requirements_section}"
"Example of the ONLY valid output:\n"
'{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}'
)
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": chunk_prompt},
{"role": "user", "content": code}
],
temperature=0.4
)
return response.choices[0].message.content
def aggregate_chunk_analyses(chunk_jsons: list, user_requirements: str = "") -> str:
"""
Aggregates a list of chunk JSONs into a single JSON summary using the LLM.
"""
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
# Build the user requirements section
requirements_section = ""
if user_requirements.strip():
requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen aggregating the relevance rating, consider how well the overall repository matches the user's stated requirements."
aggregation_prompt = (
"You are a highly precise and strict, code analyzer and JSON generator. You are given a list of JSON analyses of code chunks. "
"Aggregate these into a SINGLE overall JSON summary with the same keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
"All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
"For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
"Summarize and combine the information from all chunks. Do NOT include any explanation, markdown, or text outside the JSON. "
"If a key is missing in all chunks, use an empty string. "
f"{requirements_section}"
"Example of the ONLY valid output:\n"
'{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}'
)
user_content = "Here are the chunk analyses:\n" + "\n".join(chunk_jsons)
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": aggregation_prompt},
{"role": "user", "content": user_content}
],
max_tokens=512,
temperature=0.3
)
return response.choices[0].message.content
def analyze_combined_file(output_file="combined_repo.txt", user_requirements: str = ""):
"""
Reads the combined file, splits it into 500-line chunks, analyzes each chunk, and aggregates the LLM's output into a final summary.
Now includes user requirements for better relevance rating.
Returns the chunk JSONs (for debugging) and the aggregated analysis as a string.
"""
try:
with open(output_file, "r", encoding="utf-8") as f:
lines = f.readlines()
chunk_size = 1200
chunk_jsons = []
for i in range(0, len(lines), chunk_size):
chunk = "".join(lines[i:i+chunk_size])
analysis = analyze_code_chunk(chunk, user_requirements)
chunk_jsons.append(analysis)
final_summary = aggregate_chunk_analyses(chunk_jsons, user_requirements)
debug_output = (
"==== Chunk JSON Outputs ===="
+ "\n\n".join([f"Chunk {i+1} JSON:\n{chunk_jsons[i]}" for i in range(len(chunk_jsons))])
+ "\n\n==== Final Aggregated Summary ===="
+ f"\n{final_summary}"
)
return debug_output
except Exception as e:
return f"Error analyzing combined file: {e}"
def analyze_repo_chunk_for_context(chunk: str, repo_id: str) -> str:
"""
Analyze a repository chunk to create conversational context for the chatbot.
This creates summaries focused on helping users understand the repository.
"""
try:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
context_prompt = f"""You are analyzing a chunk of code from the repository '{repo_id}' to create a conversational summary for a chatbot assistant.
Create a concise but informative summary that helps understand:
- What this code section does
- Key functions, classes, or components
- Important features or capabilities
- How it relates to the overall repository purpose
- Any notable patterns or technologies used
Focus on information that would be useful for answering user questions about the repository.
Repository chunk:
{chunk}
Provide a clear, conversational summary in 2-3 paragraphs:"""
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": "You are an expert code analyst creating conversational summaries for a repository assistant chatbot."},
{"role": "user", "content": context_prompt}
],
max_tokens=600, # Increased for more detailed analysis with larger chunks
temperature=0.3
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Error analyzing chunk for context: {e}")
return f"Code section analysis unavailable: {e}"
def create_repo_context_summary(repo_content: str, repo_id: str) -> str:
"""
Create a comprehensive context summary by analyzing the repository in chunks.
Returns a detailed summary that the chatbot can use to answer questions.
"""
try:
lines = repo_content.split('\n')
chunk_size = 1200 # Increased for better context and fewer API calls
chunk_summaries = []
logger.info(f"Analyzing repository {repo_id} in chunks for chatbot context")
for i in range(0, len(lines), chunk_size):
chunk = '\n'.join(lines[i:i+chunk_size])
if chunk.strip(): # Only analyze non-empty chunks
summary = analyze_repo_chunk_for_context(chunk, repo_id)
chunk_summaries.append(f"=== Section {len(chunk_summaries) + 1} ===\n{summary}")
# Create final comprehensive summary
try:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
final_prompt = f"""Based on the following section summaries of repository '{repo_id}', create a comprehensive overview that a chatbot can use to answer user questions.
Section Summaries:
{chr(10).join(chunk_summaries)}
Create a well-structured overview covering:
1. Repository Purpose & Main Functionality
2. Key Components & Architecture
3. Important Features & Capabilities
4. Technology Stack & Dependencies
5. Usage Patterns & Examples
Make this comprehensive but conversational - it will be used by a chatbot to answer user questions about the repository."""
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": "You are creating a comprehensive repository summary for a chatbot assistant."},
{"role": "user", "content": final_prompt}
],
max_tokens=1500, # Increased for more comprehensive summaries
temperature=0.3
)
final_summary = response.choices[0].message.content
# Combine everything for the chatbot context
full_context = f"""=== REPOSITORY ANALYSIS FOR {repo_id.upper()} ===
{final_summary}
=== DETAILED SECTION SUMMARIES ===
{chr(10).join(chunk_summaries)}"""
logger.info(f"Created comprehensive context summary for {repo_id}")
return full_context
except Exception as e:
logger.error(f"Error creating final summary: {e}")
# Fallback to just section summaries
return f"=== REPOSITORY ANALYSIS FOR {repo_id.upper()} ===\n\n" + '\n\n'.join(chunk_summaries)
except Exception as e:
logger.error(f"Error creating repo context summary: {e}")
return f"Repository analysis unavailable: {e}"
def handle_load_repository(repo_id: str) -> Tuple[str, str]:
"""Load a specific repository and prepare it for exploration with chunk-based analysis."""
if not repo_id.strip():
return "Status: Please enter a repository ID.", ""
try:
logger.info(f"Loading repository for exploration: {repo_id}")
# Download and process the repository
try:
download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=['.py', '.md', '.txt'])
combined_text_path = combine_repo_files_for_llm()
except Exception as e:
logger.error(f"Error downloading repository {repo_id}: {e}")
error_status = f"❌ Error downloading repository: {e}"
return error_status, ""
with open(combined_text_path, "r", encoding="utf-8") as f:
repo_content = f.read()
status = f"βœ… Repository '{repo_id}' loaded successfully!\\nπŸ“ Files processed and ready for exploration.\\nπŸ”„ Analyzing repository in chunks for comprehensive context...\\nπŸ’¬ You can now ask questions about this repository."
# Create comprehensive context summary using chunk analysis
logger.info(f"Creating context summary for {repo_id}")
context_summary = create_repo_context_summary(repo_content, repo_id)
logger.info(f"Repository {repo_id} loaded and analyzed successfully for exploration")
return status, context_summary
except Exception as e:
logger.error(f"Error loading repository {repo_id}: {e}")
error_status = f"❌ Error loading repository: {e}"
return error_status, ""