File size: 16,560 Bytes
e0b6f12 1a943f1 f03a154 7209842 e0b6f12 dbe96a7 e0b6f12 27f4250 9a88164 1a943f1 2c15ffb 3330689 2c15ffb 3330689 1a943f1 27f4250 1d3eed5 e0b6f12 89ff1cb e0b6f12 27f4250 5b7f342 1a943f1 82d6e3b 9f8e537 82d6e3b 9f8e537 82d6e3b 1a943f1 f03a154 1a943f1 5b7f342 48d3c35 5b7f342 631e1ee 48d3c35 631e1ee 48d3c35 5b7f342 48d3c35 5b7f342 631e1ee 5b7f342 791be58 5b7f342 631e1ee 5b7f342 791be58 3330689 1c90111 3330689 1c90111 f03a154 3330689 1c90111 3330689 1c90111 3330689 1c90111 3330689 1c90111 fd7c5f8 89ff1cb 1c90111 3330689 1c90111 3330689 1c90111 f03a154 1c90111 f03a154 3330689 1c90111 3330689 1c90111 3330689 1c90111 3330689 791be58 1c90111 3330689 e9db129 791be58 631e1ee fd7c5f8 1c90111 2c15ffb 3330689 1c90111 3330689 e9db129 791be58 7209842 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 |
import openai
import os
import json
import re
from typing import Tuple
def analyze_code(code: str) -> str:
"""
Uses qwen2.5-coder-7b-instruct-awq model to analyze the given code.
Returns the analysis as a string.
"""
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
system_prompt = (
"You are a highly precise and strict JSON generator. Analyze the code given to you. "
"Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
"For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
"Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
"If you cannot answer, still return a valid JSON with empty strings for each key. "
"Example of the ONLY valid output:\n"
"{\n 'strength': '...', \n 'weaknesses': '...', \n 'speciality': '...', \n 'relevance rating': 'high'\n}"
)
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", # Updated model
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": code}
],
max_tokens=512,
temperature=0.4
)
return response.choices[0].message.content
def parse_llm_json_response(response: str):
try:
print("DEBUGGGGG ::: ", response)
# 1. Extract the JSON object part of the string
start = response.find('{')
end = response.rfind('}')
if start == -1 or end == -1 or end < start:
raise ValueError("No valid JSON object found in the response.")
json_str = response[start:end+1]
# 2. Replace single quotes used for keys/values with double quotes.
# This handles cases like {'key': 'value'}
json_str = re.sub(r"'", '"', json_str)
# 3. Find all string values and escape any unescaped double quotes inside them.
# This uses a function as the replacement in re.sub
def escape_inner_quotes(match):
# The match object gives us the full string matched by the regex.
# We take the part between the outer quotes (group 1)
# and replace any \" with a temporary unique placeholder.
# Then, we replace any remaining " with \", and finally
# restore the original escaped quotes.
inner_content = match.group(1)
placeholder = "___TEMP_QUOTE___"
inner_content = inner_content.replace('\\"', placeholder)
inner_content = inner_content.replace('"', '\\"')
inner_content = inner_content.replace(placeholder, '\\"')
return f'"{inner_content}"'
# This regex finds a double quote, captures everything until the next double quote,
# and then applies the function to that captured group.
json_str = re.sub(r'"(.*?)"', escape_inner_quotes, json_str)
return json.loads(json_str)
except Exception as e:
print("DEBUGGGGG error ::: ", e)
return {"error": f"Failed to parse JSON: {e}", "raw": response}
def combine_repo_files_for_llm(repo_dir="repo_files", output_file="combined_repo.txt"):
"""
Combines all .py, .md, and .txt files in the given directory (recursively) into a single text file.
Returns the path to the combined file.
"""
combined_content = []
seen_files = set()
# Priority files
priority_files = ["app.py", "README.md", "requirements.txt"]
for pf in priority_files:
pf_path = os.path.join(repo_dir, pf)
if os.path.isfile(pf_path):
try:
with open(pf_path, "r", encoding="utf-8") as f:
combined_content.append(f"\n# ===== File: {pf} =====\n")
combined_content.append(f.read())
seen_files.add(os.path.abspath(pf_path))
except Exception as e:
combined_content.append(f"\n# Could not read {pf_path}: {e}\n")
# All other .py, .md, and .txt files
for root, _, files in os.walk(repo_dir):
for file in files:
if file.endswith(".py") or file.endswith(".md") or file.endswith(".txt"):
file_path = os.path.join(root, file)
abs_path = os.path.abspath(file_path)
if abs_path in seen_files:
continue
try:
with open(file_path, "r", encoding="utf-8") as f:
combined_content.append(f"\n# ===== File: {file} =====\n")
combined_content.append(f.read())
seen_files.add(abs_path)
except Exception as e:
combined_content.append(f"\n# Could not read {file_path}: {e}\n")
with open(output_file, "w", encoding="utf-8") as out_f:
out_f.write("\n".join(combined_content))
return output_file
def analyze_code_chunk(code: str, user_requirements: str = "") -> str:
"""
Analyzes a code chunk and returns a JSON summary for that chunk.
"""
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
# Build the user requirements section
requirements_section = ""
if user_requirements.strip():
requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen rating relevance, consider how well this code matches the user's stated requirements."
chunk_prompt = (
"You are a highly precise and strict JSON generator. Analyze the following code chunk. "
"Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
"All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
"For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
"Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. "
"If you cannot answer, still return a valid JSON with empty strings for each key. "
f"{requirements_section}"
"Example of the ONLY valid output:\n"
'{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}'
)
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": chunk_prompt},
{"role": "user", "content": code}
],
temperature=0.4
)
return response.choices[0].message.content
def aggregate_chunk_analyses(chunk_jsons: list, user_requirements: str = "") -> str:
"""
Aggregates a list of chunk JSONs into a single JSON summary using the LLM.
"""
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
# Build the user requirements section
requirements_section = ""
if user_requirements.strip():
requirements_section = f"\n\nUSER REQUIREMENTS:\n{user_requirements}\n\nWhen aggregating the relevance rating, consider how well the overall repository matches the user's stated requirements."
aggregation_prompt = (
"You are a highly precise and strict, code analyzer and JSON generator. You are given a list of JSON analyses of code chunks. "
"Aggregate these into a SINGLE overall JSON summary with the same keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. "
"All property names and string values MUST use double quotes (\"). Do NOT use single quotes. "
"For 'relevance rating', you MUST use ONLY one of these exact values: 'very low', 'low', 'high', 'very high'. "
"Summarize and combine the information from all chunks. Do NOT include any explanation, markdown, or text outside the JSON. "
"If a key is missing in all chunks, use an empty string. "
f"{requirements_section}"
"Example of the ONLY valid output:\n"
'{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "high"\n}'
)
user_content = "Here are the chunk analyses:\n" + "\n".join(chunk_jsons)
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": aggregation_prompt},
{"role": "user", "content": user_content}
],
max_tokens=512,
temperature=0.3
)
return response.choices[0].message.content
def analyze_combined_file(output_file="combined_repo.txt", user_requirements: str = ""):
"""
Reads the combined file, splits it into 500-line chunks, analyzes each chunk, and aggregates the LLM's output into a final summary.
Now includes user requirements for better relevance rating.
Returns the chunk JSONs (for debugging) and the aggregated analysis as a string.
"""
try:
with open(output_file, "r", encoding="utf-8") as f:
lines = f.readlines()
chunk_size = 1200
chunk_jsons = []
for i in range(0, len(lines), chunk_size):
chunk = "".join(lines[i:i+chunk_size])
analysis = analyze_code_chunk(chunk, user_requirements)
chunk_jsons.append(analysis)
final_summary = aggregate_chunk_analyses(chunk_jsons, user_requirements)
debug_output = (
"==== Chunk JSON Outputs ===="
+ "\n\n".join([f"Chunk {i+1} JSON:\n{chunk_jsons[i]}" for i in range(len(chunk_jsons))])
+ "\n\n==== Final Aggregated Summary ===="
+ f"\n{final_summary}"
)
return debug_output
except Exception as e:
return f"Error analyzing combined file: {e}"
def analyze_repo_chunk_for_context(chunk: str, repo_id: str) -> str:
"""
Analyze a repository chunk to create conversational context for the chatbot.
This creates summaries focused on helping users understand the repository.
"""
try:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
context_prompt = f"""You are analyzing a chunk of code from the repository '{repo_id}' to create a conversational summary for a chatbot assistant.
Create a concise but informative summary that helps understand:
- What this code section does
- Key functions, classes, or components
- Important features or capabilities
- How it relates to the overall repository purpose
- Any notable patterns or technologies used
Focus on information that would be useful for answering user questions about the repository.
Repository chunk:
{chunk}
Provide a clear, conversational summary in 2-3 paragraphs:"""
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": "You are an expert code analyst creating conversational summaries for a repository assistant chatbot."},
{"role": "user", "content": context_prompt}
],
max_tokens=600, # Increased for more detailed analysis with larger chunks
temperature=0.3
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Error analyzing chunk for context: {e}")
return f"Code section analysis unavailable: {e}"
def create_repo_context_summary(repo_content: str, repo_id: str) -> str:
"""
Create a comprehensive context summary by analyzing the repository in chunks.
Returns a detailed summary that the chatbot can use to answer questions.
"""
try:
lines = repo_content.split('\n')
chunk_size = 1200 # Increased for better context and fewer API calls
chunk_summaries = []
logger.info(f"Analyzing repository {repo_id} in chunks for chatbot context")
for i in range(0, len(lines), chunk_size):
chunk = '\n'.join(lines[i:i+chunk_size])
if chunk.strip(): # Only analyze non-empty chunks
summary = analyze_repo_chunk_for_context(chunk, repo_id)
chunk_summaries.append(f"=== Section {len(chunk_summaries) + 1} ===\n{summary}")
# Create final comprehensive summary
try:
from openai import OpenAI
client = OpenAI(api_key=os.getenv("modal_api"))
client.base_url = os.getenv("base_url")
final_prompt = f"""Based on the following section summaries of repository '{repo_id}', create a comprehensive overview that a chatbot can use to answer user questions.
Section Summaries:
{chr(10).join(chunk_summaries)}
Create a well-structured overview covering:
1. Repository Purpose & Main Functionality
2. Key Components & Architecture
3. Important Features & Capabilities
4. Technology Stack & Dependencies
5. Usage Patterns & Examples
Make this comprehensive but conversational - it will be used by a chatbot to answer user questions about the repository."""
response = client.chat.completions.create(
model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ",
messages=[
{"role": "system", "content": "You are creating a comprehensive repository summary for a chatbot assistant."},
{"role": "user", "content": final_prompt}
],
max_tokens=1500, # Increased for more comprehensive summaries
temperature=0.3
)
final_summary = response.choices[0].message.content
# Combine everything for the chatbot context
full_context = f"""=== REPOSITORY ANALYSIS FOR {repo_id.upper()} ===
{final_summary}
=== DETAILED SECTION SUMMARIES ===
{chr(10).join(chunk_summaries)}"""
logger.info(f"Created comprehensive context summary for {repo_id}")
return full_context
except Exception as e:
logger.error(f"Error creating final summary: {e}")
# Fallback to just section summaries
return f"=== REPOSITORY ANALYSIS FOR {repo_id.upper()} ===\n\n" + '\n\n'.join(chunk_summaries)
except Exception as e:
logger.error(f"Error creating repo context summary: {e}")
return f"Repository analysis unavailable: {e}"
def handle_load_repository(repo_id: str) -> Tuple[str, str]:
"""Load a specific repository and prepare it for exploration with chunk-based analysis."""
if not repo_id.strip():
return "Status: Please enter a repository ID.", ""
try:
logger.info(f"Loading repository for exploration: {repo_id}")
# Download and process the repository
try:
download_filtered_space_files(repo_id, local_dir="repo_files", file_extensions=['.py', '.md', '.txt'])
combined_text_path = combine_repo_files_for_llm()
except Exception as e:
logger.error(f"Error downloading repository {repo_id}: {e}")
error_status = f"β Error downloading repository: {e}"
return error_status, ""
with open(combined_text_path, "r", encoding="utf-8") as f:
repo_content = f.read()
status = f"β
Repository '{repo_id}' loaded successfully!\\nπ Files processed and ready for exploration.\\nπ Analyzing repository in chunks for comprehensive context...\\nπ¬ You can now ask questions about this repository."
# Create comprehensive context summary using chunk analysis
logger.info(f"Creating context summary for {repo_id}")
context_summary = create_repo_context_summary(repo_content, repo_id)
logger.info(f"Repository {repo_id} loaded and analyzed successfully for exploration")
return status, context_summary
except Exception as e:
logger.error(f"Error loading repository {repo_id}: {e}")
error_status = f"β Error loading repository: {e}"
return error_status, ""
|