import openai import os import json import re def analyze_code(code: str) -> str: """ Uses OpenAI's GPT-4.1 mini model to analyze the given code. Returns the analysis as a string. """ from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") system_prompt = ( "You are a highly precise and strict JSON generator. Analyze the code given to you. " "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. " "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. " "If you cannot answer, still return a valid JSON with empty strings for each key. " "Example of the ONLY valid output:\n" "{\n 'strength': '...', \n 'weaknesses': '...', \n 'speciality': '...', \n 'relevance rating': '...'\n}" ) response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", # Updated model messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": code} ], max_tokens=512, temperature=0.4 ) return response.choices[0].message.content def parse_llm_json_response(response: str): try: print("DEBUGGGGG ::: ", response) # 1. Extract the JSON object part of the string start = response.find('{') end = response.rfind('}') if start == -1 or end == -1 or end < start: raise ValueError("No valid JSON object found in the response.") json_str = response[start:end+1] # 2. Replace single quotes used for keys/values with double quotes. # This handles cases like {'key': 'value'} json_str = re.sub(r"'", '"', json_str) # 3. Find all string values and escape any unescaped double quotes inside them. # This uses a function as the replacement in re.sub def escape_inner_quotes(match): # The match object gives us the full string matched by the regex. # We take the part between the outer quotes (group 1) # and replace any \" with a temporary unique placeholder. # Then, we replace any remaining " with \", and finally # restore the original escaped quotes. inner_content = match.group(1) placeholder = "___TEMP_QUOTE___" inner_content = inner_content.replace('\\"', placeholder) inner_content = inner_content.replace('"', '\\"') inner_content = inner_content.replace(placeholder, '\\"') return f'"{inner_content}"' # This regex finds a double quote, captures everything until the next double quote, # and then applies the function to that captured group. json_str = re.sub(r'"(.*?)"', escape_inner_quotes, json_str) return json.loads(json_str) except Exception as e: print("DEBUGGGGG error ::: ", e) return {"error": f"Failed to parse JSON: {e}", "raw": response} def combine_repo_files_for_llm(repo_dir="repo_files", output_file="combined_repo.txt"): """ Combines all .py and .md files in the given directory (recursively) into a single text file. Returns the path to the combined file. """ combined_content = [] seen_files = set() # Priority files priority_files = ["app.py", "README.md"] for pf in priority_files: pf_path = os.path.join(repo_dir, pf) if os.path.isfile(pf_path): try: with open(pf_path, "r", encoding="utf-8") as f: combined_content.append(f"\n# ===== File: {pf} =====\n") combined_content.append(f.read()) seen_files.add(os.path.abspath(pf_path)) except Exception as e: combined_content.append(f"\n# Could not read {pf_path}: {e}\n") # All other .py and .md files for root, _, files in os.walk(repo_dir): for file in files: if file.endswith(".py") or file.endswith(".md"): file_path = os.path.join(root, file) abs_path = os.path.abspath(file_path) if abs_path in seen_files: continue try: with open(file_path, "r", encoding="utf-8") as f: combined_content.append(f"\n# ===== File: {file} =====\n") combined_content.append(f.read()) seen_files.add(abs_path) except Exception as e: combined_content.append(f"\n# Could not read {file_path}: {e}\n") with open(output_file, "w", encoding="utf-8") as out_f: out_f.write("\n".join(combined_content)) return output_file def analyze_code_chunk(code: str) -> str: """ Analyzes a code chunk and returns a JSON summary for that chunk. """ from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") chunk_prompt = ( "You are a highly precise and strict JSON generator. Analyze the following code chunk. " "Your ONLY output must be a valid JSON object with the following keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. " "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. " "Do NOT include any explanation, markdown, or text outside the JSON. Do NOT add any commentary, preamble, or postscript. " "If you cannot answer, still return a valid JSON with empty strings for each key. " "Example of the ONLY valid output:\n" '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "..."\n}' ) response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": chunk_prompt}, {"role": "user", "content": code} ], max_tokens=512, temperature=0.4 ) return response.choices[0].message.content def aggregate_chunk_analyses(chunk_jsons: list) -> str: """ Aggregates a list of chunk JSONs into a single JSON summary using the LLM. """ from openai import OpenAI client = OpenAI(api_key=os.getenv("modal_api")) client.base_url = os.getenv("base_url") aggregation_prompt = ( "You are a highly precise and strict, code analyzer and JSON generator. You are given a list of JSON analyses of code chunks. " "Aggregate these into a SINGLE overall JSON summary with the same keys: 'strength', 'weaknesses', 'speciality', 'relevance rating'. " "All property names and string values MUST use double quotes (\"). Do NOT use single quotes. " "Summarize and combine the information from all chunks. Do NOT include any explanation, markdown, or text outside the JSON. " "If a key is missing in all chunks, use an empty string. " "Example of the ONLY valid output:\n" '{\n "strength": "...", \n "weaknesses": "...", \n "speciality": "...", \n "relevance rating": "..."\n}' ) user_content = "Here are the chunk analyses:\n" + "\n".join(chunk_jsons) response = client.chat.completions.create( model="Orion-zhen/Qwen2.5-Coder-7B-Instruct-AWQ", messages=[ {"role": "system", "content": aggregation_prompt}, {"role": "user", "content": user_content} ], max_tokens=512, temperature=0.3 ) return response.choices[0].message.content def analyze_combined_file(output_file="combined_repo.txt"): """ Reads the combined file, splits it into 500-line chunks, analyzes each chunk, and aggregates the LLM's output into a final summary. Returns the chunk JSONs (for debugging) and the aggregated analysis as a string. """ try: with open(output_file, "r", encoding="utf-8") as f: lines = f.readlines() chunk_size = 500 chunk_jsons = [] for i in range(0, len(lines), chunk_size): chunk = "".join(lines[i:i+chunk_size]) analysis = analyze_code_chunk(chunk) chunk_jsons.append(analysis) final_summary = aggregate_chunk_analyses(chunk_jsons) debug_output = ( "==== Chunk JSON Outputs ====" + "\n\n".join([f"Chunk {i+1} JSON:\n{chunk_jsons[i]}" for i in range(len(chunk_jsons))]) + "\n\n==== Final Aggregated Summary ====" + f"\n{final_summary}" ) return debug_output except Exception as e: return f"Error analyzing combined file: {e}"