import asyncio import numpy as np import os from sklearn.metrics.pairwise import cosine_similarity from typing import List from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.postprocessor import SimilarityPostprocessor from llama_index.embeddings.mistralai import MistralAIEmbedding from llama_index.llms.mistralai import MistralAI from config import MISTRAL_API_KEY from tools.utils import fetch_repo_files, fetch_file_content INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"} def safe_normalize(vec: np.ndarray) -> np.ndarray: vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0) norm = np.linalg.norm(vec) if norm == 0 or np.isnan(norm) or np.isinf(norm): return None return vec / norm def select_relevant_files_semantic(issue_description: str, file_paths: List[str]) -> List[str]: embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY) issue_embedding = np.array(embed_model.get_text_embedding(issue_description), dtype=np.float64) issue_embedding = safe_normalize(issue_embedding) if issue_embedding is None: print("[Warning] Issue description embedding invalid (zero or NaN norm). Returning empty list.") return [] scored_files = [] for path in file_paths: try: file_embedding = np.array(embed_model.get_text_embedding(path), dtype=np.float64) file_embedding = safe_normalize(file_embedding) if file_embedding is None: print(f"[Warning] Skipping {path} due to zero or invalid embedding norm.") continue with np.errstate(divide='ignore', invalid='ignore', over='ignore'): score = cosine_similarity([issue_embedding], [file_embedding])[0][0] if np.isnan(score) or np.isinf(score): print(f"[Warning] Skipping {path} due to invalid similarity score.") continue scored_files.append((path, score)) except Exception as e: print(f"[Warning] Skipping {path} due to error: {e}") top_files = [f[0] for f in sorted(scored_files, key=lambda x: x[1], reverse=True)[:2]] if "README.md" in file_paths: if "README.md" not in top_files: top_files.insert(0, "README.md") return top_files async def async_retry_on_429(func, *args, max_retries=3, delay=1, **kwargs): for attempt in range(max_retries): try: return await func(*args, **kwargs) except Exception as e: status = getattr(e, 'response', None) and getattr(e.response, 'status_code', None) if status == 429: print(f"[Retry] Rate limit hit while calling {func.__name__}. Attempt {attempt+1}/{max_retries}. Retrying in {delay} seconds...") await asyncio.sleep(delay) delay *= 2 else: raise async def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description: str = "") -> VectorStoreIndex: model_name = "codestral-embed" embed_model = MistralAIEmbedding(model_name=model_name, api_key=MISTRAL_API_KEY) print(f"[Indexing] Starting to index repository: {owner}/{repo} at ref {ref}...") file_paths = await async_retry_on_429(fetch_repo_files, owner, repo, ref) if issue_description: file_paths = select_relevant_files_semantic(issue_description, file_paths) # stays sync unless heavy documents = [] for path in file_paths: _, ext = os.path.splitext(path) if ext.lower() not in INCLUDE_FILE_EXTENSIONS: continue try: content = await async_retry_on_429(fetch_file_content, owner, repo, path, ref) documents.append(Document(text=content, metadata={"file_path": path})) print(f"[Indexing] Added file: {path}") await asyncio.sleep(0.1) except Exception as e: print(f"[Warning] Skipping file {path} due to error: {e}") try: index = await async_retry_on_429(VectorStoreIndex.from_documents, documents, embed_model=embed_model) except Exception as e: print(f"[Error] Failed to build index due to: {e}") raise print(f"[Indexing] Finished indexing {len(documents)} files.") return index async def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]: index = await build_repo_index(owner, repo, ref, issue_description) Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY) Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY) retriever = index.as_retriever(similarity_top_k=3) query_engine = RetrieverQueryEngine( retriever=retriever, response_synthesizer=get_response_synthesizer(), node_postprocessors=[ SimilarityPostprocessor(similarity_top_k=3, similarity_cutoff=0.75) ], ) query = ( f"Please give relevant information from the codebase that highly matches the keywords of this issue and is useful for solving or understanding this issue: {issue_description}\n" "STRICT RULES:\n" "- ONLY use information available in the retriever context.\n" "- DO NOT generate or assume any information outside the given context.\n" f"- ONLY include context that is highly relevant and clearly useful for understanding or solving this issue: {issue_description}\n" "- DO NOT include generic, loosely related, or unrelated content.\n" ) response = await asyncio.to_thread(query_engine.query, query) print(response) return response