OpenSorus / tools /code_index.py
halfacupoftea's picture
Clean up code
531ba0a
import asyncio
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
from llama_index.core import VectorStoreIndex, Document, Settings, get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.llms.mistralai import MistralAI
from config import MISTRAL_API_KEY
from tools.utils import fetch_repo_files, fetch_file_content
INCLUDE_FILE_EXTENSIONS = {".py", ".js", ".ts", ".json", ".md", ".txt"}
def safe_normalize(vec: np.ndarray) -> np.ndarray:
vec = np.nan_to_num(vec, nan=0.0, posinf=0.0, neginf=0.0)
norm = np.linalg.norm(vec)
if norm == 0 or np.isnan(norm) or np.isinf(norm):
return None
return vec / norm
def select_relevant_files_semantic(issue_description: str, file_paths: List[str]) -> List[str]:
embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
issue_embedding = np.array(embed_model.get_text_embedding(issue_description), dtype=np.float64)
issue_embedding = safe_normalize(issue_embedding)
if issue_embedding is None:
print("[Warning] Issue description embedding invalid (zero or NaN norm). Returning empty list.")
return []
scored_files = []
for path in file_paths:
try:
file_embedding = np.array(embed_model.get_text_embedding(path), dtype=np.float64)
file_embedding = safe_normalize(file_embedding)
if file_embedding is None:
print(f"[Warning] Skipping {path} due to zero or invalid embedding norm.")
continue
with np.errstate(divide='ignore', invalid='ignore', over='ignore'):
score = cosine_similarity([issue_embedding], [file_embedding])[0][0]
if np.isnan(score) or np.isinf(score):
print(f"[Warning] Skipping {path} due to invalid similarity score.")
continue
scored_files.append((path, score))
except Exception as e:
print(f"[Warning] Skipping {path} due to error: {e}")
top_files = [f[0] for f in sorted(scored_files, key=lambda x: x[1], reverse=True)[:2]]
if "README.md" in file_paths:
if "README.md" not in top_files:
top_files.insert(0, "README.md")
return top_files
async def async_retry_on_429(func, *args, max_retries=3, delay=1, **kwargs):
for attempt in range(max_retries):
try:
return await func(*args, **kwargs)
except Exception as e:
status = getattr(e, 'response', None) and getattr(e.response, 'status_code', None)
if status == 429:
print(f"[Retry] Rate limit hit while calling {func.__name__}. Attempt {attempt+1}/{max_retries}. Retrying in {delay} seconds...")
await asyncio.sleep(delay)
delay *= 2
else:
raise
async def build_repo_index(owner: str, repo: str, ref: str = "main", issue_description: str = "") -> VectorStoreIndex:
model_name = "codestral-embed"
embed_model = MistralAIEmbedding(model_name=model_name, api_key=MISTRAL_API_KEY)
print(f"[Indexing] Starting to index repository: {owner}/{repo} at ref {ref}...")
file_paths = await async_retry_on_429(fetch_repo_files, owner, repo, ref)
if issue_description:
file_paths = select_relevant_files_semantic(issue_description, file_paths) # stays sync unless heavy
documents = []
for path in file_paths:
_, ext = os.path.splitext(path)
if ext.lower() not in INCLUDE_FILE_EXTENSIONS:
continue
try:
content = await async_retry_on_429(fetch_file_content, owner, repo, path, ref)
documents.append(Document(text=content, metadata={"file_path": path}))
print(f"[Indexing] Added file: {path}")
await asyncio.sleep(0.1)
except Exception as e:
print(f"[Warning] Skipping file {path} due to error: {e}")
try:
index = await async_retry_on_429(VectorStoreIndex.from_documents, documents, embed_model=embed_model)
except Exception as e:
print(f"[Error] Failed to build index due to: {e}")
raise
print(f"[Indexing] Finished indexing {len(documents)} files.")
return index
async def retrieve_context(owner: str, repo: str, ref: str, issue_description: str) -> List[str]:
index = await build_repo_index(owner, repo, ref, issue_description)
Settings.llm = MistralAI(model="codestral-latest", api_key=MISTRAL_API_KEY)
Settings.embed_model = MistralAIEmbedding(model_name="codestral-embed", api_key=MISTRAL_API_KEY)
retriever = index.as_retriever(similarity_top_k=3)
query_engine = RetrieverQueryEngine(
retriever=retriever,
response_synthesizer=get_response_synthesizer(),
node_postprocessors=[
SimilarityPostprocessor(similarity_top_k=3, similarity_cutoff=0.75)
],
)
query = (
f"Please give relevant information from the codebase that highly matches the keywords of this issue and is useful for solving or understanding this issue: {issue_description}\n"
"STRICT RULES:\n"
"- ONLY use information available in the retriever context.\n"
"- DO NOT generate or assume any information outside the given context.\n"
f"- ONLY include context that is highly relevant and clearly useful for understanding or solving this issue: {issue_description}\n"
"- DO NOT include generic, loosely related, or unrelated content.\n"
)
response = await asyncio.to_thread(query_engine.query, query)
print(response)
return response