DeepGit-lite / src /deepgit_lite.py
zamalali
Refine DeepGit Lite description and improve error handling for GitHub API key and document embeddings
1796763
raw
history blame
11.3 kB
import os
import base64
import requests
import numpy as np
import datetime
from sentence_transformers import SentenceTransformer
import faiss
import math
import logging
from dotenv import load_dotenv
from pathlib import Path
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
# ---------------------------
# Environment and .env Setup
# ---------------------------
dotenv_path = Path(__file__).resolve().parent.parent / ".env"
load_dotenv(dotenv_path=str(dotenv_path))
if "GITHUB_API_KEY" not in os.environ:
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
# ---------------------------
# Logging Setup
# ---------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# ---------------------------
# ChatGroq Integration Setup (for query enhancement and final justification)
# ---------------------------
llm_groq = ChatGroq(
model="llama-3.1-8b-instant",
temperature=0.2,
max_tokens=100,
timeout=15,
max_retries=2
)
def enhance_query(original_query):
prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
please enhance and expand it by adding relevant technical keywords, recent research context,
and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
Provide the refined query text."""
messages = [
("system", "You are a helpful research assistant specializing in AI and software research."),
("human", prompt)
]
result = llm_groq.invoke(messages)
# Extract text content if available
if hasattr(result, "content"):
return result.content
return str(result)
def justify_candidate(candidate, query):
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
Repository Details:
- Stars: {candidate['stars']}
- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
Provide a concise justification:"""
messages = [
("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
("human", prompt)
]
result = llm_groq.invoke(messages)
if hasattr(result, "content"):
return result.content
return str(result)
# ---------------------------
# GitHub API Helper Functions
# ---------------------------
def fetch_readme_content(repo_full_name, headers):
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
response = requests.get(readme_url, headers=headers)
if response.status_code == 200:
readme_data = response.json()
return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
return ""
def fetch_file_content(download_url):
try:
response = requests.get(download_url)
if response.status_code == 200:
return response.text
except Exception as e:
logger.error(f"Error fetching file: {e}")
return ""
def fetch_directory_markdown(repo_full_name, path, headers):
md_content = ""
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
response = requests.get(url, headers=headers)
if response.status_code == 200:
items = response.json()
for item in items:
if item["type"] == "file" and item["name"].lower().endswith(".md"):
content = fetch_file_content(item["download_url"])
md_content += f"\n\n# {item['name']}\n" + content
return md_content
def fetch_repo_documentation(repo_full_name, headers):
doc_text = ""
# Fetch README first.
readme = fetch_readme_content(repo_full_name, headers)
if readme:
doc_text += "# README\n" + readme
# Fetch additional markdown files and documentation directories.
root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
response = requests.get(root_url, headers=headers)
if response.status_code == 200:
items = response.json()
for item in items:
if item["type"] == "file" and item["name"].lower().endswith(".md"):
if item["name"].lower() != "readme.md":
content = fetch_file_content(item["download_url"])
doc_text += f"\n\n# {item['name']}\n" + content
elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
return doc_text if doc_text.strip() else "No documentation available."
def fetch_github_repositories(query, max_results=1000, per_page=100):
url = "https://api.github.com/search/repositories"
headers = {
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
"Accept": "application/vnd.github.v3+json"
}
repositories = []
num_pages = max_results // per_page
for page in range(1, num_pages + 1):
params = {
"q": query,
"sort": "stars",
"order": "desc",
"per_page": per_page,
"page": page
}
response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
logger.error(f"Error {response.status_code}: {response.json().get('message')}")
break
items = response.json().get('items', [])
if not items:
break
for repo in items:
repo_link = repo['html_url']
full_name = repo.get('full_name', '')
doc_content = fetch_repo_documentation(full_name, headers)
star_count = repo.get('stargazers_count', 0)
repositories.append({
"title": repo.get('name', 'No title available'),
"link": repo_link,
"combined_doc": doc_content,
"stars": star_count,
"full_name": full_name,
"open_issues_count": repo.get('open_issues_count', 0)
})
logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
return repositories
# ---------------------------
# Main Lite Workflow Function
# ---------------------------
def run_deepgit_lite(user_query):
# Stage 0: Query Enhancement using ChatGroq
logger.info("Enhancing query using ChatGroq...")
original_query = user_query.strip()
enhanced_query = enhance_query(original_query)
logger.info(f"Enhanced Query: {enhanced_query}")
github_query = enhanced_query + " language:python"
logger.info(f"Using GitHub query: {github_query}")
# Stage 1: Dense Retrieval with FAISS
logger.info("Fetching repositories from GitHub...")
repos = fetch_github_repositories(github_query)
if not repos:
logger.error("No repositories found. Please refine your query.")
return "\nNo repositories found for your query. Please try a different query."
docs = [repo.get("combined_doc", "") for repo in repos]
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
sem_model = SentenceTransformer("all-mpnet-base-v2")
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
# Check if embeddings array is empty or 1-dimensional
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
logger.error("No document embeddings generated. Aborting dense retrieval.")
return "\nFailed to generate document embeddings. Please try again."
def normalize_embeddings(embeddings):
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings / (norms + 1e-10)
doc_embeddings = normalize_embeddings(doc_embeddings)
query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
dim = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(doc_embeddings)
k = min(100, doc_embeddings.shape[0])
D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
for idx, score in zip(I[0], D[0]):
repos[idx]["semantic_similarity"] = score
ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
# Stage 2: Filtering Low-Star Repositories
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
if not filtered_candidates:
filtered_candidates = ranked_by_semantic # fallback if filtering is too strict
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
min_sem, max_sem = min(semantic_scores), max(semantic_scores)
min_star, max_star = min(star_scores), max(star_scores)
def normalize(val, min_val, max_val):
if max_val - min_val == 0:
return 0.5
return (val - min_val) / (max_val - min_val)
for repo in filtered_candidates:
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
# Weights: 60% semantic, 40% stars.
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
# Stage 4: Final Justification using ChatGroq
justifications = {}
for repo in final_ranked[:10]:
justification = justify_candidate(repo, user_query)
justifications[repo['title']] = justification
logger.info(f"Justification for {repo['title']}: {justification}")
# Format final results into a text table.
result_text = "\n=== Final Ranked Repositories ===\n"
for rank, repo in enumerate(final_ranked[:10], 1):
result_text += f"Final Rank: {rank}\n"
result_text += f"Title: {repo['title']}\n"
result_text += f"Link: {repo['link']}\n"
result_text += f"Stars: {repo['stars']}\n"
result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
result_text += '-' * 80 + "\n"
result_text += "\n=== End of Results ==="
return result_text