Spaces:
Running
on
Zero
Running
on
Zero
zamalali
Refine DeepGit Lite description and improve error handling for GitHub API key and document embeddings
1796763
import os | |
import base64 | |
import requests | |
import numpy as np | |
import datetime | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import math | |
import logging | |
from dotenv import load_dotenv | |
from pathlib import Path | |
from langchain_groq import ChatGroq | |
from langchain_core.prompts import ChatPromptTemplate | |
# --------------------------- | |
# Environment and .env Setup | |
# --------------------------- | |
dotenv_path = Path(__file__).resolve().parent.parent / ".env" | |
load_dotenv(dotenv_path=str(dotenv_path)) | |
if "GITHUB_API_KEY" not in os.environ: | |
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.") | |
# --------------------------- | |
# Logging Setup | |
# --------------------------- | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
logger = logging.getLogger(__name__) | |
# --------------------------- | |
# ChatGroq Integration Setup (for query enhancement and final justification) | |
# --------------------------- | |
llm_groq = ChatGroq( | |
model="llama-3.1-8b-instant", | |
temperature=0.2, | |
max_tokens=100, | |
timeout=15, | |
max_retries=2 | |
) | |
def enhance_query(original_query): | |
prompt = f"""You are an expert research assistant. Given the query: "{original_query}", | |
please enhance and expand it by adding relevant technical keywords, recent research context, | |
and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment. | |
Provide the refined query text.""" | |
messages = [ | |
("system", "You are a helpful research assistant specializing in AI and software research."), | |
("human", prompt) | |
] | |
result = llm_groq.invoke(messages) | |
# Extract text content if available | |
if hasattr(result, "content"): | |
return result.content | |
return str(result) | |
def justify_candidate(candidate, query): | |
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant. | |
Repository Details: | |
- Stars: {candidate['stars']} | |
- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f} | |
Provide a concise justification:""" | |
messages = [ | |
("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."), | |
("human", prompt) | |
] | |
result = llm_groq.invoke(messages) | |
if hasattr(result, "content"): | |
return result.content | |
return str(result) | |
# --------------------------- | |
# GitHub API Helper Functions | |
# --------------------------- | |
def fetch_readme_content(repo_full_name, headers): | |
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme" | |
response = requests.get(readme_url, headers=headers) | |
if response.status_code == 200: | |
readme_data = response.json() | |
return base64.b64decode(readme_data.get('content', '')).decode('utf-8') | |
return "" | |
def fetch_file_content(download_url): | |
try: | |
response = requests.get(download_url) | |
if response.status_code == 200: | |
return response.text | |
except Exception as e: | |
logger.error(f"Error fetching file: {e}") | |
return "" | |
def fetch_directory_markdown(repo_full_name, path, headers): | |
md_content = "" | |
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}" | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
items = response.json() | |
for item in items: | |
if item["type"] == "file" and item["name"].lower().endswith(".md"): | |
content = fetch_file_content(item["download_url"]) | |
md_content += f"\n\n# {item['name']}\n" + content | |
return md_content | |
def fetch_repo_documentation(repo_full_name, headers): | |
doc_text = "" | |
# Fetch README first. | |
readme = fetch_readme_content(repo_full_name, headers) | |
if readme: | |
doc_text += "# README\n" + readme | |
# Fetch additional markdown files and documentation directories. | |
root_url = f"https://api.github.com/repos/{repo_full_name}/contents" | |
response = requests.get(root_url, headers=headers) | |
if response.status_code == 200: | |
items = response.json() | |
for item in items: | |
if item["type"] == "file" and item["name"].lower().endswith(".md"): | |
if item["name"].lower() != "readme.md": | |
content = fetch_file_content(item["download_url"]) | |
doc_text += f"\n\n# {item['name']}\n" + content | |
elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]: | |
doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers) | |
return doc_text if doc_text.strip() else "No documentation available." | |
def fetch_github_repositories(query, max_results=1000, per_page=100): | |
url = "https://api.github.com/search/repositories" | |
headers = { | |
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}", | |
"Accept": "application/vnd.github.v3+json" | |
} | |
repositories = [] | |
num_pages = max_results // per_page | |
for page in range(1, num_pages + 1): | |
params = { | |
"q": query, | |
"sort": "stars", | |
"order": "desc", | |
"per_page": per_page, | |
"page": page | |
} | |
response = requests.get(url, headers=headers, params=params) | |
if response.status_code != 200: | |
logger.error(f"Error {response.status_code}: {response.json().get('message')}") | |
break | |
items = response.json().get('items', []) | |
if not items: | |
break | |
for repo in items: | |
repo_link = repo['html_url'] | |
full_name = repo.get('full_name', '') | |
doc_content = fetch_repo_documentation(full_name, headers) | |
star_count = repo.get('stargazers_count', 0) | |
repositories.append({ | |
"title": repo.get('name', 'No title available'), | |
"link": repo_link, | |
"combined_doc": doc_content, | |
"stars": star_count, | |
"full_name": full_name, | |
"open_issues_count": repo.get('open_issues_count', 0) | |
}) | |
logger.info(f"Fetched {len(repositories)} repositories from GitHub.") | |
return repositories | |
# --------------------------- | |
# Main Lite Workflow Function | |
# --------------------------- | |
def run_deepgit_lite(user_query): | |
# Stage 0: Query Enhancement using ChatGroq | |
logger.info("Enhancing query using ChatGroq...") | |
original_query = user_query.strip() | |
enhanced_query = enhance_query(original_query) | |
logger.info(f"Enhanced Query: {enhanced_query}") | |
github_query = enhanced_query + " language:python" | |
logger.info(f"Using GitHub query: {github_query}") | |
# Stage 1: Dense Retrieval with FAISS | |
logger.info("Fetching repositories from GitHub...") | |
repos = fetch_github_repositories(github_query) | |
if not repos: | |
logger.error("No repositories found. Please refine your query.") | |
return "\nNo repositories found for your query. Please try a different query." | |
docs = [repo.get("combined_doc", "") for repo in repos] | |
logger.info(f"Encoding {len(docs)} documents for dense retrieval...") | |
sem_model = SentenceTransformer("all-mpnet-base-v2") | |
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16) | |
# Check if embeddings array is empty or 1-dimensional | |
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0: | |
logger.error("No document embeddings generated. Aborting dense retrieval.") | |
return "\nFailed to generate document embeddings. Please try again." | |
def normalize_embeddings(embeddings): | |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) | |
return embeddings / (norms + 1e-10) | |
doc_embeddings = normalize_embeddings(doc_embeddings) | |
query_embedding = sem_model.encode(user_query, convert_to_numpy=True) | |
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0] | |
dim = doc_embeddings.shape[1] | |
index = faiss.IndexFlatIP(dim) | |
index.add(doc_embeddings) | |
k = min(100, doc_embeddings.shape[0]) | |
D, I = index.search(np.expand_dims(query_embedding, axis=0), k) | |
for idx, score in zip(I[0], D[0]): | |
repos[idx]["semantic_similarity"] = score | |
ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True) | |
logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.") | |
# Stage 2: Filtering Low-Star Repositories | |
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50] | |
if not filtered_candidates: | |
filtered_candidates = ranked_by_semantic # fallback if filtering is too strict | |
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.") | |
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only) | |
semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates] | |
star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates] | |
min_sem, max_sem = min(semantic_scores), max(semantic_scores) | |
min_star, max_star = min(star_scores), max(star_scores) | |
def normalize(val, min_val, max_val): | |
if max_val - min_val == 0: | |
return 0.5 | |
return (val - min_val) / (max_val - min_val) | |
for repo in filtered_candidates: | |
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem) | |
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star) | |
# Weights: 60% semantic, 40% stars. | |
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star | |
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True) | |
logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.") | |
# Stage 4: Final Justification using ChatGroq | |
justifications = {} | |
for repo in final_ranked[:10]: | |
justification = justify_candidate(repo, user_query) | |
justifications[repo['title']] = justification | |
logger.info(f"Justification for {repo['title']}: {justification}") | |
# Format final results into a text table. | |
result_text = "\n=== Final Ranked Repositories ===\n" | |
for rank, repo in enumerate(final_ranked[:10], 1): | |
result_text += f"Final Rank: {rank}\n" | |
result_text += f"Title: {repo['title']}\n" | |
result_text += f"Link: {repo['link']}\n" | |
result_text += f"Stars: {repo['stars']}\n" | |
result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n" | |
result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n" | |
result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n" | |
result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n" | |
result_text += '-' * 80 + "\n" | |
result_text += "\n=== End of Results ===" | |
return result_text | |