Spaces:

zamal
/

DeepGit-lite

Running on Zero

DeepGit-lite / src /deepgit_lite.py

zamalali

Refine DeepGit Lite description and improve error handling for GitHub API key and document embeddings

1796763 2 months ago

11.3 kB

	import os
	import base64
	import requests
	import numpy as np
	import datetime
	from sentence_transformers import SentenceTransformer
	import faiss
	import math
	import logging
	from dotenv import load_dotenv
	from pathlib import Path
	from langchain_groq import ChatGroq
	from langchain_core.prompts import ChatPromptTemplate

	# ---------------------------
	# Environment and .env Setup
	# ---------------------------
	dotenv_path = Path(__file__).resolve().parent.parent / ".env"
	load_dotenv(dotenv_path=str(dotenv_path))

	if "GITHUB_API_KEY" not in os.environ:
	raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")

	# ---------------------------
	# Logging Setup
	# ---------------------------
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	# ---------------------------
	# ChatGroq Integration Setup (for query enhancement and final justification)
	# ---------------------------
	llm_groq = ChatGroq(
	model="llama-3.1-8b-instant",
	temperature=0.2,
	max_tokens=100,
	timeout=15,
	max_retries=2
	)

	def enhance_query(original_query):
	prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
	please enhance and expand it by adding relevant technical keywords, recent research context,
	and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
	Provide the refined query text."""
	messages = [
	("system", "You are a helpful research assistant specializing in AI and software research."),
	("human", prompt)
	]
	result = llm_groq.invoke(messages)
	# Extract text content if available
	if hasattr(result, "content"):
	return result.content
	return str(result)

	def justify_candidate(candidate, query):
	prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.

	Repository Details:
	- Stars: {candidate['stars']}
	- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}

	Provide a concise justification:"""
	messages = [
	("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
	("human", prompt)
	]
	result = llm_groq.invoke(messages)
	if hasattr(result, "content"):
	return result.content
	return str(result)

	# ---------------------------
	# GitHub API Helper Functions
	# ---------------------------
	def fetch_readme_content(repo_full_name, headers):
	readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
	response = requests.get(readme_url, headers=headers)
	if response.status_code == 200:
	readme_data = response.json()
	return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
	return ""

	def fetch_file_content(download_url):
	try:
	response = requests.get(download_url)
	if response.status_code == 200:
	return response.text
	except Exception as e:
	logger.error(f"Error fetching file: {e}")
	return ""

	def fetch_directory_markdown(repo_full_name, path, headers):
	md_content = ""
	url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
	response = requests.get(url, headers=headers)
	if response.status_code == 200:
	items = response.json()
	for item in items:
	if item["type"] == "file" and item["name"].lower().endswith(".md"):
	content = fetch_file_content(item["download_url"])
	md_content += f"\n\n# {item['name']}\n" + content
	return md_content

	def fetch_repo_documentation(repo_full_name, headers):
	doc_text = ""
	# Fetch README first.
	readme = fetch_readme_content(repo_full_name, headers)
	if readme:
	doc_text += "# README\n" + readme
	# Fetch additional markdown files and documentation directories.
	root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
	response = requests.get(root_url, headers=headers)
	if response.status_code == 200:
	items = response.json()
	for item in items:
	if item["type"] == "file" and item["name"].lower().endswith(".md"):
	if item["name"].lower() != "readme.md":
	content = fetch_file_content(item["download_url"])
	doc_text += f"\n\n# {item['name']}\n" + content
	elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
	doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
	return doc_text if doc_text.strip() else "No documentation available."

	def fetch_github_repositories(query, max_results=1000, per_page=100):
	url = "https://api.github.com/search/repositories"
	headers = {
	"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
	"Accept": "application/vnd.github.v3+json"
	}
	repositories = []
	num_pages = max_results // per_page
	for page in range(1, num_pages + 1):
	params = {
	"q": query,
	"sort": "stars",
	"order": "desc",
	"per_page": per_page,
	"page": page
	}
	response = requests.get(url, headers=headers, params=params)
	if response.status_code != 200:
	logger.error(f"Error {response.status_code}: {response.json().get('message')}")
	break
	items = response.json().get('items', [])
	if not items:
	break
	for repo in items:
	repo_link = repo['html_url']
	full_name = repo.get('full_name', '')
	doc_content = fetch_repo_documentation(full_name, headers)
	star_count = repo.get('stargazers_count', 0)
	repositories.append({
	"title": repo.get('name', 'No title available'),
	"link": repo_link,
	"combined_doc": doc_content,
	"stars": star_count,
	"full_name": full_name,
	"open_issues_count": repo.get('open_issues_count', 0)
	})
	logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
	return repositories

	# ---------------------------
	# Main Lite Workflow Function
	# ---------------------------
	def run_deepgit_lite(user_query):
	# Stage 0: Query Enhancement using ChatGroq
	logger.info("Enhancing query using ChatGroq...")
	original_query = user_query.strip()
	enhanced_query = enhance_query(original_query)
	logger.info(f"Enhanced Query: {enhanced_query}")
	github_query = enhanced_query + " language:python"
	logger.info(f"Using GitHub query: {github_query}")

	# Stage 1: Dense Retrieval with FAISS
	logger.info("Fetching repositories from GitHub...")
	repos = fetch_github_repositories(github_query)
	if not repos:
	logger.error("No repositories found. Please refine your query.")
	return "\nNo repositories found for your query. Please try a different query."

	docs = [repo.get("combined_doc", "") for repo in repos]
	logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
	sem_model = SentenceTransformer("all-mpnet-base-v2")
	doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)

	# Check if embeddings array is empty or 1-dimensional
	if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
	logger.error("No document embeddings generated. Aborting dense retrieval.")
	return "\nFailed to generate document embeddings. Please try again."

	def normalize_embeddings(embeddings):
	norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
	return embeddings / (norms + 1e-10)

	doc_embeddings = normalize_embeddings(doc_embeddings)
	query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
	query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
	dim = doc_embeddings.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(doc_embeddings)
	k = min(100, doc_embeddings.shape[0])
	D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
	for idx, score in zip(I[0], D[0]):
	repos[idx]["semantic_similarity"] = score
	ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
	logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")

	# Stage 2: Filtering Low-Star Repositories
	filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
	if not filtered_candidates:
	filtered_candidates = ranked_by_semantic # fallback if filtering is too strict
	logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")

	# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
	semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
	star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]

	min_sem, max_sem = min(semantic_scores), max(semantic_scores)
	min_star, max_star = min(star_scores), max(star_scores)

	def normalize(val, min_val, max_val):
	if max_val - min_val == 0:
	return 0.5
	return (val - min_val) / (max_val - min_val)

	for repo in filtered_candidates:
	norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
	norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
	# Weights: 60% semantic, 40% stars.
	repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star

	final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
	logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")

	# Stage 4: Final Justification using ChatGroq
	justifications = {}
	for repo in final_ranked[:10]:
	justification = justify_candidate(repo, user_query)
	justifications[repo['title']] = justification
	logger.info(f"Justification for {repo['title']}: {justification}")

	# Format final results into a text table.
	result_text = "\n=== Final Ranked Repositories ===\n"
	for rank, repo in enumerate(final_ranked[:10], 1):
	result_text += f"Final Rank: {rank}\n"
	result_text += f"Title: {repo['title']}\n"
	result_text += f"Link: {repo['link']}\n"
	result_text += f"Stars: {repo['stars']}\n"
	result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
	result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
	result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
	result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
	result_text += '-' * 80 + "\n"
	result_text += "\n=== End of Results ==="

	return result_text