import os import base64 import requests import numpy as np import datetime from sentence_transformers import SentenceTransformer import faiss import math import logging from dotenv import load_dotenv from pathlib import Path from langchain_groq import ChatGroq from langchain_core.prompts import ChatPromptTemplate # --------------------------- # Environment and .env Setup # --------------------------- dotenv_path = Path(__file__).resolve().parent.parent / ".env" load_dotenv(dotenv_path=str(dotenv_path)) if "GITHUB_API_KEY" not in os.environ: raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.") # --------------------------- # Logging Setup # --------------------------- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # --------------------------- # ChatGroq Integration Setup (for query enhancement and final justification) # --------------------------- llm_groq = ChatGroq( model="llama-3.1-8b-instant", temperature=0.2, max_tokens=100, timeout=15, max_retries=2 ) def enhance_query(original_query): prompt = f"""You are an expert research assistant. Given the query: "{original_query}", please enhance and expand it by adding relevant technical keywords, recent research context, and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment. Provide the refined query text.""" messages = [ ("system", "You are a helpful research assistant specializing in AI and software research."), ("human", prompt) ] result = llm_groq.invoke(messages) # Extract text content if available if hasattr(result, "content"): return result.content return str(result) def justify_candidate(candidate, query): prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant. Repository Details: - Stars: {candidate['stars']} - Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f} Provide a concise justification:""" messages = [ ("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."), ("human", prompt) ] result = llm_groq.invoke(messages) if hasattr(result, "content"): return result.content return str(result) # --------------------------- # GitHub API Helper Functions # --------------------------- def fetch_readme_content(repo_full_name, headers): readme_url = f"https://api.github.com/repos/{repo_full_name}/readme" response = requests.get(readme_url, headers=headers) if response.status_code == 200: readme_data = response.json() return base64.b64decode(readme_data.get('content', '')).decode('utf-8') return "" def fetch_file_content(download_url): try: response = requests.get(download_url) if response.status_code == 200: return response.text except Exception as e: logger.error(f"Error fetching file: {e}") return "" def fetch_directory_markdown(repo_full_name, path, headers): md_content = "" url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}" response = requests.get(url, headers=headers) if response.status_code == 200: items = response.json() for item in items: if item["type"] == "file" and item["name"].lower().endswith(".md"): content = fetch_file_content(item["download_url"]) md_content += f"\n\n# {item['name']}\n" + content return md_content def fetch_repo_documentation(repo_full_name, headers): doc_text = "" # Fetch README first. readme = fetch_readme_content(repo_full_name, headers) if readme: doc_text += "# README\n" + readme # Fetch additional markdown files and documentation directories. root_url = f"https://api.github.com/repos/{repo_full_name}/contents" response = requests.get(root_url, headers=headers) if response.status_code == 200: items = response.json() for item in items: if item["type"] == "file" and item["name"].lower().endswith(".md"): if item["name"].lower() != "readme.md": content = fetch_file_content(item["download_url"]) doc_text += f"\n\n# {item['name']}\n" + content elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]: doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers) return doc_text if doc_text.strip() else "No documentation available." def fetch_github_repositories(query, max_results=1000, per_page=100): url = "https://api.github.com/search/repositories" headers = { "Authorization": f"token {os.getenv('GITHUB_API_KEY')}", "Accept": "application/vnd.github.v3+json" } repositories = [] num_pages = max_results // per_page for page in range(1, num_pages + 1): params = { "q": query, "sort": "stars", "order": "desc", "per_page": per_page, "page": page } response = requests.get(url, headers=headers, params=params) if response.status_code != 200: logger.error(f"Error {response.status_code}: {response.json().get('message')}") break items = response.json().get('items', []) if not items: break for repo in items: repo_link = repo['html_url'] full_name = repo.get('full_name', '') doc_content = fetch_repo_documentation(full_name, headers) star_count = repo.get('stargazers_count', 0) repositories.append({ "title": repo.get('name', 'No title available'), "link": repo_link, "combined_doc": doc_content, "stars": star_count, "full_name": full_name, "open_issues_count": repo.get('open_issues_count', 0) }) logger.info(f"Fetched {len(repositories)} repositories from GitHub.") return repositories # --------------------------- # Main Lite Workflow Function # --------------------------- def run_deepgit_lite(user_query): # Stage 0: Query Enhancement using ChatGroq logger.info("Enhancing query using ChatGroq...") original_query = user_query.strip() enhanced_query = enhance_query(original_query) logger.info(f"Enhanced Query: {enhanced_query}") github_query = enhanced_query + " language:python" logger.info(f"Using GitHub query: {github_query}") # Stage 1: Dense Retrieval with FAISS logger.info("Fetching repositories from GitHub...") repos = fetch_github_repositories(github_query) if not repos: logger.error("No repositories found. Please refine your query.") return "\nNo repositories found for your query. Please try a different query." docs = [repo.get("combined_doc", "") for repo in repos] logger.info(f"Encoding {len(docs)} documents for dense retrieval...") sem_model = SentenceTransformer("all-mpnet-base-v2") doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16) # Check if embeddings array is empty or 1-dimensional if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0: logger.error("No document embeddings generated. Aborting dense retrieval.") return "\nFailed to generate document embeddings. Please try again." def normalize_embeddings(embeddings): norms = np.linalg.norm(embeddings, axis=1, keepdims=True) return embeddings / (norms + 1e-10) doc_embeddings = normalize_embeddings(doc_embeddings) query_embedding = sem_model.encode(user_query, convert_to_numpy=True) query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0] dim = doc_embeddings.shape[1] index = faiss.IndexFlatIP(dim) index.add(doc_embeddings) k = min(100, doc_embeddings.shape[0]) D, I = index.search(np.expand_dims(query_embedding, axis=0), k) for idx, score in zip(I[0], D[0]): repos[idx]["semantic_similarity"] = score ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True) logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.") # Stage 2: Filtering Low-Star Repositories filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50] if not filtered_candidates: filtered_candidates = ranked_by_semantic # fallback if filtering is too strict logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.") # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only) semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates] star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates] min_sem, max_sem = min(semantic_scores), max(semantic_scores) min_star, max_star = min(star_scores), max(star_scores) def normalize(val, min_val, max_val): if max_val - min_val == 0: return 0.5 return (val - min_val) / (max_val - min_val) for repo in filtered_candidates: norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem) norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star) # Weights: 60% semantic, 40% stars. repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True) logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.") # Stage 4: Final Justification using ChatGroq justifications = {} for repo in final_ranked[:10]: justification = justify_candidate(repo, user_query) justifications[repo['title']] = justification logger.info(f"Justification for {repo['title']}: {justification}") # Format final results into a text table. result_text = "\n=== Final Ranked Repositories ===\n" for rank, repo in enumerate(final_ranked[:10], 1): result_text += f"Final Rank: {rank}\n" result_text += f"Title: {repo['title']}\n" result_text += f"Link: {repo['link']}\n" result_text += f"Stars: {repo['stars']}\n" result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n" result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n" result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n" result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n" result_text += '-' * 80 + "\n" result_text += "\n=== End of Results ===" return result_text