DeepGit-lite / src /deepgit_lite.py
zamalali
Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback
777083e
raw
history blame
14.8 kB
import os
import base64
import requests
import numpy as np
import datetime
from sentence_transformers import SentenceTransformer
import faiss
import math
import logging
from dotenv import load_dotenv
from pathlib import Path
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
import re
import getpass
# ---------------------------
# Environment and .env Setup
# ---------------------------
dotenv_path = Path(__file__).resolve().parents[1] / ".env"
if dotenv_path.exists():
load_dotenv(dotenv_path=dotenv_path)
if "GITHUB_API_KEY" not in os.environ:
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.")
# Optionally, silence bitsandbytes warnings if desired.
os.environ["BITSANDBYTES_NOWARN"] = "1"
# ---------------------------
# Logging Setup
# ---------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# ---------------------------
# ChatGroq Integration Setup (for query conversion and final justification)
# ---------------------------
llm_groq = ChatGroq(
model="deepseek-r1-distill-llama-70b",
temperature=0.2,
max_tokens=800,
timeout=15,
max_retries=2
)
# --- Query Conversion Functions ---
prompt = ChatPromptTemplate.from_messages([
("system",
"""You are a GitHub search optimization expert.
Your job is to:
1. Read a user's query about tools, research, or tasks.
2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
Use as many tags as necessary based on the query's complexity, but never more than five.
5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
If no specific language is mentioned, do not include any target tag.
Output Format:
tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
Rules:
- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
- Use terms commonly found in GitHub repo names, topics, or descriptions.
- Avoid generic terms like "python", "ai", "tool", "project".
- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
- Prefer real tools, popular methods, or dataset names when mentioned.
- If your output does not strictly match the required format, correct it after your internal reasoning.
- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
"""),
("human", "{query}")
])
chain = prompt | llm_groq
def parse_search_tags(response: str) -> str:
"""
Removes any internal commentary enclosed in <think> ... </think> tags using regex,
and returns only the final searchable tags.
"""
cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip()
return cleaned
def valid_tags(tags: str) -> bool:
"""
Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens.
"""
pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$'
return re.match(pattern, tags) is not None
def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}")
refined_query = query
tags_output = ""
for iteration in range(max_iterations):
print(f"\n🔄 Iteration {iteration+1}")
response = chain.invoke({"query": refined_query})
full_output = response.content.strip()
tags_output = parse_search_tags(full_output)
print(f"Output Tags: {tags_output}")
if valid_tags(tags_output):
print("✅ Valid tags format detected.")
return tags_output
else:
print("⚠️ Invalid tags format. Requesting refinement...")
refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
print("Final output (may be invalid):", tags_output)
# Fallback default tags if output is still invalid
fallback = "data-augmentation:llm-fine-tuning"
print(f"Using fallback search tags: {fallback}")
return fallback
# --- Justification Function ---
def justify_candidate(candidate, query):
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
Repository Details:
- Stars: {candidate['stars']}
- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
Provide a concise justification:"""
messages = [
("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
("human", prompt)
]
result = llm_groq.invoke(messages)
if hasattr(result, "content"):
return result.content
return str(result)
# ---------------------------
# GitHub API Helper Functions
# ---------------------------
def fetch_readme_content(repo_full_name, headers):
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
response = requests.get(readme_url, headers=headers)
if response.status_code == 200:
readme_data = response.json()
return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
return ""
def fetch_file_content(download_url):
try:
response = requests.get(download_url)
if response.status_code == 200:
return response.text
except Exception as e:
logger.error(f"Error fetching file: {e}")
return ""
def fetch_directory_markdown(repo_full_name, path, headers):
md_content = ""
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
response = requests.get(url, headers=headers)
if response.status_code == 200:
items = response.json()
for item in items:
if item["type"] == "file" and item["name"].lower().endswith(".md"):
content = fetch_file_content(item["download_url"])
md_content += f"\n\n# {item['name']}\n" + content
return md_content
def fetch_repo_documentation(repo_full_name, headers):
doc_text = ""
readme = fetch_readme_content(repo_full_name, headers)
if readme:
doc_text += "# README\n" + readme
root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
response = requests.get(root_url, headers=headers)
if response.status_code == 200:
items = response.json()
for item in items:
if item["type"] == "file" and item["name"].lower().endswith(".md"):
if item["name"].lower() != "readme.md":
content = fetch_file_content(item["download_url"])
doc_text += f"\n\n# {item['name']}\n" + content
elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
return doc_text if doc_text.strip() else "No documentation available."
def fetch_github_repositories(query, max_results=1000, per_page=100):
url = "https://api.github.com/search/repositories"
headers = {
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
"Accept": "application/vnd.github.v3+json"
}
repositories = []
num_pages = max_results // per_page
for page in range(1, num_pages + 1):
params = {
"q": query,
"sort": "stars",
"order": "desc",
"per_page": per_page,
"page": page
}
response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
logger.error(f"Error {response.status_code}: {response.json().get('message')}")
break
items = response.json().get('items', [])
if not items:
break
for repo in items:
repo_link = repo['html_url']
full_name = repo.get('full_name', '')
doc_content = fetch_repo_documentation(full_name, headers)
star_count = repo.get('stargazers_count', 0)
repositories.append({
"title": repo.get('name', 'No title available'),
"link": repo_link,
"combined_doc": doc_content,
"stars": star_count,
"full_name": full_name,
"open_issues_count": repo.get('open_issues_count', 0)
})
logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
return repositories
# ---------------------------
# Main Lite Workflow Function
# ---------------------------
def run_deepgit_lite(user_query):
# Stage 0: Query Conversion using iterative_convert_to_search_tags
logger.info("Converting query to searchable tags...")
original_query = user_query.strip()
search_tags = iterative_convert_to_search_tags(original_query)
logger.info(f"Search Tags: {search_tags}")
# Convert colon-separated tags into a space-separated query string.
tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
github_query = " ".join(tag_list) + " language:python"
logger.info(f"Using GitHub query: {github_query}")
# Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query.
logger.info("Fetching repositories from GitHub...")
repos = fetch_github_repositories(github_query)
if not repos:
logger.warning("No repositories found with converted query. Falling back to default query.")
fallback_query = "data augmentation language:python"
logger.info(f"Using fallback GitHub query: {fallback_query}")
repos = fetch_github_repositories(fallback_query)
if not repos:
logger.error("No repositories found with fallback query either.")
return "\nNo repositories found for your query. Please try a different query."
docs = [repo.get("combined_doc", "") for repo in repos]
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0:
logger.error("No document embeddings generated. Aborting dense retrieval.")
return "\nFailed to generate document embeddings. Please try again."
def normalize_embeddings(embeddings):
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
return embeddings / (norms + 1e-10)
doc_embeddings = normalize_embeddings(doc_embeddings)
query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
dim = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(doc_embeddings)
k = min(100, doc_embeddings.shape[0])
D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
for idx, score in zip(I[0], D[0]):
repos[idx]["semantic_similarity"] = score
ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
# Stage 2: Filtering Low-Star Repositories
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
if not filtered_candidates:
filtered_candidates = ranked_by_semantic
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
min_sem, max_sem = min(semantic_scores), max(semantic_scores)
min_star, max_star = min(star_scores), max(star_scores)
def normalize(val, min_val, max_val):
if max_val - min_val == 0:
return 0.5
return (val - min_val) / (max_val - min_val)
for repo in filtered_candidates:
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
# Stage 4: Final Justification using ChatGroq
justifications = {}
for repo in final_ranked[:10]:
justification = justify_candidate(repo, user_query)
justifications[repo['title']] = justification
logger.info(f"Justification for {repo['title']}: {justification}")
# Format final results into a text table.
result_text = "\n=== Final Ranked Repositories ===\n"
for rank, repo in enumerate(final_ranked[:10], 1):
result_text += f"Final Rank: {rank}\n"
result_text += f"Title: {repo['title']}\n"
result_text += f"Link: {repo['link']}\n"
result_text += f"Stars: {repo['stars']}\n"
result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
result_text += '-' * 80 + "\n"
result_text += "\n=== End of Results ==="
return result_text
# For debugging: if run directly, execute with an example query.
if __name__ == "__main__":
test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs"
print(run_deepgit_lite(test_query))