Spaces:
Running
on
Zero
Running
on
Zero
zamalali
Refactor DeepGit Lite to load environment variables, update API integration, and enhance user feedback
777083e
import os | |
import base64 | |
import requests | |
import numpy as np | |
import datetime | |
from sentence_transformers import SentenceTransformer | |
import faiss | |
import math | |
import logging | |
from dotenv import load_dotenv | |
from pathlib import Path | |
from langchain_groq import ChatGroq | |
from langchain_core.prompts import ChatPromptTemplate | |
import re | |
import getpass | |
# --------------------------- | |
# Environment and .env Setup | |
# --------------------------- | |
dotenv_path = Path(__file__).resolve().parents[1] / ".env" | |
if dotenv_path.exists(): | |
load_dotenv(dotenv_path=dotenv_path) | |
if "GITHUB_API_KEY" not in os.environ: | |
raise EnvironmentError("GITHUB_API_KEY not set in environment. Please set it as an environment variable.") | |
# Optionally, silence bitsandbytes warnings if desired. | |
os.environ["BITSANDBYTES_NOWARN"] = "1" | |
# --------------------------- | |
# Logging Setup | |
# --------------------------- | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
logger = logging.getLogger(__name__) | |
# --------------------------- | |
# ChatGroq Integration Setup (for query conversion and final justification) | |
# --------------------------- | |
llm_groq = ChatGroq( | |
model="deepseek-r1-distill-llama-70b", | |
temperature=0.2, | |
max_tokens=800, | |
timeout=15, | |
max_retries=2 | |
) | |
# --- Query Conversion Functions --- | |
prompt = ChatPromptTemplate.from_messages([ | |
("system", | |
"""You are a GitHub search optimization expert. | |
Your job is to: | |
1. Read a user's query about tools, research, or tasks. | |
2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language. | |
3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags. | |
4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery. | |
Use as many tags as necessary based on the query's complexity, but never more than five. | |
5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript). | |
If no specific language is mentioned, do not include any target tag. | |
Output Format: | |
tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]] | |
Rules: | |
- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought). | |
- Use terms commonly found in GitHub repo names, topics, or descriptions. | |
- Avoid generic terms like "python", "ai", "tool", "project". | |
- Do NOT use full phrases or vague words like "no-code", "framework", or "approach". | |
- Prefer real tools, popular methods, or dataset names when mentioned. | |
- If your output does not strictly match the required format, correct it after your internal reasoning. | |
- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories. | |
Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations. | |
"""), | |
("human", "{query}") | |
]) | |
chain = prompt | llm_groq | |
def parse_search_tags(response: str) -> str: | |
""" | |
Removes any internal commentary enclosed in <think> ... </think> tags using regex, | |
and returns only the final searchable tags. | |
""" | |
cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL).strip() | |
return cleaned | |
def valid_tags(tags: str) -> bool: | |
""" | |
Validates that the output is one to six colon-separated tokens composed of lowercase letters, numbers, and hyphens. | |
""" | |
pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){0,5}$' | |
return re.match(pattern, tags) is not None | |
def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str: | |
print(f"\n🧠 [iterative_convert_to_search_tags] Input Query: {query}") | |
refined_query = query | |
tags_output = "" | |
for iteration in range(max_iterations): | |
print(f"\n🔄 Iteration {iteration+1}") | |
response = chain.invoke({"query": refined_query}) | |
full_output = response.content.strip() | |
tags_output = parse_search_tags(full_output) | |
print(f"Output Tags: {tags_output}") | |
if valid_tags(tags_output): | |
print("✅ Valid tags format detected.") | |
return tags_output | |
else: | |
print("⚠️ Invalid tags format. Requesting refinement...") | |
refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]." | |
print("Final output (may be invalid):", tags_output) | |
# Fallback default tags if output is still invalid | |
fallback = "data-augmentation:llm-fine-tuning" | |
print(f"Using fallback search tags: {fallback}") | |
return fallback | |
# --- Justification Function --- | |
def justify_candidate(candidate, query): | |
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant. | |
Repository Details: | |
- Stars: {candidate['stars']} | |
- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f} | |
Provide a concise justification:""" | |
messages = [ | |
("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."), | |
("human", prompt) | |
] | |
result = llm_groq.invoke(messages) | |
if hasattr(result, "content"): | |
return result.content | |
return str(result) | |
# --------------------------- | |
# GitHub API Helper Functions | |
# --------------------------- | |
def fetch_readme_content(repo_full_name, headers): | |
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme" | |
response = requests.get(readme_url, headers=headers) | |
if response.status_code == 200: | |
readme_data = response.json() | |
return base64.b64decode(readme_data.get('content', '')).decode('utf-8') | |
return "" | |
def fetch_file_content(download_url): | |
try: | |
response = requests.get(download_url) | |
if response.status_code == 200: | |
return response.text | |
except Exception as e: | |
logger.error(f"Error fetching file: {e}") | |
return "" | |
def fetch_directory_markdown(repo_full_name, path, headers): | |
md_content = "" | |
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}" | |
response = requests.get(url, headers=headers) | |
if response.status_code == 200: | |
items = response.json() | |
for item in items: | |
if item["type"] == "file" and item["name"].lower().endswith(".md"): | |
content = fetch_file_content(item["download_url"]) | |
md_content += f"\n\n# {item['name']}\n" + content | |
return md_content | |
def fetch_repo_documentation(repo_full_name, headers): | |
doc_text = "" | |
readme = fetch_readme_content(repo_full_name, headers) | |
if readme: | |
doc_text += "# README\n" + readme | |
root_url = f"https://api.github.com/repos/{repo_full_name}/contents" | |
response = requests.get(root_url, headers=headers) | |
if response.status_code == 200: | |
items = response.json() | |
for item in items: | |
if item["type"] == "file" and item["name"].lower().endswith(".md"): | |
if item["name"].lower() != "readme.md": | |
content = fetch_file_content(item["download_url"]) | |
doc_text += f"\n\n# {item['name']}\n" + content | |
elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]: | |
doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers) | |
return doc_text if doc_text.strip() else "No documentation available." | |
def fetch_github_repositories(query, max_results=1000, per_page=100): | |
url = "https://api.github.com/search/repositories" | |
headers = { | |
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}", | |
"Accept": "application/vnd.github.v3+json" | |
} | |
repositories = [] | |
num_pages = max_results // per_page | |
for page in range(1, num_pages + 1): | |
params = { | |
"q": query, | |
"sort": "stars", | |
"order": "desc", | |
"per_page": per_page, | |
"page": page | |
} | |
response = requests.get(url, headers=headers, params=params) | |
if response.status_code != 200: | |
logger.error(f"Error {response.status_code}: {response.json().get('message')}") | |
break | |
items = response.json().get('items', []) | |
if not items: | |
break | |
for repo in items: | |
repo_link = repo['html_url'] | |
full_name = repo.get('full_name', '') | |
doc_content = fetch_repo_documentation(full_name, headers) | |
star_count = repo.get('stargazers_count', 0) | |
repositories.append({ | |
"title": repo.get('name', 'No title available'), | |
"link": repo_link, | |
"combined_doc": doc_content, | |
"stars": star_count, | |
"full_name": full_name, | |
"open_issues_count": repo.get('open_issues_count', 0) | |
}) | |
logger.info(f"Fetched {len(repositories)} repositories from GitHub.") | |
return repositories | |
# --------------------------- | |
# Main Lite Workflow Function | |
# --------------------------- | |
def run_deepgit_lite(user_query): | |
# Stage 0: Query Conversion using iterative_convert_to_search_tags | |
logger.info("Converting query to searchable tags...") | |
original_query = user_query.strip() | |
search_tags = iterative_convert_to_search_tags(original_query) | |
logger.info(f"Search Tags: {search_tags}") | |
# Convert colon-separated tags into a space-separated query string. | |
tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()] | |
github_query = " ".join(tag_list) + " language:python" | |
logger.info(f"Using GitHub query: {github_query}") | |
# Stage 1: Dense Retrieval with FAISS - Fetch repositories using the query. | |
logger.info("Fetching repositories from GitHub...") | |
repos = fetch_github_repositories(github_query) | |
if not repos: | |
logger.warning("No repositories found with converted query. Falling back to default query.") | |
fallback_query = "data augmentation language:python" | |
logger.info(f"Using fallback GitHub query: {fallback_query}") | |
repos = fetch_github_repositories(fallback_query) | |
if not repos: | |
logger.error("No repositories found with fallback query either.") | |
return "\nNo repositories found for your query. Please try a different query." | |
docs = [repo.get("combined_doc", "") for repo in repos] | |
logger.info(f"Encoding {len(docs)} documents for dense retrieval...") | |
sem_model = SentenceTransformer("all-mpnet-base-v2", device="cpu") | |
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16) | |
if doc_embeddings.ndim < 2 or doc_embeddings.shape[0] == 0: | |
logger.error("No document embeddings generated. Aborting dense retrieval.") | |
return "\nFailed to generate document embeddings. Please try again." | |
def normalize_embeddings(embeddings): | |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) | |
return embeddings / (norms + 1e-10) | |
doc_embeddings = normalize_embeddings(doc_embeddings) | |
query_embedding = sem_model.encode(user_query, convert_to_numpy=True) | |
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0] | |
dim = doc_embeddings.shape[1] | |
index = faiss.IndexFlatIP(dim) | |
index.add(doc_embeddings) | |
k = min(100, doc_embeddings.shape[0]) | |
D, I = index.search(np.expand_dims(query_embedding, axis=0), k) | |
for idx, score in zip(I[0], D[0]): | |
repos[idx]["semantic_similarity"] = score | |
ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True) | |
logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.") | |
# Stage 2: Filtering Low-Star Repositories | |
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50] | |
if not filtered_candidates: | |
filtered_candidates = ranked_by_semantic | |
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.") | |
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only) | |
semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates] | |
star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates] | |
min_sem, max_sem = min(semantic_scores), max(semantic_scores) | |
min_star, max_star = min(star_scores), max(star_scores) | |
def normalize(val, min_val, max_val): | |
if max_val - min_val == 0: | |
return 0.5 | |
return (val - min_val) / (max_val - min_val) | |
for repo in filtered_candidates: | |
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem) | |
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star) | |
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star | |
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True) | |
logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.") | |
# Stage 4: Final Justification using ChatGroq | |
justifications = {} | |
for repo in final_ranked[:10]: | |
justification = justify_candidate(repo, user_query) | |
justifications[repo['title']] = justification | |
logger.info(f"Justification for {repo['title']}: {justification}") | |
# Format final results into a text table. | |
result_text = "\n=== Final Ranked Repositories ===\n" | |
for rank, repo in enumerate(final_ranked[:10], 1): | |
result_text += f"Final Rank: {rank}\n" | |
result_text += f"Title: {repo['title']}\n" | |
result_text += f"Link: {repo['link']}\n" | |
result_text += f"Stars: {repo['stars']}\n" | |
result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n" | |
result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n" | |
result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n" | |
result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n" | |
result_text += '-' * 80 + "\n" | |
result_text += "\n=== End of Results ===" | |
return result_text | |
# For debugging: if run directly, execute with an example query. | |
if __name__ == "__main__": | |
test_query = "I am looking for repositories for data augmentation pipelines for fine-tuning LLMs" | |
print(run_deepgit_lite(test_query)) | |