Spaces:

zamal
/

DeepGit-lite

Running on Zero

App Files Files Community

zamalali commited on Mar 31

Commit

9494afe

1 Parent(s): 9e1349d

Add DeepGit Lite application and workflow files

Browse files

Files changed (6) hide show

app.py +239 -0
requirements.txt +9 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-311.pyc +0 -0
src/__pycache__/deepgit_lite.cpython-311.pyc +0 -0
src/deepgit_lite.py +245 -0

app.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import gradio as gr
+import os
+import time
+import threading
+import logging
+from src.deepgit_lite import run_deepgit_lite
+# ---------------------------
+# Global Logging Buffer Setup
+# ---------------------------
+LOG_BUFFER = []
+LOG_BUFFER_LOCK = threading.Lock()
+class BufferLogHandler(logging.Handler):
+    def emit(self, record):
+        log_entry = self.format(record)
+        with LOG_BUFFER_LOCK:
+            LOG_BUFFER.append(log_entry)
+root_logger = logging.getLogger()
+if not any(isinstance(h, BufferLogHandler) for h in root_logger.handlers):
+    handler = BufferLogHandler()
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    root_logger.addHandler(handler)
+def filter_logs(logs):
+    filtered = []
+    last_was_fetching = False
+    for log in logs:
+        if "HTTP Request:" in log:
+            if not last_was_fetching:
+                filtered.append("Fetching repositories...")
+                last_was_fetching = True
+        else:
+            filtered.append(log)
+            last_was_fetching = False
+    return filtered
+# ---------------------------
+# Title, Favicon & Description
+# ---------------------------
+favicon_html = """
+<head>
+<link rel="icon" type="image/x-icon" href="file/assets/deepgit.ico">
+<title>DeepGit Lite Research Agent</title>
+</head>
+"""
+title = """
+<div style="text-align: center; margin-top: 20px;">
+  <h1 style="font-size: 36px; display: inline-flex; align-items: center; gap: 16px;">
+    <img src="https://img.icons8.com/?size=100&id=118557&format=png&color=000000" width="64" />
+    <span>DeepGit Lite</span>
+  </h1>
+  <p style="font-size: 18px; color: #555; margin-top: 10px;">
+    ⚙️ A lightweight GitHub research agent for deep semantic search and ranking.
+  </p>
+</div>
+"""
+description = """<p align="center">
+DeepGit Lite is a streamlined version of DeepGit designed to perform advanced semantic research on GitHub repositories with faster response times. It uses query enhancement, dense retrieval via FAISS, activity analysis, and a final multi-factor ranking (combining semantic similarity, activity, and popularity) to deliver the best results.
+</p>"""
+consent_text = """
+<div style="padding: 10px; text-align: center;">
+  <p>
+    By using DeepGit Lite, you consent to temporary processing of your query for semantic search and ranking purposes.
+  </p>
+  <p>
+    ⭐ Star us on GitHub if you find this tool useful!<br/>
+    <a href="https://github.com/zamalali/DeepGit" target="_blank">GitHub</a>
+  </p>
+</div>
+"""
+footer = """
+<div style="text-align: center; margin-top: 40px; font-size: 13px; color: #888;">
+    Made with <span style="color: crimson;">❤️</span> by <b>Zamal</b>
+</div>
+"""
+# ---------------------------
+# HTML Table Renderer for DeepGit Lite
+# ---------------------------
+def format_percent(value):
+    try:
+        return f"{float(value) * 100:.1f}%"
+    except:
+        return value
+def parse_result_to_html(raw_result: str) -> str:
+    entries = raw_result.strip().split("Final Rank:")
+    html = """
+    <style>
+        table {
+            width: 100%;
+            border-collapse: collapse;
+            margin: 1em 0;
+            font-size: 14px;
+        }
+        th, td {
+            padding: 12px 15px;
+            border: 1px solid #ddd;
+            text-align: left;
+            vertical-align: top;
+        }
+        th {
+            background-color: #f4f4f4;
+        }
+        tr:hover { background-color: #f9f9f9; }
+    </style>
+    <table>
+        <thead>
+            <tr>
+                <th>Rank</th>
+                <th>Title</th>
+                <th>Link</th>
+                <th>Semantic Similarity</th>
+                <th>Activity Score</th>
+                <th>Final Score</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+    for entry in entries[1:]:
+        lines = entry.strip().split("\n")
+        data = {}
+        data["Final Rank"] = lines[0].strip()
+        for line in lines[1:]:
+            if ": " in line:
+                key, val = line.split(": ", 1)
+                data[key.strip()] = val.strip()
+        html += f"""
+            <tr>
+                <td>{data.get('Final Rank', '')}</td>
+                <td>{data.get('Title', '')}</td>
+                <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
+                <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
+                <td>{float(data.get('Activity Score', 0)):.2f}</td>
+                <td>{format_percent(data.get('Final Score', ''))}</td>
+            </tr>
+        """
+    html += "</tbody></table>"
+    return html
+# ---------------------------
+# Background Workflow Runner for DeepGit Lite
+# ---------------------------
+def run_lite_workflow(topic, result_container):
+    result = run_deepgit_lite(topic)
+    result_container["raw_result"] = result
+def stream_lite_workflow(topic):
+    with LOG_BUFFER_LOCK:
+        LOG_BUFFER.clear()
+    result_container = {}
+    workflow_thread = threading.Thread(target=run_lite_workflow, args=(topic, result_container))
+    workflow_thread.start()
+    last_index = 0
+    while workflow_thread.is_alive() or (last_index < len(LOG_BUFFER)):
+        with LOG_BUFFER_LOCK:
+            new_logs = LOG_BUFFER[last_index:]
+            last_index = len(LOG_BUFFER)
+        if new_logs:
+            filtered_logs = filter_logs(new_logs)
+            status_msg = filtered_logs[-1]
+            detail_msg = "<br/>".join(filtered_logs)
+            yield status_msg, detail_msg
+        time.sleep(0.5)
+    workflow_thread.join()
+    with LOG_BUFFER_LOCK:
+        final_logs = LOG_BUFFER[:]
+    filtered_final = filter_logs(final_logs)
+    raw_result = result_container.get("raw_result", "No results returned.")
+    html_result = parse_result_to_html(raw_result)
+    yield "", html_result
+# ---------------------------
+# App UI Setup for DeepGit Lite
+# ---------------------------
+with gr.Blocks(
+    theme="gstaff/sketch",
+    css="""
+        #main_container { margin: auto; max-width: 900px; }
+        footer, footer * { display: none !important; }
+    """
+) as demo:
+    gr.HTML(favicon_html)
+    gr.HTML(title)
+    gr.HTML(description)
+    with gr.Column(elem_id="user_consent_container") as consent_block:
+        gr.HTML(consent_text)
+        agree_button = gr.Button("I Agree", variant="primary")
+    with gr.Column(elem_id="main_container", visible=False) as main_block:
+        research_input = gr.Textbox(
+            label="Research Topic",
+            placeholder="Enter your research topic here, e.g., 'Instruction-based fine-tuning for LLaMA 2 using chain-of-thought prompting in Python.'",
+            lines=3
+        )
+        run_button = gr.Button("Run DeepGit Lite", variant="primary")
+        status_display = gr.Markdown("")
+        detail_display = gr.HTML("")
+        output_html = gr.HTML("")
+        state = gr.State([])
+    def enable_main():
+        return gr.update(visible=False), gr.update(visible=True)
+    agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
+    def lite_runner(topic):
+        for status, details in stream_lite_workflow(topic):
+            yield status, details
+    run_button.click(
+        fn=lite_runner,
+        inputs=[research_input],
+        outputs=[status_display, detail_display],
+        api_name="deepgit_lite",
+        show_progress=True
+    )
+    research_input.submit(
+        fn=lite_runner,
+        inputs=[research_input],
+        outputs=[status_display, detail_display],
+        api_name="deepgit_lite_submit",
+        show_progress=True
+    )
+    gr.HTML(footer)
+demo.queue(max_size=10).launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+requests==2.32.3
+numpy==1.25.2
+python-dotenv==1.0.1
+sentence-transformers==3.4.1
+faiss-cpu==1.9.0.post1
+gradio==5.23.1
+langgraph==0.2.62
+langchain_groq==0.2.4
+langchain_core==0.3.47

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (215 Bytes). View file

src/__pycache__/deepgit_lite.cpython-311.pyc ADDED Viewed

Binary file (15.9 kB). View file

src/deepgit_lite.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import os
+import base64
+import requests
+import numpy as np
+import datetime
+from sentence_transformers import SentenceTransformer
+import faiss
+import getpass
+import math
+import logging
+from dotenv import load_dotenv
+from pathlib import Path
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+# ---------------------------
+# Environment and .env Setup
+# ---------------------------
+dotenv_path = Path(__file__).resolve().parent.parent / ".env"
+load_dotenv(dotenv_path=str(dotenv_path))
+if "GITHUB_API_KEY" not in os.environ:
+    os.environ["GITHUB_API_KEY"] = getpass.getpass("Enter your GitHub API key: ")
+# ---------------------------
+# Logging Setup
+# ---------------------------
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# ---------------------------
+# ChatGroq Integration Setup (for query enhancement and final justification)
+# ---------------------------
+llm_groq = ChatGroq(
+    model="llama-3.1-8b-instant",
+    temperature=0.2,
+    max_tokens=100,
+    timeout=15,
+    max_retries=2
+)
+def enhance_query(original_query):
+    prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
+please enhance and expand it by adding relevant technical keywords, recent research context,
+and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
+Provide the refined query text."""
+    messages = [
+        ("system", "You are a helpful research assistant specializing in AI and software research."),
+        ("human", prompt)
+    ]
+    result = llm_groq.invoke(messages)
+    return result
+def justify_candidate(candidate, query):
+    prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
+Repository Details:
+- Stars: {candidate['stars']}
+- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
+Provide a concise justification:"""
+    messages = [
+        ("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
+        ("human", prompt)
+    ]
+    result = llm_groq.invoke(messages)
+    return result
+# ---------------------------
+# GitHub API Helper Functions
+# ---------------------------
+def fetch_readme_content(repo_full_name, headers):
+    readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
+    response = requests.get(readme_url, headers=headers)
+    if response.status_code == 200:
+        readme_data = response.json()
+        return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
+    return ""
+def fetch_file_content(download_url):
+    try:
+        response = requests.get(download_url)
+        if response.status_code == 200:
+            return response.text
+    except Exception as e:
+        logger.error(f"Error fetching file: {e}")
+    return ""
+def fetch_directory_markdown(repo_full_name, path, headers):
+    md_content = ""
+    url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        items = response.json()
+        for item in items:
+            if item["type"] == "file" and item["name"].lower().endswith(".md"):
+                content = fetch_file_content(item["download_url"])
+                md_content += f"\n\n# {item['name']}\n" + content
+    return md_content
+def fetch_repo_documentation(repo_full_name, headers):
+    doc_text = ""
+    # Fetch README first.
+    readme = fetch_readme_content(repo_full_name, headers)
+    if readme:
+        doc_text += "# README\n" + readme
+    # Fetch additional markdown files and documentation directories.
+    root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
+    response = requests.get(root_url, headers=headers)
+    if response.status_code == 200:
+        items = response.json()
+        for item in items:
+            if item["type"] == "file" and item["name"].lower().endswith(".md"):
+                if item["name"].lower() != "readme.md":
+                    content = fetch_file_content(item["download_url"])
+                    doc_text += f"\n\n# {item['name']}\n" + content
+            elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
+                doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
+    return doc_text if doc_text.strip() else "No documentation available."
+def fetch_github_repositories(query, max_results=1000, per_page=100):
+    url = "https://api.github.com/search/repositories"
+    headers = {
+        "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
+        "Accept": "application/vnd.github.v3+json"
+    }
+    repositories = []
+    num_pages = max_results // per_page
+    for page in range(1, num_pages + 1):
+        params = {
+            "q": query,
+            "sort": "stars",
+            "order": "desc",
+            "per_page": per_page,
+            "page": page
+        }
+        response = requests.get(url, headers=headers, params=params)
+        if response.status_code != 200:
+            logger.error(f"Error {response.status_code}: {response.json().get('message')}")
+            break
+        items = response.json().get('items', [])
+        if not items:
+            break
+        for repo in items:
+            repo_link = repo['html_url']
+            full_name = repo.get('full_name', '')
+            doc_content = fetch_repo_documentation(full_name, headers)
+            star_count = repo.get('stargazers_count', 0)
+            repositories.append({
+                "title": repo.get('name', 'No title available'),
+                "link": repo_link,
+                "combined_doc": doc_content,
+                "stars": star_count,
+                "full_name": full_name,
+                "open_issues_count": repo.get('open_issues_count', 0)
+            })
+    logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
+    return repositories
+# ---------------------------
+# Main Lite Workflow Function
+# ---------------------------
+def run_deepgit_lite(user_query):
+    # Stage 0: Query Enhancement using ChatGroq
+    logger.info("Enhancing query using ChatGroq...")
+    original_query = user_query.strip()
+    enhanced_query = enhance_query(original_query)
+    logger.info(f"Enhanced Query: {enhanced_query}")
+    github_query = enhanced_query + " language:python"
+    logger.info(f"Using GitHub query: {github_query}")
+    # Stage 1: Dense Retrieval with FAISS
+    logger.info("Fetching repositories from GitHub...")
+    repos = fetch_github_repositories(github_query)
+    docs = [repo.get("combined_doc", "") for repo in repos]
+    logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
+    sem_model = SentenceTransformer("all-mpnet-base-v2")
+    doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
+    def normalize_embeddings(embeddings):
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        return embeddings / (norms + 1e-10)
+    doc_embeddings = normalize_embeddings(doc_embeddings)
+    query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
+    query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
+    dim = doc_embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(doc_embeddings)
+    k = min(100, doc_embeddings.shape[0])
+    D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
+    for idx, score in zip(I[0], D[0]):
+        repos[idx]["semantic_similarity"] = score
+    ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
+    logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
+    # Stage 2: Filtering Low-Star Repositories
+    filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
+    if not filtered_candidates:
+        filtered_candidates = ranked_by_semantic  # fallback if filtering is too strict
+    logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
+    # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
+    semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
+    star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
+    min_sem, max_sem = min(semantic_scores), max(semantic_scores)
+    min_star, max_star = min(star_scores), max(star_scores)
+    def normalize(val, min_val, max_val):
+        if max_val - min_val == 0:
+            return 0.5
+        return (val - min_val) / (max_val - min_val)
+    for repo in filtered_candidates:
+        norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
+        norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
+        # Weights: 60% semantic, 40% stars.
+        repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
+    final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
+    logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
+    # Stage 4: Final Justification using ChatGroq
+    justifications = {}
+    for repo in final_ranked[:10]:
+        justification = justify_candidate(repo, user_query)
+        justifications[repo['title']] = justification
+        logger.info(f"Justification for {repo['title']}: {justification}")
+    # Format final results into a text table.
+    result_text = "\n=== Final Ranked Repositories ===\n"
+    for rank, repo in enumerate(final_ranked[:10], 1):
+        result_text += f"Final Rank: {rank}\n"
+        result_text += f"Title: {repo['title']}\n"
+        result_text += f"Link: {repo['link']}\n"
+        result_text += f"Stars: {repo['stars']}\n"
+        result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
+        result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
+        result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
+        result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
+        result_text += '-' * 80 + "\n"
+    result_text += "\n=== End of Results ==="
+    return result_text