Spaces:

zamal
/

DeepGit

Running

App Files Files Community

zamalali commited on Jul 4

Commit

0b55d27

0 Parent(s):

Push of DeepGit core files

Browse files

Files changed (5) hide show

.gitignore +5 -0
agent.py +125 -0
app.py +309 -0
main.py +380 -0
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.venv/
+.env
+__pycache__/
+*.pyc
+.gradio/

agent.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# test.py
+import os
+from dotenv import load_dotenv
+from github import Github, Auth
+# LangChain imports
+from langchain_groq import ChatGroq
+from langchain_core.tools import tool
+from langchain.agents import create_tool_calling_agent, AgentExecutor
+from langchain import hub
+# Load environment variables
+load_dotenv()
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+GITHUB_PAT = os.getenv("GITHUB_API_KEY")
+if not (GROQ_API_KEY and GITHUB_PAT):
+    raise ValueError("Please set GROQ_API_KEY and GITHUB_API_KEY in your .env")
+# Initialize GitHub client
+_auth = Auth.Token(GITHUB_PAT)
+_gh = Github(auth=_auth)
+# Define the GitHub tool
+@tool
+def get_repo_info(repo_name: str) -> str:
+    """Fetch and summarize metadata about a GitHub repository."""
+    try:
+        repo = _gh.get_repo(repo_name)
+    except Exception as e:
+        return f" Error fetching '{repo_name}': {e}"
+    name = repo.full_name
+    desc = repo.description or "No description"
+    url = repo.html_url
+    owner = repo.owner.login
+    stars = repo.stargazers_count
+    forks = repo.forks_count
+    issues = repo.open_issues_count
+    created = repo.created_at.isoformat()
+    updated = repo.updated_at.isoformat()
+    watchers = repo.watchers_count
+    default_br = repo.default_branch
+    language = repo.language or "None"
+    try:
+        license_name = repo.get_license().license.name
+    except:
+        license_name = "None"
+    topics = repo.get_topics()
+    try:
+        raw_md = repo.get_readme().decoded_content.decode("utf-8")
+        snippet = raw_md[:300].replace("\n", " ") + "..."
+    except:
+        snippet = "No README found"
+    contribs = repo.get_contributors()[:5]
+    contrib_list = ", ".join(f"{c.login}({c.contributions})" for c in contribs)
+    commits = repo.get_commits()[:3]
+    commit_list = "; ".join(c.commit.message.split("\n")[0] for c in commits)
+    return f"""
+Repository: {name}
+Description: {desc}
+URL: {url}
+Owner: {owner}
+⭐ Stars: {stars}   🍴 Forks: {forks}   🐛 Open Issues: {issues}
+👁️ Watchers: {watchers}   Default branch: {default_br}
+⚙️ Language: {language}   License: {license_name}
+🔍 Topics: {topics}
+README Snippet: {snippet}
+👥 Top Contributors: {contrib_list}
+🧾 Latest Commits: {commit_list}
+"""
+# Instantiate the Groq LLM
+llm = ChatGroq(
+    model="llama-3.1-8b-instant",
+    temperature=0.3,
+    max_tokens=1024,
+    api_key=GROQ_API_KEY,
+)
+# Define the tools to pass into the agent
+tools = [get_repo_info]
+# Pull default tool-calling agent prompt from LangChain hub
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+prompt = ChatPromptTemplate.from_messages([
+    ("system",
+     "You are GitHub Agent, an expert at analyzing repositories.\n"
+     "When a user asks about a repo, call the tool and return a clear, concise summary of the repository based on the tool result.\n"
+     "Avoid repeating raw tool output or adding unnecessary disclaimers.\n"
+     "Respond in complete sentences, in natural language."
+    ),
+    MessagesPlaceholder(variable_name="chat_history", optional=True),
+    ("human", "{input}"),
+    MessagesPlaceholder(variable_name="agent_scratchpad"),
+])
+# Create the agent using LangChain's legacy AgentExecutor approach
+agent = create_tool_calling_agent(llm, tools, prompt)
+# Run the agent
+# At the bottom of test.py
+agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, max_iterations=2)
+# Export it
+__all__ = ["agent_executor"]
+# # Quick test
+if __name__ == "__main__":
+    import time
+    user_input = "Give me details about the repo zamalali/deepgit"
+    start_time = time.time()
+    result = agent_executor.invoke({"input": user_input})
+    end_time = time.time()
+    print("\n Final Answer:\n", result["output"])
+    print(f"\n Took {end_time - start_time:.2f} seconds")

app.py ADDED Viewed

	@@ -0,0 +1,309 @@

+import gradio as gr
+import time
+import threading
+import logging
+from gradio.themes.utils import sizes
+from main import run_repository_ranking  # Import the repository ranking function
+import agent  # Import the test.py module for chat agent
+# ---------------------------
+# Global Logging Buffer Setup
+# ---------------------------
+LOG_BUFFER = []
+LOG_BUFFER_LOCK = threading.Lock()
+class BufferLogHandler(logging.Handler):
+    def emit(self, record):
+        log_entry = self.format(record)
+        with LOG_BUFFER_LOCK:
+            LOG_BUFFER.append(log_entry)
+root_logger = logging.getLogger()
+if not any(isinstance(h, BufferLogHandler) for h in root_logger.handlers):
+    handler = BufferLogHandler()
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    root_logger.addHandler(handler)
+def filter_logs(logs):
+    filtered = []
+    last_was_fetching = False
+    for log in logs:
+        if "HTTP Request:" in log:
+            if not last_was_fetching:
+                filtered.append("Fetching repositories...")
+                last_was_fetching = True
+        else:
+            filtered.append(log)
+            last_was_fetching = False
+    return filtered
+def parse_result_to_html(raw_result: str, num_results: int) -> (str, list):
+    """
+    Parses the raw string output from run_repository_ranking to an HTML table.
+    Only the top N results are displayed.
+    Returns (html, repo_names)
+    """
+    entries = raw_result.strip().split("Final Rank:")
+    entries = entries[1:num_results+1]  # Use only the first N entries
+    if not entries:
+        return ("<p>No repositories found for your query.</p>", [])
+    html = """
+    <table border="1" style="width:80%; margin: auto; border-collapse: collapse;">
+        <thead>
+            <tr>
+                <th>Rank</th>
+                <th>Title</th>
+                <th>Link</th>
+                <th>Combined Score</th>
+            </tr>
+        </thead>
+        <tbody>
+    """
+    repo_names = []
+    for entry in entries:
+        lines = entry.strip().split("\n")
+        data = {}
+        data["Final Rank"] = lines[0].strip() if lines else ""
+        for line in lines[1:]:
+            if ": " in line:
+                key, val = line.split(": ", 1)
+                data[key.strip()] = val.strip()
+        # Try to extract repo name from the Link (github.com/user/repo)
+        link = data.get('Link', '')
+        repo_name = ''
+        if 'github.com/' in link:
+            repo_name = link.split('github.com/')[-1].strip('/ ')
+        if repo_name:
+            repo_names.append(repo_name)
+        html += f"""
+            <tr>
+                <td>{data.get('Final Rank', '')}</td>
+                <td>{data.get('Title', '')}</td>
+                <td><a href=\"{data.get('Link', '#')}\" target=\"_blank\">GitHub</a></td>
+                <td>{data.get('Combined Score', '')}</td>
+            </tr>
+        """
+    html += "</tbody></table>"
+    return html, repo_names
+# ---------------------------
+# GPU-enabled Wrapper for Repository Ranking
+# ---------------------------
+def gpu_run_repo(topic: str, num_results: int):
+    return run_repository_ranking(topic, num_results)
+def run_lite_workflow(topic, num_results, result_container):
+    result = gpu_run_repo(topic, num_results)
+    result_container["raw_result"] = result
+def stream_lite_workflow(topic, num_results):
+    logging.info("[UI] User started a new search for topic: %s", topic)
+    with LOG_BUFFER_LOCK:
+        LOG_BUFFER.clear()
+    result_container = {}
+    workflow_thread = threading.Thread(target=run_lite_workflow, args=(topic, num_results, result_container))
+    workflow_thread.start()
+    last_index = 0
+    while workflow_thread.is_alive() or (last_index < len(LOG_BUFFER)):
+        with LOG_BUFFER_LOCK:
+            new_logs = LOG_BUFFER[last_index:]
+            last_index = len(LOG_BUFFER)
+        if new_logs:
+            filtered_logs = filter_logs(new_logs)
+            status_msg = filtered_logs[-1]
+            detail_msg = "<br/>".join(filtered_logs)
+            yield status_msg, detail_msg, []
+        time.sleep(0.5)
+    workflow_thread.join()
+    with LOG_BUFFER_LOCK:
+        final_logs = LOG_BUFFER[:]
+    raw_result = result_container.get("raw_result", "No results returned.")
+    html_result, repo_names = parse_result_to_html(raw_result, num_results)
+    yield "", html_result, repo_names
+def lite_runner(topic, num_results):
+    logging.info("[UI] Running lite_runner for topic: %s", topic)
+    yield "Workflow started", "<p>Processing your request. Please wait...</p>", []
+    for status, details, repos in stream_lite_workflow(topic, num_results):
+        yield status, details, repos
+# ---------------------------
+# App UI Setup Using Gradio Soft Theme with Centered Layout
+# ---------------------------
+with gr.Blocks(
+    theme=gr.themes.Soft(text_size=sizes.text_md),
+    title="DeepGit Lite",
+    css="""
+        /* Center header and footer */
+        #header { text-align: center; margin-bottom: 20px; }
+        #main-container { max-width: 800px; margin: auto; }
+        #footer { text-align: center; margin-top: 20px; }
+    """
+) as demo:
+    gr.Markdown(
+        """
+        <div style="padding-top: 60px;">
+            <div style="display: flex; align-items: center; justify-content: center;">
+                <img src="https://img.icons8.com/?size=100&id=118557&format=png&color=000000"
+                     style="width: 60px; height: 60px; margin-right: 12px;">
+                <h1 style="margin: 0; font-size: 2.5em; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
+                    DeepGit Lite
+                </h1>
+            </div>
+            <div style="text-align: center; margin-top: 20px; font-size: 1.1em; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;">
+                <p>
+                    ✨ DeepGit Lite is the lightweight pro version of <strong>DeepGit</strong>.<br>
+                    It harnesses advanced deep semantic search to explore GitHub repositories and deliver curated results.<br>
+                    Under the hood, it leverages a hybrid ranking approach combining dense retrieval, BM25 scoring, and cross-encoder re-ranking for optimal discovery.<br>
+                    If the agent returns no repositories found, it means no chain was invoked due to GPU unavailability. Please duplicate the space and re-run.
+                </p>
+                <p>
+                    🚀 Check out the full DeepGit version on
+                    <a href="https://github.com/zamalali/DeepGit" target="_blank">GitHub</a> and ⭐
+                    <strong>Star DeepGit</strong> on GitHub!
+                </p>
+            </div>
+        </div>
+        """,
+        elem_id="header"
+    )
+    # --- Search UI ---
+    with gr.Column(elem_id="main-container", visible=True) as search_ui:
+        research_input = gr.Textbox(
+            label="Research Query",
+            placeholder="Enter your research topic here, e.g., Looking for a low code/no code tool to augment images and annotations?",
+            lines=3
+        )
+        num_results_slider = gr.Slider(
+            minimum=5, maximum=25, value=10, step=1,
+            label="Number of Results to Display",
+            info="Choose how many top repositories to show (sorted by score)"
+        )
+        run_button = gr.Button("Run DeepGit Lite", variant="primary")
+        status_display = gr.Markdown(label="Status")
+        detail_display = gr.HTML(label="Results")
+        repo_state = gr.State([])
+        go_to_chat_btn = gr.Button("Go to Chat", visible=False)
+    # --- Chat UI ---
+    with gr.Column(visible=False) as chat_ui:
+        repo_choice = gr.Radio(choices=[], label="Select a repository", interactive=True)
+        chat_history = gr.Chatbot(label="Chat with GitHub Agent")
+        user_input = gr.Textbox(label="Your question", placeholder="Ask about the selected repo...e.g., tell me a bit more and guide me to set this up and running?")
+        send_btn = gr.Button("Send")
+        chat_state = gr.State([])
+        back_btn = gr.Button("Back to Search")
+    def update_chat_button(status, details, repos):
+        logging.info("[UI] Search complete. Showing Go to Chat button: %s", bool(repos))
+        return gr.update(visible=bool(repos)), repos
+    def show_chat_ui(repos):
+        logging.info("[UI] Switching to Chat UI. Repositories available: %s", repos)
+        return gr.update(visible=False), gr.update(visible=True), gr.update(choices=repos, value=None), []
+    def back_to_search():
+        logging.info("[UI] Switching back to Search UI.")
+        return gr.update(visible=True), gr.update(visible=False), gr.update(value=[]), gr.update(value=None), []
+    def chat_with_agent(user_msg, repo, history):
+        logging.info("[Chat] User sent message: '%s' for repo: '%s'", user_msg, repo)
+        if not user_msg or not user_msg.strip():
+            # Block blank messages
+            return history + [["", "Please enter a message before sending."]], history
+        if not repo:
+            return history + [[user_msg, "Please select a repository first."]], history
+        full_query = f"[{repo}] {user_msg}"
+        try:
+            result = agent.agent_executor.invoke({"input": full_query})
+            answer = result["output"]
+            logging.info("[Chat] Agent response received.")
+        except Exception as e:
+            answer = f"Error: {e}"
+            logging.error("[Chat] Error in agent_executor: %s", e)
+        history = history + [[user_msg, answer]]
+        return history, history
+    # Disable send button if no repo is selected or message is blank, and show a helpful message
+    def can_send(user_msg, repo):
+        if not user_msg or not user_msg.strip():
+            return gr.update(interactive=False, value="Enter a message to send")
+        if not repo:
+            return gr.update(interactive=False, value="Select a repository")
+        return gr.update(interactive=True, value="Send")
+    user_input.change(
+        fn=can_send,
+        inputs=[user_input, repo_choice],
+        outputs=[send_btn],
+        show_progress=False
+    )
+    repo_choice.change(
+        fn=can_send,
+        inputs=[user_input, repo_choice],
+        outputs=[send_btn],
+        show_progress=False
+    )
+    run_button.click(
+        fn=lite_runner,
+        inputs=[research_input, num_results_slider],
+        outputs=[status_display, detail_display, repo_state],
+        api_name="deepgit_lite",
+        show_progress=True
+    ).then(
+        fn=update_chat_button,
+        inputs=[status_display, detail_display, repo_state],
+        outputs=[go_to_chat_btn, repo_state]
+    )
+    research_input.submit(
+        fn=lite_runner,
+        inputs=[research_input, num_results_slider],
+        outputs=[status_display, detail_display, repo_state],
+        api_name="deepgit_lite_submit",
+        show_progress=True
+    ).then(
+        fn=update_chat_button,
+        inputs=[status_display, detail_display, repo_state],
+        outputs=[go_to_chat_btn, repo_state]
+    )
+    go_to_chat_btn.click(
+        fn=show_chat_ui,
+        inputs=[repo_state],
+        outputs=[search_ui, chat_ui, repo_choice, chat_state]
+    )
+    back_btn.click(
+        fn=back_to_search,
+        inputs=[],
+        outputs=[search_ui, chat_ui, chat_history, repo_choice, chat_state]
+    )
+    send_btn.click(
+        fn=chat_with_agent,
+        inputs=[user_input, repo_choice, chat_state],
+        outputs=[chat_history, chat_state],
+        queue=False
+    )
+    user_input.submit(
+        fn=chat_with_agent,
+        inputs=[user_input, repo_choice, chat_state],
+        outputs=[chat_history, chat_state],
+        queue=False
+    )
+    gr.HTML(
+        """
+        <div id="footer">
+            Made with ❤️ by <b>Zamal</b>
+        </div>
+        """
+    )
+demo.queue(max_size=10).launch()

main.py ADDED Viewed

	@@ -0,0 +1,380 @@

+import os
+import base64
+import requests
+import numpy as np
+import faiss
+import re
+import logging
+from pathlib import Path
+# For local development, load environment variables from a .env file.
+# In HuggingFace Spaces, secrets are automatically available as environment variables.
+from dotenv import load_dotenv
+load_dotenv()
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+# Optionally import BM25 for sparse retrieval.
+try:
+    from rank_bm25 import BM25Okapi
+except ImportError:
+    BM25Okapi = None
+# ---------------------------
+# Environment Variables & Setup
+# ---------------------------
+# GitHub API key (required for GitHub API calls)
+GITHUB_API_KEY = os.getenv("GITHUB_API_KEY")
+# GROQ API key (if required by ChatGroq)
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+# HuggingFace token (if you need it to load private models from HuggingFace)
+HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+CROSS_ENCODER_MODEL = os.getenv("CROSS_ENCODER_MODEL", "cross-encoder/ms-marco-MiniLM-L-6-v2")
+# Set up a persistent session for GitHub API requests.
+session = requests.Session()
+session.headers.update({
+    "Authorization": f"token {GITHUB_API_KEY}",
+    "Accept": "application/vnd.github.v3+json"
+})
+# ---------------------------
+# Langchain Groq Setup for Search Tag Conversion
+# ---------------------------
+llm = ChatGroq(
+    model="deepseek-r1-distill-llama-70b",
+    temperature=0.3,
+    max_tokens=512,
+    max_retries=3,
+    api_key=GROQ_API_KEY  # Pass GROQ_API_KEY if the ChatGroq library supports it.
+)
+prompt = ChatPromptTemplate.from_messages([
+    ("system",
+     """You are a GitHub search optimization expert.
+Your job is to:
+1. Read a user's query about tools, research, or tasks.
+2. Detect if the query mentions a specific programming language other than Python (for example, JavaScript or JS). If so, record that language as the target language.
+3. Think iteratively and generate your internal chain-of-thought enclosed in <think> ... </think> tags.
+4. After your internal reasoning, output up to five GitHub-style search tags or library names that maximize repository discovery.
+   Use as many tags as necessary based on the query's complexity, but never more than five.
+5. If you detected a non-Python target language, append an additional tag at the end in the format target-[language] (e.g., target-javascript).
+   If no specific language is mentioned, do not include any target tag.
+Output Format:
+tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]
+Rules:
+- Use lowercase and hyphenated keywords (e.g., image-augmentation, chain-of-thought).
+- Use terms commonly found in GitHub repo names, topics, or descriptions.
+- Avoid generic terms like "python", "ai", "tool", "project".
+- Do NOT use full phrases or vague words like "no-code", "framework", or "approach".
+- Prefer real tools, popular methods, or dataset names when mentioned.
+- If your output does not strictly match the required format, correct it after your internal reasoning.
+- Choose high-signal keywords to ensure the search yields the most relevant GitHub repositories.
+Excellent Examples:
+Input: "No code tool to augment image and annotation"
+Output: image-augmentation:albumentations
+Input: "Repos around chain of thought prompting mainly for finetuned models"
+Output: chain-of-thought:finetuned-llm
+Input: "Find repositories implementing data augmentation pipelines in JavaScript"
+Output: data-augmentation:target-javascript
+Output must be ONLY the search tags separated by colons. Do not include any extra text, bullet points, or explanations.
+"""),
+    ("human", "{query}")
+])
+chain = prompt | llm
+def valid_tags(tags: str) -> bool:
+    pattern = r'^[a-z0-9-]+(?::[a-z0-9-]+){1,5}$'
+    return re.match(pattern, tags) is not None
+def parse_search_tags(response: str) -> str:
+    # Remove any text inside <think>...</think> blocks.
+    cleaned = re.sub(r'<think>.*?</think>', '', response, flags=re.DOTALL)
+    pattern = r'([a-z0-9-]+(?::[a-z0-9-]+){1,5})'
+    match = re.search(pattern, cleaned)
+    if match:
+         return match.group(1).strip()
+    return cleaned.strip()
+def iterative_convert_to_search_tags(query: str, max_iterations: int = 2) -> str:
+    print(f"\n [iterative_convert_to_search_tags] Input Query: {query}")
+    refined_query = query
+    tags_output = ""
+    for iteration in range(max_iterations):
+        print(f"\n Iteration {iteration+1}")
+        response = chain.invoke({"query": refined_query})
+        full_output = response.content.strip()
+        tags_output = parse_search_tags(full_output)
+        print(f"Output Tags: {tags_output}")
+        if valid_tags(tags_output):
+            print("Valid tags format detected.")
+            return tags_output
+        else:
+            print(" Invalid tags format. Requesting refinement...")
+            refined_query = f"{query}\nPlease refine your answer so that the output strictly matches the format: tag1:tag2[:tag3[:tag4[:tag5[:target-language]]]]."
+    print("Final output (may be invalid):", tags_output)
+    return tags_output
+# ---------------------------
+# GitHub API Helper Functions
+# ---------------------------
+def fetch_readme_content(repo_full_name: str) -> str:
+    readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
+    response = session.get(readme_url)
+    if response.status_code == 200:
+        readme_data = response.json()
+        try:
+            return base64.b64decode(readme_data.get('content', '')).decode('utf-8', errors='replace')
+        except Exception:
+            return ""
+    return ""
+def fetch_markdown_contents(repo_full_name: str) -> str:
+    url = f"https://api.github.com/repos/{repo_full_name}/contents"
+    response = session.get(url)
+    contents = ""
+    if response.status_code == 200:
+        items = response.json()
+        for item in items:
+            if item.get("type") == "file" and item.get("name", "").lower().endswith(".md"):
+                file_url = item.get("download_url")
+                if file_url:
+                    file_resp = requests.get(file_url)
+                    if file_resp.status_code == 200:
+                        contents += "\n" + file_resp.text
+    return contents
+def fetch_all_markdown(repo_full_name: str) -> str:
+    readme = fetch_readme_content(repo_full_name)
+    other_md = fetch_markdown_contents(repo_full_name)
+    return readme + "\n" + other_md
+def fetch_github_repositories(query: str, max_results: int = 10) -> list:
+    url = "https://api.github.com/search/repositories"
+    params = {
+        "q": query,
+        "per_page": max_results
+    }
+    response = session.get(url, params=params)
+    if response.status_code != 200:
+        print(f"Error {response.status_code}: {response.json().get('message')}")
+        return []
+    repo_list = []
+    for repo in response.json().get('items', []):
+        repo_link = repo.get('html_url')
+        description = repo.get('description') or ""
+        combined_markdown = fetch_all_markdown(repo.get('full_name'))
+        combined_text = (description + "\n" + combined_markdown).strip()
+        repo_list.append({
+            "title": repo.get('name', 'No title available'),
+            "link": repo_link,
+            "combined_text": combined_text
+        })
+    return repo_list
+# ---------------------------
+# Dense Retrieval Model Setup
+# ---------------------------
+try:
+    # If using a GPU-enabled model, the HuggingFace token can be used for private models.
+    model = SentenceTransformer('all-mpnet-base-v2', device='cpu')
+except Exception as e:
+    print("Error initializing GPU for SentenceTransformer; falling back to CPU:", e)
+    model = SentenceTransformer('all-mpnet-base-v2', device='cpu')
+def robust_min_max_norm(scores: np.ndarray) -> np.ndarray:
+    min_val = scores.min()
+    max_val = scores.max()
+    if max_val - min_val < 1e-10:
+        return np.ones_like(scores)
+    return (scores - min_val) / (max_val - min_val)
+# ---------------------------
+# Cross-Encoder Re-Ranking Function
+# ---------------------------
+def cross_encoder_rerank_candidates(candidates: list, query: str, model_name: str, top_n: int = 10) -> list:
+    try:
+        cross_encoder = CrossEncoder(model_name, device='cpu')
+    except Exception as e:
+        print("Error initializing CrossEncoder on GPU; falling back to CPU:", e)
+        cross_encoder = CrossEncoder(model_name, device='cpu')
+    CHUNK_SIZE = 2000
+    MAX_DOC_LENGTH = 5000
+    MIN_DOC_LENGTH = 200
+    def split_text(text: str, chunk_size: int = CHUNK_SIZE) -> list:
+        return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
+    for candidate in candidates:
+        doc = candidate.get("combined_text", "")
+        if len(doc) > MAX_DOC_LENGTH:
+            doc = doc[:MAX_DOC_LENGTH]
+        try:
+            if len(doc) < MIN_DOC_LENGTH:
+                score = cross_encoder.predict([[query, doc]])
+                if hasattr(score, '__len__') and len(score) == 1:
+                    candidate["cross_encoder_score"] = float(score[0])
+                else:
+                    candidate["cross_encoder_score"] = float(score)
+            else:
+                chunks = split_text(doc)
+                pairs = [[query, chunk] for chunk in chunks]
+                scores = cross_encoder.predict(pairs)
+                scores = np.array(scores)
+                max_score = float(np.max(scores)) if scores.size > 0 else 0.0
+                avg_score = float(np.mean(scores)) if scores.size > 0 else 0.0
+                candidate["cross_encoder_score"] = 0.5 * max_score + 0.5 * avg_score
+        except Exception as e:
+            logging.debug(f"[cross-encoder] Error scoring candidate {candidate.get('link', 'unknown')}: {e}")
+            candidate["cross_encoder_score"] = 0.0
+    all_scores = [candidate["cross_encoder_score"] for candidate in candidates]
+    if all_scores:
+        min_score = min(all_scores)
+        if min_score < 0:
+            for candidate in candidates:
+                candidate["cross_encoder_score"] += -min_score
+    return candidates
+# ---------------------------
+# Main Ranking Function with Hybrid Retrieval and Combined Scoring
+# ---------------------------
+def run_repository_ranking(query: str, num_results: int = 10) -> str:
+    logging.info("[DeepGit] Step 1: Generate search tags from the query.")
+    search_tags = iterative_convert_to_search_tags(query)
+    tag_list = [tag.strip() for tag in search_tags.split(":") if tag.strip()]
+    # Step 2: Handle target language extraction.
+    logging.info("[DeepGit] Step 2: Handle target language extraction.")
+    if any(tag.startswith("target-") for tag in tag_list):
+        target_tag = next(tag for tag in tag_list if tag.startswith("target-"))
+        lang_query = f"language:{target_tag.replace('target-', '')}"
+        tag_list = [tag for tag in tag_list if not tag.startswith("target-")]
+    else:
+        lang_query = "language:python"
+    # Step 3: Build advanced search qualifiers.
+    logging.info("[DeepGit] Step 3: Build advanced search qualifiers and fetch repositories.")
+    advanced_qualifier = "in:name,description,readme"
+    all_repositories = []
+    for tag in tag_list:
+        github_query = f"{tag} {advanced_qualifier} {lang_query}"
+        logging.info(f"[DeepGit] GitHub Query: {github_query}")
+        repos = fetch_github_repositories(github_query, max_results=15)
+        all_repositories.extend(repos)
+    combined_query = " OR ".join(tag_list)
+    combined_query = f"({combined_query}) {advanced_qualifier} {lang_query}"
+    logging.info(f"[DeepGit] Combined GitHub Query: {combined_query}")
+    repos = fetch_github_repositories(combined_query, max_results=15)
+    all_repositories.extend(repos)
+    unique_repositories = {}
+    for repo in all_repositories:
+        if repo["link"] not in unique_repositories:
+            unique_repositories[repo["link"]] = repo
+        else:
+            existing_text = unique_repositories[repo["link"]]["combined_text"]
+            unique_repositories[repo["link"]]["combined_text"] = existing_text + "\n" + repo["combined_text"]
+    repositories = list(unique_repositories.values())
+    if not repositories:
+        return "No repositories found for your query."
+    # Step 4: Prepare documents.
+    logging.info("[DeepGit] Step 4: Prepare documents for dense retrieval.")
+    docs = [repo.get("combined_text", "") for repo in repositories]
+    # Step 5: Dense retrieval.
+    logging.info("[DeepGit] Step 5: Compute dense embeddings and scores.")
+    doc_embeddings = model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
+    if doc_embeddings.ndim == 1:
+        doc_embeddings = doc_embeddings.reshape(1, -1)
+    norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
+    norm_doc_embeddings = doc_embeddings / (norms + 1e-10)
+    query_embedding = model.encode(query, convert_to_numpy=True)
+    if query_embedding.ndim == 1:
+        query_embedding = query_embedding.reshape(1, -1)
+    norm_query_embedding = query_embedding / (np.linalg.norm(query_embedding) + 1e-10)
+    dim = norm_doc_embeddings.shape[1]
+    index = faiss.IndexFlatIP(dim)
+    index.add(norm_doc_embeddings)
+    k = norm_doc_embeddings.shape[0]
+    D, I = index.search(norm_query_embedding, k)
+    dense_scores = D.squeeze()
+    norm_dense_scores = robust_min_max_norm(dense_scores)
+    # Step 6: BM25 scoring.
+    logging.info("[DeepGit] Step 6: Compute BM25 scores.")
+    if BM25Okapi is not None:
+        tokenized_docs = [re.findall(r'\w+', doc.lower()) for doc in docs]
+        bm25 = BM25Okapi(tokenized_docs)
+        query_tokens = re.findall(r'\w+', query.lower())
+        bm25_scores = np.array(bm25.get_scores(query_tokens))
+        norm_bm25_scores = robust_min_max_norm(bm25_scores)
+    else:
+        norm_bm25_scores = np.zeros_like(norm_dense_scores)
+    # Step 7: Combine scores (dense score weighted higher).
+    logging.info("[DeepGit] Step 7: Combine dense and BM25 scores.")
+    alpha = 0.8
+    combined_scores = alpha * norm_dense_scores + (1 - alpha) * norm_bm25_scores
+    for idx, repo in enumerate(repositories):
+        repo["combined_score"] = float(combined_scores[idx])
+    # Step 8: Initial ranking by combined score.
+    logging.info("[DeepGit] Step 8: Initial ranking by combined score.")
+    ranked_repositories = sorted(repositories, key=lambda x: x.get("combined_score", 0), reverse=True)
+    # Step 9: Compute cross-encoder scores for the top candidates.
+    logging.info("[DeepGit] Step 9: Cross-encoder re-ranking.")
+    top_candidates = ranked_repositories[:100] if len(ranked_repositories) > 100 else ranked_repositories
+    cross_encoder_rerank_candidates(top_candidates, query, model_name=CROSS_ENCODER_MODEL, top_n=len(top_candidates))
+    # Combine both metrics: final_score = w1 * combined_score + w2 * cross_encoder_score.
+    logging.info("[DeepGit] Step 10: Final scoring and output formatting.")
+    w1 = 0.7
+    w2 = 0.3
+    for candidate in top_candidates:
+        candidate["final_score"] = w1 * candidate.get("combined_score", 0) + w2 * candidate.get("cross_encoder_score", 0)
+    final_ranked = sorted(top_candidates, key=lambda x: x.get("final_score", 0), reverse=True)[:num_results]
+    # Step 11: Format final output with scores as percentages.
+    output = "\n=== Ranked Repositories ===\n"
+    for rank, repo in enumerate(final_ranked, 1):
+        output += f"Final Rank: {rank}\n"
+        output += f"Title: {repo['title']}\n"
+        output += f"Link: {repo['link']}\n"
+        output += f"Combined Score: {repo.get('combined_score', 0) * 100:.2f}%\n"
+        output += f"Cross-Encoder Score: {repo.get('cross_encoder_score', 0) * 100:.2f}%\n"
+        output += f"Final Score: {repo.get('final_score', 0) * 100:.2f}%\n"
+        snippet = repo['combined_text'][:300].replace('\n', ' ')
+        output += f"Snippet: {snippet}...\n"
+        output += '-' * 80 + "\n"
+    output += "\n=== End of Results ==="
+    return output
+# ---------------------------
+# Main Entry Point for Testing
+# ---------------------------
+if __name__ == "__main__":
+    test_query = "Chain of thought prompting for reasoning models"
+    result = run_repository_ranking(test_query)
+    print(result)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+requests==2.32.3
+numpy==1.25.2
+python-dotenv==1.0.1
+sentence-transformers==3.4.1
+faiss-cpu==1.9.0.post1
+langgraph==0.2.62
+langchain_groq==0.2.4
+langchain_core==0.3.47