Spaces:
Running
on
Zero
Running
on
Zero
zamalali
commited on
Commit
·
9494afe
1
Parent(s):
9e1349d
Add DeepGit Lite application and workflow files
Browse files- app.py +239 -0
- requirements.txt +9 -0
- src/__init__.py +0 -0
- src/__pycache__/__init__.cpython-311.pyc +0 -0
- src/__pycache__/deepgit_lite.cpython-311.pyc +0 -0
- src/deepgit_lite.py +245 -0
app.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import threading
|
5 |
+
import logging
|
6 |
+
from src.deepgit_lite import run_deepgit_lite
|
7 |
+
|
8 |
+
# ---------------------------
|
9 |
+
# Global Logging Buffer Setup
|
10 |
+
# ---------------------------
|
11 |
+
LOG_BUFFER = []
|
12 |
+
LOG_BUFFER_LOCK = threading.Lock()
|
13 |
+
|
14 |
+
class BufferLogHandler(logging.Handler):
|
15 |
+
def emit(self, record):
|
16 |
+
log_entry = self.format(record)
|
17 |
+
with LOG_BUFFER_LOCK:
|
18 |
+
LOG_BUFFER.append(log_entry)
|
19 |
+
|
20 |
+
root_logger = logging.getLogger()
|
21 |
+
if not any(isinstance(h, BufferLogHandler) for h in root_logger.handlers):
|
22 |
+
handler = BufferLogHandler()
|
23 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
24 |
+
handler.setFormatter(formatter)
|
25 |
+
root_logger.addHandler(handler)
|
26 |
+
|
27 |
+
def filter_logs(logs):
|
28 |
+
filtered = []
|
29 |
+
last_was_fetching = False
|
30 |
+
for log in logs:
|
31 |
+
if "HTTP Request:" in log:
|
32 |
+
if not last_was_fetching:
|
33 |
+
filtered.append("Fetching repositories...")
|
34 |
+
last_was_fetching = True
|
35 |
+
else:
|
36 |
+
filtered.append(log)
|
37 |
+
last_was_fetching = False
|
38 |
+
return filtered
|
39 |
+
|
40 |
+
# ---------------------------
|
41 |
+
# Title, Favicon & Description
|
42 |
+
# ---------------------------
|
43 |
+
favicon_html = """
|
44 |
+
<head>
|
45 |
+
<link rel="icon" type="image/x-icon" href="file/assets/deepgit.ico">
|
46 |
+
<title>DeepGit Lite Research Agent</title>
|
47 |
+
</head>
|
48 |
+
"""
|
49 |
+
|
50 |
+
title = """
|
51 |
+
<div style="text-align: center; margin-top: 20px;">
|
52 |
+
<h1 style="font-size: 36px; display: inline-flex; align-items: center; gap: 16px;">
|
53 |
+
<img src="https://img.icons8.com/?size=100&id=118557&format=png&color=000000" width="64" />
|
54 |
+
<span>DeepGit Lite</span>
|
55 |
+
</h1>
|
56 |
+
<p style="font-size: 18px; color: #555; margin-top: 10px;">
|
57 |
+
⚙️ A lightweight GitHub research agent for deep semantic search and ranking.
|
58 |
+
</p>
|
59 |
+
</div>
|
60 |
+
"""
|
61 |
+
|
62 |
+
description = """<p align="center">
|
63 |
+
DeepGit Lite is a streamlined version of DeepGit designed to perform advanced semantic research on GitHub repositories with faster response times. It uses query enhancement, dense retrieval via FAISS, activity analysis, and a final multi-factor ranking (combining semantic similarity, activity, and popularity) to deliver the best results.
|
64 |
+
</p>"""
|
65 |
+
|
66 |
+
consent_text = """
|
67 |
+
<div style="padding: 10px; text-align: center;">
|
68 |
+
<p>
|
69 |
+
By using DeepGit Lite, you consent to temporary processing of your query for semantic search and ranking purposes.
|
70 |
+
</p>
|
71 |
+
<p>
|
72 |
+
⭐ Star us on GitHub if you find this tool useful!<br/>
|
73 |
+
<a href="https://github.com/zamalali/DeepGit" target="_blank">GitHub</a>
|
74 |
+
</p>
|
75 |
+
</div>
|
76 |
+
"""
|
77 |
+
|
78 |
+
footer = """
|
79 |
+
<div style="text-align: center; margin-top: 40px; font-size: 13px; color: #888;">
|
80 |
+
Made with <span style="color: crimson;">❤️</span> by <b>Zamal</b>
|
81 |
+
</div>
|
82 |
+
"""
|
83 |
+
|
84 |
+
# ---------------------------
|
85 |
+
# HTML Table Renderer for DeepGit Lite
|
86 |
+
# ---------------------------
|
87 |
+
def format_percent(value):
|
88 |
+
try:
|
89 |
+
return f"{float(value) * 100:.1f}%"
|
90 |
+
except:
|
91 |
+
return value
|
92 |
+
|
93 |
+
def parse_result_to_html(raw_result: str) -> str:
|
94 |
+
entries = raw_result.strip().split("Final Rank:")
|
95 |
+
html = """
|
96 |
+
<style>
|
97 |
+
table {
|
98 |
+
width: 100%;
|
99 |
+
border-collapse: collapse;
|
100 |
+
margin: 1em 0;
|
101 |
+
font-size: 14px;
|
102 |
+
}
|
103 |
+
th, td {
|
104 |
+
padding: 12px 15px;
|
105 |
+
border: 1px solid #ddd;
|
106 |
+
text-align: left;
|
107 |
+
vertical-align: top;
|
108 |
+
}
|
109 |
+
th {
|
110 |
+
background-color: #f4f4f4;
|
111 |
+
}
|
112 |
+
tr:hover { background-color: #f9f9f9; }
|
113 |
+
</style>
|
114 |
+
<table>
|
115 |
+
<thead>
|
116 |
+
<tr>
|
117 |
+
<th>Rank</th>
|
118 |
+
<th>Title</th>
|
119 |
+
<th>Link</th>
|
120 |
+
<th>Semantic Similarity</th>
|
121 |
+
<th>Activity Score</th>
|
122 |
+
<th>Final Score</th>
|
123 |
+
</tr>
|
124 |
+
</thead>
|
125 |
+
<tbody>
|
126 |
+
"""
|
127 |
+
for entry in entries[1:]:
|
128 |
+
lines = entry.strip().split("\n")
|
129 |
+
data = {}
|
130 |
+
data["Final Rank"] = lines[0].strip()
|
131 |
+
for line in lines[1:]:
|
132 |
+
if ": " in line:
|
133 |
+
key, val = line.split(": ", 1)
|
134 |
+
data[key.strip()] = val.strip()
|
135 |
+
html += f"""
|
136 |
+
<tr>
|
137 |
+
<td>{data.get('Final Rank', '')}</td>
|
138 |
+
<td>{data.get('Title', '')}</td>
|
139 |
+
<td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
|
140 |
+
<td>{format_percent(data.get('Semantic Similarity', ''))}</td>
|
141 |
+
<td>{float(data.get('Activity Score', 0)):.2f}</td>
|
142 |
+
<td>{format_percent(data.get('Final Score', ''))}</td>
|
143 |
+
</tr>
|
144 |
+
"""
|
145 |
+
html += "</tbody></table>"
|
146 |
+
return html
|
147 |
+
|
148 |
+
# ---------------------------
|
149 |
+
# Background Workflow Runner for DeepGit Lite
|
150 |
+
# ---------------------------
|
151 |
+
def run_lite_workflow(topic, result_container):
|
152 |
+
result = run_deepgit_lite(topic)
|
153 |
+
result_container["raw_result"] = result
|
154 |
+
|
155 |
+
def stream_lite_workflow(topic):
|
156 |
+
with LOG_BUFFER_LOCK:
|
157 |
+
LOG_BUFFER.clear()
|
158 |
+
result_container = {}
|
159 |
+
workflow_thread = threading.Thread(target=run_lite_workflow, args=(topic, result_container))
|
160 |
+
workflow_thread.start()
|
161 |
+
|
162 |
+
last_index = 0
|
163 |
+
while workflow_thread.is_alive() or (last_index < len(LOG_BUFFER)):
|
164 |
+
with LOG_BUFFER_LOCK:
|
165 |
+
new_logs = LOG_BUFFER[last_index:]
|
166 |
+
last_index = len(LOG_BUFFER)
|
167 |
+
if new_logs:
|
168 |
+
filtered_logs = filter_logs(new_logs)
|
169 |
+
status_msg = filtered_logs[-1]
|
170 |
+
detail_msg = "<br/>".join(filtered_logs)
|
171 |
+
yield status_msg, detail_msg
|
172 |
+
time.sleep(0.5)
|
173 |
+
|
174 |
+
workflow_thread.join()
|
175 |
+
with LOG_BUFFER_LOCK:
|
176 |
+
final_logs = LOG_BUFFER[:]
|
177 |
+
filtered_final = filter_logs(final_logs)
|
178 |
+
raw_result = result_container.get("raw_result", "No results returned.")
|
179 |
+
html_result = parse_result_to_html(raw_result)
|
180 |
+
yield "", html_result
|
181 |
+
|
182 |
+
# ---------------------------
|
183 |
+
# App UI Setup for DeepGit Lite
|
184 |
+
# ---------------------------
|
185 |
+
with gr.Blocks(
|
186 |
+
theme="gstaff/sketch",
|
187 |
+
css="""
|
188 |
+
#main_container { margin: auto; max-width: 900px; }
|
189 |
+
footer, footer * { display: none !important; }
|
190 |
+
"""
|
191 |
+
) as demo:
|
192 |
+
|
193 |
+
gr.HTML(favicon_html)
|
194 |
+
gr.HTML(title)
|
195 |
+
gr.HTML(description)
|
196 |
+
|
197 |
+
with gr.Column(elem_id="user_consent_container") as consent_block:
|
198 |
+
gr.HTML(consent_text)
|
199 |
+
agree_button = gr.Button("I Agree", variant="primary")
|
200 |
+
|
201 |
+
with gr.Column(elem_id="main_container", visible=False) as main_block:
|
202 |
+
research_input = gr.Textbox(
|
203 |
+
label="Research Topic",
|
204 |
+
placeholder="Enter your research topic here, e.g., 'Instruction-based fine-tuning for LLaMA 2 using chain-of-thought prompting in Python.'",
|
205 |
+
lines=3
|
206 |
+
)
|
207 |
+
run_button = gr.Button("Run DeepGit Lite", variant="primary")
|
208 |
+
status_display = gr.Markdown("")
|
209 |
+
detail_display = gr.HTML("")
|
210 |
+
output_html = gr.HTML("")
|
211 |
+
state = gr.State([])
|
212 |
+
|
213 |
+
def enable_main():
|
214 |
+
return gr.update(visible=False), gr.update(visible=True)
|
215 |
+
|
216 |
+
agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
|
217 |
+
|
218 |
+
def lite_runner(topic):
|
219 |
+
for status, details in stream_lite_workflow(topic):
|
220 |
+
yield status, details
|
221 |
+
|
222 |
+
run_button.click(
|
223 |
+
fn=lite_runner,
|
224 |
+
inputs=[research_input],
|
225 |
+
outputs=[status_display, detail_display],
|
226 |
+
api_name="deepgit_lite",
|
227 |
+
show_progress=True
|
228 |
+
)
|
229 |
+
|
230 |
+
research_input.submit(
|
231 |
+
fn=lite_runner,
|
232 |
+
inputs=[research_input],
|
233 |
+
outputs=[status_display, detail_display],
|
234 |
+
api_name="deepgit_lite_submit",
|
235 |
+
show_progress=True
|
236 |
+
)
|
237 |
+
|
238 |
+
gr.HTML(footer)
|
239 |
+
demo.queue(max_size=10).launch()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
requests==2.32.3
|
2 |
+
numpy==1.25.2
|
3 |
+
python-dotenv==1.0.1
|
4 |
+
sentence-transformers==3.4.1
|
5 |
+
faiss-cpu==1.9.0.post1
|
6 |
+
gradio==5.23.1
|
7 |
+
langgraph==0.2.62
|
8 |
+
langchain_groq==0.2.4
|
9 |
+
langchain_core==0.3.47
|
src/__init__.py
ADDED
File without changes
|
src/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (215 Bytes). View file
|
|
src/__pycache__/deepgit_lite.cpython-311.pyc
ADDED
Binary file (15.9 kB). View file
|
|
src/deepgit_lite.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
import datetime
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import faiss
|
8 |
+
import getpass
|
9 |
+
import math
|
10 |
+
import logging
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
from pathlib import Path
|
13 |
+
from langchain_groq import ChatGroq
|
14 |
+
from langchain_core.prompts import ChatPromptTemplate
|
15 |
+
|
16 |
+
# ---------------------------
|
17 |
+
# Environment and .env Setup
|
18 |
+
# ---------------------------
|
19 |
+
dotenv_path = Path(__file__).resolve().parent.parent / ".env"
|
20 |
+
load_dotenv(dotenv_path=str(dotenv_path))
|
21 |
+
|
22 |
+
if "GITHUB_API_KEY" not in os.environ:
|
23 |
+
os.environ["GITHUB_API_KEY"] = getpass.getpass("Enter your GitHub API key: ")
|
24 |
+
|
25 |
+
# ---------------------------
|
26 |
+
# Logging Setup
|
27 |
+
# ---------------------------
|
28 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
29 |
+
logger = logging.getLogger(__name__)
|
30 |
+
|
31 |
+
# ---------------------------
|
32 |
+
# ChatGroq Integration Setup (for query enhancement and final justification)
|
33 |
+
# ---------------------------
|
34 |
+
llm_groq = ChatGroq(
|
35 |
+
model="llama-3.1-8b-instant",
|
36 |
+
temperature=0.2,
|
37 |
+
max_tokens=100,
|
38 |
+
timeout=15,
|
39 |
+
max_retries=2
|
40 |
+
)
|
41 |
+
|
42 |
+
def enhance_query(original_query):
|
43 |
+
prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
|
44 |
+
please enhance and expand it by adding relevant technical keywords, recent research context,
|
45 |
+
and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
|
46 |
+
Provide the refined query text."""
|
47 |
+
messages = [
|
48 |
+
("system", "You are a helpful research assistant specializing in AI and software research."),
|
49 |
+
("human", prompt)
|
50 |
+
]
|
51 |
+
result = llm_groq.invoke(messages)
|
52 |
+
return result
|
53 |
+
|
54 |
+
def justify_candidate(candidate, query):
|
55 |
+
prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
|
56 |
+
|
57 |
+
Repository Details:
|
58 |
+
- Stars: {candidate['stars']}
|
59 |
+
- Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
|
60 |
+
|
61 |
+
Provide a concise justification:"""
|
62 |
+
messages = [
|
63 |
+
("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
|
64 |
+
("human", prompt)
|
65 |
+
]
|
66 |
+
result = llm_groq.invoke(messages)
|
67 |
+
return result
|
68 |
+
|
69 |
+
# ---------------------------
|
70 |
+
# GitHub API Helper Functions
|
71 |
+
# ---------------------------
|
72 |
+
def fetch_readme_content(repo_full_name, headers):
|
73 |
+
readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
|
74 |
+
response = requests.get(readme_url, headers=headers)
|
75 |
+
if response.status_code == 200:
|
76 |
+
readme_data = response.json()
|
77 |
+
return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
|
78 |
+
return ""
|
79 |
+
|
80 |
+
def fetch_file_content(download_url):
|
81 |
+
try:
|
82 |
+
response = requests.get(download_url)
|
83 |
+
if response.status_code == 200:
|
84 |
+
return response.text
|
85 |
+
except Exception as e:
|
86 |
+
logger.error(f"Error fetching file: {e}")
|
87 |
+
return ""
|
88 |
+
|
89 |
+
def fetch_directory_markdown(repo_full_name, path, headers):
|
90 |
+
md_content = ""
|
91 |
+
url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
|
92 |
+
response = requests.get(url, headers=headers)
|
93 |
+
if response.status_code == 200:
|
94 |
+
items = response.json()
|
95 |
+
for item in items:
|
96 |
+
if item["type"] == "file" and item["name"].lower().endswith(".md"):
|
97 |
+
content = fetch_file_content(item["download_url"])
|
98 |
+
md_content += f"\n\n# {item['name']}\n" + content
|
99 |
+
return md_content
|
100 |
+
|
101 |
+
def fetch_repo_documentation(repo_full_name, headers):
|
102 |
+
doc_text = ""
|
103 |
+
# Fetch README first.
|
104 |
+
readme = fetch_readme_content(repo_full_name, headers)
|
105 |
+
if readme:
|
106 |
+
doc_text += "# README\n" + readme
|
107 |
+
# Fetch additional markdown files and documentation directories.
|
108 |
+
root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
|
109 |
+
response = requests.get(root_url, headers=headers)
|
110 |
+
if response.status_code == 200:
|
111 |
+
items = response.json()
|
112 |
+
for item in items:
|
113 |
+
if item["type"] == "file" and item["name"].lower().endswith(".md"):
|
114 |
+
if item["name"].lower() != "readme.md":
|
115 |
+
content = fetch_file_content(item["download_url"])
|
116 |
+
doc_text += f"\n\n# {item['name']}\n" + content
|
117 |
+
elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
|
118 |
+
doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
|
119 |
+
return doc_text if doc_text.strip() else "No documentation available."
|
120 |
+
|
121 |
+
def fetch_github_repositories(query, max_results=1000, per_page=100):
|
122 |
+
url = "https://api.github.com/search/repositories"
|
123 |
+
headers = {
|
124 |
+
"Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
|
125 |
+
"Accept": "application/vnd.github.v3+json"
|
126 |
+
}
|
127 |
+
repositories = []
|
128 |
+
num_pages = max_results // per_page
|
129 |
+
for page in range(1, num_pages + 1):
|
130 |
+
params = {
|
131 |
+
"q": query,
|
132 |
+
"sort": "stars",
|
133 |
+
"order": "desc",
|
134 |
+
"per_page": per_page,
|
135 |
+
"page": page
|
136 |
+
}
|
137 |
+
response = requests.get(url, headers=headers, params=params)
|
138 |
+
if response.status_code != 200:
|
139 |
+
logger.error(f"Error {response.status_code}: {response.json().get('message')}")
|
140 |
+
break
|
141 |
+
items = response.json().get('items', [])
|
142 |
+
if not items:
|
143 |
+
break
|
144 |
+
for repo in items:
|
145 |
+
repo_link = repo['html_url']
|
146 |
+
full_name = repo.get('full_name', '')
|
147 |
+
doc_content = fetch_repo_documentation(full_name, headers)
|
148 |
+
star_count = repo.get('stargazers_count', 0)
|
149 |
+
repositories.append({
|
150 |
+
"title": repo.get('name', 'No title available'),
|
151 |
+
"link": repo_link,
|
152 |
+
"combined_doc": doc_content,
|
153 |
+
"stars": star_count,
|
154 |
+
"full_name": full_name,
|
155 |
+
"open_issues_count": repo.get('open_issues_count', 0)
|
156 |
+
})
|
157 |
+
logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
|
158 |
+
return repositories
|
159 |
+
|
160 |
+
# ---------------------------
|
161 |
+
# Main Lite Workflow Function
|
162 |
+
# ---------------------------
|
163 |
+
def run_deepgit_lite(user_query):
|
164 |
+
# Stage 0: Query Enhancement using ChatGroq
|
165 |
+
logger.info("Enhancing query using ChatGroq...")
|
166 |
+
original_query = user_query.strip()
|
167 |
+
enhanced_query = enhance_query(original_query)
|
168 |
+
logger.info(f"Enhanced Query: {enhanced_query}")
|
169 |
+
github_query = enhanced_query + " language:python"
|
170 |
+
logger.info(f"Using GitHub query: {github_query}")
|
171 |
+
|
172 |
+
# Stage 1: Dense Retrieval with FAISS
|
173 |
+
logger.info("Fetching repositories from GitHub...")
|
174 |
+
repos = fetch_github_repositories(github_query)
|
175 |
+
docs = [repo.get("combined_doc", "") for repo in repos]
|
176 |
+
logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
|
177 |
+
sem_model = SentenceTransformer("all-mpnet-base-v2")
|
178 |
+
doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
|
179 |
+
|
180 |
+
def normalize_embeddings(embeddings):
|
181 |
+
norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
|
182 |
+
return embeddings / (norms + 1e-10)
|
183 |
+
|
184 |
+
doc_embeddings = normalize_embeddings(doc_embeddings)
|
185 |
+
query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
|
186 |
+
query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
|
187 |
+
dim = doc_embeddings.shape[1]
|
188 |
+
index = faiss.IndexFlatIP(dim)
|
189 |
+
index.add(doc_embeddings)
|
190 |
+
k = min(100, doc_embeddings.shape[0])
|
191 |
+
D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
|
192 |
+
for idx, score in zip(I[0], D[0]):
|
193 |
+
repos[idx]["semantic_similarity"] = score
|
194 |
+
ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
|
195 |
+
logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
|
196 |
+
|
197 |
+
# Stage 2: Filtering Low-Star Repositories
|
198 |
+
filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
|
199 |
+
if not filtered_candidates:
|
200 |
+
filtered_candidates = ranked_by_semantic # fallback if filtering is too strict
|
201 |
+
logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
|
202 |
+
|
203 |
+
# Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
|
204 |
+
semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
|
205 |
+
star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
|
206 |
+
|
207 |
+
min_sem, max_sem = min(semantic_scores), max(semantic_scores)
|
208 |
+
min_star, max_star = min(star_scores), max(star_scores)
|
209 |
+
|
210 |
+
def normalize(val, min_val, max_val):
|
211 |
+
if max_val - min_val == 0:
|
212 |
+
return 0.5
|
213 |
+
return (val - min_val) / (max_val - min_val)
|
214 |
+
|
215 |
+
for repo in filtered_candidates:
|
216 |
+
norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
|
217 |
+
norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
|
218 |
+
# Weights: 60% semantic, 40% stars.
|
219 |
+
repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
|
220 |
+
|
221 |
+
final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
|
222 |
+
logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
|
223 |
+
|
224 |
+
# Stage 4: Final Justification using ChatGroq
|
225 |
+
justifications = {}
|
226 |
+
for repo in final_ranked[:10]:
|
227 |
+
justification = justify_candidate(repo, user_query)
|
228 |
+
justifications[repo['title']] = justification
|
229 |
+
logger.info(f"Justification for {repo['title']}: {justification}")
|
230 |
+
|
231 |
+
# Format final results into a text table.
|
232 |
+
result_text = "\n=== Final Ranked Repositories ===\n"
|
233 |
+
for rank, repo in enumerate(final_ranked[:10], 1):
|
234 |
+
result_text += f"Final Rank: {rank}\n"
|
235 |
+
result_text += f"Title: {repo['title']}\n"
|
236 |
+
result_text += f"Link: {repo['link']}\n"
|
237 |
+
result_text += f"Stars: {repo['stars']}\n"
|
238 |
+
result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
|
239 |
+
result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
|
240 |
+
result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
|
241 |
+
result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
|
242 |
+
result_text += '-' * 80 + "\n"
|
243 |
+
result_text += "\n=== End of Results ==="
|
244 |
+
|
245 |
+
return result_text
|