zamalali commited on
Commit
9494afe
·
1 Parent(s): 9e1349d

Add DeepGit Lite application and workflow files

Browse files
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import time
4
+ import threading
5
+ import logging
6
+ from src.deepgit_lite import run_deepgit_lite
7
+
8
+ # ---------------------------
9
+ # Global Logging Buffer Setup
10
+ # ---------------------------
11
+ LOG_BUFFER = []
12
+ LOG_BUFFER_LOCK = threading.Lock()
13
+
14
+ class BufferLogHandler(logging.Handler):
15
+ def emit(self, record):
16
+ log_entry = self.format(record)
17
+ with LOG_BUFFER_LOCK:
18
+ LOG_BUFFER.append(log_entry)
19
+
20
+ root_logger = logging.getLogger()
21
+ if not any(isinstance(h, BufferLogHandler) for h in root_logger.handlers):
22
+ handler = BufferLogHandler()
23
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
24
+ handler.setFormatter(formatter)
25
+ root_logger.addHandler(handler)
26
+
27
+ def filter_logs(logs):
28
+ filtered = []
29
+ last_was_fetching = False
30
+ for log in logs:
31
+ if "HTTP Request:" in log:
32
+ if not last_was_fetching:
33
+ filtered.append("Fetching repositories...")
34
+ last_was_fetching = True
35
+ else:
36
+ filtered.append(log)
37
+ last_was_fetching = False
38
+ return filtered
39
+
40
+ # ---------------------------
41
+ # Title, Favicon & Description
42
+ # ---------------------------
43
+ favicon_html = """
44
+ <head>
45
+ <link rel="icon" type="image/x-icon" href="file/assets/deepgit.ico">
46
+ <title>DeepGit Lite Research Agent</title>
47
+ </head>
48
+ """
49
+
50
+ title = """
51
+ <div style="text-align: center; margin-top: 20px;">
52
+ <h1 style="font-size: 36px; display: inline-flex; align-items: center; gap: 16px;">
53
+ <img src="https://img.icons8.com/?size=100&id=118557&format=png&color=000000" width="64" />
54
+ <span>DeepGit Lite</span>
55
+ </h1>
56
+ <p style="font-size: 18px; color: #555; margin-top: 10px;">
57
+ ⚙️ A lightweight GitHub research agent for deep semantic search and ranking.
58
+ </p>
59
+ </div>
60
+ """
61
+
62
+ description = """<p align="center">
63
+ DeepGit Lite is a streamlined version of DeepGit designed to perform advanced semantic research on GitHub repositories with faster response times. It uses query enhancement, dense retrieval via FAISS, activity analysis, and a final multi-factor ranking (combining semantic similarity, activity, and popularity) to deliver the best results.
64
+ </p>"""
65
+
66
+ consent_text = """
67
+ <div style="padding: 10px; text-align: center;">
68
+ <p>
69
+ By using DeepGit Lite, you consent to temporary processing of your query for semantic search and ranking purposes.
70
+ </p>
71
+ <p>
72
+ ⭐ Star us on GitHub if you find this tool useful!<br/>
73
+ <a href="https://github.com/zamalali/DeepGit" target="_blank">GitHub</a>
74
+ </p>
75
+ </div>
76
+ """
77
+
78
+ footer = """
79
+ <div style="text-align: center; margin-top: 40px; font-size: 13px; color: #888;">
80
+ Made with <span style="color: crimson;">❤️</span> by <b>Zamal</b>
81
+ </div>
82
+ """
83
+
84
+ # ---------------------------
85
+ # HTML Table Renderer for DeepGit Lite
86
+ # ---------------------------
87
+ def format_percent(value):
88
+ try:
89
+ return f"{float(value) * 100:.1f}%"
90
+ except:
91
+ return value
92
+
93
+ def parse_result_to_html(raw_result: str) -> str:
94
+ entries = raw_result.strip().split("Final Rank:")
95
+ html = """
96
+ <style>
97
+ table {
98
+ width: 100%;
99
+ border-collapse: collapse;
100
+ margin: 1em 0;
101
+ font-size: 14px;
102
+ }
103
+ th, td {
104
+ padding: 12px 15px;
105
+ border: 1px solid #ddd;
106
+ text-align: left;
107
+ vertical-align: top;
108
+ }
109
+ th {
110
+ background-color: #f4f4f4;
111
+ }
112
+ tr:hover { background-color: #f9f9f9; }
113
+ </style>
114
+ <table>
115
+ <thead>
116
+ <tr>
117
+ <th>Rank</th>
118
+ <th>Title</th>
119
+ <th>Link</th>
120
+ <th>Semantic Similarity</th>
121
+ <th>Activity Score</th>
122
+ <th>Final Score</th>
123
+ </tr>
124
+ </thead>
125
+ <tbody>
126
+ """
127
+ for entry in entries[1:]:
128
+ lines = entry.strip().split("\n")
129
+ data = {}
130
+ data["Final Rank"] = lines[0].strip()
131
+ for line in lines[1:]:
132
+ if ": " in line:
133
+ key, val = line.split(": ", 1)
134
+ data[key.strip()] = val.strip()
135
+ html += f"""
136
+ <tr>
137
+ <td>{data.get('Final Rank', '')}</td>
138
+ <td>{data.get('Title', '')}</td>
139
+ <td><a href="{data.get('Link', '#')}" target="_blank">GitHub</a></td>
140
+ <td>{format_percent(data.get('Semantic Similarity', ''))}</td>
141
+ <td>{float(data.get('Activity Score', 0)):.2f}</td>
142
+ <td>{format_percent(data.get('Final Score', ''))}</td>
143
+ </tr>
144
+ """
145
+ html += "</tbody></table>"
146
+ return html
147
+
148
+ # ---------------------------
149
+ # Background Workflow Runner for DeepGit Lite
150
+ # ---------------------------
151
+ def run_lite_workflow(topic, result_container):
152
+ result = run_deepgit_lite(topic)
153
+ result_container["raw_result"] = result
154
+
155
+ def stream_lite_workflow(topic):
156
+ with LOG_BUFFER_LOCK:
157
+ LOG_BUFFER.clear()
158
+ result_container = {}
159
+ workflow_thread = threading.Thread(target=run_lite_workflow, args=(topic, result_container))
160
+ workflow_thread.start()
161
+
162
+ last_index = 0
163
+ while workflow_thread.is_alive() or (last_index < len(LOG_BUFFER)):
164
+ with LOG_BUFFER_LOCK:
165
+ new_logs = LOG_BUFFER[last_index:]
166
+ last_index = len(LOG_BUFFER)
167
+ if new_logs:
168
+ filtered_logs = filter_logs(new_logs)
169
+ status_msg = filtered_logs[-1]
170
+ detail_msg = "<br/>".join(filtered_logs)
171
+ yield status_msg, detail_msg
172
+ time.sleep(0.5)
173
+
174
+ workflow_thread.join()
175
+ with LOG_BUFFER_LOCK:
176
+ final_logs = LOG_BUFFER[:]
177
+ filtered_final = filter_logs(final_logs)
178
+ raw_result = result_container.get("raw_result", "No results returned.")
179
+ html_result = parse_result_to_html(raw_result)
180
+ yield "", html_result
181
+
182
+ # ---------------------------
183
+ # App UI Setup for DeepGit Lite
184
+ # ---------------------------
185
+ with gr.Blocks(
186
+ theme="gstaff/sketch",
187
+ css="""
188
+ #main_container { margin: auto; max-width: 900px; }
189
+ footer, footer * { display: none !important; }
190
+ """
191
+ ) as demo:
192
+
193
+ gr.HTML(favicon_html)
194
+ gr.HTML(title)
195
+ gr.HTML(description)
196
+
197
+ with gr.Column(elem_id="user_consent_container") as consent_block:
198
+ gr.HTML(consent_text)
199
+ agree_button = gr.Button("I Agree", variant="primary")
200
+
201
+ with gr.Column(elem_id="main_container", visible=False) as main_block:
202
+ research_input = gr.Textbox(
203
+ label="Research Topic",
204
+ placeholder="Enter your research topic here, e.g., 'Instruction-based fine-tuning for LLaMA 2 using chain-of-thought prompting in Python.'",
205
+ lines=3
206
+ )
207
+ run_button = gr.Button("Run DeepGit Lite", variant="primary")
208
+ status_display = gr.Markdown("")
209
+ detail_display = gr.HTML("")
210
+ output_html = gr.HTML("")
211
+ state = gr.State([])
212
+
213
+ def enable_main():
214
+ return gr.update(visible=False), gr.update(visible=True)
215
+
216
+ agree_button.click(fn=enable_main, inputs=[], outputs=[consent_block, main_block], queue=False)
217
+
218
+ def lite_runner(topic):
219
+ for status, details in stream_lite_workflow(topic):
220
+ yield status, details
221
+
222
+ run_button.click(
223
+ fn=lite_runner,
224
+ inputs=[research_input],
225
+ outputs=[status_display, detail_display],
226
+ api_name="deepgit_lite",
227
+ show_progress=True
228
+ )
229
+
230
+ research_input.submit(
231
+ fn=lite_runner,
232
+ inputs=[research_input],
233
+ outputs=[status_display, detail_display],
234
+ api_name="deepgit_lite_submit",
235
+ show_progress=True
236
+ )
237
+
238
+ gr.HTML(footer)
239
+ demo.queue(max_size=10).launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ requests==2.32.3
2
+ numpy==1.25.2
3
+ python-dotenv==1.0.1
4
+ sentence-transformers==3.4.1
5
+ faiss-cpu==1.9.0.post1
6
+ gradio==5.23.1
7
+ langgraph==0.2.62
8
+ langchain_groq==0.2.4
9
+ langchain_core==0.3.47
src/__init__.py ADDED
File without changes
src/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (215 Bytes). View file
 
src/__pycache__/deepgit_lite.cpython-311.pyc ADDED
Binary file (15.9 kB). View file
 
src/deepgit_lite.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import base64
3
+ import requests
4
+ import numpy as np
5
+ import datetime
6
+ from sentence_transformers import SentenceTransformer
7
+ import faiss
8
+ import getpass
9
+ import math
10
+ import logging
11
+ from dotenv import load_dotenv
12
+ from pathlib import Path
13
+ from langchain_groq import ChatGroq
14
+ from langchain_core.prompts import ChatPromptTemplate
15
+
16
+ # ---------------------------
17
+ # Environment and .env Setup
18
+ # ---------------------------
19
+ dotenv_path = Path(__file__).resolve().parent.parent / ".env"
20
+ load_dotenv(dotenv_path=str(dotenv_path))
21
+
22
+ if "GITHUB_API_KEY" not in os.environ:
23
+ os.environ["GITHUB_API_KEY"] = getpass.getpass("Enter your GitHub API key: ")
24
+
25
+ # ---------------------------
26
+ # Logging Setup
27
+ # ---------------------------
28
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # ---------------------------
32
+ # ChatGroq Integration Setup (for query enhancement and final justification)
33
+ # ---------------------------
34
+ llm_groq = ChatGroq(
35
+ model="llama-3.1-8b-instant",
36
+ temperature=0.2,
37
+ max_tokens=100,
38
+ timeout=15,
39
+ max_retries=2
40
+ )
41
+
42
+ def enhance_query(original_query):
43
+ prompt = f"""You are an expert research assistant. Given the query: "{original_query}",
44
+ please enhance and expand it by adding relevant technical keywords, recent research context,
45
+ and details specifically related to the application of Chain of Thought prompting in large language models within a Python environment.
46
+ Provide the refined query text."""
47
+ messages = [
48
+ ("system", "You are a helpful research assistant specializing in AI and software research."),
49
+ ("human", prompt)
50
+ ]
51
+ result = llm_groq.invoke(messages)
52
+ return result
53
+
54
+ def justify_candidate(candidate, query):
55
+ prompt = f"""You are a highly knowledgeable AI research assistant. In one to two lines, explain why the repository titled "{candidate['title']}" is a good match for a query on "{query}". Mention key factors such as documentation quality and community validation if relevant.
56
+
57
+ Repository Details:
58
+ - Stars: {candidate['stars']}
59
+ - Semantic Similarity: {candidate.get('semantic_similarity', 0):.4f}
60
+
61
+ Provide a concise justification:"""
62
+ messages = [
63
+ ("system", "You are a highly knowledgeable AI research assistant that can succinctly justify repository matches."),
64
+ ("human", prompt)
65
+ ]
66
+ result = llm_groq.invoke(messages)
67
+ return result
68
+
69
+ # ---------------------------
70
+ # GitHub API Helper Functions
71
+ # ---------------------------
72
+ def fetch_readme_content(repo_full_name, headers):
73
+ readme_url = f"https://api.github.com/repos/{repo_full_name}/readme"
74
+ response = requests.get(readme_url, headers=headers)
75
+ if response.status_code == 200:
76
+ readme_data = response.json()
77
+ return base64.b64decode(readme_data.get('content', '')).decode('utf-8')
78
+ return ""
79
+
80
+ def fetch_file_content(download_url):
81
+ try:
82
+ response = requests.get(download_url)
83
+ if response.status_code == 200:
84
+ return response.text
85
+ except Exception as e:
86
+ logger.error(f"Error fetching file: {e}")
87
+ return ""
88
+
89
+ def fetch_directory_markdown(repo_full_name, path, headers):
90
+ md_content = ""
91
+ url = f"https://api.github.com/repos/{repo_full_name}/contents/{path}"
92
+ response = requests.get(url, headers=headers)
93
+ if response.status_code == 200:
94
+ items = response.json()
95
+ for item in items:
96
+ if item["type"] == "file" and item["name"].lower().endswith(".md"):
97
+ content = fetch_file_content(item["download_url"])
98
+ md_content += f"\n\n# {item['name']}\n" + content
99
+ return md_content
100
+
101
+ def fetch_repo_documentation(repo_full_name, headers):
102
+ doc_text = ""
103
+ # Fetch README first.
104
+ readme = fetch_readme_content(repo_full_name, headers)
105
+ if readme:
106
+ doc_text += "# README\n" + readme
107
+ # Fetch additional markdown files and documentation directories.
108
+ root_url = f"https://api.github.com/repos/{repo_full_name}/contents"
109
+ response = requests.get(root_url, headers=headers)
110
+ if response.status_code == 200:
111
+ items = response.json()
112
+ for item in items:
113
+ if item["type"] == "file" and item["name"].lower().endswith(".md"):
114
+ if item["name"].lower() != "readme.md":
115
+ content = fetch_file_content(item["download_url"])
116
+ doc_text += f"\n\n# {item['name']}\n" + content
117
+ elif item["type"] == "dir" and item["name"].lower() in ["docs", "documentation"]:
118
+ doc_text += f"\n\n# {item['name']} folder\n" + fetch_directory_markdown(repo_full_name, item["name"], headers)
119
+ return doc_text if doc_text.strip() else "No documentation available."
120
+
121
+ def fetch_github_repositories(query, max_results=1000, per_page=100):
122
+ url = "https://api.github.com/search/repositories"
123
+ headers = {
124
+ "Authorization": f"token {os.getenv('GITHUB_API_KEY')}",
125
+ "Accept": "application/vnd.github.v3+json"
126
+ }
127
+ repositories = []
128
+ num_pages = max_results // per_page
129
+ for page in range(1, num_pages + 1):
130
+ params = {
131
+ "q": query,
132
+ "sort": "stars",
133
+ "order": "desc",
134
+ "per_page": per_page,
135
+ "page": page
136
+ }
137
+ response = requests.get(url, headers=headers, params=params)
138
+ if response.status_code != 200:
139
+ logger.error(f"Error {response.status_code}: {response.json().get('message')}")
140
+ break
141
+ items = response.json().get('items', [])
142
+ if not items:
143
+ break
144
+ for repo in items:
145
+ repo_link = repo['html_url']
146
+ full_name = repo.get('full_name', '')
147
+ doc_content = fetch_repo_documentation(full_name, headers)
148
+ star_count = repo.get('stargazers_count', 0)
149
+ repositories.append({
150
+ "title": repo.get('name', 'No title available'),
151
+ "link": repo_link,
152
+ "combined_doc": doc_content,
153
+ "stars": star_count,
154
+ "full_name": full_name,
155
+ "open_issues_count": repo.get('open_issues_count', 0)
156
+ })
157
+ logger.info(f"Fetched {len(repositories)} repositories from GitHub.")
158
+ return repositories
159
+
160
+ # ---------------------------
161
+ # Main Lite Workflow Function
162
+ # ---------------------------
163
+ def run_deepgit_lite(user_query):
164
+ # Stage 0: Query Enhancement using ChatGroq
165
+ logger.info("Enhancing query using ChatGroq...")
166
+ original_query = user_query.strip()
167
+ enhanced_query = enhance_query(original_query)
168
+ logger.info(f"Enhanced Query: {enhanced_query}")
169
+ github_query = enhanced_query + " language:python"
170
+ logger.info(f"Using GitHub query: {github_query}")
171
+
172
+ # Stage 1: Dense Retrieval with FAISS
173
+ logger.info("Fetching repositories from GitHub...")
174
+ repos = fetch_github_repositories(github_query)
175
+ docs = [repo.get("combined_doc", "") for repo in repos]
176
+ logger.info(f"Encoding {len(docs)} documents for dense retrieval...")
177
+ sem_model = SentenceTransformer("all-mpnet-base-v2")
178
+ doc_embeddings = sem_model.encode(docs, convert_to_numpy=True, show_progress_bar=True, batch_size=16)
179
+
180
+ def normalize_embeddings(embeddings):
181
+ norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
182
+ return embeddings / (norms + 1e-10)
183
+
184
+ doc_embeddings = normalize_embeddings(doc_embeddings)
185
+ query_embedding = sem_model.encode(user_query, convert_to_numpy=True)
186
+ query_embedding = normalize_embeddings(np.expand_dims(query_embedding, axis=0))[0]
187
+ dim = doc_embeddings.shape[1]
188
+ index = faiss.IndexFlatIP(dim)
189
+ index.add(doc_embeddings)
190
+ k = min(100, doc_embeddings.shape[0])
191
+ D, I = index.search(np.expand_dims(query_embedding, axis=0), k)
192
+ for idx, score in zip(I[0], D[0]):
193
+ repos[idx]["semantic_similarity"] = score
194
+ ranked_by_semantic = sorted(repos, key=lambda x: x.get("semantic_similarity", 0), reverse=True)
195
+ logger.info(f"Stage 1 complete: {len(ranked_by_semantic)} candidates ranked by semantic similarity.")
196
+
197
+ # Stage 2: Filtering Low-Star Repositories
198
+ filtered_candidates = [repo for repo in ranked_by_semantic if repo["stars"] >= 50]
199
+ if not filtered_candidates:
200
+ filtered_candidates = ranked_by_semantic # fallback if filtering is too strict
201
+ logger.info(f"Stage 2 complete: {len(filtered_candidates)} candidates remain after filtering low-star repositories.")
202
+
203
+ # Stage 3: Combine Scores for Final Ranking (Using Semantic Similarity and Stars Only)
204
+ semantic_scores = [repo.get("semantic_similarity", 0) for repo in filtered_candidates]
205
+ star_scores = [math.log(repo.get("stars", 0) + 1) for repo in filtered_candidates]
206
+
207
+ min_sem, max_sem = min(semantic_scores), max(semantic_scores)
208
+ min_star, max_star = min(star_scores), max(star_scores)
209
+
210
+ def normalize(val, min_val, max_val):
211
+ if max_val - min_val == 0:
212
+ return 0.5
213
+ return (val - min_val) / (max_val - min_val)
214
+
215
+ for repo in filtered_candidates:
216
+ norm_sem = normalize(repo.get("semantic_similarity", 0), min_sem, max_sem)
217
+ norm_star = normalize(math.log(repo.get("stars", 0) + 1), min_star, max_star)
218
+ # Weights: 60% semantic, 40% stars.
219
+ repo["final_score"] = 0.6 * norm_sem + 0.4 * norm_star
220
+
221
+ final_ranked = sorted(filtered_candidates, key=lambda x: x["final_score"], reverse=True)
222
+ logger.info(f"Stage 3 complete: Final ranking computed for {len(final_ranked)} candidates.")
223
+
224
+ # Stage 4: Final Justification using ChatGroq
225
+ justifications = {}
226
+ for repo in final_ranked[:10]:
227
+ justification = justify_candidate(repo, user_query)
228
+ justifications[repo['title']] = justification
229
+ logger.info(f"Justification for {repo['title']}: {justification}")
230
+
231
+ # Format final results into a text table.
232
+ result_text = "\n=== Final Ranked Repositories ===\n"
233
+ for rank, repo in enumerate(final_ranked[:10], 1):
234
+ result_text += f"Final Rank: {rank}\n"
235
+ result_text += f"Title: {repo['title']}\n"
236
+ result_text += f"Link: {repo['link']}\n"
237
+ result_text += f"Stars: {repo['stars']}\n"
238
+ result_text += f"Semantic Similarity: {repo.get('semantic_similarity', 0):.4f}\n"
239
+ result_text += f"Final Score: {repo.get('final_score', 0):.4f}\n"
240
+ result_text += f"Justification: {justifications.get(repo['title'], 'No justification available')}\n"
241
+ result_text += f"Combined Doc Snippet: {repo['combined_doc'][:200]}...\n"
242
+ result_text += '-' * 80 + "\n"
243
+ result_text += "\n=== End of Results ==="
244
+
245
+ return result_text