Spaces:
Sleeping
Sleeping
First commit
Browse files- .gitignore +5 -0
- app.py +268 -0
- chat_with_project.py +110 -0
- get_prompts.py +62 -0
- milvus.py +105 -0
- requirements.txt +8 -0
- requirements_dev.txt +143 -0
- utils/extract.py +69 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
/workspace
|
| 3 |
+
/__pycache__
|
| 4 |
+
.env
|
| 5 |
+
/extraction
|
app.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import zipfile
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
import subprocess
|
| 6 |
+
from chat_with_project import query_project
|
| 7 |
+
from get_prompts import get_prompt_for_mode
|
| 8 |
+
from dotenv import load_dotenv, set_key
|
| 9 |
+
from milvus import initialize_milvus, DEFAULT_MILVUS_HOST, DEFAULT_MILVUS_PORT, DEFAULT_COLLECTION_NAME, DEFAULT_DIMENSION, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY
|
| 10 |
+
|
| 11 |
+
# --- Configuration and Setup ---
|
| 12 |
+
|
| 13 |
+
# Define paths for workspace and extraction directories
|
| 14 |
+
WORKSPACE_DIR = "workspace"
|
| 15 |
+
EXTRACTION_DIR = "extraction"
|
| 16 |
+
|
| 17 |
+
def clear_directories():
|
| 18 |
+
"""Clears the workspace and extraction directories."""
|
| 19 |
+
for directory in [WORKSPACE_DIR, EXTRACTION_DIR]:
|
| 20 |
+
if os.path.exists(directory):
|
| 21 |
+
shutil.rmtree(directory)
|
| 22 |
+
os.makedirs(directory, exist_ok=True)
|
| 23 |
+
|
| 24 |
+
# Clear directories at startup
|
| 25 |
+
clear_directories()
|
| 26 |
+
|
| 27 |
+
# --- API Key Management ---
|
| 28 |
+
|
| 29 |
+
def ensure_env_file_exists():
|
| 30 |
+
"""Ensures that a .env file exists in the project root."""
|
| 31 |
+
if not os.path.exists(".env"):
|
| 32 |
+
with open(".env", "w") as f:
|
| 33 |
+
f.write("") # Create an empty .env file
|
| 34 |
+
|
| 35 |
+
def load_api_key():
|
| 36 |
+
"""Loads the API key from the .env file or the environment."""
|
| 37 |
+
ensure_env_file_exists()
|
| 38 |
+
load_dotenv()
|
| 39 |
+
return os.environ.get("OPENAI_API_KEY")
|
| 40 |
+
|
| 41 |
+
def update_api_key(api_key):
|
| 42 |
+
"""Updates the API key in the .env file."""
|
| 43 |
+
if api_key:
|
| 44 |
+
set_key(".env", "OPENAI_API_KEY", api_key)
|
| 45 |
+
load_dotenv() # Reload environment variables
|
| 46 |
+
return "API key updated successfully."
|
| 47 |
+
else:
|
| 48 |
+
return "API key cannot be empty."
|
| 49 |
+
|
| 50 |
+
def is_api_key_set():
|
| 51 |
+
"""Checks if the API key is set."""
|
| 52 |
+
return bool(load_api_key())
|
| 53 |
+
|
| 54 |
+
# --- Core Functionalities ---
|
| 55 |
+
|
| 56 |
+
def process_zip(zip_file_path):
|
| 57 |
+
"""Extracts a zip file, analyzes content, and stores information."""
|
| 58 |
+
try:
|
| 59 |
+
# Clear existing workspace and extraction directories before processing
|
| 60 |
+
clear_directories()
|
| 61 |
+
|
| 62 |
+
# Extract the zip file
|
| 63 |
+
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
| 64 |
+
zip_ref.extractall(WORKSPACE_DIR)
|
| 65 |
+
|
| 66 |
+
# Run extract.py
|
| 67 |
+
subprocess.run(["python", "./utils/extract.py", WORKSPACE_DIR], check=True)
|
| 68 |
+
|
| 69 |
+
return "Processing complete! Results saved in the 'extraction' directory."
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
return f"An error occurred: {e}"
|
| 73 |
+
|
| 74 |
+
def init_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay):
|
| 75 |
+
"""Initializes or loads the Milvus vector database."""
|
| 76 |
+
try:
|
| 77 |
+
# Convert string inputs to appropriate types
|
| 78 |
+
milvus_port = int(milvus_port)
|
| 79 |
+
dimension = int(dimension)
|
| 80 |
+
max_retries = int(max_retries)
|
| 81 |
+
retry_delay = int(retry_delay)
|
| 82 |
+
|
| 83 |
+
initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
|
| 84 |
+
return "Milvus database initialized or loaded successfully."
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
return f"Error initializing Milvus: {e}"
|
| 88 |
+
|
| 89 |
+
# --- Chatbot Verification ---
|
| 90 |
+
|
| 91 |
+
def is_project_loaded():
|
| 92 |
+
"""Checks if a project has been loaded (i.e., if the extraction directory contains .pkl files)."""
|
| 93 |
+
extraction_dir = "extraction"
|
| 94 |
+
pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
|
| 95 |
+
return bool(pkl_files)
|
| 96 |
+
# --- Gradio UI Components ---
|
| 97 |
+
|
| 98 |
+
# Chat Interface
|
| 99 |
+
def chat_ui(query, history, mode):
|
| 100 |
+
"""Handles the chat interaction for Analyzer, Debugger, and Developer modes."""
|
| 101 |
+
api_key = load_api_key()
|
| 102 |
+
if not api_key:
|
| 103 |
+
return "Error: OpenAI API key not set. Please set the API key in the Settings tab.", []
|
| 104 |
+
|
| 105 |
+
if not is_project_loaded():
|
| 106 |
+
return "Error: No project loaded. Please upload and process a ZIP file first.", []
|
| 107 |
+
|
| 108 |
+
# Initialize history if None
|
| 109 |
+
if history is None:
|
| 110 |
+
history = []
|
| 111 |
+
|
| 112 |
+
print(f"Chat Mode: {mode}")
|
| 113 |
+
system_prompt = get_prompt_for_mode(mode)
|
| 114 |
+
print(f"System Prompt: {system_prompt}")
|
| 115 |
+
|
| 116 |
+
# Pass the query and system prompt to the LLM
|
| 117 |
+
response = query_project(query, system_prompt)
|
| 118 |
+
print(f"Response from query_project: {response}")
|
| 119 |
+
|
| 120 |
+
if response is None or not response.strip():
|
| 121 |
+
response = "An error occurred during processing. Please check the logs."
|
| 122 |
+
|
| 123 |
+
if mode == "developer":
|
| 124 |
+
extracted_files = extract_files_from_response(response)
|
| 125 |
+
|
| 126 |
+
# Format the output for developer mode
|
| 127 |
+
developer_response = ""
|
| 128 |
+
for filepath, content in extracted_files.items():
|
| 129 |
+
developer_response += f"**{filepath}:**\n`python\n{content}\n`\n\n"
|
| 130 |
+
|
| 131 |
+
history.append((query, developer_response))
|
| 132 |
+
# Return history and an empty string for the text output (as it's handled by the chatbot)
|
| 133 |
+
return history, history
|
| 134 |
+
|
| 135 |
+
else:
|
| 136 |
+
# Format the output for non-developer modes
|
| 137 |
+
formatted_response = response.replace('\n', ' \n') # Use two spaces for markdown line breaks
|
| 138 |
+
history.append((query, formatted_response))
|
| 139 |
+
# Return history and an empty string for the text output (as it's handled by the chatbot)
|
| 140 |
+
return history, history
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def extract_files_from_response(response):
|
| 144 |
+
"""
|
| 145 |
+
Parses the LLM response to extract file paths and their corresponding code content.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
response (str): The raw response string from the LLM.
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
dict: A dictionary where keys are file paths and values are the code content of each file.
|
| 152 |
+
"""
|
| 153 |
+
files = {}
|
| 154 |
+
current_file = None
|
| 155 |
+
current_content = []
|
| 156 |
+
|
| 157 |
+
for line in response.splitlines():
|
| 158 |
+
if line.startswith("--- BEGIN FILE:"):
|
| 159 |
+
if current_file is not None:
|
| 160 |
+
# Save previous file content
|
| 161 |
+
files[current_file] = "\n".join(current_content)
|
| 162 |
+
|
| 163 |
+
# Start a new file
|
| 164 |
+
current_file = line.replace("--- BEGIN FILE:", "").strip()
|
| 165 |
+
current_content = []
|
| 166 |
+
elif line.startswith("--- END FILE:"):
|
| 167 |
+
if current_file is not None:
|
| 168 |
+
# Save current file content
|
| 169 |
+
files[current_file] = "\n".join(current_content)
|
| 170 |
+
current_file = None
|
| 171 |
+
current_content = []
|
| 172 |
+
elif current_file is not None:
|
| 173 |
+
# Append line to current file content
|
| 174 |
+
current_content.append(line)
|
| 175 |
+
|
| 176 |
+
return files
|
| 177 |
+
|
| 178 |
+
# ZIP Processing Interface
|
| 179 |
+
zip_iface = gr.Interface(
|
| 180 |
+
fn=process_zip,
|
| 181 |
+
inputs=gr.File(label="Upload ZIP File"),
|
| 182 |
+
outputs="text",
|
| 183 |
+
title="Zip File Analyzer",
|
| 184 |
+
description="Upload a zip file to analyze and store its contents.",
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
# Milvus Initialization Interface
|
| 188 |
+
milvus_iface = gr.Interface(
|
| 189 |
+
fn=init_milvus,
|
| 190 |
+
inputs=[
|
| 191 |
+
gr.Textbox(label="Milvus Host", placeholder=DEFAULT_MILVUS_HOST, value=DEFAULT_MILVUS_HOST),
|
| 192 |
+
gr.Textbox(label="Milvus Port", placeholder=DEFAULT_MILVUS_PORT, value=DEFAULT_MILVUS_PORT),
|
| 193 |
+
gr.Textbox(label="Collection Name", placeholder=DEFAULT_COLLECTION_NAME, value=DEFAULT_COLLECTION_NAME),
|
| 194 |
+
gr.Textbox(label="Dimension", placeholder=str(DEFAULT_DIMENSION), value=str(DEFAULT_DIMENSION)),
|
| 195 |
+
gr.Textbox(label="Max Retries", placeholder=str(DEFAULT_MAX_RETRIES), value=str(DEFAULT_MAX_RETRIES)),
|
| 196 |
+
gr.Textbox(label="Retry Delay (seconds)", placeholder=str(DEFAULT_RETRY_DELAY), value=str(DEFAULT_RETRY_DELAY))
|
| 197 |
+
],
|
| 198 |
+
outputs="text",
|
| 199 |
+
title="Milvus Database Initialization",
|
| 200 |
+
description="Initialize or load the Milvus vector database.",
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# Gradio Chatbot UI Interface
|
| 204 |
+
chat_iface = gr.Interface(
|
| 205 |
+
fn=chat_ui,
|
| 206 |
+
inputs=[
|
| 207 |
+
gr.Textbox(label="Ask a question", placeholder="Type your question here"),
|
| 208 |
+
gr.State(), # Maintains chat history
|
| 209 |
+
gr.Radio(["analyzer", "debugger", "developer"], label="Chat Mode", value="analyzer")
|
| 210 |
+
],
|
| 211 |
+
outputs=[
|
| 212 |
+
gr.Chatbot(label="Chat with Project"),
|
| 213 |
+
"state" # This is to store the state,
|
| 214 |
+
],
|
| 215 |
+
title="Chat with your Project",
|
| 216 |
+
description="Ask questions about the data extracted from the zip file.",
|
| 217 |
+
# Example usage - Corrected to only include instruction and mode
|
| 218 |
+
examples=[
|
| 219 |
+
["What is this project about?", "analyzer"],
|
| 220 |
+
["Are there any potential bugs?", "debugger"],
|
| 221 |
+
["How does the data flow through the application?", "analyzer"],
|
| 222 |
+
["Explain the main components of the architecture.", "analyzer"],
|
| 223 |
+
["What are the dependencies of this project?", "analyzer"],
|
| 224 |
+
["Are there any potential memory leaks?", "debugger"],
|
| 225 |
+
["Identify any areas where the code could be optimized.","debugger"],
|
| 226 |
+
["Implement basic logging for the main application and save logs to a file.", "developer"],
|
| 227 |
+
["Use try/except blocks in main functions to handle exceptions", "developer"]
|
| 228 |
+
|
| 229 |
+
],
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# Settings Interface
|
| 233 |
+
settings_iface = gr.Interface(
|
| 234 |
+
fn=update_api_key,
|
| 235 |
+
inputs=gr.Textbox(label="OpenAI API Key", type="password"),
|
| 236 |
+
outputs="text",
|
| 237 |
+
title="Settings",
|
| 238 |
+
description="Set your OpenAI API key.",
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# Status Interface
|
| 242 |
+
def get_api_key_status():
|
| 243 |
+
if is_api_key_set():
|
| 244 |
+
return "API key status: Set"
|
| 245 |
+
else:
|
| 246 |
+
return "API key status: Not set"
|
| 247 |
+
|
| 248 |
+
status_iface = gr.Interface(
|
| 249 |
+
fn=get_api_key_status,
|
| 250 |
+
inputs=None,
|
| 251 |
+
outputs="text",
|
| 252 |
+
live=True,
|
| 253 |
+
title="API Key Status"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Add credits to the UI
|
| 257 |
+
credits = gr.Markdown("## Credits\n\nCreated by [Ruslan Magana Vsevolodovna](https://ruslanmv.com/)")
|
| 258 |
+
|
| 259 |
+
# --- Main Application Launch ---
|
| 260 |
+
|
| 261 |
+
# Combine the interfaces using Tabs
|
| 262 |
+
demo = gr.TabbedInterface(
|
| 263 |
+
[zip_iface, milvus_iface, chat_iface, settings_iface, status_iface],
|
| 264 |
+
["Process ZIP", "Init Milvus", "Chat with Project", "Settings", "Status"],
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
# Launch the app with credits
|
| 268 |
+
demo.queue().launch()
|
chat_with_project.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pymilvus import connections, Collection, utility
|
| 2 |
+
from sentence_transformers import SentenceTransformer
|
| 3 |
+
from langchain_openai import ChatOpenAI # Updated import
|
| 4 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 5 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 6 |
+
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Milvus connection details
|
| 10 |
+
MILVUS_HOST = 'localhost'
|
| 11 |
+
MILVUS_PORT = '19530'
|
| 12 |
+
COLLECTION_NAME = 'document_collection'
|
| 13 |
+
|
| 14 |
+
def load_api_key():
|
| 15 |
+
"""Loads the API key from the .env file or the environment."""
|
| 16 |
+
from dotenv import load_dotenv
|
| 17 |
+
load_dotenv()
|
| 18 |
+
return os.environ.get("OPENAI_API_KEY")
|
| 19 |
+
|
| 20 |
+
# Embedding model
|
| 21 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 22 |
+
|
| 23 |
+
def retrieve_relevant_documents(query, top_k=5):
|
| 24 |
+
"""
|
| 25 |
+
Retrieves the most relevant documents from Milvus based on the query.
|
| 26 |
+
"""
|
| 27 |
+
print(f"Connecting to Milvus at {MILVUS_HOST}:{MILVUS_PORT}...")
|
| 28 |
+
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
|
| 29 |
+
if utility.has_collection(COLLECTION_NAME):
|
| 30 |
+
collection = Collection(COLLECTION_NAME)
|
| 31 |
+
collection.load()
|
| 32 |
+
|
| 33 |
+
query_vector = model.encode([query]).tolist()
|
| 34 |
+
print(f"Encoded Query Vector: {query_vector}")
|
| 35 |
+
|
| 36 |
+
search_params = {
|
| 37 |
+
"metric_type": "L2",
|
| 38 |
+
"params": {"nprobe": 16}
|
| 39 |
+
}
|
| 40 |
+
search_results = collection.search(
|
| 41 |
+
data=query_vector,
|
| 42 |
+
anns_field="content_vector",
|
| 43 |
+
param=search_params,
|
| 44 |
+
limit=top_k,
|
| 45 |
+
expr=None,
|
| 46 |
+
output_fields=["path"]
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
relevant_docs = []
|
| 50 |
+
for hit in search_results[0]:
|
| 51 |
+
doc_path = hit.entity.get("path")
|
| 52 |
+
relevant_docs.append(doc_path)
|
| 53 |
+
|
| 54 |
+
print(f"Relevant Docs: {relevant_docs}")
|
| 55 |
+
connections.disconnect(alias='default')
|
| 56 |
+
else:
|
| 57 |
+
print(f"Collection {COLLECTION_NAME} does not exist.")
|
| 58 |
+
relevant_docs = []
|
| 59 |
+
|
| 60 |
+
return relevant_docs
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def generate_response_with_gpt(query, relevant_docs, system_prompt):
|
| 64 |
+
"""
|
| 65 |
+
Generates a response using OpenAI's GPT model, based on the query, relevant documents, and system prompt.
|
| 66 |
+
"""
|
| 67 |
+
api_key = load_api_key()
|
| 68 |
+
if not api_key:
|
| 69 |
+
raise ValueError("OpenAI API key not set. Please set it in the .env file or environment variables.")
|
| 70 |
+
|
| 71 |
+
print(f"Using OpenAI API Key: {api_key[:5]}...") # Partial key for debugging
|
| 72 |
+
chat = ChatOpenAI(temperature=0.7, openai_api_key=api_key, model_name="gpt-3.5-turbo")
|
| 73 |
+
|
| 74 |
+
messages = [SystemMessage(content=system_prompt)]
|
| 75 |
+
if relevant_docs:
|
| 76 |
+
doc_content = ""
|
| 77 |
+
for doc_path in relevant_docs:
|
| 78 |
+
if os.path.isfile(doc_path):
|
| 79 |
+
try:
|
| 80 |
+
with open(doc_path, "r", encoding="utf-8") as f:
|
| 81 |
+
doc_content += f.read() + "\n"
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"Error reading document {doc_path}: {e}")
|
| 84 |
+
if doc_content:
|
| 85 |
+
messages.append(HumanMessage(content=f"Relevant documents:\n{doc_content}"))
|
| 86 |
+
|
| 87 |
+
messages.append(HumanMessage(content=query))
|
| 88 |
+
print(f"Messages sent to OpenAI API: {messages}")
|
| 89 |
+
|
| 90 |
+
try:
|
| 91 |
+
response = chat.invoke(messages)
|
| 92 |
+
print(f"OpenAI API Response: {response.content}")
|
| 93 |
+
print("Type OpenAI API Response",type(response.content))
|
| 94 |
+
return response.content
|
| 95 |
+
except Exception as e:
|
| 96 |
+
print(f"Error during OpenAI API call: {e}")
|
| 97 |
+
return "Error generating response. Please try again later."
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def query_project(query, system_prompt):
|
| 101 |
+
"""
|
| 102 |
+
Queries the project using a RAG approach with specified system prompt.
|
| 103 |
+
"""
|
| 104 |
+
relevant_docs = retrieve_relevant_documents(query)
|
| 105 |
+
print(" Starting the query:")
|
| 106 |
+
print(query)
|
| 107 |
+
response = generate_response_with_gpt(query, relevant_docs, system_prompt)
|
| 108 |
+
print(f"Query Response: {response}")
|
| 109 |
+
print("Type response",type(response))
|
| 110 |
+
return response
|
get_prompts.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.prompts import PromptTemplate
|
| 2 |
+
|
| 3 |
+
ANALYZER_PROMPT_TEMPLATE = """
|
| 4 |
+
You are a code analyzer AI. Your task is to analyze the project's structure,
|
| 5 |
+
purpose, and functionality. Explain how different components interact,
|
| 6 |
+
discuss the overall architecture, and provide insights into the project's design.
|
| 7 |
+
Consider the context provided and try to be comprehensive in your analysis.
|
| 8 |
+
|
| 9 |
+
Relevant context: {context}
|
| 10 |
+
|
| 11 |
+
Explain in detail, based on the context provided.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
DEBUGGER_PROMPT_TEMPLATE = """
|
| 15 |
+
You are a code debugger AI. Your task is to identify potential bugs,
|
| 16 |
+
errors, and areas for improvement in the project's code. Analyze the given code
|
| 17 |
+
for logic errors, performance bottlenecks, and suggest fixes or improvements.
|
| 18 |
+
If the user asks how to fix an issue, provide the corrected code snippet.
|
| 19 |
+
|
| 20 |
+
Relevant context: {context}
|
| 21 |
+
|
| 22 |
+
Focus on identifying issues and providing solutions or improvements based on the context provided.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
DEVELOPER_PROMPT_TEMPLATE = """
|
| 26 |
+
You are a software developer AI. Your task is to modify or extend existing code based on user requests.
|
| 27 |
+
When a user asks to add a feature or modify existing functionality, you should:
|
| 28 |
+
|
| 29 |
+
1. Identify the files that need to be modified or created.
|
| 30 |
+
2. Output the full, updated code for each file that needs changes.
|
| 31 |
+
3. Clearly indicate the filename before each code block using this format:
|
| 32 |
+
```
|
| 33 |
+
--- BEGIN FILE: <filepath> ---
|
| 34 |
+
<full code of the file>
|
| 35 |
+
--- END FILE: <filepath> ---
|
| 36 |
+
```
|
| 37 |
+
4. If a new file needs to be created, use the same format and specify the new file's path and name.
|
| 38 |
+
5. **Do not omit any part of the code**. Output the entire content of each modified or new file.
|
| 39 |
+
6. Ensure that the generated code is functional, well-structured, and integrates seamlessly with the existing project.
|
| 40 |
+
7. Explain any additional setup or configuration steps if necessary.
|
| 41 |
+
|
| 42 |
+
Remember to consider the existing project's structure and coding style when making modifications.
|
| 43 |
+
|
| 44 |
+
Relevant context: {context}
|
| 45 |
+
|
| 46 |
+
User request: {question}
|
| 47 |
+
|
| 48 |
+
Modify or extend the code as requested, providing the full code for each relevant file.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
def get_prompt_for_mode(mode):
|
| 52 |
+
"""
|
| 53 |
+
Returns the appropriate prompt template based on the selected mode.
|
| 54 |
+
"""
|
| 55 |
+
if mode == "analyzer":
|
| 56 |
+
return ANALYZER_PROMPT_TEMPLATE
|
| 57 |
+
elif mode == "debugger":
|
| 58 |
+
return DEBUGGER_PROMPT_TEMPLATE
|
| 59 |
+
elif mode == "developer":
|
| 60 |
+
return DEVELOPER_PROMPT_TEMPLATE
|
| 61 |
+
else:
|
| 62 |
+
raise ValueError(f"Invalid mode: {mode}")
|
milvus.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# milvus.py
|
| 2 |
+
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import os
|
| 5 |
+
import sys
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
# Default Milvus connection details
|
| 10 |
+
DEFAULT_MILVUS_HOST = 'localhost'
|
| 11 |
+
DEFAULT_MILVUS_PORT = '19530'
|
| 12 |
+
DEFAULT_COLLECTION_NAME = 'document_collection'
|
| 13 |
+
DEFAULT_DIMENSION = 384 # Adjust based on your embedding model
|
| 14 |
+
DEFAULT_MAX_RETRIES = 3
|
| 15 |
+
DEFAULT_RETRY_DELAY = 5 # seconds
|
| 16 |
+
|
| 17 |
+
# Embedding model
|
| 18 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 19 |
+
|
| 20 |
+
def create_milvus_collection(host, port, collection_name, dimension):
|
| 21 |
+
"""
|
| 22 |
+
Creates a new Milvus collection if it doesn't exist.
|
| 23 |
+
"""
|
| 24 |
+
if not utility.has_collection(collection_name):
|
| 25 |
+
fields = [
|
| 26 |
+
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
|
| 27 |
+
FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=500),
|
| 28 |
+
FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension)
|
| 29 |
+
]
|
| 30 |
+
schema = CollectionSchema(fields, "Document Vector Store")
|
| 31 |
+
collection = Collection(collection_name, schema, consistency_level="Strong")
|
| 32 |
+
|
| 33 |
+
index_params = {
|
| 34 |
+
"metric_type": "L2",
|
| 35 |
+
"index_type": "IVF_FLAT",
|
| 36 |
+
"params": {"nlist": 1024}
|
| 37 |
+
}
|
| 38 |
+
collection.create_index(field_name="content_vector", index_params=index_params)
|
| 39 |
+
print(f"Collection {collection_name} created and index built.")
|
| 40 |
+
else:
|
| 41 |
+
print(f"Collection {collection_name} already exists.")
|
| 42 |
+
|
| 43 |
+
def load_data_to_milvus(host, port, collection_name):
|
| 44 |
+
"""
|
| 45 |
+
Loads data from the DataFrame into Milvus, using sentence embeddings.
|
| 46 |
+
"""
|
| 47 |
+
extraction_dir = "extraction"
|
| 48 |
+
pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
|
| 49 |
+
if not pkl_files:
|
| 50 |
+
print("No .pkl files found in the 'extraction' directory.")
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
df_path = os.path.join(extraction_dir, pkl_files[0])
|
| 54 |
+
df = pd.read_pickle(df_path)
|
| 55 |
+
|
| 56 |
+
# Generate sentence embeddings
|
| 57 |
+
df['content_vector'] = df['content'].apply(lambda x: model.encode(x).tolist())
|
| 58 |
+
|
| 59 |
+
data_to_insert = [
|
| 60 |
+
df['path'].tolist(),
|
| 61 |
+
df['content_vector'].tolist()
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
collection = Collection(collection_name)
|
| 65 |
+
collection.insert(data_to_insert)
|
| 66 |
+
collection.flush()
|
| 67 |
+
|
| 68 |
+
print(f"Data from {df_path} loaded into Milvus collection {collection_name}.")
|
| 69 |
+
|
| 70 |
+
def connect_to_milvus(host, port, max_retries, retry_delay):
|
| 71 |
+
"""Connects to Milvus with retries."""
|
| 72 |
+
retries = 0
|
| 73 |
+
while retries < max_retries:
|
| 74 |
+
try:
|
| 75 |
+
connections.connect(host=host, port=port)
|
| 76 |
+
print(f"Successfully connected to Milvus at {host}:{port}")
|
| 77 |
+
return True
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"Error connecting to Milvus: {e}")
|
| 80 |
+
retries += 1
|
| 81 |
+
if retries < max_retries:
|
| 82 |
+
print(f"Retrying in {retry_delay} seconds...")
|
| 83 |
+
time.sleep(retry_delay)
|
| 84 |
+
else:
|
| 85 |
+
print("Max retries reached. Could not connect to Milvus.")
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
def initialize_milvus(host, port, collection_name, dimension, max_retries, retry_delay):
|
| 89 |
+
"""Initializes Milvus with parameters."""
|
| 90 |
+
if connect_to_milvus(host, port, max_retries, retry_delay):
|
| 91 |
+
create_milvus_collection(host, port, collection_name, dimension)
|
| 92 |
+
load_data_to_milvus(host, port, collection_name)
|
| 93 |
+
connections.disconnect(alias='default')
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
# Use default values or environment variables if available
|
| 98 |
+
milvus_host = os.environ.get('MILVUS_HOST', DEFAULT_MILVUS_HOST)
|
| 99 |
+
milvus_port = os.environ.get('MILVUS_PORT', DEFAULT_MILVUS_PORT)
|
| 100 |
+
collection_name = os.environ.get('COLLECTION_NAME', DEFAULT_COLLECTION_NAME)
|
| 101 |
+
dimension = int(os.environ.get('DIMENSION', DEFAULT_DIMENSION))
|
| 102 |
+
max_retries = int(os.environ.get('MAX_RETRIES', DEFAULT_MAX_RETRIES))
|
| 103 |
+
retry_delay = int(os.environ.get('RETRY_DELAY', DEFAULT_RETRY_DELAY))
|
| 104 |
+
|
| 105 |
+
initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==5.11.0
|
| 2 |
+
pymilvus==2.5.3
|
| 3 |
+
sentence-transformers==3.3.1
|
| 4 |
+
openai==1.59.5
|
| 5 |
+
langchain==0.3.14
|
| 6 |
+
python-dotenv
|
| 7 |
+
langchain-community==0.3.14
|
| 8 |
+
langchain-openai==0.2.14
|
requirements_dev.txt
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==23.2.1
|
| 2 |
+
aiohappyeyeballs==2.4.4
|
| 3 |
+
aiohttp==3.11.11
|
| 4 |
+
aiosignal==1.3.2
|
| 5 |
+
altair==5.5.0
|
| 6 |
+
annotated-types==0.7.0
|
| 7 |
+
anyio==4.8.0
|
| 8 |
+
asttokens==2.4.1
|
| 9 |
+
attrs==24.3.0
|
| 10 |
+
blinker==1.9.0
|
| 11 |
+
cachetools==5.5.0
|
| 12 |
+
certifi==2024.12.14
|
| 13 |
+
charset-normalizer==3.4.1
|
| 14 |
+
click==8.1.8
|
| 15 |
+
colorama==0.4.6
|
| 16 |
+
comm==0.2.2
|
| 17 |
+
dataclasses-json==0.6.7
|
| 18 |
+
debugpy==1.8.1
|
| 19 |
+
decorator==5.1.1
|
| 20 |
+
distro==1.9.0
|
| 21 |
+
executing==2.0.1
|
| 22 |
+
fastapi==0.115.6
|
| 23 |
+
ffmpy==0.5.0
|
| 24 |
+
filelock==3.16.1
|
| 25 |
+
fpdf==1.7.2
|
| 26 |
+
frozenlist==1.5.0
|
| 27 |
+
fsspec==2024.12.0
|
| 28 |
+
gitdb==4.0.12
|
| 29 |
+
GitPython==3.1.44
|
| 30 |
+
gradio==5.11.0
|
| 31 |
+
gradio_client==1.5.3
|
| 32 |
+
greenlet==3.1.1
|
| 33 |
+
grpcio==1.67.1
|
| 34 |
+
h11==0.14.0
|
| 35 |
+
httpcore==1.0.7
|
| 36 |
+
httpx==0.28.1
|
| 37 |
+
httpx-sse==0.4.0
|
| 38 |
+
huggingface-hub==0.27.1
|
| 39 |
+
idna==3.10
|
| 40 |
+
ipykernel==6.29.4
|
| 41 |
+
ipython==8.25.0
|
| 42 |
+
jedi==0.19.1
|
| 43 |
+
Jinja2==3.1.5
|
| 44 |
+
jiter==0.8.2
|
| 45 |
+
joblib==1.4.2
|
| 46 |
+
jsonpatch==1.33
|
| 47 |
+
jsonpointer==3.0.0
|
| 48 |
+
jsonschema==4.23.0
|
| 49 |
+
jsonschema-specifications==2024.10.1
|
| 50 |
+
jupyter_client==8.6.2
|
| 51 |
+
jupyter_core==5.7.2
|
| 52 |
+
langchain==0.3.14
|
| 53 |
+
langchain-community==0.3.14
|
| 54 |
+
langchain-core==0.3.29
|
| 55 |
+
langchain-openai==0.2.14
|
| 56 |
+
langchain-text-splitters==0.3.5
|
| 57 |
+
langsmith==0.2.10
|
| 58 |
+
markdown-it-py==3.0.0
|
| 59 |
+
MarkupSafe==2.1.5
|
| 60 |
+
marshmallow==3.24.2
|
| 61 |
+
matplotlib-inline==0.1.7
|
| 62 |
+
mdurl==0.1.2
|
| 63 |
+
mpmath==1.3.0
|
| 64 |
+
multidict==6.1.0
|
| 65 |
+
mypy-extensions==1.0.0
|
| 66 |
+
narwhals==1.21.1
|
| 67 |
+
nest-asyncio==1.6.0
|
| 68 |
+
networkx==3.4.2
|
| 69 |
+
numpy==2.2.1
|
| 70 |
+
openai==1.59.5
|
| 71 |
+
orjson==3.10.14
|
| 72 |
+
packaging==24.1
|
| 73 |
+
pandas==2.2.3
|
| 74 |
+
parso==0.8.4
|
| 75 |
+
pillow==11.1.0
|
| 76 |
+
platformdirs==4.2.2
|
| 77 |
+
prompt_toolkit==3.0.47
|
| 78 |
+
propcache==0.2.1
|
| 79 |
+
protobuf==5.29.3
|
| 80 |
+
psutil==6.0.0
|
| 81 |
+
pure-eval==0.2.2
|
| 82 |
+
pyarrow==18.1.0
|
| 83 |
+
pydantic==2.10.4
|
| 84 |
+
pydantic-settings==2.7.1
|
| 85 |
+
pydantic_core==2.27.2
|
| 86 |
+
pydeck==0.9.1
|
| 87 |
+
pydub==0.25.1
|
| 88 |
+
Pygments==2.18.0
|
| 89 |
+
pymilvus==2.5.3
|
| 90 |
+
python-dateutil==2.9.0.post0
|
| 91 |
+
python-dotenv==1.0.1
|
| 92 |
+
python-multipart==0.0.20
|
| 93 |
+
pytz==2024.2
|
| 94 |
+
pywin32==306
|
| 95 |
+
PyYAML==6.0.2
|
| 96 |
+
pyzmq==26.0.3
|
| 97 |
+
referencing==0.35.1
|
| 98 |
+
regex==2024.11.6
|
| 99 |
+
requests==2.32.3
|
| 100 |
+
requests-toolbelt==1.0.0
|
| 101 |
+
rich==13.9.4
|
| 102 |
+
rpds-py==0.22.3
|
| 103 |
+
ruff==0.8.6
|
| 104 |
+
safehttpx==0.1.6
|
| 105 |
+
safetensors==0.5.2
|
| 106 |
+
scikit-learn==1.6.0
|
| 107 |
+
scipy==1.15.0
|
| 108 |
+
semantic-version==2.10.0
|
| 109 |
+
sentence-transformers==3.3.1
|
| 110 |
+
setuptools==75.1.0
|
| 111 |
+
shellingham==1.5.4
|
| 112 |
+
six==1.16.0
|
| 113 |
+
smmap==5.0.2
|
| 114 |
+
sniffio==1.3.1
|
| 115 |
+
SQLAlchemy==2.0.36
|
| 116 |
+
stack-data==0.6.3
|
| 117 |
+
starlette==0.41.3
|
| 118 |
+
streamlit==1.41.1
|
| 119 |
+
streamlit-pdf-viewer==0.0.20
|
| 120 |
+
sympy==1.13.1
|
| 121 |
+
tenacity==9.0.0
|
| 122 |
+
threadpoolctl==3.5.0
|
| 123 |
+
tiktoken==0.8.0
|
| 124 |
+
tokenizers==0.21.0
|
| 125 |
+
toml==0.10.2
|
| 126 |
+
tomlkit==0.13.2
|
| 127 |
+
torch==2.5.1
|
| 128 |
+
tornado==6.4.1
|
| 129 |
+
tqdm==4.67.1
|
| 130 |
+
traitlets==5.14.3
|
| 131 |
+
transformers==4.47.1
|
| 132 |
+
typer==0.15.1
|
| 133 |
+
typing-inspect==0.9.0
|
| 134 |
+
typing_extensions==4.12.2
|
| 135 |
+
tzdata==2024.2
|
| 136 |
+
ujson==5.10.0
|
| 137 |
+
urllib3==2.3.0
|
| 138 |
+
uvicorn==0.34.0
|
| 139 |
+
watchdog==6.0.0
|
| 140 |
+
wcwidth==0.2.13
|
| 141 |
+
websockets==14.1
|
| 142 |
+
wheel==0.44.0
|
| 143 |
+
yarl==1.18.3
|
utils/extract.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
def display_and_store_directory_content(base_path):
|
| 6 |
+
"""
|
| 7 |
+
Display all paths with directories and files along with their content,
|
| 8 |
+
and store the information in a Pandas DataFrame.
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
base_path (str): The root directory path to scan.
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
None: Prints paths and content, and saves the DataFrame as a pickle file.
|
| 15 |
+
"""
|
| 16 |
+
data = [] # To store path and content as rows for the DataFrame
|
| 17 |
+
|
| 18 |
+
for root, dirs, files in os.walk(base_path):
|
| 19 |
+
# Store directories (no content)
|
| 20 |
+
for d in dirs:
|
| 21 |
+
dir_path = os.path.join(root, d)
|
| 22 |
+
data.append({"path": dir_path, "content": ""})
|
| 23 |
+
print(f"Directory: {dir_path}")
|
| 24 |
+
|
| 25 |
+
# Store files and their content
|
| 26 |
+
for f in files:
|
| 27 |
+
file_path = os.path.join(root, f)
|
| 28 |
+
try:
|
| 29 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 30 |
+
content = file.read()
|
| 31 |
+
except Exception as e:
|
| 32 |
+
content = f"Error reading file: {e}"
|
| 33 |
+
|
| 34 |
+
data.append({"path": file_path, "content": content})
|
| 35 |
+
print(f"\nFile: {file_path}")
|
| 36 |
+
print("-" * 40)
|
| 37 |
+
print(content)
|
| 38 |
+
print("-" * 40)
|
| 39 |
+
|
| 40 |
+
# Create a DataFrame
|
| 41 |
+
df = pd.DataFrame(data)
|
| 42 |
+
|
| 43 |
+
# Create the 'extraction' directory if it doesn't exist
|
| 44 |
+
extraction_dir = "extraction"
|
| 45 |
+
if not os.path.exists(extraction_dir):
|
| 46 |
+
os.makedirs(extraction_dir)
|
| 47 |
+
|
| 48 |
+
# Use the last component of the base path as the file name
|
| 49 |
+
base_name = os.path.basename(os.path.normpath(base_path))
|
| 50 |
+
output_file = os.path.join(extraction_dir, f"{base_name}.pkl")
|
| 51 |
+
|
| 52 |
+
# Save the DataFrame to a pickle file
|
| 53 |
+
df.to_pickle(output_file)
|
| 54 |
+
print(f"\nDataFrame saved to {output_file}")
|
| 55 |
+
|
| 56 |
+
if __name__ == "__main__":
|
| 57 |
+
# Ensure a directory path is provided as an argument
|
| 58 |
+
if len(sys.argv) < 2:
|
| 59 |
+
print("Usage: python utils\\extract_all_content.py <directory>")
|
| 60 |
+
sys.exit(1)
|
| 61 |
+
|
| 62 |
+
# Get the directory path from the command-line arguments
|
| 63 |
+
directory_path = sys.argv[1]
|
| 64 |
+
|
| 65 |
+
# Execute the function
|
| 66 |
+
if os.path.exists(directory_path):
|
| 67 |
+
display_and_store_directory_content(directory_path)
|
| 68 |
+
else:
|
| 69 |
+
print(f"Error: The path '{directory_path}' does not exist.")
|