Spaces:

ruslanmv
/

Project-Chatter

Sleeping

App Files Files Community

ruslanmv commited on Jan 10

Commit

306849a

1 Parent(s): 93c7f73

First commit

Browse files

Files changed (8) hide show

.gitignore +5 -0
app.py +268 -0
chat_with_project.py +110 -0
get_prompts.py +62 -0
milvus.py +105 -0
requirements.txt +8 -0
requirements_dev.txt +143 -0
utils/extract.py +69 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+/workspace
+/__pycache__
+.env
+/extraction

app.py ADDED Viewed

	@@ -0,0 +1,268 @@

+import gradio as gr
+import zipfile
+import os
+import shutil
+import subprocess
+from chat_with_project import query_project
+from get_prompts import get_prompt_for_mode
+from dotenv import load_dotenv, set_key
+from milvus import initialize_milvus, DEFAULT_MILVUS_HOST, DEFAULT_MILVUS_PORT, DEFAULT_COLLECTION_NAME, DEFAULT_DIMENSION, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY
+# --- Configuration and Setup ---
+# Define paths for workspace and extraction directories
+WORKSPACE_DIR = "workspace"
+EXTRACTION_DIR = "extraction"
+def clear_directories():
+    """Clears the workspace and extraction directories."""
+    for directory in [WORKSPACE_DIR, EXTRACTION_DIR]:
+        if os.path.exists(directory):
+            shutil.rmtree(directory)
+        os.makedirs(directory, exist_ok=True)
+# Clear directories at startup
+clear_directories()
+# --- API Key Management ---
+def ensure_env_file_exists():
+    """Ensures that a .env file exists in the project root."""
+    if not os.path.exists(".env"):
+        with open(".env", "w") as f:
+            f.write("")  # Create an empty .env file
+def load_api_key():
+    """Loads the API key from the .env file or the environment."""
+    ensure_env_file_exists()
+    load_dotenv()
+    return os.environ.get("OPENAI_API_KEY")
+def update_api_key(api_key):
+    """Updates the API key in the .env file."""
+    if api_key:
+        set_key(".env", "OPENAI_API_KEY", api_key)
+        load_dotenv()  # Reload environment variables
+        return "API key updated successfully."
+    else:
+        return "API key cannot be empty."
+def is_api_key_set():
+    """Checks if the API key is set."""
+    return bool(load_api_key())
+# --- Core Functionalities ---
+def process_zip(zip_file_path):
+    """Extracts a zip file, analyzes content, and stores information."""
+    try:
+        # Clear existing workspace and extraction directories before processing
+        clear_directories()
+        # Extract the zip file
+        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+            zip_ref.extractall(WORKSPACE_DIR)
+        # Run extract.py
+        subprocess.run(["python", "./utils/extract.py", WORKSPACE_DIR], check=True)
+        return "Processing complete! Results saved in the 'extraction' directory."
+    except Exception as e:
+        return f"An error occurred: {e}"
+def init_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay):
+    """Initializes or loads the Milvus vector database."""
+    try:
+        # Convert string inputs to appropriate types
+        milvus_port = int(milvus_port)
+        dimension = int(dimension)
+        max_retries = int(max_retries)
+        retry_delay = int(retry_delay)
+        initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)
+        return "Milvus database initialized or loaded successfully."
+    except Exception as e:
+        return f"Error initializing Milvus: {e}"
+# --- Chatbot Verification ---
+def is_project_loaded():
+    """Checks if a project has been loaded (i.e., if the extraction directory contains .pkl files)."""
+    extraction_dir = "extraction"
+    pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
+    return bool(pkl_files)
+# --- Gradio UI Components ---
+# Chat Interface
+def chat_ui(query, history, mode):
+    """Handles the chat interaction for Analyzer, Debugger, and Developer modes."""
+    api_key = load_api_key()
+    if not api_key:
+        return "Error: OpenAI API key not set. Please set the API key in the Settings tab.", []
+    if not is_project_loaded():
+        return "Error: No project loaded. Please upload and process a ZIP file first.", []
+    # Initialize history if None
+    if history is None:
+        history = []
+    print(f"Chat Mode: {mode}")
+    system_prompt = get_prompt_for_mode(mode)
+    print(f"System Prompt: {system_prompt}")
+    # Pass the query and system prompt to the LLM
+    response = query_project(query, system_prompt)
+    print(f"Response from query_project: {response}")
+    if response is None or not response.strip():
+        response = "An error occurred during processing. Please check the logs."
+    if mode == "developer":
+        extracted_files = extract_files_from_response(response)
+        # Format the output for developer mode
+        developer_response = ""
+        for filepath, content in extracted_files.items():
+            developer_response += f"**{filepath}:**\n`python\n{content}\n`\n\n"
+        history.append((query, developer_response))
+        # Return history and an empty string for the text output (as it's handled by the chatbot)
+        return history, history
+    else:
+        # Format the output for non-developer modes
+        formatted_response = response.replace('\n', '  \n')  # Use two spaces for markdown line breaks
+        history.append((query, formatted_response))
+        # Return history and an empty string for the text output (as it's handled by the chatbot)
+        return history, history
+def extract_files_from_response(response):
+    """
+    Parses the LLM response to extract file paths and their corresponding code content.
+    Args:
+        response (str): The raw response string from the LLM.
+    Returns:
+        dict: A dictionary where keys are file paths and values are the code content of each file.
+    """
+    files = {}
+    current_file = None
+    current_content = []
+    for line in response.splitlines():
+        if line.startswith("--- BEGIN FILE:"):
+            if current_file is not None:
+                # Save previous file content
+                files[current_file] = "\n".join(current_content)
+            # Start a new file
+            current_file = line.replace("--- BEGIN FILE:", "").strip()
+            current_content = []
+        elif line.startswith("--- END FILE:"):
+            if current_file is not None:
+                # Save current file content
+                files[current_file] = "\n".join(current_content)
+                current_file = None
+                current_content = []
+        elif current_file is not None:
+            # Append line to current file content
+            current_content.append(line)
+    return files
+# ZIP Processing Interface
+zip_iface = gr.Interface(
+    fn=process_zip,
+    inputs=gr.File(label="Upload ZIP File"),
+    outputs="text",
+    title="Zip File Analyzer",
+    description="Upload a zip file to analyze and store its contents.",
+)
+# Milvus Initialization Interface
+milvus_iface = gr.Interface(
+    fn=init_milvus,
+    inputs=[
+        gr.Textbox(label="Milvus Host", placeholder=DEFAULT_MILVUS_HOST, value=DEFAULT_MILVUS_HOST),
+        gr.Textbox(label="Milvus Port", placeholder=DEFAULT_MILVUS_PORT, value=DEFAULT_MILVUS_PORT),
+        gr.Textbox(label="Collection Name", placeholder=DEFAULT_COLLECTION_NAME, value=DEFAULT_COLLECTION_NAME),
+        gr.Textbox(label="Dimension", placeholder=str(DEFAULT_DIMENSION), value=str(DEFAULT_DIMENSION)),
+        gr.Textbox(label="Max Retries", placeholder=str(DEFAULT_MAX_RETRIES), value=str(DEFAULT_MAX_RETRIES)),
+        gr.Textbox(label="Retry Delay (seconds)", placeholder=str(DEFAULT_RETRY_DELAY), value=str(DEFAULT_RETRY_DELAY))
+    ],
+    outputs="text",
+    title="Milvus Database Initialization",
+    description="Initialize or load the Milvus vector database.",
+)
+# Gradio Chatbot UI Interface
+chat_iface = gr.Interface(
+    fn=chat_ui,
+    inputs=[
+        gr.Textbox(label="Ask a question", placeholder="Type your question here"),
+        gr.State(),  # Maintains chat history
+        gr.Radio(["analyzer", "debugger", "developer"], label="Chat Mode", value="analyzer")
+    ],
+    outputs=[
+        gr.Chatbot(label="Chat with Project"),
+        "state" # This is to store the state,
+    ],
+    title="Chat with your Project",
+    description="Ask questions about the data extracted from the zip file.",
+    # Example usage - Corrected to only include instruction and mode
+    examples=[
+        ["What is this project about?", "analyzer"],
+        ["Are there any potential bugs?", "debugger"],
+        ["How does the data flow through the application?", "analyzer"],
+        ["Explain the main components of the architecture.", "analyzer"],
+        ["What are the dependencies of this project?",  "analyzer"],
+        ["Are there any potential memory leaks?",  "debugger"],
+        ["Identify any areas where the code could be optimized.","debugger"],
+        ["Implement basic logging for the main application and save logs to a file.",  "developer"],
+        ["Use try/except blocks in main functions to handle exceptions",  "developer"]
+    ],
+)
+# Settings Interface
+settings_iface = gr.Interface(
+    fn=update_api_key,
+    inputs=gr.Textbox(label="OpenAI API Key", type="password"),
+    outputs="text",
+    title="Settings",
+    description="Set your OpenAI API key.",
+)
+# Status Interface
+def get_api_key_status():
+    if is_api_key_set():
+        return "API key status: Set"
+    else:
+        return "API key status: Not set"
+status_iface = gr.Interface(
+    fn=get_api_key_status,
+    inputs=None,
+    outputs="text",
+    live=True,
+    title="API Key Status"
+)
+# Add credits to the UI
+credits = gr.Markdown("## Credits\n\nCreated by [Ruslan Magana Vsevolodovna](https://ruslanmv.com/)")
+# --- Main Application Launch ---
+# Combine the interfaces using Tabs
+demo = gr.TabbedInterface(
+    [zip_iface, milvus_iface, chat_iface, settings_iface, status_iface],
+    ["Process ZIP", "Init Milvus", "Chat with Project", "Settings", "Status"],
+)
+# Launch the app with credits
+demo.queue().launch()

chat_with_project.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from pymilvus import connections, Collection, utility
+from sentence_transformers import SentenceTransformer
+from langchain_openai import ChatOpenAI  # Updated import
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
+import os
+# Milvus connection details
+MILVUS_HOST = 'localhost'
+MILVUS_PORT = '19530'
+COLLECTION_NAME = 'document_collection'
+def load_api_key():
+    """Loads the API key from the .env file or the environment."""
+    from dotenv import load_dotenv
+    load_dotenv()
+    return os.environ.get("OPENAI_API_KEY")
+# Embedding model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def retrieve_relevant_documents(query, top_k=5):
+    """
+    Retrieves the most relevant documents from Milvus based on the query.
+    """
+    print(f"Connecting to Milvus at {MILVUS_HOST}:{MILVUS_PORT}...")
+    connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
+    if utility.has_collection(COLLECTION_NAME):
+        collection = Collection(COLLECTION_NAME)
+        collection.load()
+        query_vector = model.encode([query]).tolist()
+        print(f"Encoded Query Vector: {query_vector}")
+        search_params = {
+            "metric_type": "L2",
+            "params": {"nprobe": 16}
+        }
+        search_results = collection.search(
+            data=query_vector,
+            anns_field="content_vector",
+            param=search_params,
+            limit=top_k,
+            expr=None,
+            output_fields=["path"]
+        )
+        relevant_docs = []
+        for hit in search_results[0]:
+            doc_path = hit.entity.get("path")
+            relevant_docs.append(doc_path)
+        print(f"Relevant Docs: {relevant_docs}")
+        connections.disconnect(alias='default')
+    else:
+        print(f"Collection {COLLECTION_NAME} does not exist.")
+        relevant_docs = []
+    return relevant_docs
+def generate_response_with_gpt(query, relevant_docs, system_prompt):
+    """
+    Generates a response using OpenAI's GPT model, based on the query, relevant documents, and system prompt.
+    """
+    api_key = load_api_key()
+    if not api_key:
+        raise ValueError("OpenAI API key not set. Please set it in the .env file or environment variables.")
+    print(f"Using OpenAI API Key: {api_key[:5]}...")  # Partial key for debugging
+    chat = ChatOpenAI(temperature=0.7, openai_api_key=api_key, model_name="gpt-3.5-turbo")
+    messages = [SystemMessage(content=system_prompt)]
+    if relevant_docs:
+        doc_content = ""
+        for doc_path in relevant_docs:
+            if os.path.isfile(doc_path):
+                try:
+                    with open(doc_path, "r", encoding="utf-8") as f:
+                        doc_content += f.read() + "\n"
+                except Exception as e:
+                    print(f"Error reading document {doc_path}: {e}")
+        if doc_content:
+            messages.append(HumanMessage(content=f"Relevant documents:\n{doc_content}"))
+    messages.append(HumanMessage(content=query))
+    print(f"Messages sent to OpenAI API: {messages}")
+    try:
+        response = chat.invoke(messages)
+        print(f"OpenAI API Response: {response.content}")
+        print("Type OpenAI API Response",type(response.content))
+        return response.content
+    except Exception as e:
+        print(f"Error during OpenAI API call: {e}")
+        return "Error generating response. Please try again later."
+def query_project(query, system_prompt):
+    """
+    Queries the project using a RAG approach with specified system prompt.
+    """
+    relevant_docs = retrieve_relevant_documents(query)
+    print(" Starting the query:")
+    print(query)
+    response = generate_response_with_gpt(query, relevant_docs, system_prompt)
+    print(f"Query Response: {response}")
+    print("Type response",type(response))
+    return response

get_prompts.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from langchain.prompts import PromptTemplate
+ANALYZER_PROMPT_TEMPLATE = """
+You are a code analyzer AI. Your task is to analyze the project's structure,
+purpose, and functionality. Explain how different components interact,
+discuss the overall architecture, and provide insights into the project's design.
+Consider the context provided and try to be comprehensive in your analysis.
+Relevant context: {context}
+Explain in detail, based on the context provided.
+"""
+DEBUGGER_PROMPT_TEMPLATE = """
+You are a code debugger AI. Your task is to identify potential bugs,
+errors, and areas for improvement in the project's code. Analyze the given code
+for logic errors, performance bottlenecks, and suggest fixes or improvements.
+If the user asks how to fix an issue, provide the corrected code snippet.
+Relevant context: {context}
+Focus on identifying issues and providing solutions or improvements based on the context provided.
+"""
+DEVELOPER_PROMPT_TEMPLATE = """
+You are a software developer AI. Your task is to modify or extend existing code based on user requests.
+When a user asks to add a feature or modify existing functionality, you should:
+1. Identify the files that need to be modified or created.
+2. Output the full, updated code for each file that needs changes.
+3. Clearly indicate the filename before each code block using this format:
+   ```
+   --- BEGIN FILE: <filepath> ---
+   <full code of the file>
+   --- END FILE: <filepath> ---
+   ```
+4. If a new file needs to be created, use the same format and specify the new file's path and name.
+5. **Do not omit any part of the code**. Output the entire content of each modified or new file.
+6. Ensure that the generated code is functional, well-structured, and integrates seamlessly with the existing project.
+7. Explain any additional setup or configuration steps if necessary.
+Remember to consider the existing project's structure and coding style when making modifications.
+Relevant context: {context}
+User request: {question}
+Modify or extend the code as requested, providing the full code for each relevant file.
+"""
+def get_prompt_for_mode(mode):
+    """
+    Returns the appropriate prompt template based on the selected mode.
+    """
+    if mode == "analyzer":
+        return ANALYZER_PROMPT_TEMPLATE
+    elif mode == "debugger":
+        return DEBUGGER_PROMPT_TEMPLATE
+    elif mode == "developer":
+        return DEVELOPER_PROMPT_TEMPLATE
+    else:
+        raise ValueError(f"Invalid mode: {mode}")

milvus.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# milvus.py
+from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
+import pandas as pd
+import os
+import sys
+from sentence_transformers import SentenceTransformer
+import time
+# Default Milvus connection details
+DEFAULT_MILVUS_HOST = 'localhost'
+DEFAULT_MILVUS_PORT = '19530'
+DEFAULT_COLLECTION_NAME = 'document_collection'
+DEFAULT_DIMENSION = 384  # Adjust based on your embedding model
+DEFAULT_MAX_RETRIES = 3
+DEFAULT_RETRY_DELAY = 5  # seconds
+# Embedding model
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def create_milvus_collection(host, port, collection_name, dimension):
+    """
+    Creates a new Milvus collection if it doesn't exist.
+    """
+    if not utility.has_collection(collection_name):
+        fields = [
+            FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
+            FieldSchema(name="path", dtype=DataType.VARCHAR, max_length=500),
+            FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=dimension)
+        ]
+        schema = CollectionSchema(fields, "Document Vector Store")
+        collection = Collection(collection_name, schema, consistency_level="Strong")
+        index_params = {
+            "metric_type": "L2",
+            "index_type": "IVF_FLAT",
+            "params": {"nlist": 1024}
+        }
+        collection.create_index(field_name="content_vector", index_params=index_params)
+        print(f"Collection {collection_name} created and index built.")
+    else:
+        print(f"Collection {collection_name} already exists.")
+def load_data_to_milvus(host, port, collection_name):
+    """
+    Loads data from the DataFrame into Milvus, using sentence embeddings.
+    """
+    extraction_dir = "extraction"
+    pkl_files = [f for f in os.listdir(extraction_dir) if f.endswith('.pkl')]
+    if not pkl_files:
+        print("No .pkl files found in the 'extraction' directory.")
+        return
+    df_path = os.path.join(extraction_dir, pkl_files[0])
+    df = pd.read_pickle(df_path)
+    # Generate sentence embeddings
+    df['content_vector'] = df['content'].apply(lambda x: model.encode(x).tolist())
+    data_to_insert = [
+        df['path'].tolist(),
+        df['content_vector'].tolist()
+    ]
+    collection = Collection(collection_name)
+    collection.insert(data_to_insert)
+    collection.flush()
+    print(f"Data from {df_path} loaded into Milvus collection {collection_name}.")
+def connect_to_milvus(host, port, max_retries, retry_delay):
+    """Connects to Milvus with retries."""
+    retries = 0
+    while retries < max_retries:
+        try:
+            connections.connect(host=host, port=port)
+            print(f"Successfully connected to Milvus at {host}:{port}")
+            return True
+        except Exception as e:
+            print(f"Error connecting to Milvus: {e}")
+            retries += 1
+            if retries < max_retries:
+                print(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+            else:
+                print("Max retries reached. Could not connect to Milvus.")
+                return False
+def initialize_milvus(host, port, collection_name, dimension, max_retries, retry_delay):
+    """Initializes Milvus with parameters."""
+    if connect_to_milvus(host, port, max_retries, retry_delay):
+        create_milvus_collection(host, port, collection_name, dimension)
+        load_data_to_milvus(host, port, collection_name)
+        connections.disconnect(alias='default')
+if __name__ == "__main__":
+    # Use default values or environment variables if available
+    milvus_host = os.environ.get('MILVUS_HOST', DEFAULT_MILVUS_HOST)
+    milvus_port = os.environ.get('MILVUS_PORT', DEFAULT_MILVUS_PORT)
+    collection_name = os.environ.get('COLLECTION_NAME', DEFAULT_COLLECTION_NAME)
+    dimension = int(os.environ.get('DIMENSION', DEFAULT_DIMENSION))
+    max_retries = int(os.environ.get('MAX_RETRIES', DEFAULT_MAX_RETRIES))
+    retry_delay = int(os.environ.get('RETRY_DELAY', DEFAULT_RETRY_DELAY))
+    initialize_milvus(milvus_host, milvus_port, collection_name, dimension, max_retries, retry_delay)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+    gradio==5.11.0
+    pymilvus==2.5.3
+    sentence-transformers==3.3.1
+    openai==1.59.5
+    langchain==0.3.14
+    python-dotenv
+    langchain-community==0.3.14
+    langchain-openai==0.2.14

requirements_dev.txt ADDED Viewed

	@@ -0,0 +1,143 @@

+aiofiles==23.2.1
+aiohappyeyeballs==2.4.4
+aiohttp==3.11.11
+aiosignal==1.3.2
+altair==5.5.0
+annotated-types==0.7.0
+anyio==4.8.0
+asttokens==2.4.1
+attrs==24.3.0
+blinker==1.9.0
+cachetools==5.5.0
+certifi==2024.12.14
+charset-normalizer==3.4.1
+click==8.1.8
+colorama==0.4.6
+comm==0.2.2
+dataclasses-json==0.6.7
+debugpy==1.8.1
+decorator==5.1.1
+distro==1.9.0
+executing==2.0.1
+fastapi==0.115.6
+ffmpy==0.5.0
+filelock==3.16.1
+fpdf==1.7.2
+frozenlist==1.5.0
+fsspec==2024.12.0
+gitdb==4.0.12
+GitPython==3.1.44
+gradio==5.11.0
+gradio_client==1.5.3
+greenlet==3.1.1
+grpcio==1.67.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.27.1
+idna==3.10
+ipykernel==6.29.4
+ipython==8.25.0
+jedi==0.19.1
+Jinja2==3.1.5
+jiter==0.8.2
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter_client==8.6.2
+jupyter_core==5.7.2
+langchain==0.3.14
+langchain-community==0.3.14
+langchain-core==0.3.29
+langchain-openai==0.2.14
+langchain-text-splitters==0.3.5
+langsmith==0.2.10
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.24.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.1.0
+mypy-extensions==1.0.0
+narwhals==1.21.1
+nest-asyncio==1.6.0
+networkx==3.4.2
+numpy==2.2.1
+openai==1.59.5
+orjson==3.10.14
+packaging==24.1
+pandas==2.2.3
+parso==0.8.4
+pillow==11.1.0
+platformdirs==4.2.2
+prompt_toolkit==3.0.47
+propcache==0.2.1
+protobuf==5.29.3
+psutil==6.0.0
+pure-eval==0.2.2
+pyarrow==18.1.0
+pydantic==2.10.4
+pydantic-settings==2.7.1
+pydantic_core==2.27.2
+pydeck==0.9.1
+pydub==0.25.1
+Pygments==2.18.0
+pymilvus==2.5.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.20
+pytz==2024.2
+pywin32==306
+PyYAML==6.0.2
+pyzmq==26.0.3
+referencing==0.35.1
+regex==2024.11.6
+requests==2.32.3
+requests-toolbelt==1.0.0
+rich==13.9.4
+rpds-py==0.22.3
+ruff==0.8.6
+safehttpx==0.1.6
+safetensors==0.5.2
+scikit-learn==1.6.0
+scipy==1.15.0
+semantic-version==2.10.0
+sentence-transformers==3.3.1
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.2
+sniffio==1.3.1
+SQLAlchemy==2.0.36
+stack-data==0.6.3
+starlette==0.41.3
+streamlit==1.41.1
+streamlit-pdf-viewer==0.0.20
+sympy==1.13.1
+tenacity==9.0.0
+threadpoolctl==3.5.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+toml==0.10.2
+tomlkit==0.13.2
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.47.1
+typer==0.15.1
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+ujson==5.10.0
+urllib3==2.3.0
+uvicorn==0.34.0
+watchdog==6.0.0
+wcwidth==0.2.13
+websockets==14.1
+wheel==0.44.0
+yarl==1.18.3

utils/extract.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import sys
+import pandas as pd
+def display_and_store_directory_content(base_path):
+    """
+    Display all paths with directories and files along with their content,
+    and store the information in a Pandas DataFrame.
+    Args:
+        base_path (str): The root directory path to scan.
+    Returns:
+        None: Prints paths and content, and saves the DataFrame as a pickle file.
+    """
+    data = []  # To store path and content as rows for the DataFrame
+    for root, dirs, files in os.walk(base_path):
+        # Store directories (no content)
+        for d in dirs:
+            dir_path = os.path.join(root, d)
+            data.append({"path": dir_path, "content": ""})
+            print(f"Directory: {dir_path}")
+        # Store files and their content
+        for f in files:
+            file_path = os.path.join(root, f)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    content = file.read()
+            except Exception as e:
+                content = f"Error reading file: {e}"
+            data.append({"path": file_path, "content": content})
+            print(f"\nFile: {file_path}")
+            print("-" * 40)
+            print(content)
+            print("-" * 40)
+    # Create a DataFrame
+    df = pd.DataFrame(data)
+    # Create the 'extraction' directory if it doesn't exist
+    extraction_dir = "extraction"
+    if not os.path.exists(extraction_dir):
+        os.makedirs(extraction_dir)
+    # Use the last component of the base path as the file name
+    base_name = os.path.basename(os.path.normpath(base_path))
+    output_file = os.path.join(extraction_dir, f"{base_name}.pkl")
+    # Save the DataFrame to a pickle file
+    df.to_pickle(output_file)
+    print(f"\nDataFrame saved to {output_file}")
+if __name__ == "__main__":
+    # Ensure a directory path is provided as an argument
+    if len(sys.argv) < 2:
+        print("Usage: python utils\\extract_all_content.py <directory>")
+        sys.exit(1)
+    # Get the directory path from the command-line arguments
+    directory_path = sys.argv[1]
+    # Execute the function
+    if os.path.exists(directory_path):
+        display_and_store_directory_content(directory_path)
+    else:
+        print(f"Error: The path '{directory_path}' does not exist.")