Spaces:

milwright
/

chatui-helper

Running

File size: 6,350 Bytes

ba11a75

#!/usr/bin/env python3
"""
Test script to verify vector database creation functionality
"""

import sys
import os
from pathlib import Path

# Add current directory to path to import modules
sys.path.append(str(Path(__file__).parent))

try:
    from rag_tool import RAGTool
    from vector_store import VectorStore
    from document_processor import DocumentProcessor
    print("✅ Successfully imported all RAG modules")
except ImportError as e:
    print(f"❌ Failed to import RAG modules: {e}")
    sys.exit(1)

def test_document_processing():
    """Test document processing functionality"""
    print("\n=== Testing Document Processing ===")
    
    processor = DocumentProcessor(chunk_size=200, chunk_overlap=50)
    
    # Test with our test document
    test_file = "test_document.txt"
    if not os.path.exists(test_file):
        print(f"❌ Test file {test_file} not found")
        return False
    
    try:
        chunks = processor.process_file(test_file)
        print(f"✅ Processed {test_file} into {len(chunks)} chunks")
        
        # Show first chunk
        if chunks:
            first_chunk = chunks[0]
            print(f"First chunk preview: {first_chunk.text[:100]}...")
            print(f"Chunk metadata: {first_chunk.metadata}")
        
        return True
    except Exception as e:
        print(f"❌ Failed to process document: {e}")
        return False

def test_vector_store():
    """Test vector store functionality"""
    print("\n=== Testing Vector Store ===")
    
    try:
        # Initialize vector store
        vector_store = VectorStore()
        print("✅ Initialized vector store")
        
        # Create test data
        test_chunks = [
            {
                'text': 'Vector databases are used for semantic search',
                'chunk_id': 'test1',
                'metadata': {'file_name': 'test.txt', 'chunk_index': 0}
            },
            {
                'text': 'Machine learning models convert text to embeddings',
                'chunk_id': 'test2', 
                'metadata': {'file_name': 'test.txt', 'chunk_index': 1}
            },
            {
                'text': 'FAISS provides efficient similarity search capabilities',
                'chunk_id': 'test3',
                'metadata': {'file_name': 'test.txt', 'chunk_index': 2}
            }
        ]
        
        # Build index
        print("Building vector index...")
        vector_store.build_index(test_chunks, show_progress=True)
        print("✅ Built vector index")
        
        # Test search
        query = "How do vector databases work?"
        results = vector_store.search(query, top_k=2)
        
        print(f"Search results for '{query}':")
        for i, result in enumerate(results):
            print(f"  {i+1}. Score: {result.score:.3f} - {result.text[:50]}...")
        
        # Test serialization
        serialized = vector_store.serialize()
        print(f"✅ Serialized data size: {len(serialized['index_base64'])} characters")
        
        return True
        
    except Exception as e:
        print(f"❌ Failed vector store test: {e}")
        import traceback
        traceback.print_exc()
        return False

def test_rag_tool():
    """Test complete RAG tool functionality"""
    print("\n=== Testing RAG Tool ===")
    
    try:
        # Initialize RAG tool
        rag_tool = RAGTool()
        print("✅ Initialized RAG tool")
        
        # Process test document
        test_files = ["test_document.txt"]
        result = rag_tool.process_uploaded_files(test_files)
        
        if result['success']:
            print(f"✅ {result['message']}")
            
            # Show summary
            summary = result['summary']
            print(f"Files processed: {summary['total_files']}")
            print(f"Total chunks: {summary['total_chunks']}")
            
            # Test context retrieval
            query = "What are the benefits of vector databases?"
            context = rag_tool.get_relevant_context(query, max_chunks=2)
            
            if context:
                print(f"\nContext for '{query}':")
                print(context[:300] + "..." if len(context) > 300 else context)
                print("✅ Successfully retrieved context")
            else:
                print("⚠️ No context retrieved")
            
            # Test serialization for deployment
            serialized_data = rag_tool.get_serialized_data()
            if serialized_data:
                print("✅ Successfully serialized RAG data for deployment")
                print(f"Serialized keys: {list(serialized_data.keys())}")
            else:
                print("❌ Failed to serialize RAG data")
            
            return True
        else:
            print(f"❌ {result['message']}")
            return False
            
    except Exception as e:
        print(f"❌ Failed RAG tool test: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    """Run all tests"""
    print("=== Vector Database Testing ===")
    print("Testing vector database creation and functionality...")
    
    # Check dependencies
    print("\n=== Checking Dependencies ===")
    try:
        import sentence_transformers
        import faiss
        import fitz  # PyMuPDF
        print("✅ All required dependencies available")
    except ImportError as e:
        print(f"❌ Missing dependency: {e}")
        return
    
    # Run tests
    tests = [
        ("Document Processing", test_document_processing),
        ("Vector Store", test_vector_store), 
        ("RAG Tool", test_rag_tool)
    ]
    
    results = []
    for test_name, test_func in tests:
        print(f"\n{'='*20}")
        success = test_func()
        results.append((test_name, success))
    
    # Summary
    print(f"\n{'='*40}")
    print("TEST SUMMARY:")
    for test_name, success in results:
        status = "✅ PASS" if success else "❌ FAIL"
        print(f"  {test_name}: {status}")
    
    all_passed = all(success for _, success in results)
    if all_passed:
        print("\n🎉 All tests passed! Vector database functionality is working.")
    else:
        print("\n⚠️ Some tests failed. Check the output above for details.")

if __name__ == "__main__":
    main()