#!/usr/bin/env python3 """ Test script to verify vector database creation functionality """ import sys import os from pathlib import Path # Add current directory to path to import modules sys.path.append(str(Path(__file__).parent)) try: from rag_tool import RAGTool from vector_store import VectorStore from document_processor import DocumentProcessor print("✅ Successfully imported all RAG modules") except ImportError as e: print(f"❌ Failed to import RAG modules: {e}") sys.exit(1) def test_document_processing(): """Test document processing functionality""" print("\n=== Testing Document Processing ===") processor = DocumentProcessor(chunk_size=200, chunk_overlap=50) # Test with our test document test_file = "test_document.txt" if not os.path.exists(test_file): print(f"❌ Test file {test_file} not found") return False try: chunks = processor.process_file(test_file) print(f"✅ Processed {test_file} into {len(chunks)} chunks") # Show first chunk if chunks: first_chunk = chunks[0] print(f"First chunk preview: {first_chunk.text[:100]}...") print(f"Chunk metadata: {first_chunk.metadata}") return True except Exception as e: print(f"❌ Failed to process document: {e}") return False def test_vector_store(): """Test vector store functionality""" print("\n=== Testing Vector Store ===") try: # Initialize vector store vector_store = VectorStore() print("✅ Initialized vector store") # Create test data test_chunks = [ { 'text': 'Vector databases are used for semantic search', 'chunk_id': 'test1', 'metadata': {'file_name': 'test.txt', 'chunk_index': 0} }, { 'text': 'Machine learning models convert text to embeddings', 'chunk_id': 'test2', 'metadata': {'file_name': 'test.txt', 'chunk_index': 1} }, { 'text': 'FAISS provides efficient similarity search capabilities', 'chunk_id': 'test3', 'metadata': {'file_name': 'test.txt', 'chunk_index': 2} } ] # Build index print("Building vector index...") vector_store.build_index(test_chunks, show_progress=True) print("✅ Built vector index") # Test search query = "How do vector databases work?" results = vector_store.search(query, top_k=2) print(f"Search results for '{query}':") for i, result in enumerate(results): print(f" {i+1}. Score: {result.score:.3f} - {result.text[:50]}...") # Test serialization serialized = vector_store.serialize() print(f"✅ Serialized data size: {len(serialized['index_base64'])} characters") return True except Exception as e: print(f"❌ Failed vector store test: {e}") import traceback traceback.print_exc() return False def test_rag_tool(): """Test complete RAG tool functionality""" print("\n=== Testing RAG Tool ===") try: # Initialize RAG tool rag_tool = RAGTool() print("✅ Initialized RAG tool") # Process test document test_files = ["test_document.txt"] result = rag_tool.process_uploaded_files(test_files) if result['success']: print(f"✅ {result['message']}") # Show summary summary = result['summary'] print(f"Files processed: {summary['total_files']}") print(f"Total chunks: {summary['total_chunks']}") # Test context retrieval query = "What are the benefits of vector databases?" context = rag_tool.get_relevant_context(query, max_chunks=2) if context: print(f"\nContext for '{query}':") print(context[:300] + "..." if len(context) > 300 else context) print("✅ Successfully retrieved context") else: print("⚠️ No context retrieved") # Test serialization for deployment serialized_data = rag_tool.get_serialized_data() if serialized_data: print("✅ Successfully serialized RAG data for deployment") print(f"Serialized keys: {list(serialized_data.keys())}") else: print("❌ Failed to serialize RAG data") return True else: print(f"❌ {result['message']}") return False except Exception as e: print(f"❌ Failed RAG tool test: {e}") import traceback traceback.print_exc() return False def main(): """Run all tests""" print("=== Vector Database Testing ===") print("Testing vector database creation and functionality...") # Check dependencies print("\n=== Checking Dependencies ===") try: import sentence_transformers import faiss import fitz # PyMuPDF print("✅ All required dependencies available") except ImportError as e: print(f"❌ Missing dependency: {e}") return # Run tests tests = [ ("Document Processing", test_document_processing), ("Vector Store", test_vector_store), ("RAG Tool", test_rag_tool) ] results = [] for test_name, test_func in tests: print(f"\n{'='*20}") success = test_func() results.append((test_name, success)) # Summary print(f"\n{'='*40}") print("TEST SUMMARY:") for test_name, success in results: status = "✅ PASS" if success else "❌ FAIL" print(f" {test_name}: {status}") all_passed = all(success for _, success in results) if all_passed: print("\n🎉 All tests passed! Vector database functionality is working.") else: print("\n⚠️ Some tests failed. Check the output above for details.") if __name__ == "__main__": main()