Spaces:

milwright
/

chatui-helper

Running

App Files Files Community

chatui-helper / test_vector_db.py

milwright

Comprehensive improvements and modernization

ba11a75 18 days ago

raw

history blame

6.35 kB

	#!/usr/bin/env python3
	"""
	Test script to verify vector database creation functionality
	"""

	import sys
	import os
	from pathlib import Path

	# Add current directory to path to import modules
	sys.path.append(str(Path(__file__).parent))

	try:
	from rag_tool import RAGTool
	from vector_store import VectorStore
	from document_processor import DocumentProcessor
	print("✅ Successfully imported all RAG modules")
	except ImportError as e:
	print(f"❌ Failed to import RAG modules: {e}")
	sys.exit(1)

	def test_document_processing():
	"""Test document processing functionality"""
	print("\n=== Testing Document Processing ===")

	processor = DocumentProcessor(chunk_size=200, chunk_overlap=50)

	# Test with our test document
	test_file = "test_document.txt"
	if not os.path.exists(test_file):
	print(f"❌ Test file {test_file} not found")
	return False

	try:
	chunks = processor.process_file(test_file)
	print(f"✅ Processed {test_file} into {len(chunks)} chunks")

	# Show first chunk
	if chunks:
	first_chunk = chunks[0]
	print(f"First chunk preview: {first_chunk.text[:100]}...")
	print(f"Chunk metadata: {first_chunk.metadata}")

	return True
	except Exception as e:
	print(f"❌ Failed to process document: {e}")
	return False

	def test_vector_store():
	"""Test vector store functionality"""
	print("\n=== Testing Vector Store ===")

	try:
	# Initialize vector store
	vector_store = VectorStore()
	print("✅ Initialized vector store")

	# Create test data
	test_chunks = [
	{
	'text': 'Vector databases are used for semantic search',
	'chunk_id': 'test1',
	'metadata': {'file_name': 'test.txt', 'chunk_index': 0}
	},
	{
	'text': 'Machine learning models convert text to embeddings',
	'chunk_id': 'test2',
	'metadata': {'file_name': 'test.txt', 'chunk_index': 1}
	},
	{
	'text': 'FAISS provides efficient similarity search capabilities',
	'chunk_id': 'test3',
	'metadata': {'file_name': 'test.txt', 'chunk_index': 2}
	}
	]

	# Build index
	print("Building vector index...")
	vector_store.build_index(test_chunks, show_progress=True)
	print("✅ Built vector index")

	# Test search
	query = "How do vector databases work?"
	results = vector_store.search(query, top_k=2)

	print(f"Search results for '{query}':")
	for i, result in enumerate(results):
	print(f" {i+1}. Score: {result.score:.3f} - {result.text[:50]}...")

	# Test serialization
	serialized = vector_store.serialize()
	print(f"✅ Serialized data size: {len(serialized['index_base64'])} characters")

	return True

	except Exception as e:
	print(f"❌ Failed vector store test: {e}")
	import traceback
	traceback.print_exc()
	return False

	def test_rag_tool():
	"""Test complete RAG tool functionality"""
	print("\n=== Testing RAG Tool ===")

	try:
	# Initialize RAG tool
	rag_tool = RAGTool()
	print("✅ Initialized RAG tool")

	# Process test document
	test_files = ["test_document.txt"]
	result = rag_tool.process_uploaded_files(test_files)

	if result['success']:
	print(f"✅ {result['message']}")

	# Show summary
	summary = result['summary']
	print(f"Files processed: {summary['total_files']}")
	print(f"Total chunks: {summary['total_chunks']}")

	# Test context retrieval
	query = "What are the benefits of vector databases?"
	context = rag_tool.get_relevant_context(query, max_chunks=2)

	if context:
	print(f"\nContext for '{query}':")
	print(context[:300] + "..." if len(context) > 300 else context)
	print("✅ Successfully retrieved context")
	else:
	print("⚠️ No context retrieved")

	# Test serialization for deployment
	serialized_data = rag_tool.get_serialized_data()
	if serialized_data:
	print("✅ Successfully serialized RAG data for deployment")
	print(f"Serialized keys: {list(serialized_data.keys())}")
	else:
	print("❌ Failed to serialize RAG data")

	return True
	else:
	print(f"❌ {result['message']}")
	return False

	except Exception as e:
	print(f"❌ Failed RAG tool test: {e}")
	import traceback
	traceback.print_exc()
	return False

	def main():
	"""Run all tests"""
	print("=== Vector Database Testing ===")
	print("Testing vector database creation and functionality...")

	# Check dependencies
	print("\n=== Checking Dependencies ===")
	try:
	import sentence_transformers
	import faiss
	import fitz # PyMuPDF
	print("✅ All required dependencies available")
	except ImportError as e:
	print(f"❌ Missing dependency: {e}")
	return

	# Run tests
	tests = [
	("Document Processing", test_document_processing),
	("Vector Store", test_vector_store),
	("RAG Tool", test_rag_tool)
	]

	results = []
	for test_name, test_func in tests:
	print(f"\n{'='*20}")
	success = test_func()
	results.append((test_name, success))

	# Summary
	print(f"\n{'='*40}")
	print("TEST SUMMARY:")
	for test_name, success in results:
	status = "✅ PASS" if success else "❌ FAIL"
	print(f" {test_name}: {status}")

	all_passed = all(success for _, success in results)
	if all_passed:
	print("\n🎉 All tests passed! Vector database functionality is working.")
	else:
	print("\n⚠️ Some tests failed. Check the output above for details.")

	if __name__ == "__main__":
	main()