Spaces:

Omartificial-Intelligence-Space
/

context-caching-gemini-pdf-qa

Running

File size: 6,474 Bytes
#!/usr/bin/env python3
"""
Utility script to check if documents meet token requirements for Gemini API caching
"""

import os
import io
import httpx
from google import genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def check_document_tokens(file_path=None, url=None):
    """Check if a document meets the minimum token requirements for caching"""
    
    # Initialize client
    client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))
    
    print("🔍 Document Token Checker")
    print("=" * 50)
    
    try:
        if file_path:
            print(f"📄 Checking local file: {file_path}")
            with open(file_path, 'rb') as f:
                file_content = f.read()
            file_io = io.BytesIO(file_content)
            document_name = file_path
        elif url:
            print(f"📄 Checking URL: {url}")
            response = httpx.get(url)
            response.raise_for_status()
            file_io = io.BytesIO(response.content)
            document_name = url
        else:
            print("❌ Error: Please provide either file_path or url")
            return
        
        print("📤 Uploading to Gemini File API...")
        
        # Upload to Gemini File API
        document = client.files.upload(
            file=file_io,
            config=dict(mime_type='application/pdf')
        )
        
        print("✅ File uploaded successfully!")
        
        # Try to create a cache to check token count
        print("💾 Attempting to create cache to check token count...")
        
        try:
            cache = client.caches.create(
                model="gemini-2.0-flash-001",
                config=genai.types.CreateCachedContentConfig(
                    system_instruction="Test system instruction for token counting.",
                    contents=[document],
                )
            )
            
            token_count = getattr(cache.usage_metadata, 'cached_token_count', 0)
            
            print(f"📊 Token count: {token_count:,}")
            print(f"📏 Minimum required: 4,096")
            
            if token_count >= 4096:
                print("✅ Document meets caching requirements!")
                print("💡 This document is suitable for caching.")
                
                # Calculate cost benefits
                questions = [5, 10, 20, 50]
                print("\n💰 Cost-Benefit Analysis:")
                print("Questions | Without Cache | With Cache | Savings")
                print("-" * 50)
                
                for q in questions:
                    without_cache = token_count * q
                    with_cache = token_count + (50 * q)  # Assuming 50 tokens per question
                    savings = ((without_cache - with_cache) / without_cache) * 100
                    print(f"{q:9d} | {without_cache:12,} | {with_cache:10,} | {savings:6.1f}%")
                
            else:
                print("❌ Document does not meet caching requirements")
                print(f"📝 Need {4096 - token_count:,} more tokens")
                print("💡 Consider:")
                print("   • Uploading a longer document")
                print("   • Combining multiple documents")
                print("   • Using regular analysis (without caching)")
            
            # Clean up
            print(f"\n🗑️ Cleaning up test cache...")
            client.caches.delete(cache.name)
            print("✅ Test cache deleted!")
            
        except Exception as e:
            if "Cached content is too small" in str(e):
                print("❌ Document is too small for caching")
                print("💡 This document has fewer than 4,096 tokens")
                print("📝 Recommendations:")
                print("   • Upload a longer document")
                print("   • Combine multiple small documents")
                print("   • Use regular analysis without caching")
            else:
                print(f"❌ Error creating cache: {e}")
        
    except Exception as e:
        print(f"❌ Error: {e}")

def estimate_tokens_from_file_size(file_path):
    """Rough estimation of tokens based on file size"""
    try:
        file_size = os.path.getsize(file_path)
        # Rough estimation: 1 token ≈ 4 characters, 1 character ≈ 1 byte for text
        # For PDFs, this is very rough as they contain formatting, images, etc.
        estimated_tokens = file_size // 4
        
        print(f"📏 File size: {file_size:,} bytes")
        print(f"📊 Estimated tokens: {estimated_tokens:,}")
        
        if estimated_tokens >= 4096:
            print("✅ Likely meets caching requirements")
        else:
            print("❌ Likely too small for caching")
            
    except Exception as e:
        print(f"❌ Error estimating tokens: {e}")

def main():
    """Main function with interactive menu"""
    
    print("🎯 Gemini API Document Token Checker")
    print("=" * 60)
    
    # Check if API key is set
    if not os.getenv('GOOGLE_API_KEY'):
        print("❌ Error: GOOGLE_API_KEY not found in environment variables")
        print("Please set your API key in the .env file")
        return
    
    while True:
        print("\n📋 Options:")
        print("1. Check local PDF file")
        print("2. Check PDF from URL")
        print("3. Estimate tokens from file size")
        print("4. Exit")
        
        choice = input("\nEnter your choice (1-4): ").strip()
        
        if choice == '1':
            file_path = input("Enter the path to your PDF file: ").strip()
            if os.path.exists(file_path):
                check_document_tokens(file_path=file_path)
            else:
                print("❌ File not found!")
                
        elif choice == '2':
            url = input("Enter the URL to your PDF: ").strip()
            check_document_tokens(url=url)
            
        elif choice == '3':
            file_path = input("Enter the path to your PDF file: ").strip()
            if os.path.exists(file_path):
                estimate_tokens_from_file_size(file_path)
            else:
                print("❌ File not found!")
                
        elif choice == '4':
            print("👋 Goodbye!")
            break
            
        else:
            print("❌ Invalid choice. Please enter 1-4.")

if __name__ == "__main__":
    main()