#!/usr/bin/env python3 """ Utility script to check if documents meet token requirements for Gemini API caching """ import os import io import httpx from google import genai from dotenv import load_dotenv # Load environment variables load_dotenv() def check_document_tokens(file_path=None, url=None): """Check if a document meets the minimum token requirements for caching""" # Initialize client client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY')) print("šŸ” Document Token Checker") print("=" * 50) try: if file_path: print(f"šŸ“„ Checking local file: {file_path}") with open(file_path, 'rb') as f: file_content = f.read() file_io = io.BytesIO(file_content) document_name = file_path elif url: print(f"šŸ“„ Checking URL: {url}") response = httpx.get(url) response.raise_for_status() file_io = io.BytesIO(response.content) document_name = url else: print("āŒ Error: Please provide either file_path or url") return print("šŸ“¤ Uploading to Gemini File API...") # Upload to Gemini File API document = client.files.upload( file=file_io, config=dict(mime_type='application/pdf') ) print("āœ… File uploaded successfully!") # Try to create a cache to check token count print("šŸ’¾ Attempting to create cache to check token count...") try: cache = client.caches.create( model="gemini-2.0-flash-001", config=genai.types.CreateCachedContentConfig( system_instruction="Test system instruction for token counting.", contents=[document], ) ) token_count = getattr(cache.usage_metadata, 'cached_token_count', 0) print(f"šŸ“Š Token count: {token_count:,}") print(f"šŸ“ Minimum required: 4,096") if token_count >= 4096: print("āœ… Document meets caching requirements!") print("šŸ’” This document is suitable for caching.") # Calculate cost benefits questions = [5, 10, 20, 50] print("\nšŸ’° Cost-Benefit Analysis:") print("Questions | Without Cache | With Cache | Savings") print("-" * 50) for q in questions: without_cache = token_count * q with_cache = token_count + (50 * q) # Assuming 50 tokens per question savings = ((without_cache - with_cache) / without_cache) * 100 print(f"{q:9d} | {without_cache:12,} | {with_cache:10,} | {savings:6.1f}%") else: print("āŒ Document does not meet caching requirements") print(f"šŸ“ Need {4096 - token_count:,} more tokens") print("šŸ’” Consider:") print(" • Uploading a longer document") print(" • Combining multiple documents") print(" • Using regular analysis (without caching)") # Clean up print(f"\nšŸ—‘ļø Cleaning up test cache...") client.caches.delete(cache.name) print("āœ… Test cache deleted!") except Exception as e: if "Cached content is too small" in str(e): print("āŒ Document is too small for caching") print("šŸ’” This document has fewer than 4,096 tokens") print("šŸ“ Recommendations:") print(" • Upload a longer document") print(" • Combine multiple small documents") print(" • Use regular analysis without caching") else: print(f"āŒ Error creating cache: {e}") except Exception as e: print(f"āŒ Error: {e}") def estimate_tokens_from_file_size(file_path): """Rough estimation of tokens based on file size""" try: file_size = os.path.getsize(file_path) # Rough estimation: 1 token ā‰ˆ 4 characters, 1 character ā‰ˆ 1 byte for text # For PDFs, this is very rough as they contain formatting, images, etc. estimated_tokens = file_size // 4 print(f"šŸ“ File size: {file_size:,} bytes") print(f"šŸ“Š Estimated tokens: {estimated_tokens:,}") if estimated_tokens >= 4096: print("āœ… Likely meets caching requirements") else: print("āŒ Likely too small for caching") except Exception as e: print(f"āŒ Error estimating tokens: {e}") def main(): """Main function with interactive menu""" print("šŸŽÆ Gemini API Document Token Checker") print("=" * 60) # Check if API key is set if not os.getenv('GOOGLE_API_KEY'): print("āŒ Error: GOOGLE_API_KEY not found in environment variables") print("Please set your API key in the .env file") return while True: print("\nšŸ“‹ Options:") print("1. Check local PDF file") print("2. Check PDF from URL") print("3. Estimate tokens from file size") print("4. Exit") choice = input("\nEnter your choice (1-4): ").strip() if choice == '1': file_path = input("Enter the path to your PDF file: ").strip() if os.path.exists(file_path): check_document_tokens(file_path=file_path) else: print("āŒ File not found!") elif choice == '2': url = input("Enter the URL to your PDF: ").strip() check_document_tokens(url=url) elif choice == '3': file_path = input("Enter the path to your PDF file: ").strip() if os.path.exists(file_path): estimate_tokens_from_file_size(file_path) else: print("āŒ File not found!") elif choice == '4': print("šŸ‘‹ Goodbye!") break else: print("āŒ Invalid choice. Please enter 1-4.") if __name__ == "__main__": main()