Spaces:

Omartificial-Intelligence-Space
/

context-caching-gemini-pdf-qa

Running

App Files Files Community

context-caching-gemini-pdf-qa / check_tokens.py

Omartificial-Intelligence-Space

Create check_tokens.py

1512726 verified 2 days ago

raw

history blame

6.47 kB

	#!/usr/bin/env python3
	"""
	Utility script to check if documents meet token requirements for Gemini API caching
	"""

	import os
	import io
	import httpx
	from google import genai
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	def check_document_tokens(file_path=None, url=None):
	"""Check if a document meets the minimum token requirements for caching"""

	# Initialize client
	client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))

	print("🔍 Document Token Checker")
	print("=" * 50)

	try:
	if file_path:
	print(f"📄 Checking local file: {file_path}")
	with open(file_path, 'rb') as f:
	file_content = f.read()
	file_io = io.BytesIO(file_content)
	document_name = file_path
	elif url:
	print(f"📄 Checking URL: {url}")
	response = httpx.get(url)
	response.raise_for_status()
	file_io = io.BytesIO(response.content)
	document_name = url
	else:
	print("❌ Error: Please provide either file_path or url")
	return

	print("📤 Uploading to Gemini File API...")

	# Upload to Gemini File API
	document = client.files.upload(
	file=file_io,
	config=dict(mime_type='application/pdf')
	)

	print("✅ File uploaded successfully!")

	# Try to create a cache to check token count
	print("💾 Attempting to create cache to check token count...")

	try:
	cache = client.caches.create(
	model="gemini-2.0-flash-001",
	config=genai.types.CreateCachedContentConfig(
	system_instruction="Test system instruction for token counting.",
	contents=[document],
	)
	)

	token_count = getattr(cache.usage_metadata, 'cached_token_count', 0)

	print(f"📊 Token count: {token_count:,}")
	print(f"📏 Minimum required: 4,096")

	if token_count >= 4096:
	print("✅ Document meets caching requirements!")
	print("💡 This document is suitable for caching.")

	# Calculate cost benefits
	questions = [5, 10, 20, 50]
	print("\n💰 Cost-Benefit Analysis:")
	print("Questions \| Without Cache \| With Cache \| Savings")
	print("-" * 50)

	for q in questions:
	without_cache = token_count * q
	with_cache = token_count + (50 * q) # Assuming 50 tokens per question
	savings = ((without_cache - with_cache) / without_cache) * 100
	print(f"{q:9d} \| {without_cache:12,} \| {with_cache:10,} \| {savings:6.1f}%")

	else:
	print("❌ Document does not meet caching requirements")
	print(f"📝 Need {4096 - token_count:,} more tokens")
	print("💡 Consider:")
	print(" • Uploading a longer document")
	print(" • Combining multiple documents")
	print(" • Using regular analysis (without caching)")

	# Clean up
	print(f"\n🗑️ Cleaning up test cache...")
	client.caches.delete(cache.name)
	print("✅ Test cache deleted!")

	except Exception as e:
	if "Cached content is too small" in str(e):
	print("❌ Document is too small for caching")
	print("💡 This document has fewer than 4,096 tokens")
	print("📝 Recommendations:")
	print(" • Upload a longer document")
	print(" • Combine multiple small documents")
	print(" • Use regular analysis without caching")
	else:
	print(f"❌ Error creating cache: {e}")

	except Exception as e:
	print(f"❌ Error: {e}")

	def estimate_tokens_from_file_size(file_path):
	"""Rough estimation of tokens based on file size"""
	try:
	file_size = os.path.getsize(file_path)
	# Rough estimation: 1 token ≈ 4 characters, 1 character ≈ 1 byte for text
	# For PDFs, this is very rough as they contain formatting, images, etc.
	estimated_tokens = file_size // 4

	print(f"📏 File size: {file_size:,} bytes")
	print(f"📊 Estimated tokens: {estimated_tokens:,}")

	if estimated_tokens >= 4096:
	print("✅ Likely meets caching requirements")
	else:
	print("❌ Likely too small for caching")

	except Exception as e:
	print(f"❌ Error estimating tokens: {e}")

	def main():
	"""Main function with interactive menu"""

	print("🎯 Gemini API Document Token Checker")
	print("=" * 60)

	# Check if API key is set
	if not os.getenv('GOOGLE_API_KEY'):
	print("❌ Error: GOOGLE_API_KEY not found in environment variables")
	print("Please set your API key in the .env file")
	return

	while True:
	print("\n📋 Options:")
	print("1. Check local PDF file")
	print("2. Check PDF from URL")
	print("3. Estimate tokens from file size")
	print("4. Exit")

	choice = input("\nEnter your choice (1-4): ").strip()

	if choice == '1':
	file_path = input("Enter the path to your PDF file: ").strip()
	if os.path.exists(file_path):
	check_document_tokens(file_path=file_path)
	else:
	print("❌ File not found!")

	elif choice == '2':
	url = input("Enter the URL to your PDF: ").strip()
	check_document_tokens(url=url)

	elif choice == '3':
	file_path = input("Enter the path to your PDF file: ").strip()
	if os.path.exists(file_path):
	estimate_tokens_from_file_size(file_path)
	else:
	print("❌ File not found!")

	elif choice == '4':
	print("👋 Goodbye!")
	break

	else:
	print("❌ Invalid choice. Please enter 1-4.")

	if __name__ == "__main__":
	main()