File size: 6,474 Bytes
1512726 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
#!/usr/bin/env python3
"""
Utility script to check if documents meet token requirements for Gemini API caching
"""
import os
import io
import httpx
from google import genai
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
def check_document_tokens(file_path=None, url=None):
"""Check if a document meets the minimum token requirements for caching"""
# Initialize client
client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))
print("π Document Token Checker")
print("=" * 50)
try:
if file_path:
print(f"π Checking local file: {file_path}")
with open(file_path, 'rb') as f:
file_content = f.read()
file_io = io.BytesIO(file_content)
document_name = file_path
elif url:
print(f"π Checking URL: {url}")
response = httpx.get(url)
response.raise_for_status()
file_io = io.BytesIO(response.content)
document_name = url
else:
print("β Error: Please provide either file_path or url")
return
print("π€ Uploading to Gemini File API...")
# Upload to Gemini File API
document = client.files.upload(
file=file_io,
config=dict(mime_type='application/pdf')
)
print("β
File uploaded successfully!")
# Try to create a cache to check token count
print("πΎ Attempting to create cache to check token count...")
try:
cache = client.caches.create(
model="gemini-2.0-flash-001",
config=genai.types.CreateCachedContentConfig(
system_instruction="Test system instruction for token counting.",
contents=[document],
)
)
token_count = getattr(cache.usage_metadata, 'cached_token_count', 0)
print(f"π Token count: {token_count:,}")
print(f"π Minimum required: 4,096")
if token_count >= 4096:
print("β
Document meets caching requirements!")
print("π‘ This document is suitable for caching.")
# Calculate cost benefits
questions = [5, 10, 20, 50]
print("\nπ° Cost-Benefit Analysis:")
print("Questions | Without Cache | With Cache | Savings")
print("-" * 50)
for q in questions:
without_cache = token_count * q
with_cache = token_count + (50 * q) # Assuming 50 tokens per question
savings = ((without_cache - with_cache) / without_cache) * 100
print(f"{q:9d} | {without_cache:12,} | {with_cache:10,} | {savings:6.1f}%")
else:
print("β Document does not meet caching requirements")
print(f"π Need {4096 - token_count:,} more tokens")
print("π‘ Consider:")
print(" β’ Uploading a longer document")
print(" β’ Combining multiple documents")
print(" β’ Using regular analysis (without caching)")
# Clean up
print(f"\nποΈ Cleaning up test cache...")
client.caches.delete(cache.name)
print("β
Test cache deleted!")
except Exception as e:
if "Cached content is too small" in str(e):
print("β Document is too small for caching")
print("π‘ This document has fewer than 4,096 tokens")
print("π Recommendations:")
print(" β’ Upload a longer document")
print(" β’ Combine multiple small documents")
print(" β’ Use regular analysis without caching")
else:
print(f"β Error creating cache: {e}")
except Exception as e:
print(f"β Error: {e}")
def estimate_tokens_from_file_size(file_path):
"""Rough estimation of tokens based on file size"""
try:
file_size = os.path.getsize(file_path)
# Rough estimation: 1 token β 4 characters, 1 character β 1 byte for text
# For PDFs, this is very rough as they contain formatting, images, etc.
estimated_tokens = file_size // 4
print(f"π File size: {file_size:,} bytes")
print(f"π Estimated tokens: {estimated_tokens:,}")
if estimated_tokens >= 4096:
print("β
Likely meets caching requirements")
else:
print("β Likely too small for caching")
except Exception as e:
print(f"β Error estimating tokens: {e}")
def main():
"""Main function with interactive menu"""
print("π― Gemini API Document Token Checker")
print("=" * 60)
# Check if API key is set
if not os.getenv('GOOGLE_API_KEY'):
print("β Error: GOOGLE_API_KEY not found in environment variables")
print("Please set your API key in the .env file")
return
while True:
print("\nπ Options:")
print("1. Check local PDF file")
print("2. Check PDF from URL")
print("3. Estimate tokens from file size")
print("4. Exit")
choice = input("\nEnter your choice (1-4): ").strip()
if choice == '1':
file_path = input("Enter the path to your PDF file: ").strip()
if os.path.exists(file_path):
check_document_tokens(file_path=file_path)
else:
print("β File not found!")
elif choice == '2':
url = input("Enter the URL to your PDF: ").strip()
check_document_tokens(url=url)
elif choice == '3':
file_path = input("Enter the path to your PDF file: ").strip()
if os.path.exists(file_path):
estimate_tokens_from_file_size(file_path)
else:
print("β File not found!")
elif choice == '4':
print("π Goodbye!")
break
else:
print("β Invalid choice. Please enter 1-4.")
if __name__ == "__main__":
main() |