|
|
|
""" |
|
Utility script to check if documents meet token requirements for Gemini API caching |
|
""" |
|
|
|
import os |
|
import io |
|
import httpx |
|
from google import genai |
|
from dotenv import load_dotenv |
|
|
|
|
|
load_dotenv() |
|
|
|
def check_document_tokens(file_path=None, url=None): |
|
"""Check if a document meets the minimum token requirements for caching""" |
|
|
|
|
|
client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY')) |
|
|
|
print("π Document Token Checker") |
|
print("=" * 50) |
|
|
|
try: |
|
if file_path: |
|
print(f"π Checking local file: {file_path}") |
|
with open(file_path, 'rb') as f: |
|
file_content = f.read() |
|
file_io = io.BytesIO(file_content) |
|
document_name = file_path |
|
elif url: |
|
print(f"π Checking URL: {url}") |
|
response = httpx.get(url) |
|
response.raise_for_status() |
|
file_io = io.BytesIO(response.content) |
|
document_name = url |
|
else: |
|
print("β Error: Please provide either file_path or url") |
|
return |
|
|
|
print("π€ Uploading to Gemini File API...") |
|
|
|
|
|
document = client.files.upload( |
|
file=file_io, |
|
config=dict(mime_type='application/pdf') |
|
) |
|
|
|
print("β
File uploaded successfully!") |
|
|
|
|
|
print("πΎ Attempting to create cache to check token count...") |
|
|
|
try: |
|
cache = client.caches.create( |
|
model="gemini-2.0-flash-001", |
|
config=genai.types.CreateCachedContentConfig( |
|
system_instruction="Test system instruction for token counting.", |
|
contents=[document], |
|
) |
|
) |
|
|
|
token_count = getattr(cache.usage_metadata, 'cached_token_count', 0) |
|
|
|
print(f"π Token count: {token_count:,}") |
|
print(f"π Minimum required: 4,096") |
|
|
|
if token_count >= 4096: |
|
print("β
Document meets caching requirements!") |
|
print("π‘ This document is suitable for caching.") |
|
|
|
|
|
questions = [5, 10, 20, 50] |
|
print("\nπ° Cost-Benefit Analysis:") |
|
print("Questions | Without Cache | With Cache | Savings") |
|
print("-" * 50) |
|
|
|
for q in questions: |
|
without_cache = token_count * q |
|
with_cache = token_count + (50 * q) |
|
savings = ((without_cache - with_cache) / without_cache) * 100 |
|
print(f"{q:9d} | {without_cache:12,} | {with_cache:10,} | {savings:6.1f}%") |
|
|
|
else: |
|
print("β Document does not meet caching requirements") |
|
print(f"π Need {4096 - token_count:,} more tokens") |
|
print("π‘ Consider:") |
|
print(" β’ Uploading a longer document") |
|
print(" β’ Combining multiple documents") |
|
print(" β’ Using regular analysis (without caching)") |
|
|
|
|
|
print(f"\nποΈ Cleaning up test cache...") |
|
client.caches.delete(cache.name) |
|
print("β
Test cache deleted!") |
|
|
|
except Exception as e: |
|
if "Cached content is too small" in str(e): |
|
print("β Document is too small for caching") |
|
print("π‘ This document has fewer than 4,096 tokens") |
|
print("π Recommendations:") |
|
print(" β’ Upload a longer document") |
|
print(" β’ Combine multiple small documents") |
|
print(" β’ Use regular analysis without caching") |
|
else: |
|
print(f"β Error creating cache: {e}") |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
|
|
def estimate_tokens_from_file_size(file_path): |
|
"""Rough estimation of tokens based on file size""" |
|
try: |
|
file_size = os.path.getsize(file_path) |
|
|
|
|
|
estimated_tokens = file_size // 4 |
|
|
|
print(f"π File size: {file_size:,} bytes") |
|
print(f"π Estimated tokens: {estimated_tokens:,}") |
|
|
|
if estimated_tokens >= 4096: |
|
print("β
Likely meets caching requirements") |
|
else: |
|
print("β Likely too small for caching") |
|
|
|
except Exception as e: |
|
print(f"β Error estimating tokens: {e}") |
|
|
|
def main(): |
|
"""Main function with interactive menu""" |
|
|
|
print("π― Gemini API Document Token Checker") |
|
print("=" * 60) |
|
|
|
|
|
if not os.getenv('GOOGLE_API_KEY'): |
|
print("β Error: GOOGLE_API_KEY not found in environment variables") |
|
print("Please set your API key in the .env file") |
|
return |
|
|
|
while True: |
|
print("\nπ Options:") |
|
print("1. Check local PDF file") |
|
print("2. Check PDF from URL") |
|
print("3. Estimate tokens from file size") |
|
print("4. Exit") |
|
|
|
choice = input("\nEnter your choice (1-4): ").strip() |
|
|
|
if choice == '1': |
|
file_path = input("Enter the path to your PDF file: ").strip() |
|
if os.path.exists(file_path): |
|
check_document_tokens(file_path=file_path) |
|
else: |
|
print("β File not found!") |
|
|
|
elif choice == '2': |
|
url = input("Enter the URL to your PDF: ").strip() |
|
check_document_tokens(url=url) |
|
|
|
elif choice == '3': |
|
file_path = input("Enter the path to your PDF file: ").strip() |
|
if os.path.exists(file_path): |
|
estimate_tokens_from_file_size(file_path) |
|
else: |
|
print("β File not found!") |
|
|
|
elif choice == '4': |
|
print("π Goodbye!") |
|
break |
|
|
|
else: |
|
print("β Invalid choice. Please enter 1-4.") |
|
|
|
if __name__ == "__main__": |
|
main() |