File size: 6,474 Bytes
1512726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python3
"""
Utility script to check if documents meet token requirements for Gemini API caching
"""

import os
import io
import httpx
from google import genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

def check_document_tokens(file_path=None, url=None):
    """Check if a document meets the minimum token requirements for caching"""
    
    # Initialize client
    client = genai.Client(api_key=os.getenv('GOOGLE_API_KEY'))
    
    print("πŸ” Document Token Checker")
    print("=" * 50)
    
    try:
        if file_path:
            print(f"πŸ“„ Checking local file: {file_path}")
            with open(file_path, 'rb') as f:
                file_content = f.read()
            file_io = io.BytesIO(file_content)
            document_name = file_path
        elif url:
            print(f"πŸ“„ Checking URL: {url}")
            response = httpx.get(url)
            response.raise_for_status()
            file_io = io.BytesIO(response.content)
            document_name = url
        else:
            print("❌ Error: Please provide either file_path or url")
            return
        
        print("πŸ“€ Uploading to Gemini File API...")
        
        # Upload to Gemini File API
        document = client.files.upload(
            file=file_io,
            config=dict(mime_type='application/pdf')
        )
        
        print("βœ… File uploaded successfully!")
        
        # Try to create a cache to check token count
        print("πŸ’Ύ Attempting to create cache to check token count...")
        
        try:
            cache = client.caches.create(
                model="gemini-2.0-flash-001",
                config=genai.types.CreateCachedContentConfig(
                    system_instruction="Test system instruction for token counting.",
                    contents=[document],
                )
            )
            
            token_count = getattr(cache.usage_metadata, 'cached_token_count', 0)
            
            print(f"πŸ“Š Token count: {token_count:,}")
            print(f"πŸ“ Minimum required: 4,096")
            
            if token_count >= 4096:
                print("βœ… Document meets caching requirements!")
                print("πŸ’‘ This document is suitable for caching.")
                
                # Calculate cost benefits
                questions = [5, 10, 20, 50]
                print("\nπŸ’° Cost-Benefit Analysis:")
                print("Questions | Without Cache | With Cache | Savings")
                print("-" * 50)
                
                for q in questions:
                    without_cache = token_count * q
                    with_cache = token_count + (50 * q)  # Assuming 50 tokens per question
                    savings = ((without_cache - with_cache) / without_cache) * 100
                    print(f"{q:9d} | {without_cache:12,} | {with_cache:10,} | {savings:6.1f}%")
                
            else:
                print("❌ Document does not meet caching requirements")
                print(f"πŸ“ Need {4096 - token_count:,} more tokens")
                print("πŸ’‘ Consider:")
                print("   β€’ Uploading a longer document")
                print("   β€’ Combining multiple documents")
                print("   β€’ Using regular analysis (without caching)")
            
            # Clean up
            print(f"\nπŸ—‘οΈ Cleaning up test cache...")
            client.caches.delete(cache.name)
            print("βœ… Test cache deleted!")
            
        except Exception as e:
            if "Cached content is too small" in str(e):
                print("❌ Document is too small for caching")
                print("πŸ’‘ This document has fewer than 4,096 tokens")
                print("πŸ“ Recommendations:")
                print("   β€’ Upload a longer document")
                print("   β€’ Combine multiple small documents")
                print("   β€’ Use regular analysis without caching")
            else:
                print(f"❌ Error creating cache: {e}")
        
    except Exception as e:
        print(f"❌ Error: {e}")

def estimate_tokens_from_file_size(file_path):
    """Rough estimation of tokens based on file size"""
    try:
        file_size = os.path.getsize(file_path)
        # Rough estimation: 1 token β‰ˆ 4 characters, 1 character β‰ˆ 1 byte for text
        # For PDFs, this is very rough as they contain formatting, images, etc.
        estimated_tokens = file_size // 4
        
        print(f"πŸ“ File size: {file_size:,} bytes")
        print(f"πŸ“Š Estimated tokens: {estimated_tokens:,}")
        
        if estimated_tokens >= 4096:
            print("βœ… Likely meets caching requirements")
        else:
            print("❌ Likely too small for caching")
            
    except Exception as e:
        print(f"❌ Error estimating tokens: {e}")

def main():
    """Main function with interactive menu"""
    
    print("🎯 Gemini API Document Token Checker")
    print("=" * 60)
    
    # Check if API key is set
    if not os.getenv('GOOGLE_API_KEY'):
        print("❌ Error: GOOGLE_API_KEY not found in environment variables")
        print("Please set your API key in the .env file")
        return
    
    while True:
        print("\nπŸ“‹ Options:")
        print("1. Check local PDF file")
        print("2. Check PDF from URL")
        print("3. Estimate tokens from file size")
        print("4. Exit")
        
        choice = input("\nEnter your choice (1-4): ").strip()
        
        if choice == '1':
            file_path = input("Enter the path to your PDF file: ").strip()
            if os.path.exists(file_path):
                check_document_tokens(file_path=file_path)
            else:
                print("❌ File not found!")
                
        elif choice == '2':
            url = input("Enter the URL to your PDF: ").strip()
            check_document_tokens(url=url)
            
        elif choice == '3':
            file_path = input("Enter the path to your PDF file: ").strip()
            if os.path.exists(file_path):
                estimate_tokens_from_file_size(file_path)
            else:
                print("❌ File not found!")
                
        elif choice == '4':
            print("πŸ‘‹ Goodbye!")
            break
            
        else:
            print("❌ Invalid choice. Please enter 1-4.")

if __name__ == "__main__":
    main()