File size: 13,224 Bytes
d66ab65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
<!DOCTYPE html>
<html>
<head>
    <title>Tokenizer Pro</title>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
    <script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
    <link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
    <!-- Hidden File Drop Zone that appears when dragging files -->
    <div id="fileDropZone" class="file-drop-zone">
        <div class="drop-indicator">
            <div class="file-icon">📄</div>
            <p>Drop your file here</p>
        </div>
    </div>

    <!-- Loading overlay -->
    <div id="loadingOverlay" class="loading-overlay">
        <div class="loading-content">
            <div class="loading-spinner large"></div>
            <div class="loading-text" id="loadingText">Analyzing text...</div>
        </div>
    </div>

    <!-- File upload icon in bottom left corner -->
    <div id="fileUploadIcon" class="file-upload-icon">
        <span>📎</span>
    </div>
    <p class="file-info" id="fileInfo"></p>

    <div class="container">
        <div class="header">
            <div class="title-section">
                <h1 class="title">Tokenizer Pro</h1>
                <p class="subtitle">Advanced tokenization analysis and visualization</p>
            </div>
            <div class="model-selector">
                <div class="model-selector-header">
                    <div class="model-type-toggle">
                        <div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
                        <div class="toggle-option custom-toggle" data-type="custom">Custom</div>
                    </div>
                </div>
                <div id="predefinedModelSelector">
                    <div style="position: relative;">
                        <div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
                        <!-- TOOLTIP MOVED HERE -->
                        <div class="tokenizer-info-tooltip" id="modelInfoTooltip">
                            <div id="tokenizerInfoContent">
                                <div class="tokenizer-info-loading">
                                    <div class="tokenizer-info-spinner"></div>
                                </div>
                            </div>
                        </div>
                        <!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
                        <select id="modelSelect" name="model">
                            {% for model_id, info in models.items() %}
                            <option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
                                {{ info.alias }}
                            </option>
                            {% endfor %}
                        </select>
                    </div>
                </div>
                <div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
                    <div style="position: relative;">
                        <div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
                        <div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
                            <div id="customTokenizerInfoContent">
                                <div class="tokenizer-info-loading">
                                    <div class="tokenizer-info-spinner"></div>
                                </div>
                            </div>
                        </div>
                        <input type="text" id="customModelInput" class="custom-model-input" 

                               placeholder="Enter HuggingFace model path" 

                               value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
                    </div>
                    <span class="custom-model-help">?</span>
                    <div class="tooltip">
                        Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
                        The model must have a tokenizer available and must be not restricted. (with some exceptions)
                        Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
                        Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
                    </div>
                    <div class="model-badge" id="modelSuccessBadge">Loaded</div>
                </div>
            </div>
        </div>

        <div class="error-message" id="errorMessage">{{ error }}</div>

        <div class="input-section">
            <div class="keyboard-shortcut-hint">Ctrl+Enter</div>
            <form id="analyzeForm" method="POST" enctype="multipart/form-data">
                <textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
                <input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
                <input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
                <input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
                <input type="file" name="file" id="fileInput" style="display: none;">
                <div class="button-container">
                    <button type="submit" id="analyzeButton">Analyze Text</button>
                </div>
            </form>
        </div>

        <div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
            <div class="card">
                <div class="card-header">
                    <h2 class="card-title">Token Visualization</h2>
                    <button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;">
                        <svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
                            <path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/>
                        </svg>
                    </button>
                </div>
                <div class="token-search-container" id="tokenSearchContainer" style="display: none;">
                    <div class="token-search-row">
                        <input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens...">
                        <div class="token-search-controls">
                            <button class="token-search-btn" id="prevMatch">â—€</button>
                            <span class="token-search-count" id="searchCount">0/0</span>
                            <button class="token-search-btn" id="nextMatch">â–¶</button>
                            <button class="token-search-btn" id="clearSearch">Clear</button>
                        </div>
                    </div>
                </div>
                <div class="preview-notice" id="previewNotice">
                    Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
                </div>
                <div class="token-container" id="tokenContainer">
                    {% if token_data %}
                    {% for token in token_data.tokens %}
                    <span class="token" 

                          style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"

                          title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
                        {{ token.display }}
                    </span>
                    {% if token.newline %}<br>{% endif %}
                    {% endfor %}
                    {% endif %}
                </div>
                <button class="expand-button" id="expandButton">Show More</button>
                <div class="display-limit-notice" id="displayLimitNotice">
                    Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
                </div>
                
                <div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;">
                    <div class="frequency-chart-title">
                        <span>Top Token Frequencies</span>
                        <button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button>
                    </div>
                    <div class="frequency-chart" id="frequencyChart"></div>
                </div>
            </div>

            <div class="stats-grid">
                <div class="stat-card">
                    <div class="stat-title">Total Tokens</div>
                    <div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
                    <div class="stat-description">
                        <span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
                        (<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
                    </div>
                </div>
                <div class="stat-card">
                    <div class="stat-title">Token Types</div>
                    <div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
                    <div class="stat-description">special tokens</div>
                </div>
                <div class="stat-card">
                    <div class="stat-title">Whitespace</div>
                    <div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
                    <div class="stat-description">
                        spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
                        newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
                    </div>
                </div>
                <div class="stat-card">
                    <div class="stat-title">Token Length</div>
                    <div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
                    <div class="stat-description">
                        median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
                        ±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
                    </div>
                </div>
                <div class="stat-card">
                    <div class="stat-title">Compression</div>
                    <div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
                    <div class="stat-description">characters per token</div>
                </div>
            </div>
        </div>
    </div>
    <a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
        @bartar/tokenizers
    </a>

    <script>

        // Pass server data to client-side JavaScript

        window.tokenizerData = {

            model_type: "{{ model_type if model_type else 'predefined' }}",

            error: "{{ error if error else '' }}",

            tokenizer_info: {{ token_data.tokenizer_info|tojson if token_data and token_data.tokenizer_info else 'null' }}

        };

    </script>
    <script src="{{ url_for('static', filename='js/main.js') }}"></script>
</body>
</html>