Spaces:
Running
Running
File size: 13,224 Bytes
d66ab65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
<!DOCTYPE html>
<html>
<head>
<title>Tokenizer Pro</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
<!-- Hidden File Drop Zone that appears when dragging files -->
<div id="fileDropZone" class="file-drop-zone">
<div class="drop-indicator">
<div class="file-icon">📄</div>
<p>Drop your file here</p>
</div>
</div>
<!-- Loading overlay -->
<div id="loadingOverlay" class="loading-overlay">
<div class="loading-content">
<div class="loading-spinner large"></div>
<div class="loading-text" id="loadingText">Analyzing text...</div>
</div>
</div>
<!-- File upload icon in bottom left corner -->
<div id="fileUploadIcon" class="file-upload-icon">
<span>📎</span>
</div>
<p class="file-info" id="fileInfo"></p>
<div class="container">
<div class="header">
<div class="title-section">
<h1 class="title">Tokenizer Pro</h1>
<p class="subtitle">Advanced tokenization analysis and visualization</p>
</div>
<div class="model-selector">
<div class="model-selector-header">
<div class="model-type-toggle">
<div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
<div class="toggle-option custom-toggle" data-type="custom">Custom</div>
</div>
</div>
<div id="predefinedModelSelector">
<div style="position: relative;">
<div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
<!-- TOOLTIP MOVED HERE -->
<div class="tokenizer-info-tooltip" id="modelInfoTooltip">
<div id="tokenizerInfoContent">
<div class="tokenizer-info-loading">
<div class="tokenizer-info-spinner"></div>
</div>
</div>
</div>
<!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
<select id="modelSelect" name="model">
{% for model_id, info in models.items() %}
<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
{{ info.alias }}
</option>
{% endfor %}
</select>
</div>
</div>
<div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
<div style="position: relative;">
<div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
<div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
<div id="customTokenizerInfoContent">
<div class="tokenizer-info-loading">
<div class="tokenizer-info-spinner"></div>
</div>
</div>
</div>
<input type="text" id="customModelInput" class="custom-model-input"
placeholder="Enter HuggingFace model path"
value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
</div>
<span class="custom-model-help">?</span>
<div class="tooltip">
Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
The model must have a tokenizer available and must be not restricted. (with some exceptions)
Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
</div>
<div class="model-badge" id="modelSuccessBadge">Loaded</div>
</div>
</div>
</div>
<div class="error-message" id="errorMessage">{{ error }}</div>
<div class="input-section">
<div class="keyboard-shortcut-hint">Ctrl+Enter</div>
<form id="analyzeForm" method="POST" enctype="multipart/form-data">
<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
<input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
<input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
<input type="file" name="file" id="fileInput" style="display: none;">
<div class="button-container">
<button type="submit" id="analyzeButton">Analyze Text</button>
</div>
</form>
</div>
<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
<div class="card">
<div class="card-header">
<h2 class="card-title">Token Visualization</h2>
<button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;">
<svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
<path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/>
</svg>
</button>
</div>
<div class="token-search-container" id="tokenSearchContainer" style="display: none;">
<div class="token-search-row">
<input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens...">
<div class="token-search-controls">
<button class="token-search-btn" id="prevMatch">â—€</button>
<span class="token-search-count" id="searchCount">0/0</span>
<button class="token-search-btn" id="nextMatch">â–¶</button>
<button class="token-search-btn" id="clearSearch">Clear</button>
</div>
</div>
</div>
<div class="preview-notice" id="previewNotice">
Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
</div>
<div class="token-container" id="tokenContainer">
{% if token_data %}
{% for token in token_data.tokens %}
<span class="token"
style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
{{ token.display }}
</span>
{% if token.newline %}<br>{% endif %}
{% endfor %}
{% endif %}
</div>
<button class="expand-button" id="expandButton">Show More</button>
<div class="display-limit-notice" id="displayLimitNotice">
Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
</div>
<div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;">
<div class="frequency-chart-title">
<span>Top Token Frequencies</span>
<button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button>
</div>
<div class="frequency-chart" id="frequencyChart"></div>
</div>
</div>
<div class="stats-grid">
<div class="stat-card">
<div class="stat-title">Total Tokens</div>
<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
<div class="stat-description">
<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
</div>
</div>
<div class="stat-card">
<div class="stat-title">Token Types</div>
<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
<div class="stat-description">special tokens</div>
</div>
<div class="stat-card">
<div class="stat-title">Whitespace</div>
<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
<div class="stat-description">
spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
</div>
</div>
<div class="stat-card">
<div class="stat-title">Token Length</div>
<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
<div class="stat-description">
median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
</div>
</div>
<div class="stat-card">
<div class="stat-title">Compression</div>
<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
<div class="stat-description">characters per token</div>
</div>
</div>
</div>
</div>
<a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
@bartar/tokenizers
</a>
<script>
</script>
<script src="{{ url_for('static', filename='js/main.js') }}"></script>
</body>
</html> |