Spaces:
Running
Running
<html> | |
<head> | |
<title>Tokenizer Pro</title> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>"> | |
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}"> | |
</head> | |
<body> | |
<!-- Hidden File Drop Zone that appears when dragging files --> | |
<div id="fileDropZone" class="file-drop-zone"> | |
<div class="drop-indicator"> | |
<div class="file-icon">📄</div> | |
<p>Drop your file here</p> | |
</div> | |
</div> | |
<!-- Loading overlay --> | |
<div id="loadingOverlay" class="loading-overlay"> | |
<div class="loading-content"> | |
<div class="loading-spinner large"></div> | |
<div class="loading-text" id="loadingText">Analyzing text...</div> | |
</div> | |
</div> | |
<!-- File upload icon in bottom left corner --> | |
<div id="fileUploadIcon" class="file-upload-icon"> | |
<span>📎</span> | |
</div> | |
<p class="file-info" id="fileInfo"></p> | |
<div class="container"> | |
<div class="header"> | |
<div class="title-section"> | |
<h1 class="title">Tokenizer Pro</h1> | |
<p class="subtitle">Advanced tokenization analysis and visualization</p> | |
</div> | |
<div class="model-selector"> | |
<div class="model-selector-header"> | |
<div class="model-type-toggle"> | |
<div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div> | |
<div class="toggle-option custom-toggle" data-type="custom">Custom</div> | |
</div> | |
</div> | |
<div id="predefinedModelSelector"> | |
<div style="position: relative;"> | |
<div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div> | |
<!-- TOOLTIP MOVED HERE --> | |
<div class="tokenizer-info-tooltip" id="modelInfoTooltip"> | |
<div id="tokenizerInfoContent"> | |
<div class="tokenizer-info-loading"> | |
<div class="tokenizer-info-spinner"></div> | |
</div> | |
</div> | |
</div> | |
<!-- SELECT NOW COMES AFTER ICON AND TOOLTIP --> | |
<select id="modelSelect" name="model"> | |
{% for model_id, info in models.items() %} | |
<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}> | |
{{ info.alias }} | |
</option> | |
{% endfor %} | |
</select> | |
</div> | |
</div> | |
<div id="customModelSelector" style="display: none;" class="custom-model-wrapper"> | |
<div style="position: relative;"> | |
<div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div> | |
<div class="tokenizer-info-tooltip" id="customModelInfoTooltip"> | |
<div id="customTokenizerInfoContent"> | |
<div class="tokenizer-info-loading"> | |
<div class="tokenizer-info-spinner"></div> | |
</div> | |
</div> | |
</div> | |
<input type="text" id="customModelInput" class="custom-model-input" | |
placeholder="Enter HuggingFace model path" | |
value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}"> | |
</div> | |
<span class="custom-model-help">?</span> | |
<div class="tooltip"> | |
Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3") | |
The model must have a tokenizer available and must be not restricted. (with some exceptions) | |
Also some models have restrictions. You can use mirrored versions, like unsloth to omit that. | |
Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path. | |
</div> | |
<div class="model-badge" id="modelSuccessBadge">Loaded</div> | |
</div> | |
</div> | |
</div> | |
<div class="error-message" id="errorMessage">{{ error }}</div> | |
<div class="input-section"> | |
<div class="keyboard-shortcut-hint">Ctrl+Enter</div> | |
<form id="analyzeForm" method="POST" enctype="multipart/form-data"> | |
<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea> | |
<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}"> | |
<input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}"> | |
<input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}"> | |
<input type="file" name="file" id="fileInput" style="display: none;"> | |
<div class="button-container"> | |
<button type="submit" id="analyzeButton">Analyze Text</button> | |
</div> | |
</form> | |
</div> | |
<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}> | |
<div class="card"> | |
<div class="card-header"> | |
<h2 class="card-title">Token Visualization</h2> | |
<button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;"> | |
<svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor"> | |
<path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/> | |
</svg> | |
</button> | |
</div> | |
<div class="token-search-container" id="tokenSearchContainer" style="display: none;"> | |
<div class="token-search-row"> | |
<input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens..."> | |
<div class="token-search-controls"> | |
<button class="token-search-btn" id="prevMatch">◀</button> | |
<span class="token-search-count" id="searchCount">0/0</span> | |
<button class="token-search-btn" id="nextMatch">▶</button> | |
<button class="token-search-btn" id="clearSearch">Clear</button> | |
</div> | |
</div> | |
</div> | |
<div class="preview-notice" id="previewNotice"> | |
Note: Showing preview of first 8096 characters. Stats are calculated on the full file. | |
</div> | |
<div class="token-container" id="tokenContainer"> | |
{% if token_data %} | |
{% for token in token_data.tokens %} | |
<span class="token" | |
style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};" | |
title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}"> | |
{{ token.display }} | |
</span> | |
{% if token.newline %}<br>{% endif %} | |
{% endfor %} | |
{% endif %} | |
</div> | |
<button class="expand-button" id="expandButton">Show More</button> | |
<div class="display-limit-notice" id="displayLimitNotice"> | |
Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span> | |
</div> | |
<div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;"> | |
<div class="frequency-chart-title"> | |
<span>Top Token Frequencies</span> | |
<button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button> | |
</div> | |
<div class="frequency-chart" id="frequencyChart"></div> | |
</div> | |
</div> | |
<div class="stats-grid"> | |
<div class="stat-card"> | |
<div class="stat-title">Total Tokens</div> | |
<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div> | |
<div class="stat-description"> | |
<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span> | |
(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%) | |
</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Token Types</div> | |
<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div> | |
<div class="stat-description">special tokens</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Whitespace</div> | |
<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div> | |
<div class="stat-description"> | |
spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>, | |
newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span> | |
</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Token Length</div> | |
<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div> | |
<div class="stat-description"> | |
median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>, | |
±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std | |
</div> | |
</div> | |
<div class="stat-card"> | |
<div class="stat-title">Compression</div> | |
<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div> | |
<div class="stat-description">characters per token</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark"> | |
@bartar/tokenizers | |
</a> | |
<script> | |
</script> | |
<script src="{{ url_for('static', filename='js/main.js') }}"></script> | |
</body> | |
</html> |