Spaces:
Running
Running
File size: 13,224 Bytes
d66ab65 |
|
<!DOCTYPE html>
<html>
<head>
<title>Tokenizer Pro</title>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="icon" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'><circle fill='%230f4f9b' cx='256' cy='256' r='256'/><g transform='translate(32 0)'><path fill='white' d='M64 128l0-32 128 0 0 128-16 0c-17.7 0-32 14.3-32 32s14.3 32 32 32l96 0c17.7 0 32-14.3 32-32s-14.3-32-32-32l-16 0 0-128 128 0 0 32c0 17.7 14.3 32 32s32-14.3 32-32l0-48c0-26.5-21.5-48-48-48L224 32 48 32C21.5 32 0 53.5 0 80l0 48c0 17.7 14.3 32 32 32s32-14.3 32-32zM9.4 361.4c-12.5 12.5-12.5 32.8 0 45.3l64 64c9.2 9.2 22.9 11.9 34.9 6.9s19.8-16.6 19.8-29.6l0-32 192 0 0 32c0 12.9 7.8 24.6 19.8 29.6s25.7 2.2 34.9-6.9l64-64c12.5-12.5 12.5-32.8 0-45.3l-64-64c-9.2-9.2-22.9-11.9-34.9-6.9s-19.8 16.6-19.8 29.6l0 32-192 0 0-32c0-12.9-7.8-24.6-19.8-29.6s-25.7-2.2-34.9 6.9l-64 64z'/></g></svg>">
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
<link rel="stylesheet" href="{{ url_for('static', filename='css/style.css') }}">
</head>
<body>
<!-- Hidden File Drop Zone that appears when dragging files -->
<div id="fileDropZone" class="file-drop-zone">
<div class="drop-indicator">
<div class="file-icon">📄</div>
<p>Drop your file here</p>
</div>
</div>
<!-- Loading overlay -->
<div id="loadingOverlay" class="loading-overlay">
<div class="loading-content">
<div class="loading-spinner large"></div>
<div class="loading-text" id="loadingText">Analyzing text...</div>
</div>
</div>
<!-- File upload icon in bottom left corner -->
<div id="fileUploadIcon" class="file-upload-icon">
<span>📎</span>
</div>
<p class="file-info" id="fileInfo"></p>
<div class="container">
<div class="header">
<div class="title-section">
<h1 class="title">Tokenizer Pro</h1>
<p class="subtitle">Advanced tokenization analysis and visualization</p>
</div>
<div class="model-selector">
<div class="model-selector-header">
<div class="model-type-toggle">
<div class="toggle-option predefined-toggle active" data-type="predefined">Predefined</div>
<div class="toggle-option custom-toggle" data-type="custom">Custom</div>
</div>
</div>
<div id="predefinedModelSelector">
<div style="position: relative;">
<div class="tokenizer-info-icon" id="modelInfoIcon" title="View tokenizer information">ℹ</div>
<!-- TOOLTIP MOVED HERE -->
<div class="tokenizer-info-tooltip" id="modelInfoTooltip">
<div id="tokenizerInfoContent">
<div class="tokenizer-info-loading">
<div class="tokenizer-info-spinner"></div>
</div>
</div>
</div>
<!-- SELECT NOW COMES AFTER ICON AND TOOLTIP -->
<select id="modelSelect" name="model">
{% for model_id, info in models.items() %}
<option value="{{ model_id }}" {% if selected_model == model_id %}selected{% endif %}>
{{ info.alias }}
</option>
{% endfor %}
</select>
</div>
</div>
<div id="customModelSelector" style="display: none;" class="custom-model-wrapper">
<div style="position: relative;">
<div class="tokenizer-info-icon" id="customModelInfoIcon" title="View tokenizer information">ℹ</div>
<div class="tokenizer-info-tooltip" id="customModelInfoTooltip">
<div id="customTokenizerInfoContent">
<div class="tokenizer-info-loading">
<div class="tokenizer-info-spinner"></div>
</div>
</div>
</div>
<input type="text" id="customModelInput" class="custom-model-input"
placeholder="Enter HuggingFace model path"
value="{{ custom_model if custom_model and custom_model|length > 0 else '' }}">
</div>
<span class="custom-model-help">?</span>
<div class="tooltip">
Enter a valid HuggingFace model ID (e.g., "mistralai/Mistral-7B-Instruct-v0.3")
The model must have a tokenizer available and must be not restricted. (with some exceptions)
Also some models have restrictions. You can use mirrored versions, like unsloth to omit that.
Like ("unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit") instead of original path.
</div>
<div class="model-badge" id="modelSuccessBadge">Loaded</div>
</div>
</div>
</div>
<div class="error-message" id="errorMessage">{{ error }}</div>
<div class="input-section">
<div class="keyboard-shortcut-hint">Ctrl+Enter</div>
<form id="analyzeForm" method="POST" enctype="multipart/form-data">
<textarea name="text" id="textInput" placeholder="Enter text to analyze or upload a file in bottom left corner...">{{ text }}</textarea>
<input type="hidden" name="model" id="modelInput" value="{{ selected_model }}">
<input type="hidden" name="custom_model" id="customModelInputHidden" value="{{ custom_model if custom_model else '' }}">
<input type="hidden" name="model_type" id="modelTypeInput" value="{{ model_type if model_type else 'predefined' }}">
<input type="file" name="file" id="fileInput" style="display: none;">
<div class="button-container">
<button type="submit" id="analyzeButton">Analyze Text</button>
</div>
</form>
</div>
<div id="results" class="results" {% if not token_data %}style="display: none;"{% endif %}>
<div class="card">
<div class="card-header">
<h2 class="card-title">Token Visualization</h2>
<button type="button" class="search-toggle-btn" id="searchToggleBtn" title="Toggle token search" style="display: none;">
<svg width="16" height="16" viewBox="0 0 24 24" fill="currentColor">
<path d="M15.5 14h-.79l-.28-.27C15.41 12.59 16 11.11 16 9.5 16 5.91 13.09 3 9.5 3S3 5.91 3 9.5 5.91 16 9.5 16c1.61 0 3.09-.59 4.23-1.57l.27.28v.79l5 4.99L20.49 19l-4.99-5zm-6 0C7.01 14 5 11.99 5 9.5S7.01 5 9.5 5 14 7.01 14 9.5 11.99 14 9.5 14z"/>
</svg>
</button>
</div>
<div class="token-search-container" id="tokenSearchContainer" style="display: none;">
<div class="token-search-row">
<input type="text" class="token-search-input" id="tokenSearchInput" placeholder="Search tokens...">
<div class="token-search-controls">
<button class="token-search-btn" id="prevMatch">â—€</button>
<span class="token-search-count" id="searchCount">0/0</span>
<button class="token-search-btn" id="nextMatch">â–¶</button>
<button class="token-search-btn" id="clearSearch">Clear</button>
</div>
</div>
</div>
<div class="preview-notice" id="previewNotice">
Note: Showing preview of first 8096 characters. Stats are calculated on the full file.
</div>
<div class="token-container" id="tokenContainer">
{% if token_data %}
{% for token in token_data.tokens %}
<span class="token"
style="background-color: {{ token.colors.background }}; color: {{ token.colors.text }};"
title="Original token: {{ token.original }} | Token ID: {{ token.token_id }}">
{{ token.display }}
</span>
{% if token.newline %}<br>{% endif %}
{% endfor %}
{% endif %}
</div>
<button class="expand-button" id="expandButton">Show More</button>
<div class="display-limit-notice" id="displayLimitNotice">
Note: Only showing first 50,000 tokens. Total token count: <span id="totalTokenCount">0</span>
</div>
<div class="frequency-chart-container" id="frequencyChartContainer" style="display: none;">
<div class="frequency-chart-title">
<span>Top Token Frequencies</span>
<button class="chart-toggle-btn" id="toggleFrequencyChart">Show Chart</button>
</div>
<div class="frequency-chart" id="frequencyChart"></div>
</div>
</div>
<div class="stats-grid">
<div class="stat-card">
<div class="stat-title">Total Tokens</div>
<div class="stat-value" id="totalTokens">{{ token_data.stats.basic_stats.total_tokens if token_data else 0 }}</div>
<div class="stat-description">
<span id="uniqueTokens">{{ token_data.stats.basic_stats.unique_tokens if token_data else 0 }} unique</span>
(<span id="uniquePercentage">{{ token_data.stats.basic_stats.unique_percentage if token_data else 0 }}</span>%)
</div>
</div>
<div class="stat-card">
<div class="stat-title">Token Types</div>
<div class="stat-value" id="specialTokens">{{ token_data.stats.basic_stats.special_tokens if token_data else 0 }}</div>
<div class="stat-description">special tokens</div>
</div>
<div class="stat-card">
<div class="stat-title">Whitespace</div>
<div class="stat-value" id="spaceTokens">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</div>
<div class="stat-description">
spaces: <span id="spaceCount">{{ token_data.stats.basic_stats.space_tokens if token_data else 0 }}</span>,
newlines: <span id="newlineCount">{{ token_data.stats.basic_stats.newline_tokens if token_data else 0 }}</span>
</div>
</div>
<div class="stat-card">
<div class="stat-title">Token Length</div>
<div class="stat-value" id="avgLength">{{ token_data.stats.length_stats.avg_length if token_data else 0 }}</div>
<div class="stat-description">
median: <span id="medianLength">{{ token_data.stats.length_stats.median_length if token_data else 0 }}</span>,
±<span id="stdDev">{{ token_data.stats.length_stats.std_dev if token_data else 0 }}</span> std
</div>
</div>
<div class="stat-card">
<div class="stat-title">Compression</div>
<div class="stat-value" id="compressionRatio">{{ token_data.stats.basic_stats.compression_ratio if token_data else 0 }}</div>
<div class="stat-description">characters per token</div>
</div>
</div>
</div>
</div>
<a href="https://huggingface.co/spaces/bartar/tokenizers" target="_blank" class="watermark">
@bartar/tokenizers
</a>
<script>
</script>
<script src="{{ url_for('static', filename='js/main.js') }}"></script>
</body>
</html> |