Spaces:
Running
Running
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Smart Web Crawler</title> | |
<script src="https://cdn.tailwindcss.com"></script> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"> | |
<style> | |
.gradient-bg { | |
background: linear-gradient(135deg, #6b73ff 0%, #000dff 100%); | |
} | |
.crawl-animation { | |
animation: crawlPulse 2s infinite; | |
} | |
@keyframes crawlPulse { | |
0% { transform: translateY(0); } | |
50% { transform: translateY(-5px); } | |
100% { transform: translateY(0); } | |
} | |
.progress-bar { | |
transition: width 0.3s ease; | |
} | |
.result-card:hover { | |
transform: translateY(-5px); | |
box-shadow: 0 10px 25px rgba(0, 0, 255, 0.1); | |
} | |
.code-block { | |
font-family: 'Courier New', monospace; | |
background-color: #2d3748; | |
color: #f7fafc; | |
} | |
.tab-active { | |
border-bottom: 3px solid #3b82f6; | |
color: #3b82f6; | |
font-weight: 600; | |
} | |
.fade-in { | |
animation: fadeIn 0.5s ease-in; | |
} | |
@keyframes fadeIn { | |
from { opacity: 0; } | |
to { opacity: 1; } | |
} | |
.progress-step { | |
position: relative; | |
padding-left: 2rem; | |
} | |
.progress-step:before { | |
content: ''; | |
position: absolute; | |
left: 0.5rem; | |
top: 0; | |
bottom: 0; | |
width: 2px; | |
background-color: #e5e7eb; | |
} | |
.progress-step:first-child:before { | |
top: 1rem; | |
} | |
.progress-step:last-child:before { | |
bottom: calc(100% - 1rem); | |
} | |
.progress-step.completed .step-icon { | |
background-color: #10b981; | |
color: white; | |
} | |
.progress-step.active .step-icon { | |
background-color: #3b82f6; | |
color: white; | |
} | |
.progress-step.pending .step-icon { | |
background-color: #e5e7eb; | |
color: #6b7280; | |
} | |
.progress-step.error .step-icon { | |
background-color: #ef4444; | |
color: white; | |
} | |
.log-entry.error { | |
color: #ef4444; | |
} | |
.log-entry.warning { | |
color: #f59e0b; | |
} | |
.log-entry.success { | |
color: #10b981; | |
} | |
.log-entry.info { | |
color: #3b82f6; | |
} | |
.progress-multi { | |
height: 6px; | |
border-radius: 3px; | |
} | |
</style> | |
</head> | |
<body class="bg-gray-50 min-h-screen"> | |
<div class="gradient-bg text-white py-8 px-4 shadow-lg"> | |
<div class="container mx-auto"> | |
<div class="flex items-center justify-between"> | |
<div> | |
<h1 class="text-3xl font-bold flex items-center"> | |
<i class="fas fa-spider mr-3 crawl-animation"></i> Smart Web Crawler | |
</h1> | |
<p class="mt-2 opacity-90">Extract and organize web content into structured knowledge</p> | |
</div> | |
<div class="hidden md:block"> | |
<div class="flex space-x-2"> | |
<span class="px-3 py-1 bg-blue-400 rounded-full text-xs font-semibold">AI-Powered</span> | |
<span class="px-3 py-1 bg-purple-400 rounded-full text-xs font-semibold">Multi-Format</span> | |
<span class="px-3 py-1 bg-green-400 rounded-full text-xs font-semibold">Smart Filtering</span> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<div class="container mx-auto px-4 py-8"> | |
<div class="bg-white rounded-xl shadow-lg overflow-hidden mb-8"> | |
<div class="p-6"> | |
<h2 class="text-xl font-semibold text-gray-800 mb-4">Crawler Configuration</h2> | |
<div class="grid grid-cols-1 md:grid-cols-2 gap-6"> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-1">Start URL</label> | |
<div class="flex"> | |
<input type="text" id="baseUrl" placeholder="https://example.com" | |
class="flex-1 px-4 py-2 border border-gray-300 rounded-l-lg focus:ring-blue-500 focus:border-blue-500"> | |
<button id="validateUrlBtn" class="px-4 py-2 bg-blue-600 text-white rounded-r-lg hover:bg-blue-700"> | |
<i class="fas fa-check"></i> | |
</button> | |
</div> | |
<p id="urlError" class="text-red-500 text-xs mt-1 hidden">Please enter a valid URL starting with http:// or https://</p> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-1">Output Format</label> | |
<div class="flex space-x-4"> | |
<label class="inline-flex items-center"> | |
<input type="radio" name="outputFormat" value="json" checked class="h-4 w-4 text-blue-600 focus:ring-blue-500"> | |
<span class="ml-2">JSON</span> | |
</label> | |
<label class="inline-flex items-center"> | |
<input type="radio" name="outputFormat" value="md" class="h-4 w-4 text-blue-600 focus:ring-blue-500"> | |
<span class="ml-2">Markdown</span> | |
</label> | |
</div> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-1">Max Depth</label> | |
<input type="number" id="maxDepth" min="1" max="10" value="3" | |
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500"> | |
</div> | |
<div> | |
<label class="block text-sm font-medium text-gray-700 mb-1">Max Concurrent Requests</label> | |
<input type="number" id="maxConcurrent" min="1" max="50" value="20" | |
class="w-full px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500"> | |
</div> | |
</div> | |
<div class="mt-6"> | |
<h3 class="text-md font-medium text-gray-700 mb-3">Content to Extract</h3> | |
<div class="flex flex-wrap gap-4"> | |
<label class="inline-flex items-center"> | |
<input type="checkbox" id="extractText" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500"> | |
<span class="ml-2">Text Content</span> | |
</label> | |
<label class="inline-flex items-center"> | |
<input type="checkbox" id="extractCode" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500"> | |
<span class="ml-2">Code Blocks</span> | |
</label> | |
<label class="inline-flex items-center"> | |
<input type="checkbox" id="extractTables" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500"> | |
<span class="ml-2">Tables</span> | |
</label> | |
<label class="inline-flex items-center"> | |
<input type="checkbox" id="extractLists" checked class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500"> | |
<span class="ml-2">Lists</span> | |
</label> | |
</div> | |
</div> | |
<div class="mt-6"> | |
<label class="inline-flex items-center"> | |
<input type="checkbox" id="useLLMFilter" class="h-4 w-4 text-blue-600 rounded focus:ring-blue-500"> | |
<span class="ml-2 font-medium">Enable AI Content Filtering</span> | |
</label> | |
<div id="llmSettings" class="mt-3 pl-6 hidden"> | |
<label class="block text-sm font-medium text-gray-700 mb-1">Minimum Quality Score (0-100)</label> | |
<input type="number" id="minLLMScore" min="0" max="100" value="50" | |
class="w-24 px-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500"> | |
</div> | |
</div> | |
<div class="mt-8 flex justify-center"> | |
<button id="startCrawlBtn" | |
class="px-8 py-3 bg-blue-600 text-white rounded-lg font-semibold hover:bg-blue-700 transition-all flex items-center"> | |
<i class="fas fa-play mr-2"></i> Start Crawling | |
</button> | |
</div> | |
</div> | |
</div> | |
<div id="progressSection" class="hidden"> | |
<div class="bg-white rounded-xl shadow-lg overflow-hidden mb-8"> | |
<div class="p-6"> | |
<div class="flex justify-between items-center mb-4"> | |
<h2 class="text-xl font-semibold text-gray-800">Crawling Progress</h2> | |
<button id="stopCrawlBtn" | |
class="px-4 py-2 bg-red-500 text-white rounded-lg text-sm hover:bg-red-600"> | |
<i class="fas fa-stop mr-1"></i> Stop | |
</button> | |
</div> | |
<!-- Progress Steps --> | |
<div class="mb-6"> | |
<div class="flex space-x-4 mb-4"> | |
<div class="progress-step pending" id="step1"> | |
<div class="flex items-center"> | |
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2"> | |
<i class="fas fa-link text-xs"></i> | |
</div> | |
<span class="text-sm">URL Validation</span> | |
</div> | |
</div> | |
<div class="progress-step pending" id="step2"> | |
<div class="flex items-center"> | |
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2"> | |
<i class="fas fa-sitemap text-xs"></i> | |
</div> | |
<span class="text-sm">Site Mapping</span> | |
</div> | |
</div> | |
<div class="progress-step pending" id="step3"> | |
<div class="flex items-center"> | |
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2"> | |
<i class="fas fa-file-alt text-xs"></i> | |
</div> | |
<span class="text-sm">Content Extraction</span> | |
</div> | |
</div> | |
<div class="progress-step pending" id="step4"> | |
<div class="flex items-center"> | |
<div class="step-icon w-6 h-6 rounded-full flex items-center justify-center mr-2"> | |
<i class="fas fa-robot text-xs"></i> | |
</div> | |
<span class="text-sm">AI Analysis</span> | |
</div> | |
</div> | |
</div> | |
<!-- Multi-level progress bars --> | |
<div class="space-y-2 mb-2"> | |
<div> | |
<div class="flex justify-between text-xs text-gray-600 mb-1"> | |
<span>URL Discovery</span> | |
<span id="urlDiscoveryPercent">0%</span> | |
</div> | |
<div class="w-full bg-gray-200 rounded-full h-1.5"> | |
<div id="urlDiscoveryBar" class="progress-multi bg-blue-400 h-1.5 rounded-full" style="width: 0%"></div> | |
</div> | |
</div> | |
<div> | |
<div class="flex justify-between text-xs text-gray-600 mb-1"> | |
<span>Content Extraction</span> | |
<span id="contentExtractionPercent">0%</span> | |
</div> | |
<div class="w-full bg-gray-200 rounded-full h-1.5"> | |
<div id="contentExtractionBar" class="progress-multi bg-green-400 h-1.5 rounded-full" style="width: 0%"></div> | |
</div> | |
</div> | |
<div> | |
<div class="flex justify-between text-xs text-gray-600 mb-1"> | |
<span>AI Processing</span> | |
<span id="aiProcessingPercent">0%</span> | |
</div> | |
<div class="w-full bg-gray-200 rounded-full h-1.5"> | |
<div id="aiProcessingBar" class="progress-multi bg-purple-400 h-1.5 rounded-full" style="width: 0%"></div> | |
</div> | |
</div> | |
</div> | |
<!-- Main progress bar --> | |
<div class="mb-4"> | |
<div class="flex justify-between text-sm text-gray-600 mb-1"> | |
<span>Overall Progress: <span id="overallPercent">0%</span></span> | |
<span>Time Elapsed: <span id="timeElapsed">00:00</span></span> | |
</div> | |
<div class="w-full bg-gray-200 rounded-full h-2.5"> | |
<div id="progressBar" class="progress-bar bg-blue-600 h-2.5 rounded-full" style="width: 0%"></div> | |
</div> | |
</div> | |
</div> | |
<div class="bg-gray-50 p-4 rounded-lg"> | |
<div class="flex items-center mb-2"> | |
<div class="w-8 h-8 rounded-full bg-blue-100 flex items-center justify-center mr-3"> | |
<i class="fas fa-spider text-blue-600"></i> | |
</div> | |
<div class="flex-1"> | |
<p class="text-sm font-medium">Currently Crawling:</p> | |
<p id="currentUrl" class="text-sm text-gray-600 truncate">Waiting to start...</p> | |
</div> | |
</div> | |
<div class="flex items-center"> | |
<div class="w-8 h-8 rounded-full bg-purple-100 flex items-center justify-center mr-3"> | |
<i class="fas fa-robot text-purple-600"></i> | |
</div> | |
<div class="flex-1"> | |
<p class="text-sm font-medium">AI Analysis:</p> | |
<p id="aiAnalysis" class="text-sm text-gray-600">Ready to evaluate content quality</p> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<div class="grid grid-cols-1 lg:grid-cols-3 gap-6"> | |
<div class="lg:col-span-2"> | |
<div class="bg-white rounded-xl shadow-lg overflow-hidden"> | |
<div class="p-6"> | |
<h2 class="text-xl font-semibold text-gray-800 mb-4">Crawling Log</h2> | |
<div id="crawlLog" class="h-96 overflow-y-auto bg-gray-50 p-4 rounded-lg font-mono text-sm space-y-2"> | |
<div class="text-gray-500">System ready. Waiting for crawl to start...</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<div> | |
<div class="bg-white rounded-xl shadow-lg overflow-hidden"> | |
<div class="p-6"> | |
<h2 class="text-xl font-semibold text-gray-800 mb-4">Statistics</h2> | |
<div class="space-y-4"> | |
<div class="flex items-center justify-between p-3 bg-blue-50 rounded-lg"> | |
<div> | |
<p class="text-xs text-gray-500">Total URLs</p> | |
<p id="totalUrls" class="text-lg font-semibold">0</p> | |
</div> | |
<div class="p-2 bg-blue-100 rounded-full"> | |
<i class="fas fa-link text-blue-600"></i> | |
</div> | |
</div> | |
<div class="flex items-center justify-between p-3 bg-green-50 rounded-lg"> | |
<div> | |
<p class="text-xs text-gray-500">Valid Content</p> | |
<p id="validContent" class="text-lg font-semibold">0</p> | |
</div> | |
<div class="p-2 bg-green-100 rounded-full"> | |
<i class="fas fa-check-circle text-green-600"></i> | |
</div> | |
</div> | |
<div class="flex items-center justify-between p-3 bg-purple-50 rounded-lg"> | |
<div> | |
<p class="text-xs text-gray-500">AI Approved</p> | |
<p id="aiApproved" class="text-lg font-semibold">0</p> | |
</div> | |
<div class="p-2 bg-purple-100 rounded-full"> | |
<i class="fas fa-star text-purple-600"></i> | |
</div> | |
</div> | |
<div class="flex items-center justify-between p-3 bg-yellow-50 rounded-lg"> | |
<div> | |
<p class="text-xs text-gray-500">Avg. Score</p> | |
<p id="avgScore" class="text-lg font-semibold">0</p> | |
</div> | |
<div class="p-2 bg-yellow-100 rounded-full"> | |
<i class="fas fa-chart-line text-yellow-600"></i> | |
</div> | |
</div> | |
<div class="flex items-center justify-between p-3 bg-red-50 rounded-lg"> | |
<div> | |
<p class="text-xs text-gray-500">Errors</p> | |
<p id="errorCount" class="text-lg font-semibold">0</p> | |
</div> | |
<div class="p-2 bg-red-100 rounded-full"> | |
<i class="fas fa-exclamation-triangle text-red-600"></i> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<div id="resultsSection" class="hidden mt-8"> | |
<div class="bg-white rounded-xl shadow-lg overflow-hidden"> | |
<div class="p-6"> | |
<div class="flex justify-between items-center mb-6"> | |
<h2 class="text-xl font-semibold text-gray-800">Crawl Results</h2> | |
<div class="flex space-x-2"> | |
<button id="downloadResultsBtn" class="px-4 py-2 bg-green-600 text-white rounded-lg text-sm hover:bg-green-700"> | |
<i class="fas fa-download mr-1"></i> Download | |
</button> | |
<button id="clearResultsBtn" class="px-4 py-2 bg-gray-200 text-gray-700 rounded-lg text-sm hover:bg-gray-300"> | |
<i class="fas fa-trash mr-1"></i> Clear | |
</button> | |
</div> | |
</div> | |
<div class="border-b border-gray-200"> | |
<div class="flex space-x-4"> | |
<button id="tabSummary" class="tab-active px-4 py-2 text-sm font-medium">Summary</button> | |
<button id="tabContent" class="px-4 py-2 text-sm font-medium text-gray-500 hover:text-gray-700">Content</button> | |
<button id="tabJson" class="px-4 py-2 text-sm font-medium text-gray-500 hover:text-gray-700">JSON View</button> | |
</div> | |
</div> | |
<div id="summaryTab" class="py-4"> | |
<div class="grid grid-cols-1 md:grid-cols-3 gap-6 mb-6"> | |
<div class="bg-blue-50 p-4 rounded-lg"> | |
<h3 class="font-medium text-blue-800 mb-2">Crawl Overview</h3> | |
<ul class="space-y-2 text-sm"> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Start URL:</span> | |
<span id="summaryStartUrl" class="font-medium">-</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Total Pages:</span> | |
<span id="summaryTotalPages" class="font-medium">0</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Duration:</span> | |
<span id="summaryDuration" class="font-medium">0s</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Errors:</span> | |
<span id="summaryErrors" class="font-medium">0</span> | |
</li> | |
</ul> | |
</div> | |
<div class="bg-purple-50 p-4 rounded-lg"> | |
<h3 class="font-medium text-purple-800 mb-2">Content Analysis</h3> | |
<ul class="space-y-2 text-sm"> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Text Paragraphs:</span> | |
<span id="summaryText" class="font-medium">0</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Code Blocks:</span> | |
<span id="summaryCode" class="font-medium">0</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Tables:</span> | |
<span id="summaryTables" class="font-medium">0</span> | |
</li> | |
</ul> | |
</div> | |
<div class="bg-green-50 p-4 rounded-lg"> | |
<h3 class="font-medium text-green-800 mb-2">Quality Metrics</h3> | |
<ul class="space-y-2 text-sm"> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Avg. Quality Score:</span> | |
<span id="summaryAvgScore" class="font-medium">0</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Highest Score:</span> | |
<span id="summaryHighScore" class="font-medium">0</span> | |
</li> | |
<li class="flex justify-between"> | |
<span class="text-gray-600">Lowest Score:</span> | |
<span id="summaryLowScore" class="font-medium">0</span> | |
</li> | |
</ul> | |
</div> | |
</div> | |
<div class="mb-6"> | |
<h3 class="font-medium text-gray-800 mb-3">Top Keywords</h3> | |
<div id="keywordCloud" class="flex flex-wrap gap-2"> | |
<span class="px-3 py-1 bg-gray-100 rounded-full text-sm">No keywords extracted yet</span> | |
</div> | |
</div> | |
<div> | |
<h3 class="font-medium text-gray-800 mb-3">Best Content</h3> | |
<div id="topContent" class="space-y-4"> | |
<div class="p-4 bg-gray-50 rounded-lg text-sm text-gray-600"> | |
No content has been evaluated yet. Run a crawl to see results. | |
</div> | |
</div> | |
</div> | |
</div> | |
<div id="contentTab" class="py-4 hidden"> | |
<div class="mb-4"> | |
<div class="relative"> | |
<input type="text" id="contentSearch" placeholder="Search content..." | |
class="w-full pl-10 pr-4 py-2 border border-gray-300 rounded-lg focus:ring-blue-500 focus:border-blue-500"> | |
<div class="absolute inset-y-0 left-0 pl-3 flex items-center pointer-events-none"> | |
<i class="fas fa-search text-gray-400"></i> | |
</div> | |
</div> | |
</div> | |
<div id="contentResults" class="space-y-6"> | |
<!-- Content cards will be added here dynamically --> | |
</div> | |
<div id="contentPagination" class="flex justify-center mt-6 hidden"> | |
<nav class="inline-flex rounded-md shadow"> | |
<button class="px-3 py-1 rounded-l-md border border-gray-300 bg-white text-sm font-medium text-gray-700 hover:bg-gray-50"> | |
Previous | |
</button> | |
<button class="px-3 py-1 border-t border-b border-gray-300 bg-white text-sm font-medium text-blue-600 hover:bg-gray-50"> | |
1 | |
</button> | |
<button class="px-3 py-1 border border-gray-300 bg-white text-sm font-medium text-gray-700 hover:bg-gray-50 rounded-r-md"> | |
Next | |
</button> | |
</nav> | |
</div> | |
</div> | |
<div id="jsonTab" class="py-4 hidden"> | |
<div class="bg-gray-800 rounded-lg p-4"> | |
<div class="flex justify-between items-center mb-3"> | |
<span class="text-gray-300 font-mono text-sm">output.json</span> | |
<button id="copyJsonBtn" class="px-3 py-1 bg-gray-700 text-gray-300 rounded text-sm hover:bg-gray-600"> | |
<i class="fas fa-copy mr-1"></i> Copy | |
</button> | |
</div> | |
<pre id="jsonViewer" class="text-gray-300 font-mono text-sm overflow-x-auto p-4 bg-gray-900 rounded">{ | |
"message": "Run a crawl to see the JSON output here" | |
}</pre> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<footer class="bg-gray-100 py-6 mt-12"> | |
<div class="container mx-auto px-4 text-center text-gray-600 text-sm"> | |
<p>Smart Web Crawler - Extract and organize web content into structured knowledge</p> | |
<p class="mt-2">© 2023 AI Web Tools. All rights reserved.</p> | |
</div> | |
</footer> | |
<script> | |
// Global variables | |
let crawlData = { | |
startUrl: '', | |
startTime: null, | |
endTime: null, | |
pagesCrawled: 0, | |
contentSaved: 0, | |
totalUrls: 0, | |
validContent: 0, | |
aiApproved: 0, | |
totalScore: 0, | |
errorCount: 0, | |
results: [], | |
keywords: [], | |
stats: { | |
text: 0, | |
code: 0, | |
tables: 0, | |
lists: 0 | |
}, | |
logEntries: [] | |
}; | |
let isCrawling = false; | |
let crawlInterval; | |
let timerInterval; | |
let elapsedSeconds = 0; | |
// DOM elements | |
const progressSection = document.getElementById('progressSection'); | |
const resultsSection = document.getElementById('resultsSection'); | |
const startCrawlBtn = document.getElementById('startCrawlBtn'); | |
const stopCrawlBtn = document.getElementById('stopCrawlBtn'); | |
const crawlLog = document.getElementById('crawlLog'); | |
const currentUrl = document.getElementById('currentUrl'); | |
const aiAnalysis = document.getElementById('aiAnalysis'); | |
const pagesCrawled = document.getElementById('pagesCrawled'); | |
const contentSaved = document.getElementById('contentSaved'); | |
const progressBar = document.getElementById('progressBar'); | |
const totalUrls = document.getElementById('totalUrls'); | |
const validContent = document.getElementById('validContent'); | |
const aiApproved = document.getElementById('aiApproved'); | |
const avgScore = document.getElementById('avgScore'); | |
const errorCount = document.getElementById('errorCount'); | |
const timeElapsed = document.getElementById('timeElapsed'); | |
const overallPercent = document.getElementById('overallPercent'); | |
// Progress bars | |
const urlDiscoveryBar = document.getElementById('urlDiscoveryBar'); | |
const contentExtractionBar = document.getElementById('contentExtractionBar'); | |
const aiProcessingBar = document.getElementById('aiProcessingBar'); | |
const urlDiscoveryPercent = document.getElementById('urlDiscoveryPercent'); | |
const contentExtractionPercent = document.getElementById('contentExtractionPercent'); | |
const aiProcessingPercent = document.getElementById('aiProcessingPercent'); | |
// Progress steps | |
const step1 = document.getElementById('step1'); | |
const step2 = document.getElementById('step2'); | |
const step3 = document.getElementById('step3'); | |
const step4 = document.getElementById('step4'); | |
// Configuration elements | |
const baseUrlInput = document.getElementById('baseUrl'); | |
const maxDepthInput = document.getElementById('maxDepth'); | |
const maxConcurrentInput = document.getElementById('maxConcurrent'); | |
const extractTextCheckbox = document.getElementById('extractText'); | |
const extractCodeCheckbox = document.getElementById('extractCode'); | |
const extractTablesCheckbox = document.getElementById('extractTables'); | |
const extractListsCheckbox = document.getElementById('extractLists'); | |
const useLLMFilterCheckbox = document.getElementById('useLLMFilter'); | |
const minLLMScoreInput = document.getElementById('minLLMScore'); | |
const llmSettingsDiv = document.getElementById('llmSettings'); | |
const validateUrlBtn = document.getElementById('validateUrlBtn'); | |
// Results elements | |
const summaryStartUrl = document.getElementById('summaryStartUrl'); | |
const summaryTotalPages = document.getElementById('summaryTotalPages'); | |
const summaryDuration = document.getElementById('summaryDuration'); | |
const summaryErrors = document.getElementById('summaryErrors'); | |
const summaryText = document.getElementById('summaryText'); | |
const summaryCode = document.getElementById('summaryCode'); | |
const summaryTables = document.getElementById('summaryTables'); | |
const summaryAvgScore = document.getElementById('summaryAvgScore'); | |
const summaryHighScore = document.getElementById('summaryHighScore'); | |
const summaryLowScore = document.getElementById('summaryLowScore'); | |
const keywordCloud = document.getElementById('keywordCloud'); | |
const topContent = document.getElementById('topContent'); | |
const contentResults = document.getElementById('contentResults'); | |
const jsonViewer = document.getElementById('jsonViewer'); | |
const downloadResultsBtn = document.getElementById('downloadResultsBtn'); | |
const clearResultsBtn = document.getElementById('clearResultsBtn'); | |
const copyJsonBtn = document.getElementById('copyJsonBtn'); | |
const tabSummary = document.getElementById('tabSummary'); | |
const tabContent = document.getElementById('tabContent'); | |
const tabJson = document.getElementById('tabJson'); | |
// Initialize UI | |
document.addEventListener('DOMContentLoaded', function() { | |
// Show/hide LLM settings based on checkbox | |
useLLMFilterCheckbox.addEventListener('change', function() { | |
llmSettingsDiv.style.display = this.checked ? 'block' : 'none'; | |
}); | |
// Set default values | |
baseUrlInput.value = 'https://example.com'; | |
// Add event listeners | |
validateUrlBtn.addEventListener('click', validateUrl); | |
startCrawlBtn.addEventListener('click', startCrawling); | |
stopCrawlBtn.addEventListener('click', stopCrawling); | |
downloadResultsBtn.addEventListener('click', downloadResults); | |
clearResultsBtn.addEventListener('click', clearResults); | |
copyJsonBtn.addEventListener('click', copyJson); | |
tabSummary.addEventListener('click', () => switchTab('summary')); | |
tabContent.addEventListener('click', () => switchTab('content')); | |
tabJson.addEventListener('click', () => switchTab('json')); | |
}); | |
// Validate URL input | |
function validateUrl() { | |
const url = baseUrlInput.value.trim(); | |
const urlError = document.getElementById('urlError'); | |
if (!url.startsWith('http://') && !url.startsWith('https://')) { | |
urlError.classList.remove('hidden'); | |
baseUrlInput.classList.add('border-red-500'); | |
return false; | |
} else { | |
urlError.classList.add('hidden'); | |
baseUrlInput.classList.remove('border-red-500'); | |
return true; | |
} | |
} | |
// Start crawling simulation | |
function startCrawling() { | |
if (!validateUrl()) return; | |
// Reset data | |
crawlData = { | |
startUrl: baseUrlInput.value.trim(), | |
startTime: new Date(), | |
endTime: null, | |
pagesCrawled: 0, | |
contentSaved: 0, | |
totalUrls: 0, | |
validContent: 0, | |
aiApproved: 0, | |
totalScore: 0, | |
errorCount: 0, | |
results: [], | |
keywords: [], | |
stats: { | |
text: 0, | |
code: 0, | |
tables: 0, | |
lists: 0 | |
}, | |
logEntries: [] | |
}; | |
// Reset progress bars | |
progressBar.style.width = '0%'; | |
urlDiscoveryBar.style.width = '0%'; | |
contentExtractionBar.style.width = '0%'; | |
aiProcessingBar.style.width = '0%'; | |
urlDiscoveryPercent.textContent = '0%'; | |
contentExtractionPercent.textContent = '0%'; | |
aiProcessingPercent.textContent = '0%'; | |
overallPercent.textContent = '0%'; | |
// Reset progress steps | |
step1.className = 'progress-step active'; | |
step2.className = 'progress-step pending'; | |
step3.className = 'progress-step pending'; | |
step4.className = 'progress-step pending'; | |
// Reset timer | |
elapsedSeconds = 0; | |
updateTimer(); | |
clearInterval(timerInterval); | |
timerInterval = setInterval(updateTimer, 1000); | |
// Show progress section | |
progressSection.classList.remove('hidden'); | |
resultsSection.classList.add('hidden'); | |
// Update UI | |
startCrawlBtn.disabled = true; | |
isCrawling = true; | |
// Clear log | |
crawlLog.innerHTML = ''; | |
// Simulate crawling | |
crawlInterval = setInterval(simulateCrawlStep, 1000); | |
// Add initial log | |
addLogEntry('Starting crawl from: ' + crawlData.startUrl, 'info'); | |
addLogEntry('Configuration: Max Depth=' + maxDepthInput.value + | |
', Max Concurrent=' + maxConcurrentInput.value, 'info'); | |
// Update current URL | |
currentUrl.textContent = crawlData.startUrl; | |
// Simulate URL validation | |
setTimeout(() => { | |
step1.className = 'progress-step completed'; | |
step2.className = 'progress-step active'; | |
addLogEntry('URL validated successfully', 'success'); | |
updateProgressBar('urlDiscovery', 10); | |
}, 500); | |
} | |
// Update timer display | |
function updateTimer() { | |
elapsedSeconds++; | |
const minutes = Math.floor(elapsedSeconds / 60); | |
const seconds = elapsedSeconds % 60; | |
timeElapsed.textContent = `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; | |
} | |
// Update progress bars | |
function updateProgressBar(type, percent) { | |
let bar, percentElement; | |
switch(type) { | |
case 'urlDiscovery': | |
bar = urlDiscoveryBar; | |
percentElement = urlDiscoveryPercent; | |
break; | |
case 'contentExtraction': | |
bar = contentExtractionBar; | |
percentElement = contentExtractionPercent; | |
break; | |
case 'aiProcessing': | |
bar = aiProcessingBar; | |
percentElement = aiProcessingPercent; | |
break; | |
case 'overall': | |
bar = progressBar; | |
percentElement = overallPercent; | |
break; | |
default: | |
return; | |
} | |
bar.style.width = percent + '%'; | |
percentElement.textContent = percent + '%'; | |
// Calculate overall progress as average of the three bars | |
if (type !== 'overall') { | |
const urlPercent = parseInt(urlDiscoveryPercent.textContent); | |
const contentPercent = parseInt(contentExtractionPercent.textContent); | |
const aiPercent = parseInt(aiProcessingPercent.textContent); | |
const overall = Math.round((urlPercent + contentPercent + aiPercent) / 3); | |
updateProgressBar('overall', overall); | |
} | |
} | |
// Stop crawling | |
function stopCrawling() { | |
clearInterval(crawlInterval); | |
clearInterval(timerInterval); | |
isCrawling = false; | |
crawlData.endTime = new Date(); | |
// Update UI | |
startCrawlBtn.disabled = false; | |
addLogEntry('Crawl stopped by user', 'warning'); | |
aiAnalysis.textContent = 'Crawl stopped - analyzing results'; | |
// Mark all steps as completed or error | |
if (crawlData.errorCount > 0) { | |
step4.className = 'progress-step error'; | |
addLogEntry('Crawl completed with errors', 'error'); | |
} else { | |
step4.className = 'progress-step completed'; | |
addLogEntry('Crawl completed successfully', 'success'); | |
} | |
// Process results after a short delay | |
setTimeout(processResults, 500); | |
} | |
// Simulate a crawl step | |
function simulateCrawlStep() { | |
if (!isCrawling) return; | |
// Randomly decide if we're done | |
if (Math.random() < 0.1 && crawlData.pagesCrawled > 5) { | |
stopCrawling(); | |
return; | |
} | |
// Randomly generate errors (10% chance) | |
if (Math.random() < 0.1) { | |
const errorTypes = [ | |
'Connection timeout', | |
'SSL certificate error', | |
'404 Not Found', | |
'403 Forbidden', | |
'500 Server Error' | |
]; | |
const errorType = errorTypes[Math.floor(Math.random() * errorTypes.length)]; | |
const fakeUrl = generateFakeUrl(crawlData.startUrl); | |
crawlData.errorCount++; | |
errorCount.textContent = crawlData.errorCount; | |
addLogEntry(`Error crawling ${fakeUrl}: ${errorType}`, 'error'); | |
// Randomly fail a step if we have multiple errors | |
if (crawlData.errorCount > 2 && Math.random() < 0.3) { | |
const steps = [step2, step3, step4]; | |
const failedStep = steps[Math.floor(Math.random() * steps.length)]; | |
failedStep.className = 'progress-step error'; | |
addLogEntry(`Step failed: ${failedStep.querySelector('span').textContent}`, 'error'); | |
} | |
return; | |
} | |
// Simulate finding new URLs | |
const newUrls = Math.floor(Math.random() * 3) + 1; | |
crawlData.totalUrls += newUrls; | |
totalUrls.textContent = crawlData.totalUrls; | |
// Update URL discovery progress | |
if (crawlData.pagesCrawled < 5) { | |
const progress = Math.min(100, 10 + (crawlData.pagesCrawled / 5) * 90); | |
updateProgressBar('urlDiscovery', progress); | |
} | |
// Simulate crawling a page | |
crawlData.pagesCrawled++; | |
pagesCrawled.textContent = crawlData.pagesCrawled; | |
// Simulate URL being crawled | |
const fakeUrl = generateFakeUrl(crawlData.startUrl); | |
currentUrl.textContent = fakeUrl; | |
// Simulate AI analysis | |
const aiMessages = [ | |
"Analyzing content structure...", | |
"Evaluating content quality...", | |
"Checking for relevant information...", | |
"Identifying key concepts...", | |
"Filtering low-quality content..." | |
]; | |
aiAnalysis.textContent = aiMessages[Math.floor(Math.random() * aiMessages.length)]; | |
// Randomly decide if we found valid content | |
if (Math.random() > 0.3) { | |
crawlData.validContent++; | |
validContent.textContent = crawlData.validContent; | |
// Simulate content being saved | |
if (Math.random() > 0.5) { | |
crawlData.contentSaved++; | |
contentSaved.textContent = crawlData.contentSaved; | |
// Update content extraction progress | |
const contentProgress = Math.min(100, (crawlData.contentSaved / 10) * 100); | |
updateProgressBar('contentExtraction', contentProgress); | |
// Activate content extraction step if not already | |
if (step3.className.includes('pending')) { | |
step2.className = 'progress-step completed'; | |
step3.className = 'progress-step active'; | |
addLogEntry('Site mapping complete, starting content extraction', 'success'); | |
} | |
// Generate fake content | |
const contentTypes = ['text', 'code', 'table', 'list']; | |
const type = contentTypes[Math.floor(Math.random() * contentTypes.length)]; | |
// Update stats | |
if (type === 'text') crawlData.stats.text++; | |
if (type === 'code') crawlData.stats.code++; | |
if (type === 'table') crawlData.stats.tables++; | |
if (type === 'list') crawlData.stats.lists++; | |
// Generate fake score if LLM filter is enabled | |
let score = 0; | |
if (useLLMFilterCheckbox.checked) { | |
score = Math.floor(Math.random() * 41) + 60; // 60-100 | |
crawlData.totalScore += score; | |
if (score >= parseInt(minLLMScoreInput.value)) { | |
crawlData.aiApproved++; | |
aiApproved.textContent = crawlData.aiApproved; | |
// Update AI processing progress | |
const aiProgress = Math.min(100, (crawlData.aiApproved / 5) * 100); | |
updateProgressBar('aiProcessing', aiProgress); | |
// Activate AI processing step if not already | |
if (step4.className.includes('pending')) { | |
step3.className = 'progress-step completed'; | |
step4.className = 'progress-step active'; | |
addLogEntry('Content extraction complete, starting AI analysis', 'success'); | |
} | |
} | |
} | |
// Calculate average score | |
if (crawlData.aiApproved > 0) { | |
const avg = Math.round(crawlData.totalScore / crawlData.aiApproved); | |
avgScore.textContent = avg; | |
} | |
// Add log entry | |
addLogEntry(`Saved ${type} content from ${fakeUrl}` + | |
(useLLMFilterCheckbox.checked ? ` (AI Score: ${score})` : ''), 'success'); | |
// Add to results | |
const result = { | |
url: fakeUrl, | |
type: type, | |
content: generateFakeContent(type), | |
score: score, | |
keywords: generateFakeKeywords() | |
}; | |
crawlData.results.push(result); | |
// Add keywords to cloud | |
result.keywords.forEach(keyword => { | |
if (!crawlData.keywords.includes(keyword)) { | |
crawlData.keywords.push(keyword); | |
} | |
}); | |
} | |
} | |
// Add random log entries | |
if (Math.random() > 0.7) { | |
const logMessages = [ | |
{msg: `Found ${newUrls} new URLs to crawl`, type: 'info'}, | |
{msg: "Processing page content...", type: 'info'}, | |
{msg: "Extracting text paragraphs...", type: 'info'}, | |
{msg: "Identifying code blocks...", type: 'info'}, | |
{msg: "Parsing table structures...", type: 'info'}, | |
{msg: "Waiting for server response...", type: 'warning'}, | |
{msg: "Rate limit approaching, slowing down requests", type: 'warning'} | |
]; | |
const message = logMessages[Math.floor(Math.random() * logMessages.length)]; | |
addLogEntry(message.msg, message.type); | |
} | |
} | |
// Add log entry with type | |
function addLogEntry(message, type = 'info') { | |
const now = new Date(); | |
const timeStr = now.toLocaleTimeString(); | |
const entry = document.createElement('div'); | |
entry.className = `log-entry ${type} fade-in`; | |
entry.innerHTML = `<span class="text-gray-500">[${timeStr}]</span> ${message}`; | |
crawlLog.appendChild(entry); | |
crawlLog.scrollTop = crawlLog.scrollHeight; | |
// Add to crawl data | |
crawlData.logEntries.push({ | |
time: timeStr, | |
message: message, | |
type: type | |
}); | |
} | |
// Process results after crawl completes | |
function processResults() { | |
// Show results section | |
resultsSection.classList.remove('hidden'); | |
// Update summary | |
summaryStartUrl.textContent = crawlData.startUrl; | |
summaryTotalPages.textContent = crawlData.pagesCrawled; | |
summaryErrors.textContent = crawlData.errorCount; | |
const duration = Math.round((crawlData.endTime - crawlData.startTime) / 1000); | |
summaryDuration.textContent = duration + 's'; | |
summaryText.textContent = crawlData.stats.text; | |
summaryCode.textContent = crawlData.stats.code; | |
summaryTables.textContent = crawlData.stats.tables; | |
if (useLLMFilterCheckbox.checked && crawlData.aiApproved > 0) { | |
const avg = Math.round(crawlData.totalScore / crawlData.aiApproved); | |
summaryAvgScore.textContent = avg; | |
// Find high and low scores | |
let high = 0, low = 100; | |
crawlData.results.forEach(result => { | |
if (result.score > high) high = result.score; | |
if (result.score < low) low = result.score; | |
}); | |
summaryHighScore.textContent = high; | |
summaryLowScore.textContent = low; | |
} else { | |
summaryAvgScore.textContent = 'N/A'; | |
summaryHighScore.textContent = 'N/A'; | |
summaryLowScore.textContent = 'N/A'; | |
} | |
// Update keyword cloud | |
updateKeywordCloud(); | |
// Update top content | |
updateTopContent(); | |
// Update content results | |
updateContentResults(); | |
// Update JSON viewer | |
updateJsonViewer(); | |
} | |
// Update keyword cloud | |
function updateKeywordCloud() { | |
keywordCloud.innerHTML = ''; | |
if (crawlData.keywords.length === 0) { | |
keywordCloud.innerHTML = '<span class="px-3 py-1 bg-gray-100 rounded-full text-sm">No keywords extracted</span>'; | |
return; | |
} | |
// Show up to 12 keywords with random sizes | |
const shuffled = [...crawlData.keywords].sort(() => 0.5 - Math.random()); | |
const selected = shuffled.slice(0, Math.min(12, shuffled.length)); | |
selected.forEach(keyword => { | |
const sizes = ['text-xs', 'text-sm', 'text-base', 'text-lg']; | |
const size = sizes[Math.floor(Math.random() * sizes.length)]; | |
const colors = [ | |
'bg-blue-100 text-blue-800', | |
'bg-green-100 text-green-800', | |
'bg-purple-100 text-purple-800', | |
'bg-yellow-100 text-yellow-800', | |
'bg-red-100 text-red-800', | |
'bg-indigo-100 text-indigo-800' | |
]; | |
const color = colors[Math.floor(Math.random() * colors.length)]; | |
const el = document.createElement('span'); | |
el.className = `px-3 py-1 rounded-full ${size} ${color} font-medium`; | |
el.textContent = keyword; | |
keywordCloud.appendChild(el); | |
}); | |
} | |
// Update top content | |
function updateTopContent() { | |
topContent.innerHTML = ''; | |
if (crawlData.results.length === 0) { | |
topContent.innerHTML = ` | |
<div class="p-4 bg-gray-50 rounded-lg text-sm text-gray-600"> | |
No content has been evaluated yet. Run a crawl to see results. | |
</div> | |
`; | |
return; | |
} | |
// Sort by score (if available) or just take first few | |
const sorted = [...crawlData.results].sort((a, b) => b.score - a.score); | |
const top = sorted.slice(0, Math.min(3, sorted.length)); | |
top.forEach(result => { | |
const card = document.createElement('div'); | |
card.className = 'result-card bg-white border border-gray-200 rounded-lg p-4 hover:shadow-md transition-all'; | |
let contentPreview = ''; | |
if (result.type === 'text') { | |
contentPreview = result.content.substring(0, 150) + '...'; | |
} else if (result.type === 'code') { | |
contentPreview = result.content.split('\n')[0] + '...'; | |
} else if (result.type === 'table') { | |
contentPreview = 'Table with ' + result.content.rows.length + ' rows'; | |
} else if (result.type === 'list') { | |
contentPreview = 'List with ' + result.content.items.length + ' items'; | |
} | |
card.innerHTML = ` | |
<div class="flex justify-between items-start mb-2"> | |
<h4 class="font-medium text-blue-600">${result.type.charAt(0).toUpperCase() + result.type.slice(1)}</h4> | |
${useLLMFilterCheckbox.checked ? `<span class="px-2 py-1 bg-green-100 text-green-800 rounded-full text-xs">Score: ${result.score}</span>` : ''} | |
</div> | |
<p class="text-sm text-gray-600 mb-3">${contentPreview}</p> | |
<div class="flex justify-between items-center"> | |
<a href="${result.url}" target="_blank" class="text-xs text-blue-500 hover:underline">View source</a> | |
<div class="flex space-x-1"> | |
${result.keywords.slice(0, 2).map(k => `<span class="px-2 py-0.5 bg-gray-100 rounded-full text-xs">${k}</span>`).join('')} | |
</div> | |
</div> | |
`; | |
topContent.appendChild(card); | |
}); | |
} | |
// Update content results | |
function updateContentResults() { | |
contentResults.innerHTML = ''; | |
if (crawlData.results.length === 0) { | |
contentResults.innerHTML = ` | |
<div class="p-8 text-center text-gray-500"> | |
<i class="fas fa-inbox text-4xl mb-2"></i> | |
<p>No content has been extracted yet.</p> | |
</div> | |
`; | |
return; | |
} | |
crawlData.results.forEach(result => { | |
const card = document.createElement('div'); | |
card.className = 'result-card bg-white border border-gray-200 rounded-lg p-4 hover:shadow-md transition-all fade-in'; | |
let contentDisplay = ''; | |
if (result.type === 'text') { | |
contentDisplay = `<p class="text-gray-700">${result.content}</p>`; | |
} else if (result.type === 'code') { | |
contentDisplay = ` | |
<div class="code-block rounded-lg p-3 overflow-x-auto"> | |
<pre><code>${result.content}</code></pre> | |
</div> | |
`; | |
} else if (result.type === 'table') { | |
contentDisplay = ` | |
<div class="overflow-x-auto"> | |
<table class="min-w-full border"> | |
<thead> | |
<tr class="bg-gray-100"> | |
${result.content.headers.map(h => `<th class="px-4 py-2 text-left border">${h}</th>`).join('')} | |
</tr> | |
</thead> | |
<tbody> | |
${result.content.rows.map(row => ` | |
<tr> | |
${row.map(cell => `<td class="px-4 py-2 border">${cell}</td>`).join('')} | |
</tr> | |
`).join('')} | |
</tbody> | |
</table> | |
</div> | |
`; | |
} else if (result.type === 'list') { | |
contentDisplay = ` | |
<ul class="list-disc pl-5 space-y-1"> | |
${result.content.items.map(item => `<li>${item}</li>`).join('')} | |
</ul> | |
`; | |
} | |
card.innerHTML = ` | |
<div class="flex justify-between items-start mb-3"> | |
<div> | |
<h4 class="font-medium text-blue-600">${result.type.charAt(0).toUpperCase() + result.type.slice(1)}</h4> | |
<a href="${result.url}" target="_blank" class="text-xs text-gray-500 hover:underline">${result.url}</a> | |
</div> | |
${useLLMFilterCheckbox.checked ? `<span class="px-2 py-1 bg-green-100 text-green-800 rounded-full text-xs">Score: ${result.score}</span>` : ''} | |
</div> | |
${contentDisplay} | |
<div class="mt-3 pt-2 border-t border-gray-100"> | |
<div class="flex flex-wrap gap-1"> | |
${result.keywords.map(k => `<span class="px-2 py-0.5 bg-gray-100 rounded-full text-xs">${k}</span>`).join('')} | |
</div> | |
</div> | |
`; | |
contentResults.appendChild(card); | |
}); | |
} | |
// Update JSON viewer | |
function updateJsonViewer() { | |
const output = { | |
metadata: { | |
source: crawlData.startUrl, | |
pages: crawlData.pagesCrawled, | |
duration: Math.round((crawlData.endTime - crawlData.startTime) / 1000) + 's', | |
created: new Date().toISOString(), | |
errors: crawlData.errorCount | |
}, | |
content: crawlData.results.map(r => ({ | |
url: r.url, | |
type: r.type, | |
content: r.type === 'table' ? { headers: r.content.headers, rows: r.content.rows } : r.content, | |
score: r.score, | |
keywords: r.keywords | |
})) | |
}; | |
jsonViewer.textContent = JSON.stringify(output, null, 2); | |
} | |
// Switch between tabs | |
function switchTab(tab) { | |
document.getElementById('summaryTab').classList.add('hidden'); | |
document.getElementById('contentTab').classList.add('hidden'); | |
document.getElementById('jsonTab').classList.add('hidden'); | |
document.getElementById('tabSummary').classList.remove('tab-active'); | |
document.getElementById('tabContent').classList.remove('tab-active'); | |
document.getElementById('tabJson').classList.remove('tab-active'); | |
document.getElementById('tabSummary').classList.add('text-gray-500'); | |
document.getElementById('tabContent').classList.add('text-gray-500'); | |
document.getElementById('tabJson').classList.add('text-gray-500'); | |
document.getElementById(tab + 'Tab').classList.remove('hidden'); | |
document.getElementById('tab' + tab.charAt(0).toUpperCase() + tab.slice(1)).classList.add('tab-active'); | |
document.getElementById('tab' + tab.charAt(0).toUpperCase() + tab.slice(1)).classList.remove('text-gray-500'); | |
} | |
// Download results | |
function downloadResults() { | |
const format = document.querySelector('input[name="outputFormat"]:checked').value; | |
const blob = format === 'json' ? | |
new Blob([JSON.stringify(crawlData.results, null, 2)], { type: 'application/json' }) : | |
new Blob([generateMarkdownOutput()], { type: 'text/markdown' }); | |
const url = URL.createObjectURL(blob); | |
const a = document.createElement('a'); | |
a.href = url; | |
a.download = `crawl_results_${new Date().toISOString().slice(0, 10)}.${format}`; | |
document.body.appendChild(a); | |
a.click(); | |
document.body.removeChild(a); | |
URL.revokeObjectURL(url); | |
} | |
// Generate markdown output | |
function generateMarkdownOutput() { | |
let md = `# Web Crawl Results\n\n`; | |
md += `- **Source**: ${crawlData.startUrl}\n`; | |
md += `- **Pages Crawled**: ${crawlData.pagesCrawled}\n`; | |
md += `- **Content Saved**: ${crawlData.contentSaved}\n`; | |
md += `- **Errors**: ${crawlData.errorCount}\n`; | |
md += `- **Date**: ${new Date().toISOString()}\n\n`; | |
crawlData.results.forEach(result => { | |
md += `## ${result.url}\n\n`; | |
md += `- **Type**: ${result.type}\n`; | |
if (useLLMFilterCheckbox.checked) { | |
md += `- **AI Score**: ${result.score}\n`; | |
} | |
md += `- **Keywords**: ${result.keywords.join(', ')}\n\n`; | |
if (result.type === 'text') { | |
md += `${result.content}\n\n`; | |
} else if (result.type === 'code') { | |
md += `\`\`\`\n${result.content}\n\`\`\`\n\n`; | |
} else if (result.type === 'table') { | |
md += `| ${result.content.headers.join(' | ')} |\n`; | |
md += `| ${result.content.headers.map(() => '---').join(' | ')} |\n`; | |
result.content.rows.forEach(row => { | |
md += `| ${row.join(' | ')} |\n`; | |
}); | |
md += '\n'; | |
} else if (result.type === 'list') { | |
result.content.items.forEach(item => { | |
md += `- ${item}\n`; | |
}); | |
md += '\n'; | |
} | |
md += '---\n\n'; | |
}); | |
return md; | |
} | |
// Copy JSON to clipboard | |
function copyJson() { | |
navigator.clipboard.writeText(jsonViewer.textContent) | |
.then(() => { | |
const copyBtn = document.querySelector('#jsonTab button'); | |
copyBtn.innerHTML = '<i class="fas fa-check mr-1"></i> Copied!'; | |
setTimeout(() => { | |
copyBtn.innerHTML = '<i class="fas fa-copy mr-1"></i> Copy'; | |
}, 2000); | |
}); | |
} | |
// Clear results | |
function clearResults() { | |
if (confirm('Are you sure you want to clear all results?')) { | |
crawlData = { | |
startUrl: '', | |
startTime: null, | |
endTime: null, | |
pagesCrawled: 0, | |
contentSaved: 0, | |
totalUrls: 0, | |
validContent: 0, | |
aiApproved: 0, | |
totalScore: 0, | |
errorCount: 0, | |
results: [], | |
keywords: [], | |
stats: { | |
text: 0, | |
code: 0, | |
tables: 0, | |
lists: 0 | |
}, | |
logEntries: [] | |
}; | |
updateKeywordCloud(); | |
updateTopContent(); | |
updateContentResults(); | |
updateJsonViewer(); | |
// Reset summary | |
summaryStartUrl.textContent = '-'; | |
summaryTotalPages.textContent = '0'; | |
summaryDuration.textContent = '0s'; | |
summaryErrors.textContent = '0'; | |
summaryText.textContent = '0'; | |
summaryCode.textContent = '0'; | |
summaryTables.textContent = '0'; | |
summaryAvgScore.textContent = '0'; | |
summaryHighScore.textContent = '0'; | |
summaryLowScore.textContent = '0'; | |
} | |
} | |
// Helper functions | |
function generateFakeUrl(baseUrl) { | |
const paths = [ | |
'about', 'contact', 'products', 'services', 'blog', | |
'article', 'docs', 'tutorial', 'guide', 'faq' | |
]; | |
const extensions = ['', '.html', '.php', '/']; | |
const path = paths[Math.floor(Math.random() * paths.length)]; | |
const ext = extensions[Math.floor(Math.random() * extensions.length)]; | |
const query = Math.random() > 0.7 ? '?id=' + Math.floor(Math.random() * 1000) : ''; | |
return baseUrl + '/' + path + ext + query; | |
} | |
function generateFakeContent(type) { | |
if (type === 'text') { | |
const paragraphs = [ | |
"The quick brown fox jumps over the lazy dog. This sentence contains all the letters in the English alphabet.", | |
"Web crawling is an essential technique for gathering information from websites. It involves systematically browsing the web to index and collect data.", | |
"Artificial intelligence is transforming many industries by automating complex tasks and providing insights from large datasets.", | |
"The future of technology lies in the convergence of AI, blockchain, and IoT, creating smarter and more connected systems.", | |
"Responsive web design ensures that websites adapt to different screen sizes and devices, providing optimal viewing experiences." | |
]; | |
return paragraphs[Math.floor(Math.random() * paragraphs.length)]; | |
} else if (type === 'code') { | |
const languages = ['javascript', 'python', 'html', 'css', 'java']; | |
const language = languages[Math.floor(Math.random() * languages.length)]; | |
if (language === 'javascript') { | |
return `function greet(name) {\n return "Hello, " + name + "!";\n}\n\nconst message = greet("World");\nconsole.log(message);`; | |
} else if (language === 'python') { | |
return `def factorial(n):\n if n == 0:\n return 1\n else:\n return n * factorial(n-1)\n\nprint(factorial(5))`; | |
} else if (language === 'html') { | |
return `<!DOCTYPE html>\n<html>\n<head>\n <title>Example</title>\n</head>\n<body>\n <h1>Hello World</h1>\n<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - 🧬 <a href="https://enzostvs-deepsite.hf.space?remix=gewei20/smart-web-crawler" style="color: #fff;text-decoration: underline;" target="_blank" >Remix</a></p></body>\n</html> |