Nymbo commited on
Commit
fef0301
·
verified ·
1 Parent(s): b7ce93f

Create index.html

Browse files
Files changed (1) hide show
  1. index.html +704 -0
index.html ADDED
@@ -0,0 +1,704 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <!--
3
+ File: micro-apps/text-tools/url-formatter-cleaner-with-titles-v2.html
4
+ Purpose: Extract only regular (non-image) HTTP/HTTPS links from large pasted text,
5
+ and (optionally) fetch page titles to emit Markdown links WITHOUT bullets
6
+ and WITHOUT any leading "Title:" prefixes.
7
+
8
+ Notes:
9
+ - Single-file Micro-App (no build step, no local assets).
10
+ - In-line comments explain each section in plain language.
11
+ - Title fetching is best-effort: many sites block cross-origin reads (CORS).
12
+ • We first try a normal fetch.
13
+ • If that fails and the "Use proxy" option is ON, we fall back to https://r.jina.ai/<URL>
14
+ which returns a readability-friendly text version that often contains the page title.
15
+ • We sanitize titles to strip "Title:" and leading headings/bullets.
16
+ • If everything fails, we fall back to the URL's hostname as the title.
17
+ -->
18
+ <html lang="en">
19
+ <head>
20
+ <meta charset="UTF-8" />
21
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
22
+ <title>URL Cleaner + Title Fetcher — Markdown Links (No Bullets)</title>
23
+ <style>
24
+ /* =============== Theme tokens (easy color tweaking) =============== */
25
+ :root{
26
+ --bg:#0f1115; /* page background (dark) */
27
+ --panel:#161a23; /* cards/panels */
28
+ --panel-2:#0d1118; /* headers / subpanels */
29
+ --text:#e5e7eb; /* main text */
30
+ --muted:#9aa3b2; /* secondary text */
31
+ --accent:#5b9cff; /* primary accent */
32
+ --accent-2:#2f6fe6; /* accent hover */
33
+ --border:#222838; /* subtle borders */
34
+ --good:#22c55e; /* success green */
35
+ --warn:#f59e0b; /* warning amber */
36
+ --bad:#ef4444; /* danger red */
37
+ --radius-lg:16px; /* big rounded corners */
38
+ --radius:12px; /* normal rounded corners */
39
+ --pad:14px; /* base padding */
40
+ --shadow:0 18px 50px rgba(0,0,0,.35);
41
+ --mono: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
42
+ --sans: Inter, ui-sans-serif, system-ui, Segoe UI, Roboto, Helvetica, Arial;
43
+ }
44
+ /* Optional light theme (toggle button) */
45
+ [data-theme="light"]{
46
+ --bg:#f6f7fb;
47
+ --panel:#ffffff;
48
+ --panel-2:#f2f5fb;
49
+ --text:#0f172a;
50
+ --muted:#5b6474;
51
+ --accent:#2563eb;
52
+ --accent-2:#1e40af;
53
+ --border:#e6e8ee;
54
+ --shadow:0 18px 50px rgba(0,0,0,.08);
55
+ }
56
+
57
+ /* =============== Page reset and layout shell =============== */
58
+ *{box-sizing:border-box}
59
+ html,body{height:100%}
60
+ body{
61
+ margin:0; background:var(--bg); color:var(--text);
62
+ font-family:var(--sans); -webkit-font-smoothing:antialiased; -moz-osx-font-smoothing:grayscale;
63
+ }
64
+
65
+ header{
66
+ position:sticky; top:0; z-index:10;
67
+ display:flex; align-items:center; gap:10px;
68
+ padding:var(--pad);
69
+ background:linear-gradient(180deg, rgba(255,255,255,.04), rgba(255,255,255,0)) , var(--bg);
70
+ border-bottom:1px solid var(--border);
71
+ -webkit-backdrop-filter: blur(6px);
72
+ backdrop-filter: blur(6px);
73
+ }
74
+ .brand{
75
+ display:flex; align-items:center; gap:10px;
76
+ background:var(--panel); border:1px solid var(--border);
77
+ padding:8px 12px; border-radius:10px; box-shadow:var(--shadow); font-weight:700;
78
+ }
79
+ .brand svg{color:var(--accent)}
80
+ .btn{
81
+ appearance:none; border:1px solid var(--border);
82
+ background:var(--panel); color:var(--text);
83
+ border-radius:10px; padding:10px 12px; cursor:pointer;
84
+ transition: transform .12s ease, border-color .12s ease, background .12s ease;
85
+ }
86
+ .btn:hover{transform:translateY(-1px); border-color:var(--accent)}
87
+ .btn.primary{background:var(--accent); border-color:var(--accent); color:#fff}
88
+ .btn.primary:hover{background:var(--accent-2); border-color:var(--accent-2)}
89
+ .btn[disabled]{opacity:.6; cursor:not-allowed; transform:none}
90
+ .pill{
91
+ display:inline-flex; align-items:center; gap:6px; font-size:12px; color:var(--muted);
92
+ border:1px solid var(--border); background:var(--panel); padding:6px 10px; border-radius:999px;
93
+ }
94
+ .kbd{
95
+ font-family:var(--mono); font-size:12px; padding:2px 6px;
96
+ border:1px solid var(--border); border-bottom-width:3px; border-radius:6px; background:var(--panel-2);
97
+ }
98
+
99
+ main{
100
+ padding:var(--pad);
101
+ display:grid; gap:var(--pad);
102
+ grid-template-columns: 1fr 1fr; /* two columns: input | output */
103
+ }
104
+ @media (max-width: 1024px){ main{grid-template-columns:1fr} }
105
+
106
+ .card{
107
+ background:var(--panel); border:1px solid var(--border);
108
+ border-radius:var(--radius-lg); box-shadow:var(--shadow); overflow:hidden;
109
+ display:grid; grid-template-rows:auto 1fr auto;
110
+ }
111
+ .card-header{
112
+ background:var(--panel-2); border-bottom:1px solid var(--border);
113
+ padding:var(--pad); display:flex; align-items:center; justify-content:space-between; gap:10px;
114
+ }
115
+ .card-body{ padding:var(--pad); overflow:auto }
116
+ .card-footer{ padding:var(--pad); border-top:1px solid var(--border); background:var(--panel-2); display:flex; gap:8px; flex-wrap:wrap }
117
+
118
+ h1,h2,h3{margin:0}
119
+ .muted{color:var(--muted)}
120
+ .row{display:flex; gap:8px; align-items:center}
121
+ .spacer{flex:1}
122
+ .hidden{display:none}
123
+
124
+ textarea{
125
+ width:100%; min-height:320px; resize:vertical;
126
+ border-radius:12px; border:1px solid var(--border);
127
+ background:var(--panel-2); color:var(--text); padding:12px;
128
+ font-family:var(--mono); font-size:14px; line-height:1.45;
129
+ }
130
+ .stats{ display:flex; gap:12px; flex-wrap:wrap; font-size:13px; color:var(--muted) }
131
+ .stats b{color:var(--text)}
132
+ .options{ display:flex; gap:14px; flex-wrap:wrap; align-items:center; font-size:14px; color:var(--muted) }
133
+ label.switch{
134
+ display:inline-flex; gap:8px; align-items:center; cursor:pointer;
135
+ background:var(--panel-2); padding:6px 10px; border:1px solid var(--border); border-radius:10px;
136
+ }
137
+ input[type="checkbox"]{ accent-color: var(--accent) }
138
+
139
+ /* Small toast notification (copy / status messages) */
140
+ .toast{
141
+ position:fixed; bottom:16px; right:16px; z-index:50;
142
+ background:var(--panel); border:1px solid var(--border); border-left:6px solid var(--good);
143
+ padding:10px 12px; border-radius:10px; box-shadow:var(--shadow);
144
+ opacity:0; transform:translateY(8px); transition:opacity .18s ease, transform .18s ease;
145
+ max-width: 60ch; white-space: pre-wrap;
146
+ }
147
+ .toast.show{opacity:1; transform:translateY(0)}
148
+ .spinner{ inline-size:14px; block-size:14px; border:2px solid var(--border); border-top-color:var(--accent); border-radius:50%; animation:spin 1s linear infinite }
149
+ @keyframes spin{ to{ transform:rotate(360deg) } }
150
+ </style>
151
+ </head>
152
+ <body>
153
+ <!-- ========= Top bar with title and quick actions ========= -->
154
+ <header>
155
+ <div class="brand" role="img" aria-label="URL Cleaner">
156
+ <!-- tiny logo dot -->
157
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="none" aria-hidden="true">
158
+ <circle cx="12" cy="12" r="8" fill="currentColor"></circle>
159
+ </svg>
160
+ URL Cleaner
161
+ </div>
162
+ <span class="pill">Paste → Clean URLs → Markdown with titles (no bullets)</span>
163
+ <div class="spacer"></div>
164
+ <button id="themeToggle" class="btn" title="Toggle theme (dark/light)">Toggle Theme</button>
165
+ </header>
166
+
167
+ <!-- ========= Main two-panel layout: Input | Output ========= -->
168
+ <main id="app" data-theme="dark">
169
+ <!-- ========== Left: Input panel ========== -->
170
+ <section class="card">
171
+ <div class="card-header">
172
+ <h3>Input Text</h3>
173
+ <div class="row">
174
+ <button id="btnPaste" class="btn" title="Paste from clipboard">Paste</button>
175
+ <span class="pill"><span class="kbd">Ctrl</span>/<span class="kbd">Cmd</span> + <span class="kbd">Enter</span> clean · <span class="kbd">Shift</span> + <span class="kbd">Enter</span> titles</span>
176
+ </div>
177
+ </div>
178
+ <div class="card-body">
179
+ <!-- Where you paste long/messy text with URLs -->
180
+ <textarea id="inputArea" placeholder="Paste any text here (descriptions, HTML/Markdown, etc.)"></textarea>
181
+ </div>
182
+ <div class="card-footer">
183
+ <!-- Simple options that tweak cleaning behavior -->
184
+ <div class="options">
185
+ <label class="switch" title="Remove UTM/fbclid/tracking params from kept links">
186
+ <input type="checkbox" id="optStripTracking" checked />
187
+ <span>Strip tracking params</span>
188
+ </label>
189
+ <label class="switch" title="Remove exact-duplicate links (case-insensitive)">
190
+ <input type="checkbox" id="optDedupe" checked />
191
+ <span>De-duplicate</span>
192
+ </label>
193
+ <label class="switch" title="Only keep http/https (ignore other schemes)">
194
+ <input type="checkbox" id="optHttpOnly" checked />
195
+ <span>HTTP(S) only</span>
196
+ </label>
197
+ <label class="switch" title="Fallback to a CORS-friendly proxy when fetching titles (best compatibility)">
198
+ <input type="checkbox" id="optProxy" checked />
199
+ <span>Use proxy for titles</span>
200
+ </label>
201
+ </div>
202
+ <div class="spacer"></div>
203
+ <button id="btnClean" class="btn primary" title="Extract non-image links (Ctrl/Cmd+Enter)">Clean Links</button>
204
+ </div>
205
+ </section>
206
+
207
+ <!-- ========== Right: Output panel ========== -->
208
+ <section class="card">
209
+ <div class="card-header">
210
+ <h3>Output</h3>
211
+ <div class="row">
212
+ <span id="counts" class="pill" aria-live="polite">0 kept · 0 removed</span>
213
+ <span id="loading" class="pill hidden"><span class="spinner" aria-hidden="true"></span> <span id="loadingText">Fetching…</span></span>
214
+ <button id="btnTitles" class="btn" title="Fetch titles and output Markdown (no bullets)">Titles → Markdown</button>
215
+ <button id="btnAddSpacing" class="btn" title="Insert a blank line between each link in the output">Add spacing</button>
216
+ <button id="btnRemoveSpacing" class="btn" title="Remove extra blank lines between links">Remove spacing</button>
217
+ <button id="btnSortDomain" class="btn" title="Sort by domain (group similar domains)">Sort by domain</button>
218
+ <button id="btnCopy" class="btn" title="Copy all to clipboard">Copy All</button>
219
+ <button id="btnDownload" class="btn" title="Download as .txt">Download</button>
220
+ </div>
221
+ </div>
222
+ <div class="card-body">
223
+ <!-- Read-only output (either URLs or Markdown list) -->
224
+ <textarea id="outputArea" placeholder="Your cleaned list (or Markdown) will appear here"></textarea>
225
+ </div>
226
+ <div class="card-footer">
227
+ <div class="stats" id="stats">
228
+ <div><b>Found:</b> <span id="statFound">0</span> links</div>
229
+ <div><b>Removed images:</b> <span id="statImg">0</span></div>
230
+ <div><b>Removed dupes:</b> <span id="statDupes">0</span></div>
231
+ <div><b>Kept:</b> <span id="statKept">0</span></div>
232
+ <div><b>Titles resolved:</b> <span id="statTitles">0</span></div>
233
+ </div>
234
+ </div>
235
+ </section>
236
+ </main>
237
+
238
+ <!-- Small toast element for feedback (copy / errors / summaries) -->
239
+ <div id="toast" class="toast" role="status" aria-live="polite"></div>
240
+
241
+ <script>
242
+ // ===========================
243
+ // Helper shortcuts (tiny DOM utilities)
244
+ // ===========================
245
+ const qs = (s, el=document) => el.querySelector(s); // select one element by CSS selector
246
+ const qsa = (s, el=document) => [...el.querySelectorAll(s)]; // select many elements
247
+ const on = (el, ev, fn, opts) => el.addEventListener(ev, fn, opts); // attach event listener
248
+
249
+ // ===========================
250
+ // URL detection and cleaning utilities
251
+ // ===========================
252
+
253
+ // Find raw http/https URLs in text (keeps anything until whitespace or obvious break)
254
+ // Layman's terms: look for "http://..." or "https://..." and grab characters until a space or quote.
255
+ const URL_REGEX = /https?:\/\/[^\s<>"'`)+\]}]+/gi;
256
+
257
+ // Direct image file extensions
258
+ const IMAGE_EXT = /\.(?:png|jpe?g|gif|bmp|webp|tiff?|svg|avif|heic|heif)$/i;
259
+
260
+ // Query params that hint an image format
261
+ const IMAGE_HINT_PARAMS = ['format','fm','image','img','ext'];
262
+
263
+ // Tracking params to strip when enabled
264
+ const TRACKING_PARAMS = [
265
+ 'utm_source','utm_medium','utm_campaign','utm_term','utm_content',
266
+ 'utm_name','utm_id','utm_creative_format','utm_creative_tactic',
267
+ 'gclid','fbclid','mc_cid','mc_eid','igshid','msclkid','vero_conv','vero_id'
268
+ ];
269
+
270
+ // Trim trailing punctuation (like ")" or ",") and angle brackets around URLs
271
+ function trimTrailingPunctuation(url){
272
+ url = url.replace(/^<+|>+$/g, '');
273
+ const TRAIL = /[),.;:'"\]\}>]+$/;
274
+ while (TRAIL.test(url)) {
275
+ if (url.endsWith(')')) {
276
+ const open = (url.match(/\(/g)||[]).length;
277
+ const close = (url.match(/\)/g)||[]).length;
278
+ if (close <= open) break;
279
+ }
280
+ url = url.replace(TRAIL, '');
281
+ }
282
+ return url;
283
+ }
284
+
285
+ // Decide if a URL points directly to an image file
286
+ function isDirectImageURL(href){
287
+ if (/^(data|blob):/i.test(href)) return true;
288
+ let u; try { u = new URL(href); } catch { return false; }
289
+ if (!/^https?:$/i.test(u.protocol)) return false;
290
+ if (IMAGE_EXT.test(u.pathname)) return true;
291
+ for (const key of IMAGE_HINT_PARAMS) {
292
+ const val = (u.searchParams.get(key) || '').toLowerCase();
293
+ if (/(?:png|jpe?g|gif|bmp|webp|tiff?|svg|avif|heic|heif)/.test(val)) return true;
294
+ }
295
+ return false;
296
+ }
297
+
298
+ // Optionally remove common tracking parameters from a URL
299
+ function stripTrackingParams(href){
300
+ let u; try { u = new URL(href); } catch { return href; }
301
+ for (const p of TRACKING_PARAMS) u.searchParams.delete(p);
302
+ return u.toString().replace(/\?$/, '').replace(/#$/, '');
303
+ }
304
+
305
+ // ===========================
306
+ // Title parsing + normalization helpers
307
+ // ===========================
308
+
309
+ // Decode HTML entities like &amp; to &
310
+ function decodeEntities(str){ const t=document.createElement('textarea'); t.innerHTML=str; return t.value; }
311
+
312
+ // Normalize a title: trim, collapse whitespace, remove common prefixes like "Title: " or leading hashes/bullets
313
+ function normalizeTitle(s){
314
+ if (!s) return '';
315
+ s = decodeEntities(String(s)).replace(/[\r\n]+/g, ' ').replace(/\s+/g, ' ').trim();
316
+ s = s.replace(/^(?:title\s*:\s*)/i, ''); // drop leading "Title: " (case-insensitive)
317
+ s = s.replace(/^[#>\-\s]+/, ''); // drop leading Markdown headings/bullets/arrows
318
+ s = s.slice(0, 160); // keep titles reasonably short
319
+ return s;
320
+ }
321
+
322
+ // Escape [] and backslashes in Markdown link text to avoid breaking syntax
323
+ function markdownEscapeText(s){
324
+ return String(s).replace(/[\\\[\]]/g, m => `\\${m}`);
325
+ }
326
+
327
+ // Basic regex escape
328
+ function rxEscape(s){ return String(s).replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); }
329
+
330
+ // Extract a site core name from hostname (drop subdomains/TLDs where possible)
331
+ function hostCore(hostname){
332
+ const h = (hostname || '').replace(/^www\.|^m\.|^amp\.|^news\./i, '');
333
+ const parts = h.split('.');
334
+ if (parts.length <= 2) return parts[0] || h;
335
+ // choose the longest label as a heuristic core
336
+ return parts.reduce((a,b)=> (b.length>a.length?b:a), parts[0]);
337
+ }
338
+
339
+ // Strip trailing/leading site/brand suffixes like " - The Verge" or "| NYTimes"
340
+ function stripSiteSuffix(title, hostname, siteName){
341
+ let out = String(title || '');
342
+ if (!out) return '';
343
+ const core = hostCore(hostname);
344
+ const candidates = [siteName, core, hostname && hostname.replace(/^www\./i,'')].filter(Boolean)
345
+ .map(s => normalizeTitle(String(s))
346
+ .replace(/\.(com|net|org|io|dev|app|news|co|uk|de|jp|fr|es|it|nl|ru|br|au)$/i, ''))
347
+ .filter(Boolean)
348
+ .map(rxEscape);
349
+ if (!candidates.length) return out;
350
+
351
+ const SEP = '\\s*(?:[-–—|:•»]\\s*)';
352
+ const suffixRe = new RegExp(`${SEP}(?:${candidates.join('|')})\\s*$`, 'i');
353
+ const prefixRe = new RegExp(`^(?:${candidates.join('|')})${SEP}`, 'i');
354
+
355
+ let prev;
356
+ do { prev = out; out = out.replace(suffixRe, '').replace(prefixRe, ''); } while (out !== prev);
357
+ return out.trim();
358
+ }
359
+
360
+ // Derive a human-ish title from the URL slug as a last resort
361
+ function slugTitleFromURL(href){
362
+ try{
363
+ const u = new URL(href);
364
+ const segs = u.pathname.split('/').filter(Boolean);
365
+ let cand = segs.length ? segs[segs.length-1] : '';
366
+ if (!cand && segs.length > 1) cand = segs[segs.length-2];
367
+ cand = cand.replace(/\.(html?|php|aspx?)$/i, '');
368
+ cand = decodeURIComponent(cand).replace(/[\-_+]+/g, ' ');
369
+ if (/^\d+$/.test(cand) || cand.length < 3) return hostCore(u.hostname);
370
+ return normalizeTitle(cand);
371
+ }catch{ return ''; }
372
+ }
373
+
374
+ // Pull a reasonable title from HTML markup using DOMParser with multiple sources
375
+ function parseTitleFromHTML(html, hostname){
376
+ if (!html) return '';
377
+ let doc;
378
+ try { doc = new DOMParser().parseFromString(html, 'text/html'); } catch { doc = null; }
379
+ if (!doc) return '';
380
+ const pick = (sel) => {
381
+ const el = doc.querySelector(sel);
382
+ return el ? normalizeTitle(el.getAttribute('content') || el.textContent || '') : '';
383
+ };
384
+ const siteName = pick('meta[property="og:site_name"], meta[name="og:site_name"]');
385
+ const cands = [
386
+ pick('meta[property="og:title"], meta[name="og:title"]'),
387
+ pick('meta[name="twitter:title"], meta[property="twitter:title"], meta[name="title"], meta[property="title"]'),
388
+ pick('h1, h1[itemprop="headline"], [data-test-id="post-title"]'),
389
+ normalizeTitle(doc.title || '')
390
+ ].filter(Boolean);
391
+ let best = cands.find(Boolean) || '';
392
+ best = stripSiteSuffix(best, hostname, siteName);
393
+ return normalizeTitle(best);
394
+ }
395
+
396
+ // Pull a decent title from a plaintext article (proxy output). Skip obvious boilerplate.
397
+ function parseTitleFromPlain(text){
398
+ if (!text) return '';
399
+ const bad = /cookie|consent|subscribe|sign\s*in|log\s*in|adblock|advert|privacy|terms|404|not\s*found|redirecting/i;
400
+ const first = (text.split(/\r?\n/)
401
+ .map(s=>s.trim())
402
+ .filter(Boolean)
403
+ .find(s => s.length >= 3 && s.length <= 180 && !bad.test(s)) || '');
404
+ return normalizeTitle(first);
405
+ }
406
+
407
+ // ===========================
408
+ // Core cleaning logic
409
+ // ===========================
410
+
411
+ function cleanLinksFromText(raw, opts){
412
+ if (!raw) return { kept: [], found: 0, removedImages: 0, removedDupes: 0 };
413
+ const matches = raw.match(URL_REGEX) || [];
414
+ const foundCount = matches.length;
415
+ let links = matches.map(m => trimTrailingPunctuation(m));
416
+ if (opts.httpOnly) links = links.filter(h => /^https?:\/\//i.test(h));
417
+
418
+ const kept = []; let removedImages = 0;
419
+ for (const href of links) { if (isDirectImageURL(href)) removedImages++; else kept.push(href); }
420
+
421
+ let cleaned = kept.map(h => opts.stripTracking ? stripTrackingParams(h) : h);
422
+
423
+ let removedDupes = 0;
424
+ if (opts.dedupe) {
425
+ const seen = new Set(); const uniq = [];
426
+ for (const h of cleaned) { const key = h.toLowerCase(); if (!seen.has(key)) { seen.add(key); uniq.push(h); } else { removedDupes++; } }
427
+ cleaned = uniq;
428
+ }
429
+
430
+ return { kept: cleaned, found: foundCount, removedImages, removedDupes };
431
+ }
432
+
433
+ // ===========================
434
+ // Title fetching with concurrency + proxy fallback
435
+ // ===========================
436
+
437
+ // Promise with timeout so one slow site doesn't block everything
438
+ function withTimeout(promise, ms){
439
+ return Promise.race([ promise, new Promise((_, rej)=> setTimeout(()=> rej(new Error('timeout')), ms)) ]);
440
+ }
441
+
442
+ // Try to fetch a page's title. Strategy: direct fetch → proxy → fallback to hostname
443
+ // Simple in-session title cache
444
+ const titleCache = new Map();
445
+ (function hydrateCache(){
446
+ try{ const raw = sessionStorage.getItem('titleCacheV1'); if (raw) {
447
+ const obj = JSON.parse(raw); for (const [k,v] of Object.entries(obj)) titleCache.set(k, v);
448
+ }}catch{}
449
+ })();
450
+ function cacheGet(key){ return titleCache.get(key); }
451
+ function cacheSet(key, value){ try{ titleCache.set(key, value); sessionStorage.setItem('titleCacheV1', JSON.stringify(Object.fromEntries(titleCache))); }catch{} }
452
+
453
+ async function fetchTitleForURL(url, { useProxy=true, timeoutMs=10000 }={}){
454
+ const u = new URL(url);
455
+ const fallbackHost = u.hostname.replace(/^www\./, '');
456
+
457
+ const cached = cacheGet(url);
458
+ if (cached) return { url, title: cached, source: 'cache' };
459
+
460
+ try {
461
+ const res = await withTimeout(fetch(url, { mode:'cors', redirect:'follow', credentials:'omit' }), timeoutMs);
462
+ if (res && res.ok) {
463
+ const ct = res.headers.get('content-type') || '';
464
+ if (/text\/html|application\/(xhtml\+xml|html)/i.test(ct) || !ct) {
465
+ const html = await res.text();
466
+ const t = parseTitleFromHTML(html, u.hostname);
467
+ if (t) { cacheSet(url, t); return { url, title: t, source: 'direct' }; }
468
+ }
469
+ }
470
+ } catch (_) { /* swallow and try proxy */ }
471
+
472
+ if (useProxy) {
473
+ try {
474
+ const res2 = await withTimeout(fetch('https://r.jina.ai/' + url, { redirect:'follow' }), timeoutMs);
475
+ if (res2 && res2.ok) {
476
+ const text = await res2.text();
477
+ // r.jina.ai returns plain text; first try plain heuristics, then attempt HTML parse in case markup survived
478
+ let maybe = parseTitleFromPlain(text);
479
+ if (!maybe) maybe = parseTitleFromHTML(text, u.hostname);
480
+ if (!maybe) maybe = slugTitleFromURL(url) || fallbackHost;
481
+ if (maybe) { cacheSet(url, maybe); return { url, title: maybe, source: 'proxy' }; }
482
+ }
483
+ } catch (_) { /* ignore */ }
484
+ }
485
+
486
+ const last = slugTitleFromURL(url) || fallbackHost;
487
+ cacheSet(url, last);
488
+ return { url, title: last, source: 'fallback' };
489
+ }
490
+
491
+ // Map with limited concurrency
492
+ async function mapWithConcurrency(items, limit, mapper){
493
+ const results = new Array(items.length); let i = 0, active = 0;
494
+ return await new Promise((resolve)=>{
495
+ function launch(){
496
+ while (active < limit && i < items.length){
497
+ const idx = i++; active++;
498
+ Promise.resolve(mapper(items[idx], idx))
499
+ .then(v => { results[idx] = { status:'fulfilled', value:v }; })
500
+ .catch(e => { results[idx] = { status:'rejected', reason:e }; })
501
+ .finally(()=>{ active--; if (i < items.length) launch(); else if (active === 0) resolve(results); });
502
+ }
503
+ }
504
+ launch();
505
+ });
506
+ }
507
+
508
+ // ===========================
509
+ // UI wiring
510
+ // ===========================
511
+ const app = qs('#app');
512
+ const inputArea = qs('#inputArea');
513
+ const outputArea = qs('#outputArea');
514
+ const btnClean = qs('#btnClean');
515
+ const btnTitles = qs('#btnTitles');
516
+ const btnCopy = qs('#btnCopy');
517
+ const btnDownload= qs('#btnDownload');
518
+ const btnPaste = qs('#btnPaste');
519
+ const btnAddSpacing = qs('#btnAddSpacing');
520
+ const btnRemoveSpacing = qs('#btnRemoveSpacing');
521
+ const btnSortDomain = qs('#btnSortDomain');
522
+ const themeToggle= qs('#themeToggle');
523
+
524
+ const optStrip = qs('#optStripTracking');
525
+ const optDedupe = qs('#optDedupe');
526
+ const optHttp = qs('#optHttpOnly');
527
+ const optProxy = qs('#optProxy');
528
+
529
+ const counts = qs('#counts');
530
+ const statFound = qs('#statFound');
531
+ const statImg = qs('#statImg');
532
+ const statDupes = qs('#statDupes');
533
+ const statKept = qs('#statKept');
534
+ const statTitles = qs('#statTitles');
535
+ const toastEl = qs('#toast');
536
+ const loadingEl = qs('#loading');
537
+ const loadingTxt = qs('#loadingText');
538
+
539
+ let currentKept = [];
540
+
541
+ function showToast(msg='Copied!'){
542
+ toastEl.textContent = msg; toastEl.classList.add('show'); setTimeout(()=> toastEl.classList.remove('show'), 1600);
543
+ }
544
+
545
+ on(themeToggle, 'click', () => {
546
+ const next = app.getAttribute('data-theme') === 'dark' ? 'light' : 'dark';
547
+ app.setAttribute('data-theme', next);
548
+ });
549
+
550
+ on(btnPaste, 'click', async () => {
551
+ try { const txt = await navigator.clipboard.readText(); inputArea.value = txt || inputArea.value; inputArea.focus(); }
552
+ catch { showToast('Clipboard read blocked by browser'); }
553
+ });
554
+
555
+ on(btnClean, 'click', () => runClean());
556
+
557
+ // Keyboard shortcuts: Ctrl/Cmd + Enter to clean; +Shift to fetch titles → Markdown (no bullets)
558
+ on(document, 'keydown', (e) => {
559
+ const meta = e.ctrlKey || e.metaKey;
560
+ if (meta && e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); runClean(); }
561
+ if (meta && e.key === 'Enter' && e.shiftKey) { e.preventDefault(); runTitlesToMarkdown(); }
562
+ });
563
+
564
+ on(btnCopy, 'click', async () => {
565
+ const txt = outputArea.value || '';
566
+ try { await navigator.clipboard.writeText(txt); showToast('Output copied!'); }
567
+ catch { outputArea.focus(); outputArea.select(); showToast('Press Ctrl+C to copy'); }
568
+ });
569
+
570
+ on(btnDownload, 'click', () => {
571
+ const blob = new Blob([outputArea.value || ''], {type:'text/plain'});
572
+ const url = URL.createObjectURL(blob);
573
+ const a = document.createElement('a'); a.href = url; a.download = 'links-or-markdown.txt'; a.click();
574
+ URL.revokeObjectURL(url);
575
+ });
576
+
577
+ on(btnTitles, 'click', () => runTitlesToMarkdown());
578
+
579
+ // Insert exactly one empty line between each non-empty line in the output
580
+ on(btnAddSpacing, 'click', () => {
581
+ const text = outputArea.value || '';
582
+ if (!text.trim()) { showToast('No output to space'); return; }
583
+ const lines = text.split(/\r?\n/).filter(l => l.trim() !== '');
584
+ if (!lines.length) { showToast('No output to space'); return; }
585
+ outputArea.value = lines.join('\n\n');
586
+ showToast('Added blank lines between links');
587
+ });
588
+
589
+ // Remove excess blank lines (compact to one line per link)
590
+ on(btnRemoveSpacing, 'click', () => {
591
+ const text = outputArea.value || '';
592
+ if (!text.trim()) { showToast('No output to trim'); return; }
593
+ const lines = text.split(/\r?\n/).filter(l => l.trim() !== '');
594
+ if (!lines.length) { showToast('No output to trim'); return; }
595
+ outputArea.value = lines.join('\n');
596
+ showToast('Removed extra blank lines');
597
+ });
598
+
599
+ // Sort output lines by domain name (hostname), grouping similar domains
600
+ on(btnSortDomain, 'click', () => {
601
+ const text = outputArea.value || '';
602
+ if (!text.trim()) { showToast('No output to sort'); return; }
603
+ const lines = text.split(/\r?\n/).filter(l => l.trim() !== '');
604
+ if (!lines.length) { showToast('No output to sort'); return; }
605
+
606
+ const URL_IN_LINE = /https?:\/\/[^\s<>"'`)+\]}]+/i;
607
+ function extractUrl(line){
608
+ const md = line.match(/\[[^\]]*\]\((https?:[^\s)]+)\)/i);
609
+ if (md) return md[1];
610
+ const m = line.match(URL_IN_LINE);
611
+ return m ? m[0] : '';
612
+ }
613
+ function hostKeyFromUrl(u){
614
+ try { const { hostname } = new URL(u); return (hostname || '').replace(/^www\./i,'').toLowerCase(); } catch { return ''; }
615
+ }
616
+
617
+ const enriched = lines.map((line, idx) => {
618
+ const url = extractUrl(line);
619
+ const hostKey = hostKeyFromUrl(url);
620
+ return { line, idx, hostKey };
621
+ });
622
+
623
+ enriched.sort((a,b) => {
624
+ if (a.hostKey && b.hostKey){
625
+ const c = a.hostKey.localeCompare(b.hostKey);
626
+ if (c !== 0) return c;
627
+ return a.line.localeCompare(b.line);
628
+ }
629
+ if (a.hostKey && !b.hostKey) return -1;
630
+ if (!a.hostKey && b.hostKey) return 1;
631
+ return a.line.localeCompare(b.line);
632
+ });
633
+
634
+ outputArea.value = enriched.map(e => e.line).join('\n');
635
+ showToast('Sorted by domain');
636
+ });
637
+
638
+ function setLoading(isOn, text){
639
+ loadingEl.classList.toggle('hidden', !isOn);
640
+ loadingTxt.textContent = text || 'Fetching…';
641
+ btnTitles.disabled = !!isOn; btnClean.disabled = !!isOn; btnCopy.disabled = !!isOn; btnDownload.disabled = !!isOn; btnAddSpacing.disabled = !!isOn; btnSortDomain.disabled = !!isOn;
642
+ btnRemoveSpacing.disabled = !!isOn;
643
+ }
644
+
645
+ // Clean → update UI state
646
+ function runClean(){
647
+ const opts = { stripTracking: !!optStrip.checked, dedupe: !!optDedupe.checked, httpOnly: !!optHttp.checked };
648
+ const res = cleanLinksFromText(inputArea.value || '', opts);
649
+ currentKept = res.kept.slice();
650
+ outputArea.value = res.kept.join('\n');
651
+ statFound.textContent = String(res.found);
652
+ statImg.textContent = String(res.removedImages);
653
+ statDupes.textContent = String(res.removedDupes);
654
+ statKept.textContent = String(res.kept.length);
655
+ statTitles.textContent= '0';
656
+ counts.textContent = `${res.kept.length} kept · ${res.removedImages + res.removedDupes} removed`;
657
+ }
658
+
659
+ // Fetch titles for the current list, then print Markdown lines: [Title](URL) (no bullets)
660
+ async function runTitlesToMarkdown(){
661
+ if (!currentKept.length) runClean();
662
+ const urls = currentKept.slice();
663
+ if (!urls.length){ showToast('Nothing to process — paste text and Clean first'); return; }
664
+
665
+ setLoading(true, `Fetching titles (0/${urls.length})`);
666
+ statTitles.textContent = '0';
667
+
668
+ const CONCURRENCY = 6; // a small pool keeps things fast without overloading
669
+ let resolved = 0, titleHits = 0;
670
+
671
+ const results = await mapWithConcurrency(urls, CONCURRENCY, async (u) => {
672
+ const r = await fetchTitleForURL(u, { useProxy: !!optProxy.checked, timeoutMs: 10000 });
673
+ resolved++; if (r && r.title) titleHits++;
674
+ if (resolved % 1 === 0) setLoading(true, `Fetching titles (${resolved}/${urls.length})`);
675
+ return r;
676
+ });
677
+
678
+ // Build Markdown lines without bullets
679
+ const lines = results.map((r, i) => {
680
+ let url = urls[i];
681
+ let title = '';
682
+ if (r && r.status === 'fulfilled' && r.value) {
683
+ url = r.value.url || url;
684
+ title = r.value.title || '';
685
+ }
686
+ let safeTitle = normalizeTitle(title);
687
+ if (!safeTitle) safeTitle = slugTitleFromURL(url) || new URL(url).hostname.replace(/^www\./, '');
688
+ return `[${markdownEscapeText(safeTitle)}](${url})`;
689
+ });
690
+
691
+ outputArea.value = lines.join('\n');
692
+ statTitles.textContent = String(titleHits);
693
+ counts.textContent = `${urls.length} links → Markdown (no bullets, titles resolved: ${titleHits})`;
694
+ setLoading(false);
695
+
696
+ const proxyNote = optProxy.checked ? '\n(Proxy ON for better compatibility)' : '';
697
+ showToast(`Markdown ready for ${urls.length} link(s). Titles resolved: ${titleHits}${proxyNote}`);
698
+ }
699
+
700
+ // Auto-run if input already has content (e.g., after refresh)
701
+ if ((qs('#inputArea').value||'').trim()) runClean();
702
+ </script>
703
+ </body>
704
+ </html>