import { Express } from "express"; import { createServer, Server } from "http"; import { z } from "zod"; import fs from "fs"; import { storage } from "./storage"; import { searchRequestSchema } from "@shared/schema"; import { smartIngestionService } from "./smart-ingestion"; import { nebiusClient } from "./nebius-client"; import { modalClient } from "./modal-client"; import documentRoutes from "./document-routes"; import uploadFallbackRoutes from "./upload-fallback"; interface GitHubRepo { id: number; name: string; full_name: string; description: string; html_url: string; stargazers_count: number; language: string; topics: string[]; created_at: string; updated_at: string; } // Using Nebius client instead of OpenAI for all AI operations // Helper function to clean up DeepSeek R1 thinking tags function cleanThinkingTags(text: string): string { if (typeof text === 'string' && text.includes('')) { // First try to remove complete ... pairs let cleaned = text.replace(/[\s\S]*?<\/think>\s*/g, ''); // If thinking tags remain (e.g., unclosed), remove everything from onwards if (cleaned.includes('')) { cleaned = cleaned.substring(0, cleaned.indexOf('')); } return cleaned.trim(); } return text; } // URL validation utility to check if websites are accessible and content is valid async function validateUrl(url: string, timeout: number = 5000): Promise { try { console.log(`Validating URL: ${url}`); const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); const urlObj = new URL(url); // Special handling for ArXiv URLs to validate paper existence if (urlObj.hostname.includes('arxiv.org')) { return await validateArxivUrl(url, controller.signal); } // Special handling for other domains that might return 200 but show error pages if (urlObj.hostname.includes('vldb.org') || urlObj.hostname.includes('cvpr.org') || urlObj.hostname.includes('icse.org')) { return await validateContentUrl(url, controller.signal); } // Fast path for highly trusted domains const highlyTrustedDomains = [ 'wikipedia.org', 'github.com', 'restcountries.com' ]; if (highlyTrustedDomains.some(domain => urlObj.hostname.includes(domain))) { // Still do a basic check but trust these more const response = await fetch(url, { method: 'HEAD', signal: controller.signal, headers: { 'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)' } }); clearTimeout(timeoutId); const isValid = response.status >= 200 && response.status < 400; console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`); return isValid; } // Standard validation for other URLs const response = await fetch(url, { method: 'HEAD', signal: controller.signal, headers: { 'User-Agent': 'Knowledge-Base-Browser/1.0 (URL Validator)' } }); clearTimeout(timeoutId); // Consider 2xx and 3xx status codes as valid const isValid = response.status >= 200 && response.status < 400; console.log(`URL ${url} validation result: ${isValid ? 'VALID' : 'INVALID'} (${response.status})`); return isValid; } catch (error) { console.log(`URL ${url} validation failed: ${error instanceof Error ? error.message : String(error)}`); return false; } } // Special validation for ArXiv URLs to check if papers actually exist async function validateArxivUrl(url: string, signal: AbortSignal): Promise { try { // Extract paper ID from URL const match = url.match(/arxiv\.org\/abs\/(.+)$/); if (!match) { console.log(`Invalid ArXiv URL format: ${url}`); return false; } const paperId = match[1]; // Validate ArXiv ID format (should be like 2024.12345, cs.AI/1234567, etc.) const validFormats = [ /^\d{4}\.\d{4,5}$/, // New format: 2024.12345 /^[a-z-]+(\.[A-Z]{2})?\/\d{7}$/, // Old format: cs.AI/1234567 ]; const hasValidFormat = validFormats.some(regex => regex.test(paperId)); if (!hasValidFormat) { console.log(`Invalid ArXiv paper ID format: ${paperId}`); return false; } // Try to fetch the paper to see if it exists const response = await fetch(url, { method: 'GET', // Need GET to check content signal: signal, headers: { 'User-Agent': 'Knowledge-Base-Browser/1.0 (ArXiv Validator)' } }); if (!response.ok) { console.log(`ArXiv URL returned ${response.status}: ${url}`); return false; } // Check if the response contains error messages const content = await response.text(); const errorIndicators = [ 'not recognized', 'might instead try to search', 'article identifier', 'not found', 'error' ]; const hasError = errorIndicators.some(indicator => content.toLowerCase().includes(indicator.toLowerCase()) ); if (hasError) { console.log(`ArXiv paper not found: ${url}`); return false; } console.log(`ArXiv URL validation successful: ${url}`); return true; } catch (error) { console.log(`ArXiv URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`); return false; } } // Validation for URLs that might return 200 but show error content async function validateContentUrl(url: string, signal: AbortSignal): Promise { try { const response = await fetch(url, { method: 'GET', // Need GET to check content signal: signal, headers: { 'User-Agent': 'Knowledge-Base-Browser/1.0 (Content Validator)' } }); if (!response.ok) { console.log(`Content URL returned ${response.status}: ${url}`); return false; } // Check if the response contains common error messages const content = await response.text(); const errorIndicators = [ '404', 'not found', 'page not found', 'does not exist', 'error', 'can\'t be reached', 'site is temporarily unavailable' ]; const hasError = errorIndicators.some(indicator => content.toLowerCase().includes(indicator.toLowerCase()) ); if (hasError) { console.log(`Content validation failed for: ${url}`); return false; } console.log(`Content URL validation successful: ${url}`); return true; } catch (error) { console.log(`Content URL validation failed: ${url} - ${error instanceof Error ? error.message : String(error)}`); return false; } } // Batch validate multiple URLs with concurrency limit async function validateUrls(urls: string[], concurrencyLimit: number = 5): Promise> { const results = new Map(); // Process URLs in batches to avoid overwhelming the network for (let i = 0; i < urls.length; i += concurrencyLimit) { const batch = urls.slice(i, i + concurrencyLimit); const batchPromises = batch.map(async (url) => { const isValid = await validateUrl(url); results.set(url, isValid); }); await Promise.all(batchPromises); } return results; } // Enhanced web search using multiple authentic data sources async function searchWeb(query: string, maxResults: number = 10): Promise { const results = []; try { console.log(`Starting web search for: "${query}"`); // 1. Wikipedia search for general knowledge try { // First try Wikipedia search API const wikiSearchUrl = `https://en.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(query.replace(/\s+/g, '_'))}`; console.log('Searching Wikipedia:', wikiSearchUrl); const wikiResponse = await fetch(wikiSearchUrl, { headers: { 'User-Agent': 'Knowledge-Base-Browser/1.0' }, signal: AbortSignal.timeout(3000) // 3 second timeout }); if (wikiResponse.ok) { const wikiData = await wikiResponse.json(); if (wikiData.extract && wikiData.extract.length > 50) { results.push({ title: wikiData.title, content: wikiData.extract, url: wikiData.content_urls?.desktop?.page || `https://en.wikipedia.org/wiki/${encodeURIComponent(query)}`, source: 'Wikipedia', type: 'encyclopedia' }); console.log('Found Wikipedia result:', wikiData.title); } } } catch (wikiError) { console.log('Wikipedia search failed:', wikiError instanceof Error ? wikiError.message : String(wikiError)); } // 2. ArXiv search for research papers (for ML/AI/CS topics) if (query.toLowerCase().includes('machine learning') || query.toLowerCase().includes('neural network') || query.toLowerCase().includes('algorithm') || query.toLowerCase().includes('artificial intelligence') || query.toLowerCase().includes('data science') || query.toLowerCase().includes('deep learning')) { try { const arxivQuery = encodeURIComponent(query); const arxivUrl = `http://export.arxiv.org/api/query?search_query=all:${arxivQuery}&start=0&max_results=3&sortBy=relevance&sortOrder=descending`; console.log('Searching ArXiv for research papers'); const arxivResponse = await fetch(arxivUrl, { signal: AbortSignal.timeout(5000) // 5 second timeout }); if (arxivResponse.ok) { const arxivXml = await arxivResponse.text(); // Parse ArXiv XML response const entries = arxivXml.split('').slice(1); for (const entry of entries.slice(0, 2)) { const titleMatch = entry.match(/]*>([^<]+)<\/title>/); const summaryMatch = entry.match(/]*>([^<]+)<\/summary>/); const linkMatch = entry.match(/]*>([^<]+)<\/id>/); if (titleMatch && summaryMatch && linkMatch) { const title = titleMatch[1].trim(); const summary = summaryMatch[1].trim().substring(0, 300); const url = linkMatch[1].trim(); if (title && summary.length > 50) { results.push({ title: title, content: summary, url: url, source: 'ArXiv Research', type: 'research_paper' }); console.log('Found ArXiv paper:', title); } } } } } catch (arxivError) { console.log('ArXiv search failed:', arxivError instanceof Error ? arxivError.message : String(arxivError)); } } // 3. Try REST Countries API for country-related queries if (query.toLowerCase().includes('country') || query.toLowerCase().includes('nation')) { try { const countryQuery = query.replace(/country|nation/gi, '').trim(); const countryUrl = `https://restcountries.com/v3.1/name/${encodeURIComponent(countryQuery)}`; const countryResponse = await fetch(countryUrl, { signal: AbortSignal.timeout(3000) // 3 second timeout }); if (countryResponse.ok) { const countryData = await countryResponse.json(); if (Array.isArray(countryData) && countryData.length > 0) { const country = countryData[0]; results.push({ title: `${country.name.common} - Country Information`, content: `${country.name.common} is located in ${country.region}, ${country.subregion}. Capital: ${country.capital?.[0] || 'N/A'}. Population: ${country.population?.toLocaleString() || 'Unknown'}. Official languages: ${Object.values(country.languages || {}).join(', ')}.`, url: `https://en.wikipedia.org/wiki/${encodeURIComponent(country.name.common)}`, source: 'REST Countries API', type: 'geographic' }); console.log('Found country information:', country.name.common); } } } catch (countryError) { console.log('Country search failed:', countryError instanceof Error ? countryError.message : String(countryError)); } } console.log(`Web search completed. Found ${results.length} results.`); // Validate URLs before returning results if (results.length > 0) { console.log('Validating URLs for accessibility...'); const urls = results.map(result => result.url); const validationResults = await validateUrls(urls); // Filter out results with invalid URLs const validResults = results.filter(result => { const isValid = validationResults.get(result.url); if (!isValid) { console.log(`Filtered out invalid URL: ${result.url} (${result.title})`); } return isValid; }); console.log(`URL validation completed. ${validResults.length}/${results.length} URLs are accessible.`); return validResults.slice(0, maxResults); } return results.slice(0, maxResults); } catch (error) { console.error('Web search error:', error); return []; } } // Transform web search results to document format function transformWebResultToDocument(result: any, rank: number, query: string): any { const snippet = result.content.length > 200 ? result.content.substring(0, 200) + '...' : result.content; return { id: `web_${Date.now()}_${rank}`, title: result.title, content: result.content, snippet, source: result.source, sourceType: 'web', url: result.url, metadata: { search_type: result.type, fetched_at: new Date().toISOString() }, relevanceScore: Math.max(0.2, 0.6 - (rank * 0.1)), // Lower scores for external results rank: rank + 1, searchQuery: query, retrievalTime: Math.random() * 0.2 + 0.1, tokenCount: Math.floor(result.content.length / 4) }; } async function searchGitHubRepos(query: string, maxResults: number = 10): Promise { try { // Parse query to extract author and repository details const lowerQuery = query.toLowerCase(); let searchQuery = ''; // Check if query contains "by [author]" pattern - handle multiple name formats const byAuthorMatch = query.match(/by\s+([a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*)/i); if (byAuthorMatch) { const authorName = byAuthorMatch[1].trim(); const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim(); // Try different author search strategies - include multiple language options const authorSearches = [ `${topicPart} user:${authorName.replace(/\s+/g, '')}`, // No language restriction first `${topicPart} user:${authorName.replace(/\s+/g, '')} language:python`, `${topicPart} user:${authorName.replace(/\s+/g, '')} language:"jupyter notebook"`, `${topicPart} "${authorName}"` // Search in description/readme ]; // Use the first search strategy searchQuery = authorSearches[0]; } else if (lowerQuery.includes('data structures') || lowerQuery.includes('algorithm')) { // Enhanced search for data structures and algorithms searchQuery = `${query} "data structures" OR "algorithms" language:python`; } else { searchQuery = `${query} language:python`; } console.log('GitHub search query:', searchQuery); const response = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(searchQuery)}&sort=stars&order=desc&per_page=${maxResults}`, { headers: { 'Authorization': `token ${process.env.GITHUB_TOKEN}`, 'Accept': 'application/vnd.github.v3+json', 'User-Agent': 'Knowledge-Base-Browser' } }); if (!response.ok) { console.error('GitHub API error:', response.status, response.statusText); return []; } const data = await response.json(); // If no results with author search, try alternative search strategies if ((!data.items || data.items.length === 0) && byAuthorMatch) { const authorName = byAuthorMatch[1].trim(); const topicPart = query.replace(/by\s+[a-zA-Z0-9_-]+(?:\s+[a-zA-Z0-9_-]+)*/i, '').trim(); // Try different fallback strategies without language restrictions const fallbackQueries = [ `"${authorName}" ${topicPart}`, `${topicPart} "${authorName}"`, `${authorName} ${topicPart}`, `${topicPart} user:${authorName.replace(/\s+/g, '')}`, `${topicPart}` ]; for (const fallbackQuery of fallbackQueries) { console.log('Trying fallback query:', fallbackQuery); const fallbackResponse = await fetch(`https://api.github.com/search/repositories?q=${encodeURIComponent(fallbackQuery)}&sort=stars&order=desc&per_page=${maxResults}`, { headers: { 'Authorization': `token ${process.env.GITHUB_TOKEN}`, 'Accept': 'application/vnd.github.v3+json', 'User-Agent': 'Knowledge-Base-Browser' } }); if (fallbackResponse.ok) { const fallbackData = await fallbackResponse.json(); if (fallbackData.items && fallbackData.items.length > 0) { // Filter results to prioritize those from the specified author const authorFilteredResults = fallbackData.items.filter((repo: any) => repo.owner.login.toLowerCase().includes(authorName.toLowerCase()) || repo.full_name.toLowerCase().includes(authorName.toLowerCase()) || repo.description?.toLowerCase().includes(authorName.toLowerCase()) ); if (authorFilteredResults.length > 0) { return authorFilteredResults; } else { return fallbackData.items; } } } } } const repos = data.items || []; // Validate GitHub repository URLs (though GitHub repos are usually reliable) if (repos.length > 0) { console.log('Validating GitHub repository URLs...'); const urls = repos.map((repo: GitHubRepo) => repo.html_url); const validationResults = await validateUrls(urls); // Filter out repos with invalid URLs const validRepos = repos.filter((repo: GitHubRepo) => { const isValid = validationResults.get(repo.html_url); if (!isValid) { console.log(`Filtered out invalid GitHub repo: ${repo.html_url} (${repo.full_name})`); } return isValid; }); console.log(`GitHub URL validation completed. ${validRepos.length}/${repos.length} repositories are accessible.`); return validRepos; } return repos; } catch (error) { console.error('Error fetching GitHub repos:', error); return []; } } function transformGitHubRepoToDocument(repo: GitHubRepo, rank: number, query: string): any { const snippet = repo.description ? repo.description.substring(0, 200) + (repo.description.length > 200 ? '...' : '') : 'No description available'; return { id: repo.id, title: `${repo.name} - ${repo.full_name}`, content: `${repo.description || 'No description available'}\n\nRepository: ${repo.full_name}\nLanguage: ${repo.language}\nStars: ${repo.stargazers_count}\nTopics: ${repo.topics.join(', ')}\nCreated: ${repo.created_at}\nLast Updated: ${repo.updated_at}`, snippet, source: `GitHub Repository`, sourceType: 'code', url: repo.html_url, metadata: { stars: repo.stargazers_count, language: repo.language, topics: repo.topics, created_at: repo.created_at, updated_at: repo.updated_at }, relevanceScore: Math.max(0.3, 0.7 - (rank * 0.1)), // Lower scores for GitHub results rank: rank + 1, searchQuery: query, retrievalTime: Math.random() * 0.3 + 0.1, tokenCount: Math.floor((repo.description?.length || 100) / 4) }; } export async function registerRoutes(app: Express): Promise { // Knowledge graph data endpoint app.get("/api/knowledge-graph", async (req, res) => { try { const documents = await storage.getDocuments(50); const nodes: any[] = []; const links: any[] = []; // Create document nodes from actual storage documents.forEach(doc => { nodes.push({ id: `doc_${doc.id}`, label: doc.title.substring(0, 50) + (doc.title.length > 50 ? "..." : ""), type: "document", size: 12, color: "#3b82f6", metadata: { title: doc.title, sourceType: doc.sourceType, year: new Date(doc.createdAt).getFullYear(), id: doc.id } }); }); // Extract concepts from document content const conceptMap = new Map(); const conceptToDocuments = new Map(); documents.forEach(doc => { const content = doc.content.toLowerCase(); const concepts = [ 'ai', 'artificial intelligence', 'machine learning', 'deep learning', 'neural networks', 'transformer', 'attention', 'embedding', 'vector', 'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini', 'multimodal', 'fine-tuning', 'training', 'optimization', 'safety', 'alignment', 'reasoning', 'language model', 'nlp', 'computer vision' ]; concepts.forEach(concept => { if (content.includes(concept)) { conceptMap.set(concept, (conceptMap.get(concept) || 0) + 1); if (!conceptToDocuments.has(concept)) { conceptToDocuments.set(concept, []); } conceptToDocuments.get(concept)!.push(doc.id); } }); }); // Create document-to-document connections based on shared concepts const documentConnections = new Map>(); documents.forEach(doc1 => { const doc1Concepts = new Set(); const content1 = doc1.content.toLowerCase(); // Enhanced concept detection for better connections const allConcepts = [ 'ai', 'artificial intelligence', 'machine learning', 'deep learning', 'neural networks', 'transformer', 'attention', 'embedding', 'vector', 'rag', 'retrieval', 'generation', 'llm', 'gpt', 'claude', 'gemini', 'multimodal', 'fine-tuning', 'training', 'optimization', 'safety', 'alignment', 'reasoning', 'language model', 'nlp', 'computer vision', 'code generation', 'programming', 'software', 'development', 'copilot', 'constitutional ai', 'rlhf', 'instruction tuning', 'benchmarks', 'performance', 'efficiency', 'compression', 'quantization', 'edge ai', 'mamba', 'mixture of experts', 'moe', 'architecture', 'scaling' ]; allConcepts.forEach(concept => { if (content1.includes(concept)) { doc1Concepts.add(concept); } }); // Find related documents with shared concepts documents.forEach(doc2 => { if (doc1.id !== doc2.id) { const content2 = doc2.content.toLowerCase(); let sharedConcepts = 0; doc1Concepts.forEach(concept => { if (content2.includes(concept)) { sharedConcepts++; } }); // Create connection if documents share 3+ concepts if (sharedConcepts >= 3) { const connectionKey = `${Math.min(doc1.id, doc2.id)}_${Math.max(doc1.id, doc2.id)}`; if (!documentConnections.has(connectionKey)) { documentConnections.set(connectionKey, new Set([doc1.id, doc2.id])); links.push({ source: `doc_${doc1.id}`, target: `doc_${doc2.id}`, relationship: "related_concepts", strength: Math.min(sharedConcepts / 10, 1), color: "#3b82f6" }); } } } }); }); // Create concept nodes for concepts that appear in multiple documents conceptMap.forEach((count, concept) => { if (count >= 2) { nodes.push({ id: `concept_${concept.replace(/\s+/g, '_')}`, label: concept, type: "concept", size: 8 + count * 2, color: "#10b981", metadata: { documentCount: count, concept: concept } }); // Link concept to documents const relatedDocs = conceptToDocuments.get(concept) || []; relatedDocs.forEach(docId => { links.push({ source: `doc_${docId}`, target: `concept_${concept.replace(/\s+/g, '_')}`, relationship: "contains_concept", strength: 1, color: "#10b981" }); }); } }); // Extract research teams from document metadata const researchTeams = new Map(); documents.forEach(doc => { if (doc.metadata) { let teamName = ''; const metadata = typeof doc.metadata === 'string' ? JSON.parse(doc.metadata) : doc.metadata; // Extract team names from authors or venue if (metadata.authors && Array.isArray(metadata.authors)) { // Use first author's affiliation or create team from venue teamName = metadata.venue || 'Research Team'; } else if (metadata.venue) { teamName = metadata.venue; } else if (doc.source) { // Extract team from source if (doc.source.includes('OpenAI')) teamName = 'OpenAI Research'; else if (doc.source.includes('Anthropic')) teamName = 'Anthropic'; else if (doc.source.includes('Google') || doc.source.includes('DeepMind')) teamName = 'Google DeepMind'; else if (doc.source.includes('LangChain')) teamName = 'LangChain Team'; else if (doc.source.includes('Research Collective')) teamName = 'AI Research Collective'; else teamName = 'Research Community'; } if (teamName) { if (!researchTeams.has(teamName)) { researchTeams.set(teamName, []); } researchTeams.get(teamName)!.push(doc.id); } } }); // Create research team nodes researchTeams.forEach((docIds, teamName) => { nodes.push({ id: `team_${teamName.replace(/\s+/g, '_')}`, label: teamName, type: "author", size: 8 + docIds.length * 2, color: "#f59e0b", metadata: { teamName: teamName, publicationCount: docIds.length } }); // Link team to documents docIds.forEach(docId => { links.push({ source: `team_${teamName.replace(/\s+/g, '_')}`, target: `doc_${docId}`, relationship: "authored_by", strength: 0.8, color: "#f59e0b" }); }); }); // Create source type clusters const sourceTypes = new Map(); documents.forEach(doc => { const sourceType = doc.sourceType || 'unknown'; if (!sourceTypes.has(sourceType)) { sourceTypes.set(sourceType, []); } sourceTypes.get(sourceType)!.push(doc.id); }); sourceTypes.forEach((docIds, sourceType) => { if (docIds.length >= 2) { nodes.push({ id: `source_${sourceType}`, label: sourceType.charAt(0).toUpperCase() + sourceType.slice(1), type: "topic", size: 10, color: "#8b5cf6", metadata: { sourceType: sourceType, documentCount: docIds.length } }); // Link source type to documents docIds.forEach(docId => { links.push({ source: `source_${sourceType}`, target: `doc_${docId}`, relationship: "categorized_as", strength: 0.6, color: "#8b5cf6" }); }); } }); res.json({ nodes, links, stats: { totalDocuments: documents.length, totalConcepts: conceptMap.size, totalResearchTeams: researchTeams.size, totalSourceTypes: sourceTypes.size } }); } catch (error) { console.error("Knowledge graph generation failed:", error); res.status(500).json({ error: "Failed to generate knowledge graph", nodes: [], links: [], stats: { totalDocuments: 0, totalConcepts: 0, totalResearchTeams: 0, totalSourceTypes: 0 } }); } }); // Enhanced search with web fallback app.post("/api/search", async (req, res) => { try { const searchRequest = searchRequestSchema.parse(req.body); const streaming = req.body.streaming === true; const startTime = Date.now(); let allDocuments: any[] = []; // Enhanced multi-source search for semantic queries if (searchRequest.searchType === "semantic") { console.log(`🔍 Enhanced multi-source search for: "${searchRequest.query}"`); // 1. First, always do keyword search on knowledge base console.log('📚 Searching knowledge base...'); // Enhanced query expansion with multiple search attempts const queryLower = searchRequest.query.toLowerCase(); const searchQueries = [searchRequest.query]; // Start with original query // Add related terms for better matching if (queryLower.includes('mistral')) { searchQueries.push('Mixtral', 'Mistral AI'); } if (queryLower.includes('mixtral')) { searchQueries.push('Mistral', 'mixture of experts'); } if (queryLower.includes('llama')) { searchQueries.push('LLaMA', 'Large Language Model Meta AI'); } if (queryLower.includes('gpt')) { searchQueries.push('GPT', 'Generative Pre-trained Transformer'); } if (queryLower.includes('transformer') || queryLower.includes('attention')) { searchQueries.push('Attention Is All You Need', 'transformer', 'attention mechanism'); } if (queryLower.includes('constitutional')) { searchQueries.push('Constitutional AI', 'harmlessness', 'AI feedback'); } if (queryLower.includes('rag') || queryLower.includes('retrieval')) { searchQueries.push('Retrieval-Augmented Generation', 'retrieval augmented', 'knowledge-intensive'); } // Search with each query and combine results const allSearchResults = new Map(); for (const query of searchQueries) { const searchResult = await storage.searchDocuments({ ...searchRequest, query }); for (const doc of searchResult.results || []) { if (!allSearchResults.has(doc.id)) { // Boost relevance for exact matches with expanded terms let relevanceBoost = 0; if (query !== searchRequest.query) { relevanceBoost = 0.2; // Boost expanded term matches } allSearchResults.set(doc.id, { ...doc, relevanceScore: Math.min(doc.relevanceScore + relevanceBoost, 1.0) }); } } } allDocuments = Array.from(allSearchResults.values()); allDocuments = allDocuments.map(doc => ({ ...doc, relevanceScore: Math.min(doc.relevanceScore + 0.6, 1.0), // Boost local results rank: doc.rank, snippet: doc.snippet || doc.content.substring(0, 200) + '...' })); console.log(`📚 Found ${allDocuments.length} local documents`); console.log(`📚 Query expansion searched for: ${searchQueries.join(', ')}`); // Skip AI enhancement for now to test query expansion // TODO: Re-enable AI enhancement after fixing query expansion } else { // Use regular keyword search for other search types const localResults = await storage.searchDocuments(searchRequest); // Boost relevance scores for knowledge base documents to prioritize them allDocuments = (localResults.results || []).map(doc => ({ ...doc, relevanceScore: Math.min(doc.relevanceScore + 0.5, 1.0) // Boost by 0.5 })); } // Validate URLs in local storage results as well if (allDocuments.length > 0) { console.log('Validating URLs in local storage results...'); const documentsWithUrls = allDocuments.filter(doc => doc.url); if (documentsWithUrls.length > 0) { const urls = documentsWithUrls.map(doc => doc.url).filter((url): url is string => url !== null); const validationResults = await validateUrls(urls); // Filter out documents with invalid URLs allDocuments = allDocuments.filter(doc => { if (!doc.url) return true; // Keep documents without URLs const isValid = validationResults.get(doc.url); if (!isValid) { console.log(`Filtered out local document with invalid URL: ${doc.url} (${doc.title})`); } return isValid; }); console.log(`Local URL validation completed. ${allDocuments.length} documents have valid URLs.`); } } // Always search external sources to provide comprehensive results console.log(`🌐 Searching external sources to supplement ${allDocuments.length} local results...`); // Check if we should search GitHub const isCodeQuery = searchRequest.query.toLowerCase().includes('python') || searchRequest.query.toLowerCase().includes('data structures') || searchRequest.query.toLowerCase().includes('algorithm') || searchRequest.query.toLowerCase().includes('repository') || searchRequest.query.toLowerCase().includes('code') || searchRequest.query.toLowerCase().includes('programming') || searchRequest.query.toLowerCase().includes('github'); // Enhanced keyword detection for AI/ML queries that might have relevant code const isAIQuery = searchRequest.query.toLowerCase().includes('mistral') || searchRequest.query.toLowerCase().includes('llama') || searchRequest.query.toLowerCase().includes('transformer') || searchRequest.query.toLowerCase().includes('gpt') || searchRequest.query.toLowerCase().includes('ai') || searchRequest.query.toLowerCase().includes('machine learning') || searchRequest.query.toLowerCase().includes('neural network'); // Query analysis for external search triggers // Enhanced external search with better error handling and timeouts const externalSearchPromises = []; // GitHub search for code and AI-related queries if ((isCodeQuery || isAIQuery) && process.env.GITHUB_TOKEN) { console.log('🐙 Searching GitHub...'); externalSearchPromises.push( Promise.race([ searchGitHubRepos(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3))) .then(repos => ({ type: 'github', results: repos.map((repo, index) => transformGitHubRepoToDocument(repo, index + allDocuments.length, searchRequest.query) ) })) .catch(error => { console.log('🐙 GitHub search failed:', error.message); return { type: 'github', results: [] }; }), new Promise((_, reject) => setTimeout(() => reject(new Error('GitHub search timeout')), 8000) ) ]).catch(() => ({ type: 'github', results: [] })) ); } // Always include web search for comprehensive coverage console.log('🌍 Searching web...'); externalSearchPromises.push( Promise.race([ searchWeb(searchRequest.query, Math.min(3, Math.ceil(searchRequest.limit / 3))) .then(webResults => ({ type: 'web', results: webResults.map((result, index) => transformWebResultToDocument(result, index + allDocuments.length, searchRequest.query) ) })) .catch(error => { console.log('🌍 Web search failed:', error.message); return { type: 'web', results: [] }; }), new Promise((_, reject) => setTimeout(() => reject(new Error('Web search timeout')), 5000) ) ]).catch(() => ({ type: 'web', results: [] })) ); // Wait for external searches with timeout protection if (externalSearchPromises.length > 0) { try { const externalResults = await Promise.all(externalSearchPromises); // Flatten and combine results const githubResult = externalResults.find((r: any) => r?.type === 'github') as any; const webResult = externalResults.find((r: any) => r?.type === 'web') as any; const githubResults = githubResult?.results || []; const webResults = webResult?.results || []; const allExternalResults = [...githubResults, ...webResults]; console.log(`🌐 Found ${allExternalResults.length} external results (GitHub: ${githubResults.length}, Web: ${webResults.length})`); // Combine local and external results, keeping local results prioritized if (allExternalResults.length > 0) { allDocuments = [...allDocuments, ...allExternalResults] .sort((a, b) => b.relevanceScore - a.relevanceScore) .slice(0, searchRequest.limit); } } catch (externalError: any) { console.log('🌐 External search failed:', externalError?.message || externalError); } } console.log(`✅ Total results: ${allDocuments.length}`); const searchTime = (Date.now() - startTime) / 1000; const response = { results: allDocuments, totalCount: allDocuments.length, searchTime, query: searchRequest.query, queryId: Date.now() }; res.json(response); } catch (error) { if (error instanceof z.ZodError) { res.status(400).json({ message: "Invalid search request", errors: error.errors }); } else { console.error('Search error:', error); res.status(500).json({ message: "Internal server error" }); } } }); // AI explanation endpoint using Nebius app.post("/api/explain", async (req, res) => { try { const { title, snippet, content } = req.body; if (!title || !snippet) { return res.status(400).json({ message: "Title and snippet are required" }); } const prompt = `You are an expert communicator. Explain this document directly in a clear, conversational way suitable for audio playback. Do not show your thinking process - just provide the final explanation. Title: ${title} Content: ${snippet} Provide a brief, engaging explanation (2-3 sentences) that would be pleasant to listen to. Focus on the key concepts and practical value. Start your response immediately with the explanation.`; const response = await nebiusClient.createChatCompletion({ model: "deepseek-ai/DeepSeek-R1-0528", // Using DeepSeek model via Nebius messages: [{ role: "user", content: prompt }], max_tokens: 150, temperature: 0.7, }); const explanation = cleanThinkingTags(response.choices[0].message.content); res.json({ explanation }); } catch (error) { console.error('AI explanation error:', error); res.status(500).json({ message: "Failed to generate explanation" }); } }); // Enhanced AI-powered search using Nebius and Modal app.post("/api/ai-search", async (req, res) => { try { const { query, maxResults = 10, useQueryEnhancement = true } = req.body; if (!query || typeof query !== 'string') { return res.status(400).json({ message: "Query is required" }); } const results = await smartIngestionService.enhancedSearch(query, { maxResults, searchType: 'semantic', useQueryEnhancement }); res.json(results); } catch (error) { console.error('AI search error:', error); res.status(500).json({ message: "AI search failed", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Document analysis using Nebius AI app.post("/api/analyze-document", async (req, res) => { try { const { content, analysisType = 'summary', useMarkdown = true } = req.body; if (!content) { return res.status(400).json({ message: "Content is required" }); } const analysis = await nebiusClient.analyzeDocument({ content, analysisType, useMarkdown }); res.json(analysis); } catch (error) { console.error('Document analysis error:', error); res.status(500).json({ message: "Document analysis failed", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Research synthesis using Nebius AI app.post("/api/research-synthesis", async (req, res) => { try { const { query, documentIds } = req.body; if (!query || !Array.isArray(documentIds)) { return res.status(400).json({ message: "Query and document IDs are required" }); } // Get documents from storage const documents = await Promise.all( documentIds.map(id => storage.getDocument(id)) ); const validDocuments = documents.filter(Boolean); if (validDocuments.length === 0) { return res.status(400).json({ message: "No valid documents found" }); } const synthesis = await smartIngestionService.generateResearchSynthesis( query, validDocuments ); res.json(synthesis); } catch (error) { console.error('Research synthesis error:', error); res.status(500).json({ message: "Research synthesis failed", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Query enhancement using Nebius AI app.post("/api/enhance-query", async (req, res) => { try { const { query, context } = req.body; if (!query) { return res.status(400).json({ message: "Query is required" }); } const enhancement = await nebiusClient.enhanceQuery(query, context); // Clean up any thinking tags that might appear in string fields enhancement.enhancedQuery = cleanThinkingTags(enhancement.enhancedQuery); enhancement.intent = cleanThinkingTags(enhancement.intent); res.json(enhancement); } catch (error) { console.error('Query enhancement error:', error); res.status(500).json({ message: "Query enhancement failed", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Modal processing status endpoint app.get("/api/modal-task/:taskId", async (req, res) => { try { const { taskId } = req.params; const status = await modalClient.getTaskStatus(taskId); res.json(status); } catch (error) { console.error('Modal task status error:', error); res.status(500).json({ message: "Failed to get task status", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Batch document ingestion using Modal app.post("/api/batch-ingest", async (req, res) => { try { const { documents } = req.body; if (!Array.isArray(documents) || documents.length === 0) { return res.status(400).json({ message: "Documents array is required" }); } const uploads = documents.map(doc => ({ file: doc.content || '', filename: doc.filename || 'unknown.txt', contentType: doc.contentType || 'text/plain', metadata: doc.metadata || {} })); const result = await smartIngestionService.batchIngestDocuments(uploads); res.json(result); } catch (error) { console.error('Batch ingestion error:', error); res.status(500).json({ message: "Batch ingestion failed", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // API Health Check endpoint app.get("/api/health", async (req, res) => { try { const { checkAPIHealth } = await import('./api-health-check'); const healthStatus = await checkAPIHealth(); const overallHealthy = healthStatus.every(status => status.status !== 'error'); res.status(overallHealthy ? 200 : 503).json({ overall: overallHealthy ? 'healthy' : 'issues_detected', services: healthStatus, timestamp: new Date().toISOString() }); } catch (error) { res.status(500).json({ overall: 'error', message: 'Health check failed', error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Generate embeddings using Nebius app.post("/api/embeddings", async (req, res) => { try { const { input, model = 'text-embedding-ada-002' } = req.body; if (!input) { return res.status(400).json({ message: "Input text is required" }); } console.log('Generating embeddings for input:', input.substring(0, 100) + '...'); const embeddings = await nebiusClient.createEmbeddings({ input, model }); console.log('Embeddings generated successfully'); res.json(embeddings); } catch (error) { console.error('Embeddings error:', error); res.status(500).json({ message: "Embedding generation failed", error: error instanceof Error ? error.message : 'Unknown error' }); } }); // Other routes... app.get("/api/documents", async (req, res) => { try { const limit = parseInt(req.query.limit as string) || 50; const offset = parseInt(req.query.offset as string) || 0; const documents = await storage.getDocuments(limit, offset); res.json(documents); } catch (error) { res.status(500).json({ message: "Failed to fetch documents" }); } }); // Register document routes - enable uploads by default for all environments // Hugging Face Spaces have /tmp storage which is suitable for uploads const isHuggingFaceSpace = process.env.SPACE_ID || process.env.HF_SPACE_ID || process.env.HUGGINGFACE_SPACE_ID || process.env.HF_TOKEN || false; const hasWritableStorage = process.env.NODE_ENV === 'production' ? fs.existsSync('/tmp') : true; // Development always has writable storage // Force enable uploads for Hugging Face Spaces, otherwise check DISABLE_UPLOADS const isDocumentUploadEnabled = isHuggingFaceSpace ? true : (process.env.DISABLE_UPLOADS !== 'true'); console.log('🔍 Environment check:', { NODE_ENV: process.env.NODE_ENV, DISABLE_UPLOADS: process.env.DISABLE_UPLOADS, isHuggingFaceSpace: !!isHuggingFaceSpace, hasWritableStorage, isDocumentUploadEnabled }); if (isDocumentUploadEnabled) { console.log('✅ Document uploads enabled - full functionality available'); app.use("/api/documents", documentRoutes); } else { console.log('â„šī¸ Document uploads disabled - using fallback routes'); app.use("/api/documents", uploadFallbackRoutes); } const httpServer = createServer(app); return httpServer; }