AleksanderObuchowski's picture
add blacklists and change name
7bbab39
raw
history blame
18 kB
const express = require('express');
const fs = require('fs');
const path = require('path');
const axios = require('axios');
const NodeCache = require('node-cache');
const router = express.Router();
const cache = new NodeCache({ stdTTL: 3600 });
const ALGORITHMS_PATH = path.join(__dirname, '../../data/algorithms.json');
const TIMELINE_CACHE_PATH = path.join(__dirname, '../../data/timeline-cache.json');
const PUBMED_BASE_URL = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils';
function loadAlgorithms() {
const data = fs.readFileSync(ALGORITHMS_PATH, 'utf8');
return JSON.parse(data);
}
function loadTimelineCache() {
try {
if (fs.existsSync(TIMELINE_CACHE_PATH)) {
const data = fs.readFileSync(TIMELINE_CACHE_PATH, 'utf8');
return JSON.parse(data);
}
} catch (error) {
console.warn('Error loading timeline cache:', error.message);
}
return {};
}
function saveTimelineCache(cache) {
try {
fs.writeFileSync(TIMELINE_CACHE_PATH, JSON.stringify(cache, null, 2));
} catch (error) {
console.error('Error saving timeline cache:', error.message);
}
}
function getCacheKey(algorithmKey, year) {
return `${algorithmKey}-${year}`;
}
function isCurrentYear(year) {
return year === new Date().getFullYear();
}
async function searchAlgorithmUsage(problem, algorithmKey, algorithmData) {
try {
const synonymQueries = algorithmData.synonyms.map(synonym =>
`("${problem}" AND "${synonym}")`
).join(' OR ');
// Build blacklist exclusions if they exist
let blacklistExclusions = '';
if (algorithmData.blacklist && algorithmData.blacklist.length > 0) {
const blacklistTerms = algorithmData.blacklist.map(term => `NOT "${term}"`).join(' ');
blacklistExclusions = ` ${blacklistTerms}`;
}
// Add filters to exclude review papers, meta-analyses, and systematic reviews
const filteredQuery = `(${synonymQueries})${blacklistExclusions} NOT Review[Publication Type] NOT Meta-Analysis[Publication Type] NOT Systematic Review[Publication Type]`;
const searchUrl = `${PUBMED_BASE_URL}/esearch.fcgi?db=pubmed&term=${encodeURIComponent(filteredQuery)}&retmode=json`;
// Debug logging for CNN and GAN specifically
if (algorithmKey === 'cnn') {
console.log(`CNN Search for "${problem}":`);
console.log(`Query: ${filteredQuery}`);
console.log(`URL: ${searchUrl}`);
}
if (algorithmKey === 'gan') {
console.log(`GAN Search for "${problem}":`);
console.log(`Synonym queries: ${synonymQueries}`);
console.log(`Blacklist exclusions: ${blacklistExclusions}`);
console.log(`Final query: ${filteredQuery}`);
console.log(`URL: ${searchUrl}`);
}
const response = await axios.get(searchUrl);
const count = parseInt(response.data.esearchresult.count) || 0;
// Debug logging for CNN and GAN results
if (algorithmKey === 'cnn') {
console.log(`CNN Results: ${count} papers found`);
console.log(`Sample IDs:`, response.data.esearchresult.idlist?.slice(0, 3));
}
if (algorithmKey === 'gan') {
console.log(`GAN Results: ${count} papers found`);
console.log(`Sample IDs:`, response.data.esearchresult.idlist?.slice(0, 3));
}
return {
algorithm: algorithmKey,
name: algorithmData.name,
category: algorithmData.category,
description: algorithmData.description,
count: count,
sampleIds: response.data.esearchresult.idlist?.slice(0, 3) || []
};
} catch (error) {
if (algorithmKey === 'cnn') {
console.error(`CNN Search Error:`, error.message);
}
return {
algorithm: algorithmKey,
name: algorithmData.name,
category: algorithmData.category,
description: algorithmData.description,
count: 0,
sampleIds: []
};
}
}
router.post('/problem', async (req, res) => {
try {
const { problem } = req.body;
if (!problem) {
return res.status(400).json({ error: 'Problem parameter is required' });
}
const algorithms = loadAlgorithms();
const results = [];
for (const [key, algo] of Object.entries(algorithms.algorithms)) {
const result = await searchAlgorithmUsage(problem, key, algo);
results.push(result);
}
results.sort((a, b) => b.count - a.count);
res.json({
problem,
totalAlgorithms: results.length,
results: results.filter(r => r.count > 0),
allResults: results
});
} catch (error) {
res.status(500).json({ error: error.message });
}
});
async function delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function fetchAlgorithmCount(key, algo) {
const cacheKey = `dashboard-${key}`;
const cached = cache.get(cacheKey);
if (cached !== undefined) {
console.log(`${algo.name}: ${cached} results (cached)`);
return cached;
}
const generalQuery = algo.synonyms.map(s => `"${s}"`).join(' OR ');
// Build blacklist exclusions if they exist
let blacklistExclusions = '';
if (algo.blacklist && algo.blacklist.length > 0) {
const blacklistTerms = algo.blacklist.map(term => `NOT "${term}"`).join(' ');
blacklistExclusions = ` ${blacklistTerms}`;
}
// Add filters to exclude review papers, meta-analyses, and systematic reviews
const filteredQuery = `(${generalQuery})${blacklistExclusions} NOT Review[Publication Type] NOT Meta-Analysis[Publication Type] NOT Systematic Review[Publication Type]`;
try {
const searchUrl = `${PUBMED_BASE_URL}/esearch.fcgi?db=pubmed&term=${encodeURIComponent(filteredQuery)}&retmode=json`;
const response = await axios.get(searchUrl, { timeout: 15000 });
const count = parseInt(response.data.esearchresult.count) || 0;
console.log(`${algo.name}: ${count} results for query: ${filteredQuery}`);
cache.set(cacheKey, count);
return count;
} catch (error) {
console.error(`Error fetching data for ${algo.name}:`, error.message);
return 0;
}
}
router.get('/dashboard-stats', async (req, res) => {
try {
const algorithms = loadAlgorithms();
const stats = {
classical_ml: [],
deep_learning: [],
llms: []
};
// Process algorithms sequentially to avoid rate limiting
for (const [key, algo] of Object.entries(algorithms.algorithms)) {
const count = await fetchAlgorithmCount(key, algo);
stats[algo.category].push({
algorithm: key,
name: algo.name,
count: count
});
// Add delay between requests to respect rate limits
await delay(200);
}
stats.classical_ml.sort((a, b) => b.count - a.count);
stats.deep_learning.sort((a, b) => b.count - a.count);
stats.llms.sort((a, b) => b.count - a.count);
res.json(stats);
} catch (error) {
res.status(500).json({ error: error.message });
}
});
router.get('/pubmed-link', (req, res) => {
const { problem, algorithm } = req.query;
if (!problem || !algorithm) {
return res.status(400).json({ error: 'Both problem and algorithm parameters are required' });
}
const algorithms = loadAlgorithms();
const algoData = algorithms.algorithms[algorithm];
if (!algoData) {
return res.status(404).json({ error: 'Algorithm not found' });
}
const synonymQueries = algoData.synonyms.map(synonym =>
`("${problem}" AND "${synonym}")`
).join(' OR ');
// Build blacklist exclusions if they exist
let blacklistExclusions = '';
if (algoData.blacklist && algoData.blacklist.length > 0) {
const blacklistTerms = algoData.blacklist.map(term => `NOT "${term}"`).join(' ');
blacklistExclusions = ` ${blacklistTerms}`;
}
// Add filters to exclude review papers for PubMed links too
const filteredQuery = `(${synonymQueries})${blacklistExclusions} NOT Review[Publication Type] NOT Meta-Analysis[Publication Type] NOT Systematic Review[Publication Type]`;
const pubmedUrl = `https://pubmed.ncbi.nlm.nih.gov/?term=${encodeURIComponent(filteredQuery)}`;
res.json({ url: pubmedUrl });
});
async function fetchAlgorithmCountByYear(key, algo, year, diskCache, retryCount = 0) {
const diskCacheKey = getCacheKey(key, year);
// Check disk cache first for past years (which never change)
if (!isCurrentYear(year) && diskCache[diskCacheKey] !== undefined) {
console.log(`Using disk cache for ${algo.name} (${year}): ${diskCache[diskCacheKey]}`);
return diskCache[diskCacheKey];
}
// Check memory cache for current year
const memoryCacheKey = `timeline-${key}-${year}`;
const memCached = cache.get(memoryCacheKey);
if (isCurrentYear(year) && memCached !== undefined) {
return memCached;
}
const generalQuery = algo.synonyms.map(s => `"${s}"`).join(' OR ');
const yearFilter = `"${year}"[Date - Publication]`;
// Build blacklist exclusions if they exist
let blacklistExclusions = '';
if (algo.blacklist && algo.blacklist.length > 0) {
const blacklistTerms = algo.blacklist.map(term => `NOT "${term}"`).join(' ');
blacklistExclusions = ` ${blacklistTerms}`;
}
const filteredQuery = `(${generalQuery}) AND ${yearFilter}${blacklistExclusions} NOT Review[Publication Type] NOT Meta-Analysis[Publication Type] NOT Systematic Review[Publication Type]`;
try {
const searchUrl = `${PUBMED_BASE_URL}/esearch.fcgi?db=pubmed&term=${encodeURIComponent(filteredQuery)}&retmode=json`;
const response = await axios.get(searchUrl, { timeout: 15000 });
const count = parseInt(response.data.esearchresult.count) || 0;
// Save to appropriate cache
if (isCurrentYear(year)) {
// Current year: save to memory cache (expires)
cache.set(memoryCacheKey, count);
} else {
// Past years: save to disk cache (permanent)
diskCache[diskCacheKey] = count;
}
console.log(`Fetched ${algo.name} (${year}): ${count} papers`);
return count;
} catch (error) {
if (error.response?.status === 429 && retryCount < 3) {
const backoffTime = Math.pow(2, retryCount) * 1000; // 1s, 2s, 4s
console.log(`Rate limited for ${algo.name} (${year}), retrying in ${backoffTime}ms (attempt ${retryCount + 1})`);
await delay(backoffTime);
return fetchAlgorithmCountByYear(key, algo, year, diskCache, retryCount + 1);
}
console.error(`Error fetching timeline data for ${algo.name} (${year}):`, error.message);
return 0;
}
}
router.get('/timeline-stream', async (req, res) => {
const { startYear = 2015, endYear = 2024 } = req.query;
const start = parseInt(startYear);
const end = parseInt(endYear);
if (start > end || start < 2010 || end > 2024) {
return res.status(400).json({ error: 'Invalid year range' });
}
// Set up Server-Sent Events
res.writeHead(200, {
'Content-Type': 'text/event-stream',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'Cache-Control'
});
const sendProgress = (data) => {
res.write(`data: ${JSON.stringify(data)}\n\n`);
};
try {
const algorithms = loadAlgorithms();
const diskCache = loadTimelineCache();
let cacheUpdated = false;
const years = [];
for (let year = start; year <= end; year++) {
years.push(year);
}
const timelineData = [];
const algorithmsData = [];
// Initialize timeline structure
for (const year of years) {
timelineData.push({ year });
}
// Count total operations
const totalAlgorithms = Object.keys(algorithms.algorithms).length;
const totalYears = years.length;
const totalOperations = totalAlgorithms * totalYears;
let completedOperations = 0;
let cachedResults = 0;
let fetchedResults = 0;
// Count cached vs fetched upfront
for (const [key, algo] of Object.entries(algorithms.algorithms)) {
for (const year of years) {
const diskCacheKey = getCacheKey(key, year);
if (!isCurrentYear(year) && diskCache[diskCacheKey] !== undefined) {
cachedResults++;
} else {
fetchedResults++;
}
}
}
sendProgress({
type: 'init',
totalOperations,
cachedResults,
fetchedResults,
message: 'Starting timeline data collection...'
});
// Process each algorithm
for (const [key, algo] of Object.entries(algorithms.algorithms)) {
const algorithmTimeline = {
algorithm: key,
name: algo.name,
category: algo.category,
data: []
};
sendProgress({
type: 'algorithm_start',
algorithm: algo.name,
progress: Math.round((completedOperations / totalOperations) * 100),
completed: completedOperations,
total: totalOperations
});
// Get data for each year
for (const year of years) {
const count = await fetchAlgorithmCountByYear(key, algo, year, diskCache);
algorithmTimeline.data.push({ year, count });
// Add to timeline data structure
const yearIndex = timelineData.findIndex(item => item.year === year);
if (yearIndex !== -1) {
timelineData[yearIndex][key] = count;
}
completedOperations++;
sendProgress({
type: 'year_complete',
algorithm: algo.name,
year,
count,
progress: Math.round((completedOperations / totalOperations) * 100),
completed: completedOperations,
total: totalOperations
});
// Check if we made an API call and need to save cache
const diskCacheKey = getCacheKey(key, year);
if (!isCurrentYear(year) && diskCache[diskCacheKey] === count) {
cacheUpdated = true;
}
// Add delay only if we made an actual API call
if (isCurrentYear(year) || diskCache[diskCacheKey] === undefined) {
await delay(500);
}
}
algorithmsData.push(algorithmTimeline);
sendProgress({
type: 'algorithm_complete',
algorithm: algo.name,
progress: Math.round((completedOperations / totalOperations) * 100),
completed: completedOperations,
total: totalOperations
});
}
// Save cache if updated
if (cacheUpdated) {
saveTimelineCache(diskCache);
sendProgress({
type: 'cache_saved',
message: 'Timeline cache updated and saved to disk'
});
}
// Send final results
sendProgress({
type: 'complete',
timelineData,
algorithms: algorithmsData,
yearRange: { start, end },
cacheStats: {
cached: cachedResults,
fetched: fetchedResults
}
});
res.end();
} catch (error) {
sendProgress({
type: 'error',
error: error.message
});
res.end();
}
});
router.get('/timeline', async (req, res) => {
try {
const { startYear = 2015, endYear = 2024 } = req.query;
const start = parseInt(startYear);
const end = parseInt(endYear);
if (start > end || start < 2010 || end > 2024) {
return res.status(400).json({ error: 'Invalid year range' });
}
const algorithms = loadAlgorithms();
const diskCache = loadTimelineCache();
let cacheUpdated = false;
const years = [];
for (let year = start; year <= end; year++) {
years.push(year);
}
const timelineData = [];
const algorithmsData = [];
// Initialize timeline structure
for (const year of years) {
timelineData.push({ year });
}
// Count how many API calls we'll need to make
let totalApiCalls = 0;
let cachedResults = 0;
for (const [key, algo] of Object.entries(algorithms.algorithms)) {
for (const year of years) {
const diskCacheKey = getCacheKey(key, year);
if (!isCurrentYear(year) && diskCache[diskCacheKey] !== undefined) {
cachedResults++;
} else {
totalApiCalls++;
}
}
}
console.log(`Timeline request: ${cachedResults} cached results, ${totalApiCalls} API calls needed`);
// Process each algorithm
for (const [key, algo] of Object.entries(algorithms.algorithms)) {
const algorithmTimeline = {
algorithm: key,
name: algo.name,
category: algo.category,
data: []
};
// Get data for each year
for (const year of years) {
const count = await fetchAlgorithmCountByYear(key, algo, year, diskCache);
algorithmTimeline.data.push({ year, count });
// Add to timeline data structure
const yearIndex = timelineData.findIndex(item => item.year === year);
if (yearIndex !== -1) {
timelineData[yearIndex][key] = count;
}
// Check if we made an API call (not cached) and need to save cache
const diskCacheKey = getCacheKey(key, year);
if (!isCurrentYear(year) && diskCache[diskCacheKey] === count) {
cacheUpdated = true;
}
// Add delay only if we made an actual API call
if (isCurrentYear(year) || diskCache[diskCacheKey] === undefined) {
await delay(500);
}
}
algorithmsData.push(algorithmTimeline);
console.log(`Completed timeline data for ${algo.name}`);
}
// Save updated cache to disk if needed
if (cacheUpdated) {
saveTimelineCache(diskCache);
console.log('Timeline cache updated and saved to disk');
}
res.json({
timelineData,
algorithms: algorithmsData,
yearRange: { start, end },
cacheStats: {
cached: cachedResults,
fetched: totalApiCalls
}
});
} catch (error) {
console.error('Timeline API error:', error);
res.status(500).json({ error: error.message });
}
});
module.exports = router;