import { boolean, index, integer, jsonb, pgEnum, pgTable, serial, text, timestamp, vector, bigserial, unique, } from 'drizzle-orm/pg-core'; import { sql } from 'drizzle-orm'; import type { DataSourceConfigWrapperType } from './validators/dataSourceConfig'; import type { AnalysisPayloadWrapper } from './validators/analysisPayload'; /** * Note: We use $ to denote the table objects * This frees up the uses of sources, articles, reports, etc as variables in the codebase **/ export const ingestedItemStatusEnum = pgEnum('ingested_item_status', [ 'NEW', 'PENDING_PROCESSING', 'PROCESSED', 'FAILED_RENDER', 'FAILED_FETCH', 'FAILED_PROCESSING', 'FAILED_EMBEDDING', 'FAILED_R2_UPLOAD', 'SKIPPED_PDF', 'SKIPPED_TOO_OLD', ]); export const sourceTypeEnum = pgEnum('source_type', ['RSS']); export const $publishers = pgTable('publishers', { id: serial('id').primaryKey(), name: text('name').notNull(), base_url: text('base_url'), created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(), }); export const $data_sources = pgTable('data_sources', { id: serial('id').primaryKey(), name: text('name').notNull(), source_type: sourceTypeEnum().notNull(), config: jsonb('config').$type().notNull(), // Stores source-specific config like {"url": "...", "config_schema_version": "1.0", "paywall": false, "category": "..."} config_version_hash: text('config_version_hash'), // Hash of config to detect changes publisher_id: integer('publisher_id').references(() => $publishers.id), scrape_frequency_minutes: integer('scrape_frequency_minutes').notNull().default(240), // Default: 4 hours lastChecked: timestamp('last_checked', { mode: 'date' }), do_initialized_at: timestamp('do_initialized_at', { mode: 'date' }), created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(), updated_at: timestamp('updated_at', { mode: 'date' }).defaultNow().notNull(), }); export const $ingested_items = pgTable( 'ingested_items', { id: bigserial('id', { mode: 'number' }).primaryKey(), item_id_from_source: text('item_id_from_source').notNull(), // RSS guid, Tweet ID, etc. raw_data_r2_key: text('raw_data_r2_key').notNull(), // R2 key for original payload display_title: text('display_title'), // nullable, might be derived later url_to_original: text('url_to_original').notNull().unique(), published_at: timestamp('published_at', { mode: 'date' }), status: ingestedItemStatusEnum().default('NEW'), content_body_r2_key: text('content_body_r2_key'), // R2 key for processed text content_body_text: text('content_body_text'), // inline snippet or full text if small word_count: integer('word_count'), embedding_text: text('embedding_text'), // text used to generate embedding analysis_payload: jsonb('analysis_payload').$type(), // structured LLM analysis source_specific_metadata: jsonb('source_specific_metadata'), // small, queryable metadata usedBrowser: boolean('used_browser'), embedding: vector('embedding', { dimensions: 384 }), fail_reason: text('fail_reason'), data_source_id: integer('data_source_id') .references(() => $data_sources.id) .notNull(), processed_at: timestamp('processed_at', { mode: 'date' }), ingested_at: timestamp('ingested_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`), }, table => [ index('embeddingIndex').using('hnsw', table.embedding.op('vector_cosine_ops')), unique('uniqueSourceItem').on(table.data_source_id, table.item_id_from_source), ] ); export const $reports = pgTable('reports', { id: serial('id').primaryKey(), title: text('title').notNull(), content: text('content').notNull(), totalArticles: integer('total_articles').notNull(), totalSources: integer('total_sources').notNull(), usedArticles: integer('used_articles').notNull(), usedSources: integer('used_sources').notNull(), tldr: text('tldr'), clustering_params: jsonb('clustering_params'), model_author: text('model_author'), createdAt: timestamp('created_at', { mode: 'date' }) .default(sql`CURRENT_TIMESTAMP`) .notNull(), }); export const $newsletter = pgTable('newsletter', { id: serial('id').primaryKey(), email: text('email').notNull().unique(), createdAt: timestamp('created_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`), });