yunlonggong's picture
Initial project upload
1b44660
import {
boolean,
index,
integer,
jsonb,
pgEnum,
pgTable,
serial,
text,
timestamp,
vector,
bigserial,
unique,
} from 'drizzle-orm/pg-core';
import { sql } from 'drizzle-orm';
import type { DataSourceConfigWrapperType } from './validators/dataSourceConfig';
import type { AnalysisPayloadWrapper } from './validators/analysisPayload';
/**
* Note: We use $ to denote the table objects
* This frees up the uses of sources, articles, reports, etc as variables in the codebase
**/
export const ingestedItemStatusEnum = pgEnum('ingested_item_status', [
'NEW',
'PENDING_PROCESSING',
'PROCESSED',
'FAILED_RENDER',
'FAILED_FETCH',
'FAILED_PROCESSING',
'FAILED_EMBEDDING',
'FAILED_R2_UPLOAD',
'SKIPPED_PDF',
'SKIPPED_TOO_OLD',
]);
export const sourceTypeEnum = pgEnum('source_type', ['RSS']);
export const $publishers = pgTable('publishers', {
id: serial('id').primaryKey(),
name: text('name').notNull(),
base_url: text('base_url'),
created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(),
});
export const $data_sources = pgTable('data_sources', {
id: serial('id').primaryKey(),
name: text('name').notNull(),
source_type: sourceTypeEnum().notNull(),
config: jsonb('config').$type<DataSourceConfigWrapperType>().notNull(), // Stores source-specific config like {"url": "...", "config_schema_version": "1.0", "paywall": false, "category": "..."}
config_version_hash: text('config_version_hash'), // Hash of config to detect changes
publisher_id: integer('publisher_id').references(() => $publishers.id),
scrape_frequency_minutes: integer('scrape_frequency_minutes').notNull().default(240), // Default: 4 hours
lastChecked: timestamp('last_checked', { mode: 'date' }),
do_initialized_at: timestamp('do_initialized_at', { mode: 'date' }),
created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(),
updated_at: timestamp('updated_at', { mode: 'date' }).defaultNow().notNull(),
});
export const $ingested_items = pgTable(
'ingested_items',
{
id: bigserial('id', { mode: 'number' }).primaryKey(),
item_id_from_source: text('item_id_from_source').notNull(), // RSS guid, Tweet ID, etc.
raw_data_r2_key: text('raw_data_r2_key').notNull(), // R2 key for original payload
display_title: text('display_title'), // nullable, might be derived later
url_to_original: text('url_to_original').notNull().unique(),
published_at: timestamp('published_at', { mode: 'date' }),
status: ingestedItemStatusEnum().default('NEW'),
content_body_r2_key: text('content_body_r2_key'), // R2 key for processed text
content_body_text: text('content_body_text'), // inline snippet or full text if small
word_count: integer('word_count'),
embedding_text: text('embedding_text'), // text used to generate embedding
analysis_payload: jsonb('analysis_payload').$type<typeof AnalysisPayloadWrapper>(), // structured LLM analysis
source_specific_metadata: jsonb('source_specific_metadata'), // small, queryable metadata
usedBrowser: boolean('used_browser'),
embedding: vector('embedding', { dimensions: 384 }),
fail_reason: text('fail_reason'),
data_source_id: integer('data_source_id')
.references(() => $data_sources.id)
.notNull(),
processed_at: timestamp('processed_at', { mode: 'date' }),
ingested_at: timestamp('ingested_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`),
},
table => [
index('embeddingIndex').using('hnsw', table.embedding.op('vector_cosine_ops')),
unique('uniqueSourceItem').on(table.data_source_id, table.item_id_from_source),
]
);
export const $reports = pgTable('reports', {
id: serial('id').primaryKey(),
title: text('title').notNull(),
content: text('content').notNull(),
totalArticles: integer('total_articles').notNull(),
totalSources: integer('total_sources').notNull(),
usedArticles: integer('used_articles').notNull(),
usedSources: integer('used_sources').notNull(),
tldr: text('tldr'),
clustering_params: jsonb('clustering_params'),
model_author: text('model_author'),
createdAt: timestamp('created_at', { mode: 'date' })
.default(sql`CURRENT_TIMESTAMP`)
.notNull(),
});
export const $newsletter = pgTable('newsletter', {
id: serial('id').primaryKey(),
email: text('email').notNull().unique(),
createdAt: timestamp('created_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`),
});