Spaces:
Running
Running
File size: 4,455 Bytes
1b44660 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import {
boolean,
index,
integer,
jsonb,
pgEnum,
pgTable,
serial,
text,
timestamp,
vector,
bigserial,
unique,
} from 'drizzle-orm/pg-core';
import { sql } from 'drizzle-orm';
import type { DataSourceConfigWrapperType } from './validators/dataSourceConfig';
import type { AnalysisPayloadWrapper } from './validators/analysisPayload';
/**
* Note: We use $ to denote the table objects
* This frees up the uses of sources, articles, reports, etc as variables in the codebase
**/
export const ingestedItemStatusEnum = pgEnum('ingested_item_status', [
'NEW',
'PENDING_PROCESSING',
'PROCESSED',
'FAILED_RENDER',
'FAILED_FETCH',
'FAILED_PROCESSING',
'FAILED_EMBEDDING',
'FAILED_R2_UPLOAD',
'SKIPPED_PDF',
'SKIPPED_TOO_OLD',
]);
export const sourceTypeEnum = pgEnum('source_type', ['RSS']);
export const $publishers = pgTable('publishers', {
id: serial('id').primaryKey(),
name: text('name').notNull(),
base_url: text('base_url'),
created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(),
});
export const $data_sources = pgTable('data_sources', {
id: serial('id').primaryKey(),
name: text('name').notNull(),
source_type: sourceTypeEnum().notNull(),
config: jsonb('config').$type<DataSourceConfigWrapperType>().notNull(), // Stores source-specific config like {"url": "...", "config_schema_version": "1.0", "paywall": false, "category": "..."}
config_version_hash: text('config_version_hash'), // Hash of config to detect changes
publisher_id: integer('publisher_id').references(() => $publishers.id),
scrape_frequency_minutes: integer('scrape_frequency_minutes').notNull().default(240), // Default: 4 hours
lastChecked: timestamp('last_checked', { mode: 'date' }),
do_initialized_at: timestamp('do_initialized_at', { mode: 'date' }),
created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(),
updated_at: timestamp('updated_at', { mode: 'date' }).defaultNow().notNull(),
});
export const $ingested_items = pgTable(
'ingested_items',
{
id: bigserial('id', { mode: 'number' }).primaryKey(),
item_id_from_source: text('item_id_from_source').notNull(), // RSS guid, Tweet ID, etc.
raw_data_r2_key: text('raw_data_r2_key').notNull(), // R2 key for original payload
display_title: text('display_title'), // nullable, might be derived later
url_to_original: text('url_to_original').notNull().unique(),
published_at: timestamp('published_at', { mode: 'date' }),
status: ingestedItemStatusEnum().default('NEW'),
content_body_r2_key: text('content_body_r2_key'), // R2 key for processed text
content_body_text: text('content_body_text'), // inline snippet or full text if small
word_count: integer('word_count'),
embedding_text: text('embedding_text'), // text used to generate embedding
analysis_payload: jsonb('analysis_payload').$type<typeof AnalysisPayloadWrapper>(), // structured LLM analysis
source_specific_metadata: jsonb('source_specific_metadata'), // small, queryable metadata
usedBrowser: boolean('used_browser'),
embedding: vector('embedding', { dimensions: 384 }),
fail_reason: text('fail_reason'),
data_source_id: integer('data_source_id')
.references(() => $data_sources.id)
.notNull(),
processed_at: timestamp('processed_at', { mode: 'date' }),
ingested_at: timestamp('ingested_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`),
},
table => [
index('embeddingIndex').using('hnsw', table.embedding.op('vector_cosine_ops')),
unique('uniqueSourceItem').on(table.data_source_id, table.item_id_from_source),
]
);
export const $reports = pgTable('reports', {
id: serial('id').primaryKey(),
title: text('title').notNull(),
content: text('content').notNull(),
totalArticles: integer('total_articles').notNull(),
totalSources: integer('total_sources').notNull(),
usedArticles: integer('used_articles').notNull(),
usedSources: integer('used_sources').notNull(),
tldr: text('tldr'),
clustering_params: jsonb('clustering_params'),
model_author: text('model_author'),
createdAt: timestamp('created_at', { mode: 'date' })
.default(sql`CURRENT_TIMESTAMP`)
.notNull(),
});
export const $newsletter = pgTable('newsletter', {
id: serial('id').primaryKey(),
email: text('email').notNull().unique(),
createdAt: timestamp('created_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`),
});
|