File size: 4,455 Bytes
1b44660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import {
  boolean,
  index,
  integer,
  jsonb,
  pgEnum,
  pgTable,
  serial,
  text,
  timestamp,
  vector,
  bigserial,
  unique,
} from 'drizzle-orm/pg-core';
import { sql } from 'drizzle-orm';
import type { DataSourceConfigWrapperType } from './validators/dataSourceConfig';
import type { AnalysisPayloadWrapper } from './validators/analysisPayload';

/**
 * Note: We use $ to denote the table objects
 * This frees up the uses of sources, articles, reports, etc as variables in the codebase
 **/

export const ingestedItemStatusEnum = pgEnum('ingested_item_status', [
  'NEW',
  'PENDING_PROCESSING',
  'PROCESSED',
  'FAILED_RENDER',
  'FAILED_FETCH',
  'FAILED_PROCESSING',
  'FAILED_EMBEDDING',
  'FAILED_R2_UPLOAD',
  'SKIPPED_PDF',
  'SKIPPED_TOO_OLD',
]);

export const sourceTypeEnum = pgEnum('source_type', ['RSS']);

export const $publishers = pgTable('publishers', {
  id: serial('id').primaryKey(),
  name: text('name').notNull(),
  base_url: text('base_url'),
  created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(),
});

export const $data_sources = pgTable('data_sources', {
  id: serial('id').primaryKey(),
  name: text('name').notNull(),
  source_type: sourceTypeEnum().notNull(),
  config: jsonb('config').$type<DataSourceConfigWrapperType>().notNull(), // Stores source-specific config like {"url": "...", "config_schema_version": "1.0", "paywall": false, "category": "..."}
  config_version_hash: text('config_version_hash'), // Hash of config to detect changes
  publisher_id: integer('publisher_id').references(() => $publishers.id),
  scrape_frequency_minutes: integer('scrape_frequency_minutes').notNull().default(240), // Default: 4 hours
  lastChecked: timestamp('last_checked', { mode: 'date' }),
  do_initialized_at: timestamp('do_initialized_at', { mode: 'date' }),
  created_at: timestamp('created_at', { mode: 'date' }).defaultNow().notNull(),
  updated_at: timestamp('updated_at', { mode: 'date' }).defaultNow().notNull(),
});

export const $ingested_items = pgTable(
  'ingested_items',
  {
    id: bigserial('id', { mode: 'number' }).primaryKey(),

    item_id_from_source: text('item_id_from_source').notNull(), // RSS guid, Tweet ID, etc.
    raw_data_r2_key: text('raw_data_r2_key').notNull(), // R2 key for original payload

    display_title: text('display_title'), // nullable, might be derived later
    url_to_original: text('url_to_original').notNull().unique(),
    published_at: timestamp('published_at', { mode: 'date' }),

    status: ingestedItemStatusEnum().default('NEW'),

    content_body_r2_key: text('content_body_r2_key'), // R2 key for processed text
    content_body_text: text('content_body_text'), // inline snippet or full text if small
    word_count: integer('word_count'),

    embedding_text: text('embedding_text'), // text used to generate embedding
    analysis_payload: jsonb('analysis_payload').$type<typeof AnalysisPayloadWrapper>(), // structured LLM analysis
    source_specific_metadata: jsonb('source_specific_metadata'), // small, queryable metadata

    usedBrowser: boolean('used_browser'),
    embedding: vector('embedding', { dimensions: 384 }),
    fail_reason: text('fail_reason'),

    data_source_id: integer('data_source_id')
      .references(() => $data_sources.id)
      .notNull(),

    processed_at: timestamp('processed_at', { mode: 'date' }),
    ingested_at: timestamp('ingested_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`),
  },
  table => [
    index('embeddingIndex').using('hnsw', table.embedding.op('vector_cosine_ops')),
    unique('uniqueSourceItem').on(table.data_source_id, table.item_id_from_source),
  ]
);

export const $reports = pgTable('reports', {
  id: serial('id').primaryKey(),
  title: text('title').notNull(),
  content: text('content').notNull(),

  totalArticles: integer('total_articles').notNull(),
  totalSources: integer('total_sources').notNull(),

  usedArticles: integer('used_articles').notNull(),
  usedSources: integer('used_sources').notNull(),

  tldr: text('tldr'),

  clustering_params: jsonb('clustering_params'),

  model_author: text('model_author'),

  createdAt: timestamp('created_at', { mode: 'date' })
    .default(sql`CURRENT_TIMESTAMP`)
    .notNull(),
});

export const $newsletter = pgTable('newsletter', {
  id: serial('id').primaryKey(),
  email: text('email').notNull().unique(),
  createdAt: timestamp('created_at', { mode: 'date' }).default(sql`CURRENT_TIMESTAMP`),
});