File size: 3,359 Bytes
b17a5c8
9264459
 
 
 
2c00ea8
9264459
 
2c00ea8
 
 
 
e5f4e9a
2c00ea8
 
e5f4e9a
2c00ea8
 
 
 
 
c504a2c
e5f4e9a
 
2a808d7
9264459
 
2c00ea8
2a808d7
2c00ea8
 
 
e5f4e9a
 
b17a5c8
 
 
2c00ea8
b17a5c8
 
2c00ea8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e5f4e9a
 
2c00ea8
e5f4e9a
2c00ea8
 
 
 
 
 
 
3148b2c
 
 
 
2c00ea8
3148b2c
2c00ea8
 
 
 
 
 
 
 
 
 
 
 
e5f4e9a
2c00ea8
 
 
 
 
 
 
 
 
 
 
e5f4e9a
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";

import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchScrapedSource } from "$lib/types/WebSearch";
import type { Assistant } from "$lib/types/Assistant";

import { search } from "./search/search";
import { scrape } from "./scrape/scrape";
import { findContextSources } from "./embed/embed";
import { removeParents } from "./markdown/tree";

const MAX_N_PAGES_TO_SCRAPE = 8 as const;
const MAX_N_PAGES_TO_EMBED = 5 as const;

export type AppendUpdate = (message: string, args?: string[], type?: "error" | "update") => void;
const makeAppendUpdate =
	(updatePad: (upd: MessageUpdate) => void): AppendUpdate =>
	(message, args, type) =>
		updatePad({ type: "webSearch", messageType: type ?? "update", message, args });

export async function runWebSearch(
	conv: Conversation,
	messages: Message[],
	updatePad: (upd: MessageUpdate) => void,
	ragSettings?: Assistant["rag"]
): Promise<WebSearch> {
	const prompt = messages[messages.length - 1].content;
	const createdAt = new Date();
	const updatedAt = new Date();
	const appendUpdate = makeAppendUpdate(updatePad);

	try {
		const embeddingModel =
			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
		if (!embeddingModel) {
			throw Error(`Embedding model ${conv.embeddingModel} not available anymore`);
		}

		// Search the web
		const { searchQuery, pages } = await search(messages, ragSettings, appendUpdate);
		if (pages.length === 0) throw Error("No results found for this search query");

		// Scrape pages
		appendUpdate("Browsing search results");

		const scrapedPages = await Promise.all(
			pages
				.slice(0, MAX_N_PAGES_TO_SCRAPE)
				.map(scrape(appendUpdate, embeddingModel.chunkCharLength))
		).then((allScrapedPages) =>
			allScrapedPages
				.filter((p): p is WebSearchScrapedSource => Boolean(p))
				.filter((p) => p.page.markdownTree.children.length > 0)
				.slice(0, MAX_N_PAGES_TO_EMBED)
		);

		if (!scrapedPages.length) {
			throw Error(`No text found in the first ${MAX_N_PAGES_TO_SCRAPE} results`);
		}

		// Chunk the text of each of the elements and find the most similar chunks to the prompt
		appendUpdate("Extracting relevant information");
		const contextSources = await findContextSources(scrapedPages, prompt, embeddingModel).then(
			(ctxSources) =>
				ctxSources.map((source) => ({
					...source,
					page: { ...source.page, markdownTree: removeParents(source.page.markdownTree) },
				}))
		);
		updatePad({
			type: "webSearch",
			messageType: "sources",
			message: "sources",
			sources: contextSources,
		});

		return {
			prompt,
			searchQuery,
			results: scrapedPages.map(({ page, ...source }) => ({
				...source,
				page: { ...page, markdownTree: removeParents(page.markdownTree) },
			})),
			contextSources,
			createdAt,
			updatedAt,
		};
	} catch (searchError) {
		const message = searchError instanceof Error ? searchError.message : String(searchError);
		console.error(message);
		appendUpdate("An error occurred", [JSON.stringify(message)], "error");
		return {
			prompt,
			searchQuery: "",
			results: [],
			contextSources: [],
			createdAt,
			updatedAt,
		};
	}
}