File size: 6,084 Bytes
e5f4e9a
 
 
 
b17a5c8
376916f
b17a5c8
9264459
 
 
 
 
 
 
 
 
 
 
e5f4e9a
 
 
 
9264459
 
 
 
c504a2c
e5f4e9a
 
2a808d7
9264459
 
e5f4e9a
2a808d7
e5f4e9a
347b211
e5f4e9a
 
 
 
 
 
 
 
347b211
e5f4e9a
 
 
9264459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36be130
9264459
 
36be130
9264459
 
 
36be130
c4489ae
9264459
 
 
36be130
 
9264459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eacf6f2
e5f4e9a
9264459
e5f4e9a
 
b17a5c8
 
 
 
 
 
 
 
e5f4e9a
 
 
 
 
376916f
 
 
 
 
 
6fb5de7
376916f
 
e5f4e9a
 
b17a5c8
e5f4e9a
 
 
 
 
 
 
 
 
 
 
 
 
 
b17a5c8
e5f4e9a
 
 
 
 
a4dc062
 
 
 
 
 
e5f4e9a
 
3148b2c
 
 
 
 
 
e5f4e9a
 
b85bb4c
e5f4e9a
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import { searchWeb } from "$lib/server/websearch/searchWeb";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
import { getWebSearchProvider } from "./searchWeb";
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
import { WEBSEARCH_ALLOWLIST, WEBSEARCH_BLOCKLIST, ENABLE_LOCAL_FETCH } from "$env/static/private";

import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import type { Assistant } from "$lib/types/Assistant";

import { z } from "zod";
import JSON5 from "json5";
import { isURLLocal } from "../isURLLocal";

const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_EMBED = 5 as const;

const listSchema = z.array(z.string()).default([]);

const allowList = listSchema.parse(JSON5.parse(WEBSEARCH_ALLOWLIST));
const blockList = listSchema.parse(JSON5.parse(WEBSEARCH_BLOCKLIST));

export async function runWebSearch(
	conv: Conversation,
	messages: Message[],
	updatePad: (upd: MessageUpdate) => void,
	ragSettings?: Assistant["rag"]
) {
	const prompt = messages[messages.length - 1].content;
	const webSearch: WebSearch = {
		prompt,
		searchQuery: "",
		results: [],
		contextSources: [],
		createdAt: new Date(),
		updatedAt: new Date(),
	};

	function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
		updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
	}

	try {
		// if the assistant specified direct links, skip the websearch
		if (ragSettings && ragSettings?.allowedLinks.length > 0) {
			appendUpdate("Using links specified in Assistant");

			let linksToUse = [...ragSettings.allowedLinks];

			if (ENABLE_LOCAL_FETCH !== "true") {
				const localLinks = await Promise.all(
					linksToUse.map(async (link) => {
						try {
							const url = new URL(link);
							return await isURLLocal(url);
						} catch (e) {
							return true;
						}
					})
				);

				linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
			}

			webSearch.results = linksToUse.map((link) => {
				return { link, hostname: new URL(link).hostname, title: "", text: "" };
			});
		} else {
			webSearch.searchQuery = await generateQuery(messages);
			const searchProvider = getWebSearchProvider();
			appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);

			let filters = "";
			if (ragSettings && ragSettings?.allowedDomains.length > 0) {
				appendUpdate("Filtering on specified domains");
				filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
			}

			// handle the global lists
			filters +=
				allowList.map((item) => "site:" + item).join(" OR ") +
				" " +
				blockList.map((item) => "-site:" + item).join(" ");

			webSearch.searchQuery = filters + " " + webSearch.searchQuery;

			const results = await searchWeb(webSearch.searchQuery);
			webSearch.results =
				(results.organic_results &&
					results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
						try {
							const { title, link, text } = el;
							const { hostname } = new URL(link);
							return { title, link, hostname, text };
						} catch (e) {
							// Ignore Errors
							return null;
						}
					})) ??
				[];
		}

		webSearch.results = webSearch.results.filter((value) => value !== null);
		webSearch.results = webSearch.results
			.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
			.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only

		// fetch the model
		const embeddingModel =
			embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;

		if (!embeddingModel) {
			throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
		}

		let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
		if (webSearch.results.length > 0) {
			appendUpdate("Browsing results");
			const promises = webSearch.results.map(async (result) => {
				const { link } = result;
				let text = result.text ?? "";
				if (!text) {
					try {
						text = await parseWeb(link);
						appendUpdate("Browsing webpage", [link]);
					} catch (e) {
						appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error");
						// ignore errors
					}
				}
				const MAX_N_CHUNKS = 100;
				const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
				return texts.map((t) => ({ source: result, text: t }));
			});
			const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
			paragraphChunks = nestedParagraphChunks.flat();
			if (!paragraphChunks.length) {
				throw new Error("No text found on the first 5 results");
			}
		} else {
			throw new Error("No results found for this search query");
		}

		appendUpdate("Extracting relevant information");
		const topKClosestParagraphs = 8;
		const texts = paragraphChunks.map(({ text }) => text);
		const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
			topK: topKClosestParagraphs,
		});

		for (const idx of indices) {
			const { source } = paragraphChunks[idx];
			const contextWithId = { idx, text: texts[idx] };
			const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
			if (usedSource) {
				usedSource.context.push(contextWithId);
			} else {
				webSearch.contextSources.push({ ...source, context: [contextWithId] });
			}
		}
		updatePad({
			type: "webSearch",
			messageType: "sources",
			message: "sources",
			sources: webSearch.contextSources,
		});
	} catch (searchError) {
		if (searchError instanceof Error) {
			appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
		}
	}

	return webSearch;
}