Spaces:
Running
Running
File size: 6,084 Bytes
e5f4e9a b17a5c8 376916f b17a5c8 9264459 e5f4e9a 9264459 c504a2c e5f4e9a 2a808d7 9264459 e5f4e9a 2a808d7 e5f4e9a 347b211 e5f4e9a 347b211 e5f4e9a 9264459 36be130 9264459 36be130 9264459 36be130 c4489ae 9264459 36be130 9264459 eacf6f2 e5f4e9a 9264459 e5f4e9a b17a5c8 e5f4e9a 376916f 6fb5de7 376916f e5f4e9a b17a5c8 e5f4e9a b17a5c8 e5f4e9a a4dc062 e5f4e9a 3148b2c e5f4e9a b85bb4c e5f4e9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
import { searchWeb } from "$lib/server/websearch/searchWeb";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import { findSimilarSentences } from "$lib/server/sentenceSimilarity";
import { getWebSearchProvider } from "./searchWeb";
import { defaultEmbeddingModel, embeddingModels } from "$lib/server/embeddingModels";
import { WEBSEARCH_ALLOWLIST, WEBSEARCH_BLOCKLIST, ENABLE_LOCAL_FETCH } from "$env/static/private";
import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import type { Assistant } from "$lib/types/Assistant";
import { z } from "zod";
import JSON5 from "json5";
import { isURLLocal } from "../isURLLocal";
const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_EMBED = 5 as const;
const listSchema = z.array(z.string()).default([]);
const allowList = listSchema.parse(JSON5.parse(WEBSEARCH_ALLOWLIST));
const blockList = listSchema.parse(JSON5.parse(WEBSEARCH_BLOCKLIST));
export async function runWebSearch(
conv: Conversation,
messages: Message[],
updatePad: (upd: MessageUpdate) => void,
ragSettings?: Assistant["rag"]
) {
const prompt = messages[messages.length - 1].content;
const webSearch: WebSearch = {
prompt,
searchQuery: "",
results: [],
contextSources: [],
createdAt: new Date(),
updatedAt: new Date(),
};
function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
updatePad({ type: "webSearch", messageType: type ?? "update", message, args });
}
try {
// if the assistant specified direct links, skip the websearch
if (ragSettings && ragSettings?.allowedLinks.length > 0) {
appendUpdate("Using links specified in Assistant");
let linksToUse = [...ragSettings.allowedLinks];
if (ENABLE_LOCAL_FETCH !== "true") {
const localLinks = await Promise.all(
linksToUse.map(async (link) => {
try {
const url = new URL(link);
return await isURLLocal(url);
} catch (e) {
return true;
}
})
);
linksToUse = linksToUse.filter((_, index) => !localLinks[index]);
}
webSearch.results = linksToUse.map((link) => {
return { link, hostname: new URL(link).hostname, title: "", text: "" };
});
} else {
webSearch.searchQuery = await generateQuery(messages);
const searchProvider = getWebSearchProvider();
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
let filters = "";
if (ragSettings && ragSettings?.allowedDomains.length > 0) {
appendUpdate("Filtering on specified domains");
filters += ragSettings.allowedDomains.map((item) => "site:" + item).join(" OR ");
}
// handle the global lists
filters +=
allowList.map((item) => "site:" + item).join(" OR ") +
" " +
blockList.map((item) => "-site:" + item).join(" ");
webSearch.searchQuery = filters + " " + webSearch.searchQuery;
const results = await searchWeb(webSearch.searchQuery);
webSearch.results =
(results.organic_results &&
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
try {
const { title, link, text } = el;
const { hostname } = new URL(link);
return { title, link, hostname, text };
} catch (e) {
// Ignore Errors
return null;
}
})) ??
[];
}
webSearch.results = webSearch.results.filter((value) => value !== null);
webSearch.results = webSearch.results
.filter(({ link }) => !blockList.some((el) => link.includes(el))) // filter out blocklist links
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
// fetch the model
const embeddingModel =
embeddingModels.find((m) => m.id === conv.embeddingModel) ?? defaultEmbeddingModel;
if (!embeddingModel) {
throw new Error(`Embedding model ${conv.embeddingModel} not available anymore`);
}
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
if (webSearch.results.length > 0) {
appendUpdate("Browsing results");
const promises = webSearch.results.map(async (result) => {
const { link } = result;
let text = result.text ?? "";
if (!text) {
try {
text = await parseWeb(link);
appendUpdate("Browsing webpage", [link]);
} catch (e) {
appendUpdate("Failed to parse webpage", [(e as Error).message, link], "error");
// ignore errors
}
}
const MAX_N_CHUNKS = 100;
const texts = chunk(text, embeddingModel.chunkCharLength).slice(0, MAX_N_CHUNKS);
return texts.map((t) => ({ source: result, text: t }));
});
const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
paragraphChunks = nestedParagraphChunks.flat();
if (!paragraphChunks.length) {
throw new Error("No text found on the first 5 results");
}
} else {
throw new Error("No results found for this search query");
}
appendUpdate("Extracting relevant information");
const topKClosestParagraphs = 8;
const texts = paragraphChunks.map(({ text }) => text);
const indices = await findSimilarSentences(embeddingModel, prompt, texts, {
topK: topKClosestParagraphs,
});
for (const idx of indices) {
const { source } = paragraphChunks[idx];
const contextWithId = { idx, text: texts[idx] };
const usedSource = webSearch.contextSources.find((cSource) => cSource.link === source.link);
if (usedSource) {
usedSource.context.push(contextWithId);
} else {
webSearch.contextSources.push({ ...source, context: [contextWithId] });
}
}
updatePad({
type: "webSearch",
messageType: "sources",
message: "sources",
sources: webSearch.contextSources,
});
} catch (searchError) {
if (searchError instanceof Error) {
appendUpdate("An error occurred", [JSON.stringify(searchError.message)], "error");
}
}
return webSearch;
}
|