chat-ui-energy

Paused

App Files Files Community

chat-ui-energy / src /lib /server /websearch /scrape /parser.ts

neven4

fix(scrape parser): Cannot read properties of 'undefined' during page parsing (#1274)

21054cc unverified over 1 year ago

raw

history blame

16.9 kB

	import type { SerializedHTMLElement } from "./types";

	interface DBSCANOptions<T> {
	dataset: T[];
	epsilon?: number;
	epsilonCompare?: (distance: number, epsilon: number) => boolean;
	minimumPoints?: number;
	distanceFunction: (a: T, b: T) => number;
	}

	export function spatialParser() {
	/**
	* Implementation for dbscan, inlined and migrated to typescript from https://github.com/cdxOo/dbscan (MIT License)
	*/
	const DBSCAN = <T>({
	dataset,
	epsilon = 1,
	epsilonCompare = (dist, e) => dist < e,
	minimumPoints = 2,
	distanceFunction,
	}: DBSCANOptions<T>) => {
	const visitedIndices: Record<number, boolean> = {};
	const isVisited = (i: number) => visitedIndices[i];
	const markVisited = (i: number) => {
	visitedIndices[i] = true;
	};

	const clusteredIndices: Record<number, boolean> = {};
	const isClustered = (i: number) => clusteredIndices[i];
	const markClustered = (i: number) => {
	clusteredIndices[i] = true;
	};

	const uniqueMerge = <U>(targetArray: U[], sourceArray: U[]) => {
	for (let i = 0; i < sourceArray.length; i += 1) {
	const item = sourceArray[i];
	if (targetArray.indexOf(item) < 0) {
	targetArray.push(item);
	}
	}
	};

	const findNeighbors = (index: number) => {
	const neighbors = [];
	for (let other = 0; other < dataset.length; other += 1) {
	const distance = distanceFunction(dataset[index], dataset[other]);
	if (epsilonCompare(distance, epsilon)) {
	neighbors.push(other);
	}
	}
	return neighbors;
	};

	const noise: number[] = [];
	const addNoise = (i: number) => noise.push(i);

	const clusters: number[][] = [];
	const createCluster = () => clusters.push([]) - 1;
	const addIndexToCluster = (c: number, i: number) => {
	clusters[c].push(i);
	markClustered(i);
	};

	const expandCluster = (c: number, neighbors: number[]) => {
	for (let i = 0; i < neighbors.length; i += 1) {
	const neighborIndex = neighbors[i];
	if (!isVisited(neighborIndex)) {
	markVisited(neighborIndex);

	const secondaryNeighbors = findNeighbors(neighborIndex);
	if (secondaryNeighbors.length >= minimumPoints) {
	uniqueMerge(neighbors, secondaryNeighbors);
	}
	}

	if (!isClustered(neighborIndex)) {
	addIndexToCluster(c, neighborIndex);
	}
	}
	};

	dataset.forEach((_, index) => {
	if (!isVisited(index)) {
	markVisited(index);

	const neighbors = findNeighbors(index);
	if (neighbors.length < minimumPoints) {
	addNoise(index);
	} else {
	const clusterIndex = createCluster();
	addIndexToCluster(clusterIndex, index);
	expandCluster(clusterIndex, neighbors);
	}
	}
	});

	return { clusters, noise };
	};

	// -----------
	// Scraping implementation

	const IgnoredTagsList = [
	"footer",
	"nav",
	"aside",
	"script",
	"style",
	"noscript",
	"form",
	"button",
	];
	const InlineTags = [
	"a",
	"abbrv",
	"span",
	"address",
	"time",
	"acronym",
	"strong",
	"b",
	"br",
	"sub",
	"sup",
	"tt",
	"var",
	"em",
	"i",
	];

	type ReadableNode = HTMLElement;
	type NodeWithRect = {
	node: ReadableNode;
	rect: DOMRect;
	};

	const isOnlyChild = (node: Node) => {
	if (!node.parentElement) return true;
	if (node.parentElement.nodeName === "body") return false;
	if (node.parentElement.childNodes.length === 1) return true;
	return false;
	};

	const hasValidInlineParent = (node: Node) => {
	return node.parentElement && !node.parentElement.matches("div, section, article, main, body ");
	};

	const hasValidParent = (node: Node) => {
	return node.parentElement && !node.parentElement.isSameNode(document.body);
	};

	const possibleCodeParents = Array.from(document.querySelectorAll("pre, p"));
	const possibleTableParents = Array.from(document.querySelectorAll("table"));
	const possibleListParents = Array.from(document.querySelectorAll("ul, ol"));
	/**
	* We want to find the highest parent of text node in the cluster.
	* For example in this case: <p><span>Text here</span></p>
	* the P tag is highest parent.
	*/
	const findHighestDirectParentOfReadableNode = (node: Node): HTMLElement => {
	// go up the tree until the parent is no longer an only child
	let parent = node.parentElement;
	// if the parent is an inline tag, then go up one more level
	while (
	parent &&
	hasValidInlineParent(parent) &&
	InlineTags.includes(parent?.tagName.toLowerCase())
	) {
	parent = parent.parentElement;
	}

	while (parent && isOnlyChild(parent)) {
	if (!hasValidParent(parent)) break;
	parent = parent.parentElement;
	}

	if (!parent) {
	throw new Error(
	"disconnected node found, this should not really be possible when traversing through the dom"
	);
	}

	// if the parent is a span, code or div tag check if there is a pre tag or p tag above it
	if (["span", "code", "div"].includes(parent.nodeName.toLowerCase())) {
	const hasParent = possibleCodeParents.find((tag) => tag.contains(parent)) as HTMLElement;
	if (hasParent) {
	parent = hasParent;
	}
	}

	// if the parent is a li tag check if there is a ul or ol tag above it
	if (parent.nodeName.toLowerCase() === "li") {
	const hasParent = possibleListParents.find((tag) => tag.contains(parent)) as HTMLElement;
	if (hasParent) {
	parent = hasParent;
	}
	}

	// if the parent is a td, th, tr tag check if there is a table tag above it
	if (["td", "th", "tr"].includes(parent.nodeName.toLowerCase())) {
	const hasParent = possibleTableParents.find((tag) => tag.contains(parent)) as HTMLElement;
	if (hasParent) {
	parent = hasParent;
	}
	}

	return parent;
	};
	const barredNodes = Array.from(document.querySelectorAll(IgnoredTagsList.join(",")));

	const doesNodePassHeuristics = (node: Node) => {
	if ((node.textContent ?? "").trim().length < 10) {
	return false;
	}

	const parentNode = findHighestDirectParentOfReadableNode(node);

	if (parentNode && parentNode instanceof Element) {
	if (
	!parentNode.checkVisibility({
	checkOpacity: true,
	checkVisibilityCSS: true,
	})
	)
	return false;

	const rect = parentNode.getBoundingClientRect();
	// elements that are readable usually don't have really small height or width
	if (rect.width < 4 \|\| rect.height < 4) {
	return false;
	}
	}

	if (parentNode && parentNode instanceof Element) {
	if (barredNodes.some((barredNode) => barredNode.contains(parentNode))) {
	return false;
	}
	}

	return true;
	};

	const getAllReadableNodes = (): NodeWithRect[] => {
	if (!document.body) throw new Error("Page failed to load");
	const treeWalker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, {
	acceptNode(node) {
	if (doesNodePassHeuristics(node)) {
	return NodeFilter.FILTER_ACCEPT;
	} else {
	return NodeFilter.FILTER_SKIP;
	}
	},
	});

	const readableNodes = [];

	while (treeWalker.nextNode()) {
	readableNodes.push(treeWalker.currentNode as ReadableNode);
	}

	/*
	* <table><p>hello</p><p>world</p></table>
	* table is already included in the parent of the first p tag
	*/

	const parentsForReadableNodes = readableNodes.map(findHighestDirectParentOfReadableNode);
	const listWithOnlyParents: HTMLElement[] = [];
	// find unique nodes in the parent list, a unique node is a node that is not a child of any other node in the list
	for (let i = 0; i < parentsForReadableNodes.length; i++) {
	const node = parentsForReadableNodes[i];
	const hasParentInList = parentsForReadableNodes.find((otherNode, idx) => {
	if (i === idx) return false;
	return otherNode.contains(node);
	});
	listWithOnlyParents.push(hasParentInList ? hasParentInList : node);
	}

	const uniqueParents = Array.from(new Set(listWithOnlyParents));

	return uniqueParents.map((node) => {
	return {
	node,
	rect: node.getBoundingClientRect(),
	};
	});
	};

	const distanceFunction = (a: NodeWithRect, b: NodeWithRect) => {
	// we make two assumptions here which are fine to make for rects returned from getBoundingClientRect
	// 1. rects are upright and not rotated
	// 2. If two rects intersect, we assume distance to be 0
	let dx = 0;
	let dy = 0;
	const rect1 = a.rect;
	const rect2 = b.rect;
	// Calculate the horizontal distance
	if (rect1.x + rect1.width < rect2.x) {
	dx = rect2.x - (rect1.x + rect1.width);
	} else if (rect2.x + rect2.width < rect1.x) {
	dx = rect1.x - (rect2.x + rect2.width);
	}

	// Calculate the vertical distance
	if (rect1.y + rect1.height < rect2.y) {
	dy = rect2.y - (rect1.y + rect1.height);
	} else if (rect2.y + rect2.height < rect1.y) {
	dy = rect1.y - (rect2.y + rect2.height);
	}

	const distance = Math.sqrt(dx * dx + dy * dy);
	// Return the Euclidean distance
	return distance;
	};
	/**
	* Clusters nodes using dbscan
	*/
	const clusterReadableNodes = (nodes: NodeWithRect[]) => {
	const { clusters } = DBSCAN({
	dataset: nodes,
	epsilon: 28,
	minimumPoints: 1,
	distanceFunction,
	});

	return clusters;
	};

	const totalTextLength = (cluster: number[]) => {
	return cluster
	.map((t) => readableNodes[t].node.innerText?.replaceAll(/ {2}\|\r\n\|\n\|\r/gm, ""))
	.join("").length;
	};

	const approximatelyEqual = (a: number, b: number, epsilon = 1) => {
	return Math.abs(a - b) < epsilon;
	};

	const getClusterBounds = (cluster: number[]) => {
	const leftMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.x));
	const topMostPoint = Math.min(...cluster.map((c) => readableNodes[c].rect.y));
	const rightMostPoint = Math.max(
	...cluster.map((c) => readableNodes[c].rect.x + readableNodes[c].rect.width)
	);
	const bottomMostPoint = Math.max(
	...cluster.map((c) => readableNodes[c].rect.y + readableNodes[c].rect.height)
	);
	return {
	// left most element
	x: leftMostPoint,
	y: topMostPoint,
	width: rightMostPoint - leftMostPoint,
	height: bottomMostPoint - topMostPoint,
	};
	};

	const round = (num: number, decimalPlaces = 2) => {
	const factor = Math.pow(10, decimalPlaces);
	return Math.round(num * factor) / factor;
	};

	/** minimum distance to center of the screen */
	const clusterCentrality = (cluster: number[]) => {
	const bounds = getClusterBounds(cluster);
	const centerOfScreen = window.innerWidth / 2;
	// the cluster contains the center of the screen
	if (bounds.x < centerOfScreen && bounds.x + bounds.width > centerOfScreen) {
	return 0;
	}

	// the cluster is to the left of the screen
	if (bounds.x + bounds.width < centerOfScreen) {
	return centerOfScreen - (bounds.x + bounds.width);
	}

	// the cluster is to the right of the screen
	return bounds.x - centerOfScreen;
	};
	/** measure of text share that belong to the cluster */
	const percentageTextShare = (cluster: number[], totalLength: number) => {
	// apply an exponentially increasing penalty for centrality per 100 pixels distance from center

	return round((totalTextLength(cluster) / totalLength) * 100);
	};

	const shouldMergeClusters = (clusterA: number[], clusterB: number[]) => {
	const clusterABounds = getClusterBounds(clusterA);
	const clusterBBounds = getClusterBounds(clusterB);

	// A cluster is horizontally aligned if the x and width are roughly equal
	const isHorizontallyAligned =
	approximatelyEqual(clusterABounds.x, clusterBBounds.x, 40) &&
	approximatelyEqual(clusterABounds.width, clusterBBounds.width, 40);

	if (!isHorizontallyAligned) return false;

	// check the y gap between the clusters
	const higherCluster = clusterABounds.y < clusterBBounds.y ? clusterABounds : clusterBBounds;
	const lowerCluster = clusterABounds.y < clusterBBounds.y ? clusterBBounds : clusterABounds;
	const yGap = lowerCluster.y - (higherCluster.y + higherCluster.height);

	if (approximatelyEqual(yGap, 0, 100)) return true;
	};

	const findCriticalClusters = (clusters: number[][]) => {
	// merge the clusters that have similar widths and x position

	let i = 0;
	while (i < clusters.length) {
	const cluster = clusters[i];
	for (let j = i + 1; j < clusters.length; j++) {
	const otherCluster = clusters[j];
	if (shouldMergeClusters(cluster, otherCluster)) {
	cluster.push(...otherCluster);
	clusters.splice(j, 1);
	j -= 1;
	}
	}

	i++;
	}

	const totalText = totalTextLength(clusters.flat());

	// sort in descending order of text share
	const clusterWithMetrics = clusters.map((cluster) => {
	const centrality = clusterCentrality(cluster);
	return {
	cluster,
	centrality,
	percentageTextShare: percentageTextShare(cluster, totalText),
	};
	});

	// if there is a dominant cluster with more than 60% text share, return that
	const dominantCluster = clusterWithMetrics[0]?.percentageTextShare > 60;
	if (dominantCluster) return [clusterWithMetrics[0].cluster];

	// clusters are sorted by text share after applying a penalty for centrality
	const sortedClusters = clusterWithMetrics.sort((a, b) => {
	const penaltyForA = Math.pow(0.9, a.centrality / 100);
	const penaltyForB = Math.pow(0.9, b.centrality / 100);
	const adjustedTextShareA = a.percentageTextShare * penaltyForA;
	const adjustedTextShareB = b.percentageTextShare * penaltyForB;

	return adjustedTextShareB - adjustedTextShareA;
	});

	// find all clusters that are similar to the largest cluster in terms of text share
	// and see if they are enough to cover at least 60% of the text share
	const largeTextShareClusters = sortedClusters.filter((c) =>
	approximatelyEqual(c.percentageTextShare, sortedClusters[0]?.percentageTextShare, 10)
	);

	const totalTextShareOfLargeClusters = largeTextShareClusters.reduce(
	(acc, cluster) => acc + cluster.percentageTextShare,
	0
	);

	if (totalTextShareOfLargeClusters > 60) {
	return largeTextShareClusters.map((c) => c.cluster);
	}

	// choose clusters till the text share is greater than 60%
	let totalTextShare = 0;
	const criticalClusters = [];
	for (const cluster of sortedClusters) {
	/** Ignore clusters with less than 2%*/
	if (cluster.percentageTextShare < 2) continue;
	if (totalTextShare > 60) break;
	criticalClusters.push(cluster.cluster);
	totalTextShare += cluster.percentageTextShare;
	}

	// if the total text share is less than 60% then return an empty array
	// as this website should not be particularly useful for the web search anyways
	// this should almost never happen on structured website with a lot of text
	if (totalTextShare < 60) {
	return [];
	}

	return criticalClusters;
	};

	const allowListedAttributes = ["href", "src", "alt", "title", "class", "id"];
	function serializeHTMLElement(node: Element): SerializedHTMLElement {
	return {
	tagName: node.tagName.toLowerCase(),
	attributes: allowListedAttributes.reduce((acc, attr) => {
	const value = node.getAttribute(attr);
	if (value) {
	acc[attr] = value;
	}
	return acc;
	}, {} as Record<string, string>),
	content: Array.from(node.childNodes).map(serializeNode).filter(Boolean),
	};
	}

	function serializeNode(node: Node): SerializedHTMLElement \| string {
	if (node.nodeType === 1) return serializeHTMLElement(node as Element);
	else if (node.nodeType === 3) return node.textContent ?? "";
	else return "";
	}

	function getPageMetadata(): {
	title: string;
	siteName?: string;
	author?: string;
	description?: string;
	createdAt?: string;
	updatedAt?: string;
	} {
	const title = document.title ?? "";
	const siteName =
	document.querySelector("meta[property='og:site_name']")?.getAttribute("content") ?? undefined;
	const author =
	document.querySelector("meta[name='author']")?.getAttribute("content") ?? undefined;
	const description =
	document.querySelector("meta[name='description']")?.getAttribute("content") ??
	document.querySelector("meta[property='og:description']")?.getAttribute("content") ??
	undefined;
	const createdAt =
	document.querySelector("meta[property='article:published_time']")?.getAttribute("content") ??
	document.querySelector("meta[name='date']")?.getAttribute("content") ??
	undefined;
	const updatedAt =
	document.querySelector("meta[property='article:modified_time']")?.getAttribute("content") ??
	undefined;

	return { title, siteName, author, description, createdAt, updatedAt };
	}

	const readableNodes = getAllReadableNodes();
	const clusters = clusterReadableNodes(readableNodes);

	const criticalClusters = findCriticalClusters(clusters);

	// filter readable nodes using the above information as well as heuristics
	const filteredNodes = readableNodes.filter((_, idx) => {
	return criticalClusters.some((cluster) => {
	return cluster.includes(idx);
	});
	});

	const elements = filteredNodes
	.filter(
	(node, idx, nodes) => !nodes.slice(idx + 1).some((otherNode) => node.node === otherNode.node)
	)
	.map<SerializedHTMLElement>(({ node }) => serializeHTMLElement(node));
	const metadata = getPageMetadata();
	return { ...metadata, elements };
	}