Spaces:

harvesthealth
/

secondme-api

Sleeping

Gemini

fix: Resolve runtime errors and timeout issues\n\n- Create writable directories for logs and Hugging Face cache.\n- Set TRANSFORMERS_CACHE environment variable.\n- Fix SyntaxWarning in lpm_kernel/utils.py.\n- Disable ChromaDB telemetry to prevent posthog errors.

e09e6c3 23 days ago

raw

history blame contribute delete

28.8 kB

	import logging
	from enum import Enum
	import tiktoken
	import re
	from typing import Any, Optional, Union, Collection, AbstractSet, Literal, List
	from langchain.text_splitter import TextSplitter
	import random
	import string
	from itertools import chain
	import json
	from lpm_kernel.configs.logging import get_train_process_logger
	logger = get_train_process_logger()

	class IntentType(Enum):
	Emotion = "Emotion"
	Knowledge = "Knowledge"


	def select_language_desc(
	preferred_language,
	default_desc="Identify the language of the provided Hint. Your response must be in the same language.",
	):
	custom_desc = "You must respond in {}."
	if isinstance(preferred_language, str) and "/" in preferred_language:
	native, es = preferred_language.split("/")
	logging.info(f"Native: {native}, ES: {es}")
	return custom_desc.format(es)
	else:
	logging.info(
	"Error: preferred_language is not in the correct format. It should be 'native/es'."
	)
	return default_desc


	def cal_upperbound(
	model_limit: int = 4096,
	generage_limit: int = 512,
	tolerance: int = 500,
	raw: str = "",
	model_name: str = "gpt-3.5-turbo",
	) -> int:
	"""
	:param model_limit: Maximum token count for the underlying model call
	:param tolerance: Error tolerance buffer
	:param raw: system prompt and raw content
	:return:
	"""
	if model_name is not None:
	if model_name in tiktoken.model.MODEL_TO_ENCODING:
	enc = tiktoken.encoding_for_model(model_name)
	logging.info(f"Successfully initialized tokenizer for model: {model_name}")
	else:
	enc = tiktoken.get_encoding("cl100k_base")
	logging.warning(f"Model '{model_name}' doesn't have a corresponding tokenizer, falling back to default: cl100k_base")
	else:
	enc = tiktoken.get_encoding("cl100k_base")
	logging.info(f"No model specified, using default tokenizer: cl100k_base")
	raw_token = len(enc.encode(raw))
	upper_bound = model_limit - raw_token - tolerance - generage_limit
	if upper_bound < 0:
	logging.info(f"raw content is too long: {raw_token}")
	return 0
	return upper_bound


	def equidistant_filter(chunks, separator, filtered_chunks_n=6):
	# Select the first and last two chunks, sample the remaining chunks evenly from the middle
	gap = (len(chunks) - 2) / (filtered_chunks_n - 2)
	indexes = [
	int(gap * i)
	for i in range(int(len(chunks) / gap) + 1)
	if (gap * i < len(chunks) - 2)
	]
	filtered_chunks = [chunks[i] for i in indexes]
	filtered_chunks.append(separator.join(chunks[-2:]))
	return filtered_chunks


	def tab_or_space_replacement(match):
	# If there is a tab character in the matched string, replace it with a single tab, otherwise replace it with a single space
	return "\t" if "\t" in match.group() else " "


	def text_filter(text: str) -> str:
	pattern_tab_space = "[ \t]{3,}"
	pattern_wordwrap = "[\n\f\r\v]{3,}"
	# Replace when encountering three or more spaces or tabs
	replaced_text = re.sub(pattern_tab_space, tab_or_space_replacement, text)
	# When there are multiple consecutive \n (newline), \f (form feed), \r (carriage return), \v (vertical tab), replace them with 2 original newlines
	replaced_text = re.sub(pattern_wordwrap, "\n\n", replaced_text)
	return replaced_text


	ALLOW_SPECIAL_TOKEN = {"<\|endofprompt\|>", "<\|endoftext\|>"}


	def find_sublist_indices(main_list, sublist):
	indices = []
	length = len(sublist)
	for i in range(len(main_list) - length + 1):
	if main_list[i : i + length] == sublist:
	indices.append((i, i + length))
	return indices


	class TokenTextSplitter(TextSplitter):
	"""Implementation of splitting text that looks at tokens."""

	def __init__(
	self,
	encoding_name: str = "cl100k_base",
	model_name: Optional[str] = None,
	allowed_special: Union[Literal["all"], AbstractSet[str]] = ALLOW_SPECIAL_TOKEN,
	disallowed_special: Union[Literal["all"], Collection[str]] = "all",
	**kwargs: Any,
	):
	"""Create a new TextSplitter."""
	super().__init__(**kwargs)
	try:
	import tiktoken
	except ImportError:
	raise ValueError(
	"Could not import tiktoken python package. "
	"This is needed in order to for TokenTextSplitter. "
	"Please it install it with `pip install tiktoken`."
	)
	# create a GPT-3 encoder instance
	if model_name is not None:
	if model_name in tiktoken.model.MODEL_TO_ENCODING:
	enc = tiktoken.encoding_for_model(model_name)
	logging.info(f"Successfully initialized tokenizer for model: {model_name}")
	else:
	enc = tiktoken.get_encoding(encoding_name)
	logging.warning(f"Model '{model_name}' doesn't have a corresponding tokenizer, falling back to default: {encoding_name}")
	else:
	enc = tiktoken.get_encoding(encoding_name)
	logging.info(f"No model specified, using default tokenizer: {encoding_name}")
	self._tokenizer = enc
	self._allowed_special = allowed_special
	self._disallowed_special = disallowed_special

	def split_text(self, text: str) -> List[str]:
	"""Split incoming text and return chunks."""
	# Filter content with a large number of whitespace characters in the input text to increase the proportion of effective content within chunks
	text = text_filter(text)
	splits = []
	input_ids = self._tokenizer.encode(
	text,
	allowed_special=self._allowed_special,
	disallowed_special=self._disallowed_special,
	)

	start_idx = 0
	while start_idx < len(input_ids):
	cur_idx = min(start_idx + self._chunk_size, len(input_ids))
	chunk_ids = input_ids[start_idx:cur_idx]
	s = self._tokenizer.decode(chunk_ids).strip()
	if s:
	s = self._cut_meaningless_head_tail(s)
	if s:
	splits.append(s)
	start_idx += self._chunk_size - self._chunk_overlap
	logging.debug("finished split_text(): %s splits", len(splits))
	return splits

	def _cut_meaningless_head_tail(self, text: str) -> str:
	# Only split when there are multiple newlines, as parsing of PDF/Word often contains false newlines
	sentences = re.split(r"\. \|! \|\? \|。\|！\|？\|\n+ *\n+", text)
	if len(sentences) < 2:
	return text
	head = sentences[0]
	body = ". ".join(sentences[1:-1])
	tail = sentences[-1]
	head_len = len(
	self._tokenizer.encode(
	body,
	allowed_special=self._allowed_special,
	disallowed_special=self._disallowed_special,
	)
	)
	body_len = len(
	self._tokenizer.encode(
	body,
	allowed_special=self._allowed_special,
	disallowed_special=self._disallowed_special,
	)
	)
	tail_len = len(
	self._tokenizer.encode(
	tail,
	allowed_special=self._allowed_special,
	disallowed_special=self._disallowed_special,
	)
	)
	parts = []
	# Use length to roughly estimate the impact of discarding the tail; if the impact is not significant, discard it
	# Rough estimate: Chinese 20 tokens, 8 characters; English 10 tokens, 30 characters
	if head_len >= 20 or len(head) >= 30:
	parts.append(head)
	if body_len > 0:
	parts.append(body)
	if tail_len >= 20 or len(tail) >= 30:
	parts.append(tail)
	res = "\n".join(parts)

	logger.info(
	"_cut_meaningless_tail() removes redundant sentence tails from chunks, before cut: %s characters, after cut: %s characters",
	len(text),
	len(res),
	)
	return res


	def chunk_filter(
	chunks, filter, filtered_chunks_n=6, separator="\n", spacer="\n……\n……\n……\n"
	):
	if len(chunks) <= filtered_chunks_n:
	return separator.join(chunks)
	return spacer.join(filter(chunks, separator, filtered_chunks_n))


	def get_safe_content_turncate(content, model_name="gpt-3.5-turbo", max_tokens=3300):
	if model_name is not None:
	if model_name in tiktoken.model.MODEL_TO_ENCODING:
	enc = tiktoken.encoding_for_model(model_name)
	logging.info(f"Successfully initialized tokenizer for model: {model_name}")
	else:
	enc = tiktoken.get_encoding("cl100k_base")
	logging.warning(f"Model '{model_name}' doesn't have a corresponding tokenizer, falling back to default: cl100k_base")
	else:
	enc = tiktoken.get_encoding("cl100k_base")
	logging.info(f"No model specified, using default tokenizer: cl100k_base")
	logging.warning(
	"get_safe_content_turncate(): current model maximum input length is %s, current input length is %s",
	max_tokens,
	len(enc.encode(content)),
	)
	if len(enc.encode(content)) > max_tokens:
	content = enc.decode(enc.encode(content)[:max_tokens])
	return content


	class DataType(Enum):
	DOCUMENT = "DOCUMENT"
	WEBSITE = "WEBSITE"
	IMAGE = "IMAGE"
	TABLE = "TABLE"
	AUDIO = "AUDIO"
	TEXT = "TEXT"

	@staticmethod
	def extra_values_map():
	return {
	"SHORT_AUDIO": "AUDIO",
	}

	@classmethod
	def _missing_(cls, value):
	# Try to find the corresponding primary key value from the extra value mapping
	extra_map = cls.extra_values_map()
	if value in extra_map:
	value = extra_map[value]
	return cls.__members__.get(value)
	# If not found, return DOCUMENT by default
	logging.error("DataType._missing_(): Could not find corresponding DataType enum value: %s", value)
	return cls.DOCUMENT


	def get_urls(string):
	url_arr = []

	if not string:
	return url_arr

	pattern = re.compile(
	r"(https?\|ftp\|file)://[-A-Za-z0-9+&@#/%?=~_\|!:,.;\u4e00-\u9fa5]+[-A-Za-z0-9+&@#/%=~_\|]"
	)
	matcher = pattern.finditer(string)

	for match in matcher:
	url_arr.append(match.group())

	sorted_url_arr = sorted(set(url_arr), key=len, reverse=True)

	return sorted_url_arr


	def get_random_string(s_length: int) -> str:
	# Generate a random string
	letters = string.ascii_letters + string.digits
	return "".join(random.choice(letters) for i in range(s_length))


	def get_random_strings(n: int, s_length: int) -> List[str]:
	unique_strings = set()
	while len(unique_strings) < n:
	unique_strings.add(get_random_string(s_length))
	return list(unique_strings)


	def encode_urls(text, random_string_len: int = 16):
	urls = get_urls(text)
	random_strings = get_random_strings(len(urls), random_string_len)
	url2string_dict = dict(zip(urls, random_strings))
	string2url_dict = dict(zip(random_strings, urls))
	for url, random_string in url2string_dict.items():
	text = text.replace(url, random_string)
	return text, string2url_dict


	def decode_urls(text, string2url_dict):
	for random_string, url in string2url_dict.items():
	text = text.replace(random_string, url)
	return text


	class TokenParagraphSplitter(TextSplitter):
	"""For business data characteristics, perform some additional processing. This includes:
	1. Complete fragments as independent chunks help improve information focus in each chunk. Complete fragments are mainly determined by period+newline.
	2. When complete fragments are too long, split them into sentences and combine sentences into chunks that meet window size limits
	3. If a sentence is too long, split it directly by token granularity
	"""

	line_break_characters = ["\n", "\f", "\r", "\v"]
	whitespace_characters = [" ", "\t"]
	sentence_terminators = [
	".",
	"!",
	"?",
	"。",
	"！",
	"？",
	"……",
	"...",
	] + line_break_characters
	paired_punctuation = [
	("(", ")"),
	("[", "]"),
	("{", "}"),
	("<", ">"),
	("“", "”"),
	("‘", "’"),
	("《", "》"),
	("【", "】"),
	]
	intra_sentence_delimiters = [",", "，", ";", "；"] + whitespace_characters

	def __init__(
	self,
	encoding_name: str = "cl100k_base",
	allowed_special: Union[Literal["all"], AbstractSet[str]] = ALLOW_SPECIAL_TOKEN,
	disallowed_special: Union[Literal["all"], Collection[str]] = "all",
	**kwargs: Any,
	):
	"""Create a new TextSplitter."""
	super().__init__(**kwargs)
	try:
	import tiktoken
	except ImportError:
	raise ValueError(
	"Could not import tiktoken python package. "
	"This is needed in order to for TokenTextSplitter. "
	"Please it install it with `pip install tiktoken`."
	)
	# create a GPT-3 encoder instance
	self._tokenizer = tiktoken.get_encoding(encoding_name)
	self._allowed_special = allowed_special
	self._disallowed_special = disallowed_special

	def split_text(self, text: str) -> List[str]:
	chunks = []

	# Clean up abnormal whitespace characters in the text, such as replacing 3 or more consecutive \n with \n\n
	text = text_filter(text)

	# Replace URLs in the text to avoid symbols like ./?/ in URLs interfering with sentence splitting
	text, string2url_dict = encode_urls(text)
	url_strings = list(string2url_dict.keys())

	# Split by paragraphs according to rules
	paragraphs = self._split_to_paragraphs(
	text, min_paragraph_length=self._chunk_size // 2
	)

	for i, paragraph in enumerate(paragraphs):
	splits = self._split_to_chunks(paragraph, url_strings)
	logging.debug(
	"paragraph %s/%s %s characters: %s",
	i + 1,
	len(paragraphs),
	len(paragraph),
	paragraph,
	)
	logging.debug(
	"paragraph %s/%s split into %s chunks: %s",
	i + 1,
	len(paragraphs),
	len(splits),
	splits,
	)
	chunks.extend(splits)

	chunks = [decode_urls(chunk, string2url_dict) for chunk in chunks]

	return chunks

	def _split_to_chunks(self, text: str, url_strings: List[str] = []) -> List[str]:
	sentences = self._split_to_sentences(text, url_strings)
	chunks = self._merge_sentences_into_chunks(
	sentences, min_chunk_size=self._chunk_size // 2
	)
	return chunks

	def _split_to_paragraphs(
	self, text: str, min_paragraph_length: int = 0
	) -> List[str]:
	"""Currently split the original document into paragraphs directly based on the \n[any space]\n rule."""
	line_break_characters = "".join(self.line_break_characters)
	whitespace_characters = "".join(self.whitespace_characters)
	paragraphs = re.split(
	f"([{line_break_characters}]+[{whitespace_characters}]*[{line_break_characters}])+",
	text,
	)
	if len(paragraphs) % 2 == 1:
	paragraphs = [""] + paragraphs
	paragraphs = [
	(paragraphs[i], paragraphs[i + 1])
	for i in range(0, len(paragraphs), 2)
	if (paragraphs[i] + paragraphs[i + 1]).strip()
	]

	if not paragraphs:
	return []

	new_paragraphs = []
	cur_paragraph, cur_paragraph_len = "", 0

	# merge short or broken paragraphs
	for sep, paragraph in paragraphs:
	if cur_paragraph_len >= min_paragraph_length and any(
	cur_paragraph.endswith(sym) for sym in self.sentence_terminators
	):
	new_paragraphs.append(cur_paragraph.strip())
	cur_paragraph, cur_paragraph_len = "", 0

	cur_paragraph_len += len(self._tokenizer.encode(sep + paragraph))
	cur_paragraph += sep + paragraph

	if cur_paragraph:
	new_paragraphs.append(cur_paragraph.strip())

	return new_paragraphs

	def _split_to_sentences(self, text: str, url_strings: List[str] = []) -> List[str]:
	# Use capture groups to preserve sentence separators
	pattern = (
	f"({'\|'.join(re.escape(symbol) for symbol in self.sentence_terminators)})+"
	)
	parts = re.split(pattern, text)
	sentences = []
	# Merge by skipping steps to ensure punctuation is added to the end of the corresponding sentence
	if len(parts) % 2 == 1:
	parts.append("")

	sentences = ["".join(parts[i : i + 2]) for i in range(0, len(parts), 2)]

	sentences = [s for s in sentences if s.strip()]

	if not sentences:
	return []

	# Fix fragmented sentences, mainly for special cases such as numeric indices, floating-point numbers, etc., which may be separated
	sentences = self.recombine_broken_sentences(sentences)

	# Split sentences that are too long; in the short term, split directly by character length; future optimizations could consider splitting by punctuation within sentences
	sentences_list = [
	self._force_split_to_chunks(s, url_strings) for s in sentences
	]
	sentences = list(chain.from_iterable(sentences_list))
	return sentences

	def recombine_broken_sentences(self, sentences: List[str]) -> List[str]:
	"""Fix fragmented sentences, mainly for special cases such as numeric indices, floating-point numbers, etc., which may be separated。"""
	if len(sentences) < 2:
	return sentences

	open_symbols_dict = {
	open_sym: close_sym for open_sym, close_sym in self.paired_punctuation
	}
	close_symbols_dict = {
	close_sym: open_sym for open_sym, close_sym in self.paired_punctuation
	}

	new_sentences = []
	cur_sentences = ""
	unmatched_symbol = []

	for sent in sentences:
	# If the current sentence is not empty, doesn't meet predefined merge conditions, and has no pending matching punctuation ([, (, {, etc.), then consider the sentence complete
	if cur_sentences.strip() and not (
	self.check_merge(cur_sentences, sent) or unmatched_symbol
	):
	new_sentences.append(cur_sentences)
	cur_sentences = ""

	for c in sent:
	if c in open_symbols_dict:
	unmatched_symbol.append(c)
	elif c in close_symbols_dict:
	if (
	unmatched_symbol
	and unmatched_symbol[-1] == close_symbols_dict[c]
	):
	unmatched_symbol.pop()

	# By default, the current sentence ends when a newline-like character appears
	if c in self.line_break_characters:
	unmatched_symbol = []
	if cur_sentences.strip():
	new_sentences.append(cur_sentences)
	cur_sentences = ""
	cur_sentences += c

	if cur_sentences:
	new_sentences.append(cur_sentences)

	return new_sentences

	def check_merge(self, pre_sen, cur_sen):
	if len(pre_sen) > 1 and len(cur_sen) > 0:
	# If it's a decimal point in the middle of a floating-point number
	if pre_sen[-1] == "." and pre_sen[-2].isdigit() and cur_sen[0].isdigit():
	return True
	# If it's a numeric index at the beginning of a sentence, such as 1. ***\n2. ***
	if (
	pre_sen[-1] == "."
	and pre_sen[-2].isdigit()
	and cur_sen[0] not in self.line_break_characters
	):
	return True
	# In markdown format, ! followed by [ may be an image link
	if (
	pre_sen[-1] == "!"
	and pre_sen[-2] in self.line_break_characters
	and cur_sen[0] == "["
	):
	return True

	return False

	def _merge_sentences_into_chunks(
	self, sentences: List[str], min_chunk_size: int = 200
	) -> List[str]:
	"""Assemble into chunks according to chunk_size and overlap. Note that external guarantees ensure that the length of a single sentence does not exceed chunk_size"""
	if not sentences:
	return []

	n_tokens = [
	len(
	self._tokenizer.encode(
	sentence,
	allowed_special=self._allowed_special,
	disallowed_special=self._disallowed_special,
	)
	)
	for sentence in sentences
	]

	chunks = []
	start_idx = 0
	end_idx = start_idx + 1
	cur_token_num = n_tokens[start_idx]
	while start_idx < len(n_tokens):
	# Tail reaches the end point,
	if end_idx >= len(n_tokens):
	chunk = "".join(sentences[start_idx:end_idx])
	logging.debug(
	"sentences[%s:%s] merged into chunk, current num_tokens: %s(%s)",
	start_idx,
	end_idx,
	sum(n_tokens[start_idx:end_idx]),
	cur_token_num,
	)
	chunks.append(chunk)
	break
	else:
	# +The next sentence will not exceed chunk_size, continue to include new sentences
	if cur_token_num + n_tokens[end_idx] <= self._chunk_size:
	cur_token_num += n_tokens[end_idx]
	end_idx += 1
	# +The next sentence will exceed chunk_size, assemble the current chunk and move to the next chunk
	else:
	chunk = "".join(sentences[start_idx:end_idx])
	logging.debug(
	"sentences[%s:%s] merged into chunk, current num_tokens: %s(%s)",
	start_idx,
	end_idx,
	sum(n_tokens[start_idx:end_idx]),
	cur_token_num,
	)
	chunks.append(chunk)
	# Next chunk: idx moves at least one position forward, start_idx allows overlap
	end_idx = end_idx + 1
	# Find a new starting point for start_idx that doesn't exceed the overlap
	new_start_idx = end_idx - 1
	overlap = 0
	new_cur_token_num = n_tokens[new_start_idx]
	while new_start_idx > start_idx + 1:
	if (
	overlap + n_tokens[new_start_idx - 1] >= self._chunk_overlap
	or new_cur_token_num >= self._chunk_size
	):
	break
	new_start_idx -= 1
	overlap += n_tokens[new_start_idx]
	new_cur_token_num += n_tokens[new_start_idx]

	start_idx = new_start_idx
	cur_token_num = new_cur_token_num
	if len(chunks) > 1 and len(chunks[-1]) < min_chunk_size:
	logging.warning(
	"The last chunk length %s is less than %s, merge with the previous chunk",
	len(chunks[-1]),
	min_chunk_size,
	)
	last_chunk = chunks.pop()
	chunks[-1] += last_chunk

	chunks = [chunk for chunk in chunks if chunk.strip()]

	return chunks

	def _force_split_to_chunks(
	self, text: str, url_strings: List[str] = []
	) -> List[str]:
	# TODO: In the future, consider adding forced splitting logic, such as: if a single sentence is too long, split by punctuation within the sentence, trying to preserve links and other data that require complete information
	"""If a single sentence is too long, it can only be forcibly split, split by punctuation within the sentence, trying to preserve links and other data that require complete information"""
	splits = []
	input_ids = self._tokenizer.encode(
	text,
	allowed_special=self._allowed_special,
	disallowed_special=self._disallowed_special,
	)
	if len(input_ids) < self._chunk_size:
	return [text]

	if text[-1] not in self.sentence_terminators + self.intra_sentence_delimiters:
	text += self.sentence_terminators[0]

	cur_sentence, cur_sentence_len = "", 0
	sub_sentence = ""
	for c in text:
	sub_sentence += c
	if c in self.intra_sentence_delimiters + self.sentence_terminators:
	sub_sentence_len = len(self._tokenizer.encode(sub_sentence))
	if (
	cur_sentence_len + sub_sentence_len
	> self._chunk_size - self._chunk_overlap
	):
	if cur_sentence:
	splits.append(cur_sentence)
	cur_sentence, cur_sentence_len = sub_sentence, sub_sentence_len
	else:
	# This indicates that sub_sentence is too long, at this point directly follow the forced splitting logic based on tokens
	_splits = self.safe_split(sub_sentence, url_strings)
	splits.extend(_splits[:-1])
	cur_sentence, cur_sentence_len = _splits[-1], len(_splits[-1])
	else:
	cur_sentence += sub_sentence
	cur_sentence_len += sub_sentence_len
	sub_sentence = ""

	if cur_sentence:
	splits.append(cur_sentence)

	return splits

	def safe_split(self, sub_sentence: str, url_strings: List[str] = []) -> List[str]:
	sub_sentence_tokens = self._tokenizer.encode(sub_sentence)

	# Find the position intervals of all strings in url_strings
	url_string_intervals = []
	for url_string in url_strings:
	encoded_url_string = self._tokenizer.encode(url_string)
	# Use find_sublist_indices to find all position intervals
	url_string_intervals.extend(
	find_sublist_indices(sub_sentence_tokens, encoded_url_string)
	)

	_splits = []
	i = 0
	while i < len(sub_sentence_tokens):
	if i + self._chunk_size >= len(sub_sentence_tokens):
	slice_end = len(sub_sentence_tokens)
	else:
	slice_end = i + self._chunk_size - self._chunk_overlap

	# Determine if the split interval overlaps with any important string intervals
	for s_begin, s_end in url_string_intervals:
	if i < s_end <= slice_end or i < s_begin < slice_end:
	slice_end = max(slice_end, s_end)

	# Split and record the current chunk
	_splits.append(self._tokenizer.decode(sub_sentence_tokens[i:slice_end]))
	# Move to the starting point of the next chunk
	i = slice_end

	return _splits


	def get_summarize_title_keywords(responses):
	# Clean LLM generated content to obtain summarized text titles, abstracts, and keywords
	pattern = re.compile(r"\{.*(\}\|\]\|\,)", re.DOTALL)
	gen_texts = [each.choices[0].message.content for each in responses]
	logging.info("gen_texts: %s", gen_texts)
	results = []
	for res in gen_texts:
	try:
	# Match against the pattern
	matches = list(pattern.finditer(res))
	if not matches:
	results.append(("", "", []))
	else:
	answer = matches[0].group(0)
	content = answer.strip().strip(",")
	content += "]" * (content.count("[") - content.count("]"))
	content += "}" * (content.count("{") - content.count("}"))
	d = json.loads(res)
	results.append(
	(d.get("title", ""), d.get("summary", ""), d.get("keywords", []))
	)

	except json.JSONDecodeError:
	logging.warning("JSON parsing failed, returning empty list")
	results.append(("", "", []))
	return results