from bs4 import BeautifulSoup from urllib import request from bot.web_scrapping.searchable_index import SearchableIndex from bot.utils.show_log import logger from bot.utils.constanst import set_api_key import pandas as pd import requests import os set_api_key(api_key='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') def save_content_to_file(url=None, text=None, output_folder=None, file_format=None): file_path = os.path.join(output_folder, f"combined_content.{file_format}") if file_format == 'txt': with open(f"{file_path}", "a", encoding="utf-8") as file: for t in text: file.write(f'{t.text}\n') logger.info(f"Content appended to {file_path}") elif file_format == 'pdf': request.urlretrieve(url, file_path) logger.info(f"Content appended to {file_path}") elif file_format == 'csv': df = pd.DataFrame({'Content': [t.text for t in text]}) df.to_csv(f"{file_path}", mode='a', index=False, header=False) logger.info(f"Content appended to {file_path}") elif file_format == 'xml': xml_content = ''.join([f'{t.text}' for t in text]) with open(f"{file_path}", "a", encoding="utf-8") as file: file.write(xml_content) logger.info(f"Content appended to {file_path}") else: logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml") return file_path def content_crawler_and_index(url, file_format='txt', output_folder='learning_documents'): if url != 'NO_URL': # Send an HTTP GET request to the URL responses = requests.get(url) # Check if the request was successful if responses.status_code == 200: # Create output folder if it doesn't exist if not os.path.exists(output_folder): os.makedirs(output_folder) # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(responses.text, "html.parser") text = soup.find_all(['h2', 'p', 'i', 'ul']) if text: # Save content based on the specified file format file_path = save_content_to_file(text=text, output_folder=output_folder, file_format=file_format) # Create or update the index index = SearchableIndex.embed_index(url, file_path) if os.path.isfile(file_path): os.remove(file_path) return index else: file_path = save_content_to_file(url=url, output_folder=output_folder, file_format=file_format) index = SearchableIndex.embed_index(url, file_path) if os.path.isfile(file_path): os.remove(file_path) return index else: logger.warning("Failed to retrieve content from the URL.") else: index = SearchableIndex.embed_index(url=url, path=output_folder) return index if __name__ == '__main__': pass # Example usage: # First URL # idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt') # # Second URL (appends content to existing files) # idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf') # # example get response chatbot # prompt = 'explain the paper' # llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0) # response = SearchableIndex.query(prompt, llm, idx) # print(response) # logger.info(response)