File size: 3,572 Bytes
d97a6fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from bs4 import BeautifulSoup
from urllib import request
from bot.web_scrapping.searchable_index import SearchableIndex
from bot.utils.show_log import logger
from bot.utils.constanst import set_api_key
import pandas as pd
import requests
import os

set_api_key(api_key='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')


def save_content_to_file(url=None, text=None, output_folder=None, file_format=None):
    file_path = os.path.join(output_folder, f"combined_content.{file_format}")
    if file_format == 'txt':
        with open(f"{file_path}", "a", encoding="utf-8") as file:
            for t in text:
                file.write(f'{t.text}\n')
        logger.info(f"Content appended to {file_path}")
    elif file_format == 'pdf':
        request.urlretrieve(url, file_path)
        logger.info(f"Content appended to {file_path}")
    elif file_format == 'csv':
        df = pd.DataFrame({'Content': [t.text for t in text]})
        df.to_csv(f"{file_path}", mode='a', index=False, header=False)
        logger.info(f"Content appended to {file_path}")
    elif file_format == 'xml':
        xml_content = ''.join([f'<item>{t.text}</item>' for t in text])
        with open(f"{file_path}", "a", encoding="utf-8") as file:
            file.write(xml_content)
        logger.info(f"Content appended to {file_path}")
    else:
        logger.warning("Invalid file format. Supported formats: txt, pdf, csv, xml")
    return file_path


def content_crawler_and_index(url, file_format='txt', output_folder='learning_documents'):
    if url != 'NO_URL':
        # Send an HTTP GET request to the URL
        responses = requests.get(url)
        # Check if the request was successful
        if responses.status_code == 200:
            # Create output folder if it doesn't exist
            if not os.path.exists(output_folder):
                os.makedirs(output_folder)
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(responses.text, "html.parser")
            text = soup.find_all(['h2', 'p', 'i', 'ul'])
            if text:
                # Save content based on the specified file format
                file_path = save_content_to_file(text=text, output_folder=output_folder, file_format=file_format)

                # Create or update the index
                index = SearchableIndex.embed_index(url, file_path)
                if os.path.isfile(file_path):
                    os.remove(file_path)
                return index
            else:
                file_path = save_content_to_file(url=url, output_folder=output_folder, file_format=file_format)
                index = SearchableIndex.embed_index(url, file_path)
                if os.path.isfile(file_path):
                    os.remove(file_path)
                return index

        else:
            logger.warning("Failed to retrieve content from the URL.")
    else:
        index = SearchableIndex.embed_index(url=url, path=output_folder)
        return index


if __name__ == '__main__':
    pass
    # Example usage:
    # First URL
    # idx = content_crawler_and_index("https://www.presight.io/terms-of-use.html", file_format='txt')
    #
    # Second URL (appends content to existing files)
    # idx = content_crawler_and_index(url='https://arxiv.org/pdf/2309.11235v1.pdf', file_format='pdf')
    # # example get response chatbot
    # prompt = 'explain the paper'
    # llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0)
    # response = SearchableIndex.query(prompt, llm, idx)
    # print(response)
    # logger.info(response)