File size: 4,518 Bytes
a831d50
87d876e
a831d50
 
 
 
 
87d876e
 
a831d50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87d876e
a831d50
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import gradio as gr
from io import StringIO
import pandas as pd
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_transformers import Html2TextTransformer


TAB_LINES = 22


def scrape_text(url):
    try:
        loader = RecursiveUrlLoader(url=url, max_depth=None,
                                    prevent_outside=True, check_response_status=True)
        documents = loader.load()
    except Exception as e:
        print(f"Error loading URL: {e}")
        return None
    return documents

        
def clean_text(documents):
    html2text = Html2TextTransformer()
    docs_transformed = html2text.transform_documents([documents])
    cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content)
    docs_transformed[0].page_content = cleaned_string
    return docs_transformed


def remove_tables(docs):
    table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
    docs.page_content = table_pattern.sub('', docs.page_content)
    return docs


def format_chunks_with_spaces(chunks):
    separator = "\n\n---\n\n"  
    formatted_chunks = ""
    for i, chunk in enumerate(chunks):
        formatted_chunks += f"Chunk {i+1}: \n\n"
        formatted_chunks += chunk.page_content
        formatted_chunks += separator 
    return formatted_chunks


def get_tables(raw_html):
    try:
        tables = pd.read_html(StringIO(str(raw_html.page_content)))
    except Exception as e:
        print(f"Error reading table: {e}")
        return None
    return tables


def concat_dfs(df_list):
    concatenated_df = pd.concat(df_list, ignore_index=True)
    return concatenated_df


def get_docs(url):
    raw_html = scrape_text(url)
    if raw_html is None:
        return None, None, None, None, None

    tables_list = get_tables(raw_html[0]) 
    
    if tables_list is not None:
        concat_tables = concat_dfs(tables_list)
    else:
        concat_tables = None
        
    tables_rmv_html = remove_tables(raw_html[0])
    clean_docs = clean_text(tables_rmv_html)
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
    documents_splits = text_splitter.split_documents(clean_docs)
    formatted_chunks = format_chunks_with_spaces(documents_splits)
    
    return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks


with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
    
    gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
    
    gr.Markdown("""
    <div style='text-align: center;'>
    <strong style='font-size: 36px;'>Domain Document Indexing</strong>

    </div>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
            scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")

        with gr.Column(elem_id = "col_container", scale=2):
            with gr.Tabs():
                with gr.Tab("RAW HTML"):
                    raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False,
                                                  autoscroll=False)
                with gr.Tab("Clean Content"):
                    page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False,
                                              autoscroll=False)
                with gr.Tab("Tables"):
                    tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False,
                                        autoscroll=False)
                with gr.Tab("Chunks"):
                    parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed   Chunks", value="", interactive=False,
                                               autoscroll=False)
                with gr.Tab("Metadata"):
                    metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
                                          autoscroll=False)
        
        scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables,
                                                                     metadata, parsed_chunks])

        
if __name__ == "__main__":
    demo.launch()