import re import gradio as gr from io import StringIO import pandas as pd from langchain_community.document_loaders import RecursiveUrlLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_transformers import Html2TextTransformer TAB_LINES = 22 def scrape_text(url): try: loader = RecursiveUrlLoader(url=url, max_depth=None, prevent_outside=True, check_response_status=True) documents = loader.load() except Exception as e: print(f"Error loading URL: {e}") return None return documents def clean_text(documents): html2text = Html2TextTransformer() docs_transformed = html2text.transform_documents([documents]) cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content) docs_transformed[0].page_content = cleaned_string return docs_transformed def remove_tables(docs): table_pattern = re.compile(r'.*?', re.DOTALL) docs.page_content = table_pattern.sub('', docs.page_content) return docs def format_chunks_with_spaces(chunks): separator = "\n\n---\n\n" formatted_chunks = "" for i, chunk in enumerate(chunks): formatted_chunks += f"Chunk {i+1}: \n\n" formatted_chunks += chunk.page_content formatted_chunks += separator return formatted_chunks def get_tables(raw_html): try: tables = pd.read_html(StringIO(str(raw_html.page_content))) except Exception as e: print(f"Error reading table: {e}") return None return tables def concat_dfs(df_list): concatenated_df = pd.concat(df_list, ignore_index=True) return concatenated_df def get_docs(url): raw_html = scrape_text(url) if raw_html is None: return None, None, None, None, None tables_list = get_tables(raw_html[0]) if tables_list is not None: concat_tables = concat_dfs(tables_list) else: concat_tables = None tables_rmv_html = remove_tables(raw_html[0]) clean_docs = clean_text(tables_rmv_html) text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) documents_splits = text_splitter.split_documents(clean_docs) formatted_chunks = format_chunks_with_spaces(documents_splits) return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo: gr.Image("logo.png", label=None, show_label=False, container=False, height=100) gr.Markdown("""
Domain Document Indexing
""") with gr.Row(): with gr.Column(scale=1): url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...") scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary") with gr.Column(elem_id = "col_container", scale=2): with gr.Tabs(): with gr.Tab("RAW HTML"): raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False, autoscroll=False) with gr.Tab("Clean Content"): page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False, autoscroll=False) with gr.Tab("Tables"): tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False, autoscroll=False) with gr.Tab("Chunks"): parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed Chunks", value="", interactive=False, autoscroll=False) with gr.Tab("Metadata"): metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False, autoscroll=False) scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables, metadata, parsed_chunks]) if __name__ == "__main__": demo.launch()