Spaces:
Sleeping
Sleeping
import re | |
import gradio as gr | |
from io import StringIO | |
import pandas as pd | |
from langchain_community.document_loaders import RecursiveUrlLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.document_transformers import Html2TextTransformer | |
TAB_LINES = 22 | |
def scrape_text(url): | |
try: | |
loader = RecursiveUrlLoader(url=url, max_depth=None, | |
prevent_outside=True, check_response_status=True) | |
documents = loader.load() | |
except Exception as e: | |
print(f"Error loading URL: {e}") | |
return None | |
return documents | |
def clean_text(documents): | |
html2text = Html2TextTransformer() | |
docs_transformed = html2text.transform_documents([documents]) | |
cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content) | |
docs_transformed[0].page_content = cleaned_string | |
return docs_transformed | |
def remove_tables(docs): | |
table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL) | |
docs.page_content = table_pattern.sub('', docs.page_content) | |
return docs | |
def format_chunks_with_spaces(chunks): | |
separator = "\n\n---\n\n" | |
formatted_chunks = "" | |
for i, chunk in enumerate(chunks): | |
formatted_chunks += f"Chunk {i+1}: \n\n" | |
formatted_chunks += chunk.page_content | |
formatted_chunks += separator | |
return formatted_chunks | |
def get_tables(raw_html): | |
try: | |
tables = pd.read_html(StringIO(str(raw_html.page_content))) | |
except Exception as e: | |
print(f"Error reading table: {e}") | |
return None | |
return tables | |
def concat_dfs(df_list): | |
concatenated_df = pd.concat(df_list, ignore_index=True) | |
return concatenated_df | |
def get_docs(url): | |
raw_html = scrape_text(url) | |
if raw_html is None: | |
return None, None, None, None, None | |
tables_list = get_tables(raw_html[0]) | |
if tables_list is not None: | |
concat_tables = concat_dfs(tables_list) | |
else: | |
concat_tables = None | |
tables_rmv_html = remove_tables(raw_html[0]) | |
clean_docs = clean_text(tables_rmv_html) | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200) | |
documents_splits = text_splitter.split_documents(clean_docs) | |
formatted_chunks = format_chunks_with_spaces(documents_splits) | |
return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks | |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo: | |
gr.Image("logo.png", label=None, show_label=False, container=False, height=100) | |
gr.Markdown(""" | |
<div style='text-align: center;'> | |
<strong style='font-size: 36px;'>Domain Document Indexing</strong> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=1): | |
url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...") | |
scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary") | |
with gr.Column(elem_id = "col_container", scale=2): | |
with gr.Tabs(): | |
with gr.Tab("RAW HTML"): | |
raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False, | |
autoscroll=False) | |
with gr.Tab("Clean Content"): | |
page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False, | |
autoscroll=False) | |
with gr.Tab("Tables"): | |
tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False, | |
autoscroll=False) | |
with gr.Tab("Chunks"): | |
parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed Chunks", value="", interactive=False, | |
autoscroll=False) | |
with gr.Tab("Metadata"): | |
metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False, | |
autoscroll=False) | |
scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables, | |
metadata, parsed_chunks]) | |
if __name__ == "__main__": | |
demo.launch() | |