Mustehson
Scrape&Clean Data
a831d50
raw
history blame
4.52 kB
import re
import gradio as gr
from io import StringIO
import pandas as pd
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_transformers import Html2TextTransformer
TAB_LINES = 22
def scrape_text(url):
try:
loader = RecursiveUrlLoader(url=url, max_depth=None,
prevent_outside=True, check_response_status=True)
documents = loader.load()
except Exception as e:
print(f"Error loading URL: {e}")
return None
return documents
def clean_text(documents):
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents([documents])
cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content)
docs_transformed[0].page_content = cleaned_string
return docs_transformed
def remove_tables(docs):
table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
docs.page_content = table_pattern.sub('', docs.page_content)
return docs
def format_chunks_with_spaces(chunks):
separator = "\n\n---\n\n"
formatted_chunks = ""
for i, chunk in enumerate(chunks):
formatted_chunks += f"Chunk {i+1}: \n\n"
formatted_chunks += chunk.page_content
formatted_chunks += separator
return formatted_chunks
def get_tables(raw_html):
try:
tables = pd.read_html(StringIO(str(raw_html.page_content)))
except Exception as e:
print(f"Error reading table: {e}")
return None
return tables
def concat_dfs(df_list):
concatenated_df = pd.concat(df_list, ignore_index=True)
return concatenated_df
def get_docs(url):
raw_html = scrape_text(url)
if raw_html is None:
return None, None, None, None, None
tables_list = get_tables(raw_html[0])
if tables_list is not None:
concat_tables = concat_dfs(tables_list)
else:
concat_tables = None
tables_rmv_html = remove_tables(raw_html[0])
clean_docs = clean_text(tables_rmv_html)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
documents_splits = text_splitter.split_documents(clean_docs)
formatted_chunks = format_chunks_with_spaces(documents_splits)
return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
gr.Image("logo.png", label=None, show_label=False, container=False, height=100)
gr.Markdown("""
<div style='text-align: center;'>
<strong style='font-size: 36px;'>Domain Document Indexing</strong>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
with gr.Column(elem_id = "col_container", scale=2):
with gr.Tabs():
with gr.Tab("RAW HTML"):
raw_page_content = gr.Textbox(lines=TAB_LINES, label="Page Content HTML", value="", interactive=False,
autoscroll=False)
with gr.Tab("Clean Content"):
page_content = gr.Textbox(lines=TAB_LINES, label="Clean Page Content", value="", interactive=False,
autoscroll=False)
with gr.Tab("Tables"):
tables = gr.Textbox(lines=TAB_LINES, label="Tables", value="", interactive=False,
autoscroll=False)
with gr.Tab("Chunks"):
parsed_chunks = gr.Textbox(lines=TAB_LINES, label="Parsed Chunks", value="", interactive=False,
autoscroll=False)
with gr.Tab("Metadata"):
metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
autoscroll=False)
scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables,
metadata, parsed_chunks])
if __name__ == "__main__":
demo.launch()