Spaces:

datajoi
/

Domain-Document-Indexing

Sleeping

App Files Files Community

Mustehson commited on Sep 12, 2024

Commit

cbb92c4

1 Parent(s): 59cce11

Added Depth Support for Scraping

Browse files

Files changed (2) hide show

__pycache__/app.cpython-311.pyc +0 -0
app.py +58 -26

__pycache__/app.cpython-311.pyc ADDED Viewed

Binary file (9.85 kB). View file

app.py CHANGED Viewed

@@ -9,11 +9,21 @@ from langchain_community.document_transformers import Html2TextTransformer
 TAB_LINES = 22
-def scrape_text(url):
     try:
-        loader = RecursiveUrlLoader(url=url, max_depth=None,
-                                    prevent_outside=True, check_response_status=True)
         documents = loader.load()
     except Exception as e:
         print(f"Error loading URL: {e}")
@@ -21,17 +31,18 @@ def scrape_text(url):
     return documents
-def clean_text(documents):
     html2text = Html2TextTransformer()
-    docs_transformed = html2text.transform_documents([documents])
-    cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content)
-    docs_transformed[0].page_content = cleaned_string
     return docs_transformed
 def remove_tables(docs):
-    table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
-    docs.page_content = table_pattern.sub('', docs.page_content)
     return docs
@@ -44,14 +55,34 @@ def format_chunks_with_spaces(chunks):
         formatted_chunks += separator
     return formatted_chunks
-def get_tables(raw_html):
-    try:
-        tables = pd.read_html(StringIO(str(raw_html.page_content)))
-    except Exception as e:
-        print(f"Error reading table: {e}")
-        return None
-    return tables
 def concat_dfs(df_list):
@@ -59,26 +90,25 @@ def concat_dfs(df_list):
     return concatenated_df
-def get_docs(url):
-    raw_html = scrape_text(url)
     if raw_html is None:
         return None, None, None, None, None
-    tables_list = get_tables(raw_html[0])
-    if tables_list is not None:
         concat_tables = concat_dfs(tables_list)
     else:
         concat_tables = None
-    tables_rmv_html = remove_tables(raw_html[0])
     clean_docs = clean_text(tables_rmv_html)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
     documents_splits = text_splitter.split_documents(clean_docs)
     formatted_chunks = format_chunks_with_spaces(documents_splits)
-    return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
@@ -95,7 +125,9 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
     with gr.Row():
         with gr.Column(scale=1):
             url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
-            scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
         with gr.Column(elem_id = "col_container", scale=2):
             with gr.Tabs():
@@ -115,7 +147,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
                     metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
                                           autoscroll=False)
-        scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables,
                                                                      metadata, parsed_chunks])

 TAB_LINES = 22
+def html_only_metadata_extractor(raw_html, url, response):
+    content_type = response.headers.get("Content-Type", "")
+    if "text/html" in content_type:
+        return {"source": url, "content_type": content_type}
+    return {}
+def scrape_text(url, max_depth):
     try:
+        loader = RecursiveUrlLoader(
+                url=url,
+                max_depth=max_depth,
+                check_response_status=True,
+                metadata_extractor=html_only_metadata_extractor,
+                prevent_outside=True,
+            )
         documents = loader.load()
     except Exception as e:
         print(f"Error loading URL: {e}")
     return documents
+def clean_text(docs):
     html2text = Html2TextTransformer()
+    docs_transformed = html2text.transform_documents(docs)
+    for doc in docs_transformed:
+        doc.page_content = re.sub(r'\n\n+|\n+|\s+', ' ', doc.page_content)
     return docs_transformed
 def remove_tables(docs):
+    for doc in docs:
+        table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
+        doc.page_content = table_pattern.sub('', doc.page_content)
     return docs
         formatted_chunks += separator
     return formatted_chunks
+def format_metdata(docs):
+    formatted_metadata = ""
+    for i, doc in enumerate(docs):
+        formatted_metadata += f"Metadata {i+1}: \n\n"
+        formatted_metadata += str(doc.metadata)
+        formatted_metadata += "\n\n---\n\n"
+    return formatted_metadata
+def format_page_content(docs):
+    formatted_docs = ""
+    for i, doc in enumerate(docs):
+        formatted_docs += f"Page Content {i+1}: \n\n"
+        formatted_docs += str(doc.page_content)
+        formatted_docs += "\n\n---\n\n"
+    return formatted_docs
+def get_tables(raw_docs):
+    tables_list = []
+    for raw_doc in raw_docs:
+        try:
+            tables = pd.read_html(StringIO(str(raw_doc.page_content)))
+            tables_list.extend(tables)
+        except Exception as e:
+            print(f"Error reading table: {e}")
+            continue
+    return tables_list
 def concat_dfs(df_list):
     return concatenated_df
+def get_docs(url, max_depth):
+    raw_html = scrape_text(url, max_depth)
     if raw_html is None:
         return None, None, None, None, None
+    tables_list = get_tables(raw_html)
+    if tables_list:
         concat_tables = concat_dfs(tables_list)
     else:
         concat_tables = None
+    tables_rmv_html = remove_tables(raw_html)
     clean_docs = clean_text(tables_rmv_html)
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
     documents_splits = text_splitter.split_documents(clean_docs)
     formatted_chunks = format_chunks_with_spaces(documents_splits)
+    return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
     with gr.Row():
         with gr.Column(scale=1):
             url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
+            with gr.Row():
+                max_depth = gr.Slider(1, 50, value=1, step=1, label="Max Depth", interactive=True)
+                scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
         with gr.Column(elem_id = "col_container", scale=2):
             with gr.Tabs():
                     metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
                                           autoscroll=False)
+        scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
                                                                      metadata, parsed_chunks])