Spaces:
Sleeping
Sleeping
Mustehson
commited on
Commit
·
cbb92c4
1
Parent(s):
59cce11
Added Depth Support for Scraping
Browse files- __pycache__/app.cpython-311.pyc +0 -0
- app.py +58 -26
__pycache__/app.cpython-311.pyc
ADDED
Binary file (9.85 kB). View file
|
|
app.py
CHANGED
@@ -9,11 +9,21 @@ from langchain_community.document_transformers import Html2TextTransformer
|
|
9 |
|
10 |
TAB_LINES = 22
|
11 |
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
def scrape_text(url):
|
14 |
try:
|
15 |
-
loader = RecursiveUrlLoader(
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
documents = loader.load()
|
18 |
except Exception as e:
|
19 |
print(f"Error loading URL: {e}")
|
@@ -21,17 +31,18 @@ def scrape_text(url):
|
|
21 |
return documents
|
22 |
|
23 |
|
24 |
-
def clean_text(
|
25 |
html2text = Html2TextTransformer()
|
26 |
-
docs_transformed = html2text.transform_documents(
|
27 |
-
|
28 |
-
|
29 |
return docs_transformed
|
30 |
|
31 |
|
32 |
def remove_tables(docs):
|
33 |
-
|
34 |
-
|
|
|
35 |
return docs
|
36 |
|
37 |
|
@@ -44,14 +55,34 @@ def format_chunks_with_spaces(chunks):
|
|
44 |
formatted_chunks += separator
|
45 |
return formatted_chunks
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
|
57 |
def concat_dfs(df_list):
|
@@ -59,26 +90,25 @@ def concat_dfs(df_list):
|
|
59 |
return concatenated_df
|
60 |
|
61 |
|
62 |
-
def get_docs(url):
|
63 |
-
raw_html = scrape_text(url)
|
64 |
if raw_html is None:
|
65 |
return None, None, None, None, None
|
66 |
|
67 |
-
tables_list = get_tables(raw_html
|
68 |
-
|
69 |
-
if tables_list is not None:
|
70 |
concat_tables = concat_dfs(tables_list)
|
71 |
else:
|
72 |
concat_tables = None
|
73 |
|
74 |
-
tables_rmv_html = remove_tables(raw_html
|
75 |
clean_docs = clean_text(tables_rmv_html)
|
76 |
|
77 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
|
78 |
documents_splits = text_splitter.split_documents(clean_docs)
|
79 |
formatted_chunks = format_chunks_with_spaces(documents_splits)
|
80 |
|
81 |
-
return raw_html
|
82 |
|
83 |
|
84 |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
|
@@ -95,7 +125,9 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
|
|
95 |
with gr.Row():
|
96 |
with gr.Column(scale=1):
|
97 |
url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
|
98 |
-
|
|
|
|
|
99 |
|
100 |
with gr.Column(elem_id = "col_container", scale=2):
|
101 |
with gr.Tabs():
|
@@ -115,7 +147,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
|
|
115 |
metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
|
116 |
autoscroll=False)
|
117 |
|
118 |
-
scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables,
|
119 |
metadata, parsed_chunks])
|
120 |
|
121 |
|
|
|
9 |
|
10 |
TAB_LINES = 22
|
11 |
|
12 |
+
def html_only_metadata_extractor(raw_html, url, response):
|
13 |
+
content_type = response.headers.get("Content-Type", "")
|
14 |
+
if "text/html" in content_type:
|
15 |
+
return {"source": url, "content_type": content_type}
|
16 |
+
return {}
|
17 |
|
18 |
+
def scrape_text(url, max_depth):
|
19 |
try:
|
20 |
+
loader = RecursiveUrlLoader(
|
21 |
+
url=url,
|
22 |
+
max_depth=max_depth,
|
23 |
+
check_response_status=True,
|
24 |
+
metadata_extractor=html_only_metadata_extractor,
|
25 |
+
prevent_outside=True,
|
26 |
+
)
|
27 |
documents = loader.load()
|
28 |
except Exception as e:
|
29 |
print(f"Error loading URL: {e}")
|
|
|
31 |
return documents
|
32 |
|
33 |
|
34 |
+
def clean_text(docs):
|
35 |
html2text = Html2TextTransformer()
|
36 |
+
docs_transformed = html2text.transform_documents(docs)
|
37 |
+
for doc in docs_transformed:
|
38 |
+
doc.page_content = re.sub(r'\n\n+|\n+|\s+', ' ', doc.page_content)
|
39 |
return docs_transformed
|
40 |
|
41 |
|
42 |
def remove_tables(docs):
|
43 |
+
for doc in docs:
|
44 |
+
table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
|
45 |
+
doc.page_content = table_pattern.sub('', doc.page_content)
|
46 |
return docs
|
47 |
|
48 |
|
|
|
55 |
formatted_chunks += separator
|
56 |
return formatted_chunks
|
57 |
|
58 |
+
def format_metdata(docs):
|
59 |
+
formatted_metadata = ""
|
60 |
+
for i, doc in enumerate(docs):
|
61 |
+
formatted_metadata += f"Metadata {i+1}: \n\n"
|
62 |
+
formatted_metadata += str(doc.metadata)
|
63 |
+
formatted_metadata += "\n\n---\n\n"
|
64 |
+
return formatted_metadata
|
65 |
+
|
66 |
+
def format_page_content(docs):
|
67 |
+
formatted_docs = ""
|
68 |
+
for i, doc in enumerate(docs):
|
69 |
+
formatted_docs += f"Page Content {i+1}: \n\n"
|
70 |
+
formatted_docs += str(doc.page_content)
|
71 |
+
formatted_docs += "\n\n---\n\n"
|
72 |
+
return formatted_docs
|
73 |
+
|
74 |
+
|
75 |
+
def get_tables(raw_docs):
|
76 |
+
tables_list = []
|
77 |
+
for raw_doc in raw_docs:
|
78 |
+
try:
|
79 |
+
tables = pd.read_html(StringIO(str(raw_doc.page_content)))
|
80 |
+
tables_list.extend(tables)
|
81 |
+
except Exception as e:
|
82 |
+
print(f"Error reading table: {e}")
|
83 |
+
continue
|
84 |
+
|
85 |
+
return tables_list
|
86 |
|
87 |
|
88 |
def concat_dfs(df_list):
|
|
|
90 |
return concatenated_df
|
91 |
|
92 |
|
93 |
+
def get_docs(url, max_depth):
|
94 |
+
raw_html = scrape_text(url, max_depth)
|
95 |
if raw_html is None:
|
96 |
return None, None, None, None, None
|
97 |
|
98 |
+
tables_list = get_tables(raw_html)
|
99 |
+
if tables_list:
|
|
|
100 |
concat_tables = concat_dfs(tables_list)
|
101 |
else:
|
102 |
concat_tables = None
|
103 |
|
104 |
+
tables_rmv_html = remove_tables(raw_html)
|
105 |
clean_docs = clean_text(tables_rmv_html)
|
106 |
|
107 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
|
108 |
documents_splits = text_splitter.split_documents(clean_docs)
|
109 |
formatted_chunks = format_chunks_with_spaces(documents_splits)
|
110 |
|
111 |
+
return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks
|
112 |
|
113 |
|
114 |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
|
|
|
125 |
with gr.Row():
|
126 |
with gr.Column(scale=1):
|
127 |
url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
|
128 |
+
with gr.Row():
|
129 |
+
max_depth = gr.Slider(1, 50, value=1, step=1, label="Max Depth", interactive=True)
|
130 |
+
scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
|
131 |
|
132 |
with gr.Column(elem_id = "col_container", scale=2):
|
133 |
with gr.Tabs():
|
|
|
147 |
metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
|
148 |
autoscroll=False)
|
149 |
|
150 |
+
scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
|
151 |
metadata, parsed_chunks])
|
152 |
|
153 |
|