Mustehson commited on
Commit
cbb92c4
·
1 Parent(s): 59cce11

Added Depth Support for Scraping

Browse files
Files changed (2) hide show
  1. __pycache__/app.cpython-311.pyc +0 -0
  2. app.py +58 -26
__pycache__/app.cpython-311.pyc ADDED
Binary file (9.85 kB). View file
 
app.py CHANGED
@@ -9,11 +9,21 @@ from langchain_community.document_transformers import Html2TextTransformer
9
 
10
  TAB_LINES = 22
11
 
 
 
 
 
 
12
 
13
- def scrape_text(url):
14
  try:
15
- loader = RecursiveUrlLoader(url=url, max_depth=None,
16
- prevent_outside=True, check_response_status=True)
 
 
 
 
 
17
  documents = loader.load()
18
  except Exception as e:
19
  print(f"Error loading URL: {e}")
@@ -21,17 +31,18 @@ def scrape_text(url):
21
  return documents
22
 
23
 
24
- def clean_text(documents):
25
  html2text = Html2TextTransformer()
26
- docs_transformed = html2text.transform_documents([documents])
27
- cleaned_string = re.sub(r'\n\n+|\n+|\s+', ' ', docs_transformed[0].page_content)
28
- docs_transformed[0].page_content = cleaned_string
29
  return docs_transformed
30
 
31
 
32
  def remove_tables(docs):
33
- table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
34
- docs.page_content = table_pattern.sub('', docs.page_content)
 
35
  return docs
36
 
37
 
@@ -44,14 +55,34 @@ def format_chunks_with_spaces(chunks):
44
  formatted_chunks += separator
45
  return formatted_chunks
46
 
47
-
48
- def get_tables(raw_html):
49
- try:
50
- tables = pd.read_html(StringIO(str(raw_html.page_content)))
51
- except Exception as e:
52
- print(f"Error reading table: {e}")
53
- return None
54
- return tables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
 
57
  def concat_dfs(df_list):
@@ -59,26 +90,25 @@ def concat_dfs(df_list):
59
  return concatenated_df
60
 
61
 
62
- def get_docs(url):
63
- raw_html = scrape_text(url)
64
  if raw_html is None:
65
  return None, None, None, None, None
66
 
67
- tables_list = get_tables(raw_html[0])
68
-
69
- if tables_list is not None:
70
  concat_tables = concat_dfs(tables_list)
71
  else:
72
  concat_tables = None
73
 
74
- tables_rmv_html = remove_tables(raw_html[0])
75
  clean_docs = clean_text(tables_rmv_html)
76
 
77
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
78
  documents_splits = text_splitter.split_documents(clean_docs)
79
  formatted_chunks = format_chunks_with_spaces(documents_splits)
80
 
81
- return raw_html[0].page_content, clean_docs[0].page_content, concat_tables, raw_html[0].metadata, formatted_chunks
82
 
83
 
84
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
@@ -95,7 +125,9 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
95
  with gr.Row():
96
  with gr.Column(scale=1):
97
  url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
98
- scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
 
 
99
 
100
  with gr.Column(elem_id = "col_container", scale=2):
101
  with gr.Tabs():
@@ -115,7 +147,7 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo"
115
  metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
116
  autoscroll=False)
117
 
118
- scarpe_url_button.click(get_docs, inputs=url_input, outputs=[raw_page_content, page_content, tables,
119
  metadata, parsed_chunks])
120
 
121
 
 
9
 
10
  TAB_LINES = 22
11
 
12
+ def html_only_metadata_extractor(raw_html, url, response):
13
+ content_type = response.headers.get("Content-Type", "")
14
+ if "text/html" in content_type:
15
+ return {"source": url, "content_type": content_type}
16
+ return {}
17
 
18
+ def scrape_text(url, max_depth):
19
  try:
20
+ loader = RecursiveUrlLoader(
21
+ url=url,
22
+ max_depth=max_depth,
23
+ check_response_status=True,
24
+ metadata_extractor=html_only_metadata_extractor,
25
+ prevent_outside=True,
26
+ )
27
  documents = loader.load()
28
  except Exception as e:
29
  print(f"Error loading URL: {e}")
 
31
  return documents
32
 
33
 
34
+ def clean_text(docs):
35
  html2text = Html2TextTransformer()
36
+ docs_transformed = html2text.transform_documents(docs)
37
+ for doc in docs_transformed:
38
+ doc.page_content = re.sub(r'\n\n+|\n+|\s+', ' ', doc.page_content)
39
  return docs_transformed
40
 
41
 
42
  def remove_tables(docs):
43
+ for doc in docs:
44
+ table_pattern = re.compile(r'<table.*?>.*?</table>', re.DOTALL)
45
+ doc.page_content = table_pattern.sub('', doc.page_content)
46
  return docs
47
 
48
 
 
55
  formatted_chunks += separator
56
  return formatted_chunks
57
 
58
+ def format_metdata(docs):
59
+ formatted_metadata = ""
60
+ for i, doc in enumerate(docs):
61
+ formatted_metadata += f"Metadata {i+1}: \n\n"
62
+ formatted_metadata += str(doc.metadata)
63
+ formatted_metadata += "\n\n---\n\n"
64
+ return formatted_metadata
65
+
66
+ def format_page_content(docs):
67
+ formatted_docs = ""
68
+ for i, doc in enumerate(docs):
69
+ formatted_docs += f"Page Content {i+1}: \n\n"
70
+ formatted_docs += str(doc.page_content)
71
+ formatted_docs += "\n\n---\n\n"
72
+ return formatted_docs
73
+
74
+
75
+ def get_tables(raw_docs):
76
+ tables_list = []
77
+ for raw_doc in raw_docs:
78
+ try:
79
+ tables = pd.read_html(StringIO(str(raw_doc.page_content)))
80
+ tables_list.extend(tables)
81
+ except Exception as e:
82
+ print(f"Error reading table: {e}")
83
+ continue
84
+
85
+ return tables_list
86
 
87
 
88
  def concat_dfs(df_list):
 
90
  return concatenated_df
91
 
92
 
93
+ def get_docs(url, max_depth):
94
+ raw_html = scrape_text(url, max_depth)
95
  if raw_html is None:
96
  return None, None, None, None, None
97
 
98
+ tables_list = get_tables(raw_html)
99
+ if tables_list:
 
100
  concat_tables = concat_dfs(tables_list)
101
  else:
102
  concat_tables = None
103
 
104
+ tables_rmv_html = remove_tables(raw_html)
105
  clean_docs = clean_text(tables_rmv_html)
106
 
107
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=200)
108
  documents_splits = text_splitter.split_documents(clean_docs)
109
  formatted_chunks = format_chunks_with_spaces(documents_splits)
110
 
111
+ return format_page_content(raw_html), format_page_content(clean_docs), concat_tables, format_metdata(raw_html), formatted_chunks
112
 
113
 
114
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple", secondary_hue="indigo")) as demo:
 
125
  with gr.Row():
126
  with gr.Column(scale=1):
127
  url_input = gr.Textbox(lines=5, label="URL", placeholder="Enter your URL here...")
128
+ with gr.Row():
129
+ max_depth = gr.Slider(1, 50, value=1, step=1, label="Max Depth", interactive=True)
130
+ scarpe_url_button = gr.Button(value="Scrape & Create Embeddings", variant="primary")
131
 
132
  with gr.Column(elem_id = "col_container", scale=2):
133
  with gr.Tabs():
 
147
  metadata = gr.Textbox(lines=TAB_LINES, label="Metadata", value="", interactive=False,
148
  autoscroll=False)
149
 
150
+ scarpe_url_button.click(get_docs, inputs=[url_input, max_depth], outputs=[raw_page_content, page_content, tables,
151
  metadata, parsed_chunks])
152
 
153