Commit
·
f2a2588
1
Parent(s):
b05b1be
completed mcp v1
Browse files- .gitignore +2 -0
- .gradio/certificate.pem +31 -0
- app.py +82 -0
- web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
- web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
- web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
- web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
- web2json/ai_extractor.py +126 -0
- web2json/pipeline.py +43 -0
- web2json/postprocessor.py +27 -0
- web2json/preprocessor.py +138 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
test.ipynb
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
app.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
from typing import Dict, Any
|
5 |
+
from web2json.preprocessor import BasicPreprocessor
|
6 |
+
from web2json.ai_extractor import AIExtractor, GeminiLLMClient
|
7 |
+
from web2json.postprocessor import PostProcessor
|
8 |
+
from web2json.pipeline import Pipeline
|
9 |
+
from pydantic import BaseModel, Field
|
10 |
+
import os
|
11 |
+
import dotenv
|
12 |
+
|
13 |
+
dotenv.load_dotenv()
|
14 |
+
|
15 |
+
# Define schemas
|
16 |
+
class Article(BaseModel):
|
17 |
+
title: str = Field(..., description="The title of the article.")
|
18 |
+
author: str = Field(..., description="The author of the article.")
|
19 |
+
content: str = Field(..., description="The main content of the article.")
|
20 |
+
|
21 |
+
class Product(BaseModel):
|
22 |
+
name: str = Field(..., description="The name of the product.")
|
23 |
+
description: str = Field(..., description="A detailed description of the product.")
|
24 |
+
price: float = Field(..., description="The price of the product.")
|
25 |
+
|
26 |
+
class JobPosting(BaseModel):
|
27 |
+
title: str = Field(..., description="The title of the job position.")
|
28 |
+
company: str = Field(..., description="The name of the company offering the job.")
|
29 |
+
location: str = Field(..., description="The location of the job.")
|
30 |
+
description: str = Field(..., description="A detailed description of the job responsibilities.")
|
31 |
+
|
32 |
+
SCHEMA_OPTIONS = {
|
33 |
+
"Article": Article,
|
34 |
+
"Product": Product,
|
35 |
+
"Job Posting": JobPosting,
|
36 |
+
}
|
37 |
+
|
38 |
+
# Core processing function
|
39 |
+
|
40 |
+
def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
|
41 |
+
if schema_name not in SCHEMA_OPTIONS:
|
42 |
+
return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
|
43 |
+
|
44 |
+
schema = SCHEMA_OPTIONS[schema_name]
|
45 |
+
prompt_template = "extract the following information: {content} based on schema: {schema}"
|
46 |
+
|
47 |
+
# Initialize pipeline components
|
48 |
+
preprocessor = BasicPreprocessor(config={'keep_tags': False})
|
49 |
+
try:
|
50 |
+
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
51 |
+
except Exception as e:
|
52 |
+
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
53 |
+
|
54 |
+
ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
|
55 |
+
postprocessor = PostProcessor()
|
56 |
+
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
57 |
+
|
58 |
+
try:
|
59 |
+
result = pipeline.run(content, is_url, schema)
|
60 |
+
# print("-"*80)
|
61 |
+
# print(f"Processed result: {result}")
|
62 |
+
return result
|
63 |
+
except Exception as e:
|
64 |
+
return {"error": f"Processing error: {str(e)}"}
|
65 |
+
|
66 |
+
# Build Gradio Interface
|
67 |
+
demo = gr.Interface(
|
68 |
+
fn=webpage_to_json,
|
69 |
+
inputs=[
|
70 |
+
gr.Textbox(label="Content (URL or Raw Text)", lines=10,
|
71 |
+
placeholder="Enter URL or paste raw HTML/text here."),
|
72 |
+
gr.Checkbox(label="Content is URL?", value=False),
|
73 |
+
gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
|
74 |
+
label="Select Schema", value="Article")
|
75 |
+
],
|
76 |
+
outputs=gr.JSON(label="Output JSON"),
|
77 |
+
title="Webpage to JSON Converter",
|
78 |
+
description="Convert web pages or raw text into structured JSON using customizable schemas."
|
79 |
+
)
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
demo.launch(mcp_server=True)
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
ADDED
Binary file (6.46 kB). View file
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
ADDED
Binary file (2.49 kB). View file
|
|
web2json/__pycache__/postprocessor.cpython-311.pyc
ADDED
Binary file (1.65 kB). View file
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
ADDED
Binary file (5.68 kB). View file
|
|
web2json/ai_extractor.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from abc import ABC, abstractmethod
|
3 |
+
from google import genai
|
4 |
+
from google.genai import types
|
5 |
+
from pydantic import BaseModel
|
6 |
+
|
7 |
+
class LLMClient(ABC):
|
8 |
+
"""
|
9 |
+
Abstract base class for calling LLM APIs.
|
10 |
+
"""
|
11 |
+
def __init__(self, config: dict = None):
|
12 |
+
"""
|
13 |
+
Initializes the LLMClient with a configuration dictionary.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
config (dict): Configuration settings for the LLM client.
|
17 |
+
"""
|
18 |
+
self.config = config or {}
|
19 |
+
|
20 |
+
@abstractmethod
|
21 |
+
def call_api(self, prompt: str) -> str:
|
22 |
+
"""
|
23 |
+
Call the underlying LLM API with the given prompt.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
prompt (str): The prompt or input text for the LLM.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
str: The response from the LLM.
|
30 |
+
"""
|
31 |
+
pass
|
32 |
+
|
33 |
+
|
34 |
+
class GeminiLLMClient(LLMClient):
|
35 |
+
"""
|
36 |
+
Concrete implementation of LLMClient for the Gemini API.
|
37 |
+
"""
|
38 |
+
|
39 |
+
def __init__(self, config: dict):
|
40 |
+
"""
|
41 |
+
Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
|
42 |
+
|
43 |
+
Args:
|
44 |
+
config (dict): Configuration containing:
|
45 |
+
- 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
|
46 |
+
- 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
|
47 |
+
- 'generation_config': (optional) dict of GenerateContentConfig parameters
|
48 |
+
"""
|
49 |
+
api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
|
50 |
+
if not api_key:
|
51 |
+
raise ValueError(
|
52 |
+
"API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
|
53 |
+
)
|
54 |
+
self.client = genai.Client(api_key=api_key)
|
55 |
+
self.model_name = config.get("model_name", "gemini-2.0-flash")
|
56 |
+
# allow custom generation settings, fallback to sensible defaults
|
57 |
+
gen_conf = config.get("generation_config", {})
|
58 |
+
self.generate_config = types.GenerateContentConfig(
|
59 |
+
response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
|
60 |
+
temperature=gen_conf.get("temperature"),
|
61 |
+
max_output_tokens=gen_conf.get("max_output_tokens"),
|
62 |
+
top_p=gen_conf.get("top_p"),
|
63 |
+
top_k=gen_conf.get("top_k"),
|
64 |
+
# add any other fields you want to expose
|
65 |
+
)
|
66 |
+
|
67 |
+
def call_api(self, prompt: str) -> str:
|
68 |
+
"""
|
69 |
+
Call the Gemini API with the given prompt (non-streaming).
|
70 |
+
|
71 |
+
Args:
|
72 |
+
prompt (str): The input text for the API.
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
str: The generated text from the Gemini API.
|
76 |
+
"""
|
77 |
+
contents = [
|
78 |
+
types.Content(
|
79 |
+
role="user",
|
80 |
+
parts=[types.Part.from_text(text=prompt)],
|
81 |
+
)
|
82 |
+
]
|
83 |
+
|
84 |
+
# Non-streaming call returns a full response object
|
85 |
+
response = self.client.models.generate_content(
|
86 |
+
model=self.model_name,
|
87 |
+
contents=contents,
|
88 |
+
config=self.generate_config,
|
89 |
+
)
|
90 |
+
|
91 |
+
# Combine all output parts into a single string
|
92 |
+
return response.text
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
class AIExtractor:
|
97 |
+
def __init__(self, llm_client: LLMClient, prompt_template: str):
|
98 |
+
"""
|
99 |
+
Initializes the AIExtractor with a specific LLM client and configuration.
|
100 |
+
|
101 |
+
Args:
|
102 |
+
llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
|
103 |
+
prompt_template (str): The template to use for generating prompts for the LLM.
|
104 |
+
should contain placeholders for dynamic content.
|
105 |
+
e.g., "Extract the following information: {content} based on schema: {schema}"
|
106 |
+
"""
|
107 |
+
self.llm_client = llm_client
|
108 |
+
self.prompt_template = prompt_template
|
109 |
+
|
110 |
+
def extract(self, content: str, schema: BaseModel) -> str:
|
111 |
+
"""
|
112 |
+
Extracts structured information from the given content based on the provided schema.
|
113 |
+
|
114 |
+
Args:
|
115 |
+
content (str): The raw content to extract information from.
|
116 |
+
schema (BaseModel): A Pydantic model defining the structure of the expected output.
|
117 |
+
|
118 |
+
Returns:
|
119 |
+
str: The structured JSON object as a string.
|
120 |
+
"""
|
121 |
+
prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
|
122 |
+
# print(f"Generated prompt: {prompt}")
|
123 |
+
response = self.llm_client.call_api(prompt)
|
124 |
+
return response
|
125 |
+
|
126 |
+
|
web2json/pipeline.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from web2json.ai_extractor import *
|
2 |
+
from web2json.postprocessor import *
|
3 |
+
from web2json.preprocessor import *
|
4 |
+
from pydantic import BaseModel
|
5 |
+
|
6 |
+
class Pipeline:
|
7 |
+
|
8 |
+
def __init__(self,
|
9 |
+
preprocessor: Preprocessor,
|
10 |
+
ai_extractor: AIExtractor,
|
11 |
+
postprocessor: PostProcessor):
|
12 |
+
self.preprocessor = preprocessor
|
13 |
+
self.ai_extractor = ai_extractor
|
14 |
+
self.postprocessor = postprocessor
|
15 |
+
|
16 |
+
def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
|
17 |
+
"""
|
18 |
+
Run the entire pipeline: preprocess, extract, and postprocess.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
content (str): The raw content to process.
|
22 |
+
is_url (bool): Whether the content is a URL or raw text.
|
23 |
+
schema (BaseModel): The schema defining the structure of the expected output.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
dict: The final structured data after processing.
|
27 |
+
"""
|
28 |
+
# Step 1: Preprocess the content
|
29 |
+
preprocessed_content = self.preprocessor.preprocess(content, is_url)
|
30 |
+
# print(f"Preprocessed content: {preprocessed_content[:100]}...")
|
31 |
+
# print('+'*80)
|
32 |
+
# Step 2: Extract structured information using AI
|
33 |
+
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
|
34 |
+
# print(f"Extracted data: {extracted_data[:100]}...")
|
35 |
+
# print('+'*80)
|
36 |
+
# Step 3: Post-process the extracted data
|
37 |
+
final_output = self.postprocessor.process(extracted_data)
|
38 |
+
# print(f"Final output: {final_output}")
|
39 |
+
# print('+'*80)
|
40 |
+
|
41 |
+
return final_output
|
42 |
+
|
43 |
+
|
web2json/postprocessor.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from json_repair import repair_json
|
2 |
+
import json
|
3 |
+
|
4 |
+
class PostProcessor:
|
5 |
+
|
6 |
+
def process(self, response: str) -> dict:
|
7 |
+
json_response = {}
|
8 |
+
try:
|
9 |
+
# Extract the JSON from the generated text. Handle variations in output format.
|
10 |
+
json_string = response
|
11 |
+
if "```json" in response:
|
12 |
+
json_string = response.split("```json")[1].split("```")[0]
|
13 |
+
elif "{" in response and "}" in response:
|
14 |
+
# try to grab the json
|
15 |
+
start_index = response.find("{")
|
16 |
+
end_index = response.rfind("}") + 1
|
17 |
+
json_string = response[start_index:end_index]
|
18 |
+
|
19 |
+
json_response = json.loads(repair_json(json_string)) # Added for robustness
|
20 |
+
except Exception as e:
|
21 |
+
print(f"Error parsing JSON: {e}")
|
22 |
+
print(f"Generated text: {response}")
|
23 |
+
json_response = {}
|
24 |
+
|
25 |
+
|
26 |
+
return json_response
|
27 |
+
|
web2json/preprocessor.py
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup , Comment
|
4 |
+
from abc import ABC, abstractmethod
|
5 |
+
from typing import Any, Dict, Optional
|
6 |
+
|
7 |
+
|
8 |
+
class Preprocessor(ABC):
|
9 |
+
"""
|
10 |
+
Abstract base class for preprocessors.
|
11 |
+
Defines the interface for transforming raw inputs into structured data.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
|
15 |
+
"""
|
16 |
+
Initialize the preprocessor with optional configuration.
|
17 |
+
|
18 |
+
Args:
|
19 |
+
config: A dictionary of configuration settings.
|
20 |
+
- keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
|
21 |
+
"""
|
22 |
+
self.config = config if config is not None else {'keep_tags': False}
|
23 |
+
|
24 |
+
def _fetch_content(self, url: str) -> str:
|
25 |
+
"""
|
26 |
+
Fetches and parses the text content from a URL.
|
27 |
+
|
28 |
+
Args:
|
29 |
+
url: The URL to fetch content from.
|
30 |
+
|
31 |
+
Returns:
|
32 |
+
The clean, extracted text content from the page.
|
33 |
+
|
34 |
+
Raises:
|
35 |
+
ValueError: If the URL cannot be fetched or processed.
|
36 |
+
"""
|
37 |
+
try:
|
38 |
+
# Set a User-Agent header to mimic a browser, which can help avoid
|
39 |
+
# being blocked by some websites.
|
40 |
+
# Inside _fetch_content method
|
41 |
+
headers = {
|
42 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
43 |
+
'Accept-Language': 'en-US,en;q=0.9',
|
44 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
45 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
46 |
+
'Connection': 'keep-alive',
|
47 |
+
}
|
48 |
+
|
49 |
+
# Make the HTTP GET request with a timeout.
|
50 |
+
response = requests.get(url, headers=headers, timeout=15)
|
51 |
+
|
52 |
+
|
53 |
+
return response.text
|
54 |
+
|
55 |
+
except requests.exceptions.RequestException as e:
|
56 |
+
# Catch any network-related errors (DNS, connection, timeout, etc.)
|
57 |
+
# and re-raise them as a more user-friendly ValueError.
|
58 |
+
raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
|
59 |
+
|
60 |
+
|
61 |
+
@abstractmethod
|
62 |
+
def preprocess(self, content: str, is_url: bool) -> str:
|
63 |
+
"""
|
64 |
+
Take raw content (HTML, text, etc.) and apply preprocessing steps.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
content: The raw data to preprocess.
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
A dictionary containing structured, cleaned data ready for downstream tasks.
|
71 |
+
"""
|
72 |
+
pass
|
73 |
+
|
74 |
+
class BasicPreprocessor(Preprocessor):
|
75 |
+
"""
|
76 |
+
Base preprocessor with common functionality.
|
77 |
+
Can be extended for specific preprocessing tasks.
|
78 |
+
"""
|
79 |
+
# TODO: Might need to think of how to improve this later
|
80 |
+
def _clean_html(self, html_content: str) -> str:
|
81 |
+
"""
|
82 |
+
Cleans up the given HTML content by:
|
83 |
+
- Removing <script> and <style> tags and their content.
|
84 |
+
- Removing HTML comments.
|
85 |
+
- Extracting and returning the visible text with normalized whitespace if keep_tags is False.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
html_content (str): The HTML content to clean.
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
str: The cleaned, visible text from the HTML.
|
92 |
+
"""
|
93 |
+
# Parse the HTML content
|
94 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
95 |
+
|
96 |
+
# Remove script and style elements
|
97 |
+
for tag in soup(["script", "style"]):
|
98 |
+
tag.decompose()
|
99 |
+
|
100 |
+
# Remove HTML comments
|
101 |
+
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
102 |
+
comment.extract()
|
103 |
+
|
104 |
+
# Extract text and normalize whitespace
|
105 |
+
if self.config.get('keep_tags', False):
|
106 |
+
# If keep_tags is True, return the raw HTML
|
107 |
+
return str(soup)
|
108 |
+
|
109 |
+
text = soup.get_text(separator=" ", strip=True)
|
110 |
+
clean_text = re.sub(r'\s+', ' ', text)
|
111 |
+
|
112 |
+
return clean_text
|
113 |
+
|
114 |
+
def preprocess(self, content: str, is_url: bool) -> str:
|
115 |
+
"""
|
116 |
+
Take raw content (HTML, text, etc.) and apply preprocessing steps.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
content: The raw data to preprocess.
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
A dictionary containing structured, cleaned data ready for downstream tasks.
|
123 |
+
"""
|
124 |
+
|
125 |
+
html_content = content
|
126 |
+
if is_url:
|
127 |
+
# Fetch content from the URL
|
128 |
+
html_content = self._fetch_content(content)
|
129 |
+
|
130 |
+
|
131 |
+
# Clean the HTML content
|
132 |
+
cleaned_content = self._clean_html(html_content)
|
133 |
+
|
134 |
+
return cleaned_content.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|