abdo-Mansour commited on
Commit
f2a2588
·
1 Parent(s): b05b1be

completed mcp v1

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .env
2
+ test.ipynb
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import gradio as gr
4
+ from typing import Dict, Any
5
+ from web2json.preprocessor import BasicPreprocessor
6
+ from web2json.ai_extractor import AIExtractor, GeminiLLMClient
7
+ from web2json.postprocessor import PostProcessor
8
+ from web2json.pipeline import Pipeline
9
+ from pydantic import BaseModel, Field
10
+ import os
11
+ import dotenv
12
+
13
+ dotenv.load_dotenv()
14
+
15
+ # Define schemas
16
+ class Article(BaseModel):
17
+ title: str = Field(..., description="The title of the article.")
18
+ author: str = Field(..., description="The author of the article.")
19
+ content: str = Field(..., description="The main content of the article.")
20
+
21
+ class Product(BaseModel):
22
+ name: str = Field(..., description="The name of the product.")
23
+ description: str = Field(..., description="A detailed description of the product.")
24
+ price: float = Field(..., description="The price of the product.")
25
+
26
+ class JobPosting(BaseModel):
27
+ title: str = Field(..., description="The title of the job position.")
28
+ company: str = Field(..., description="The name of the company offering the job.")
29
+ location: str = Field(..., description="The location of the job.")
30
+ description: str = Field(..., description="A detailed description of the job responsibilities.")
31
+
32
+ SCHEMA_OPTIONS = {
33
+ "Article": Article,
34
+ "Product": Product,
35
+ "Job Posting": JobPosting,
36
+ }
37
+
38
+ # Core processing function
39
+
40
+ def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
41
+ if schema_name not in SCHEMA_OPTIONS:
42
+ return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
43
+
44
+ schema = SCHEMA_OPTIONS[schema_name]
45
+ prompt_template = "extract the following information: {content} based on schema: {schema}"
46
+
47
+ # Initialize pipeline components
48
+ preprocessor = BasicPreprocessor(config={'keep_tags': False})
49
+ try:
50
+ llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
51
+ except Exception as e:
52
+ return {"error": f"Failed to initialize LLM client: {str(e)}"}
53
+
54
+ ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
55
+ postprocessor = PostProcessor()
56
+ pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
57
+
58
+ try:
59
+ result = pipeline.run(content, is_url, schema)
60
+ # print("-"*80)
61
+ # print(f"Processed result: {result}")
62
+ return result
63
+ except Exception as e:
64
+ return {"error": f"Processing error: {str(e)}"}
65
+
66
+ # Build Gradio Interface
67
+ demo = gr.Interface(
68
+ fn=webpage_to_json,
69
+ inputs=[
70
+ gr.Textbox(label="Content (URL or Raw Text)", lines=10,
71
+ placeholder="Enter URL or paste raw HTML/text here."),
72
+ gr.Checkbox(label="Content is URL?", value=False),
73
+ gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
74
+ label="Select Schema", value="Article")
75
+ ],
76
+ outputs=gr.JSON(label="Output JSON"),
77
+ title="Webpage to JSON Converter",
78
+ description="Convert web pages or raw text into structured JSON using customizable schemas."
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ demo.launch(mcp_server=True)
web2json/__pycache__/ai_extractor.cpython-311.pyc ADDED
Binary file (6.46 kB). View file
 
web2json/__pycache__/pipeline.cpython-311.pyc ADDED
Binary file (2.49 kB). View file
 
web2json/__pycache__/postprocessor.cpython-311.pyc ADDED
Binary file (1.65 kB). View file
 
web2json/__pycache__/preprocessor.cpython-311.pyc ADDED
Binary file (5.68 kB). View file
 
web2json/ai_extractor.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from google import genai
4
+ from google.genai import types
5
+ from pydantic import BaseModel
6
+
7
+ class LLMClient(ABC):
8
+ """
9
+ Abstract base class for calling LLM APIs.
10
+ """
11
+ def __init__(self, config: dict = None):
12
+ """
13
+ Initializes the LLMClient with a configuration dictionary.
14
+
15
+ Args:
16
+ config (dict): Configuration settings for the LLM client.
17
+ """
18
+ self.config = config or {}
19
+
20
+ @abstractmethod
21
+ def call_api(self, prompt: str) -> str:
22
+ """
23
+ Call the underlying LLM API with the given prompt.
24
+
25
+ Args:
26
+ prompt (str): The prompt or input text for the LLM.
27
+
28
+ Returns:
29
+ str: The response from the LLM.
30
+ """
31
+ pass
32
+
33
+
34
+ class GeminiLLMClient(LLMClient):
35
+ """
36
+ Concrete implementation of LLMClient for the Gemini API.
37
+ """
38
+
39
+ def __init__(self, config: dict):
40
+ """
41
+ Initializes the GeminiLLMClient with an API key, model name, and optional generation settings.
42
+
43
+ Args:
44
+ config (dict): Configuration containing:
45
+ - 'api_key': (optional) API key for Gemini (falls back to GEMINI_API_KEY env var)
46
+ - 'model_name': (optional) the model to use (default 'gemini-2.0-flash')
47
+ - 'generation_config': (optional) dict of GenerateContentConfig parameters
48
+ """
49
+ api_key = config.get("api_key") or os.environ.get("GEMINI_API_KEY")
50
+ if not api_key:
51
+ raise ValueError(
52
+ "API key for Gemini must be provided in config['api_key'] or GEMINI_API_KEY env var."
53
+ )
54
+ self.client = genai.Client(api_key=api_key)
55
+ self.model_name = config.get("model_name", "gemini-2.0-flash")
56
+ # allow custom generation settings, fallback to sensible defaults
57
+ gen_conf = config.get("generation_config", {})
58
+ self.generate_config = types.GenerateContentConfig(
59
+ response_mime_type=gen_conf.get("response_mime_type", "text/plain"),
60
+ temperature=gen_conf.get("temperature"),
61
+ max_output_tokens=gen_conf.get("max_output_tokens"),
62
+ top_p=gen_conf.get("top_p"),
63
+ top_k=gen_conf.get("top_k"),
64
+ # add any other fields you want to expose
65
+ )
66
+
67
+ def call_api(self, prompt: str) -> str:
68
+ """
69
+ Call the Gemini API with the given prompt (non-streaming).
70
+
71
+ Args:
72
+ prompt (str): The input text for the API.
73
+
74
+ Returns:
75
+ str: The generated text from the Gemini API.
76
+ """
77
+ contents = [
78
+ types.Content(
79
+ role="user",
80
+ parts=[types.Part.from_text(text=prompt)],
81
+ )
82
+ ]
83
+
84
+ # Non-streaming call returns a full response object
85
+ response = self.client.models.generate_content(
86
+ model=self.model_name,
87
+ contents=contents,
88
+ config=self.generate_config,
89
+ )
90
+
91
+ # Combine all output parts into a single string
92
+ return response.text
93
+
94
+
95
+
96
+ class AIExtractor:
97
+ def __init__(self, llm_client: LLMClient, prompt_template: str):
98
+ """
99
+ Initializes the AIExtractor with a specific LLM client and configuration.
100
+
101
+ Args:
102
+ llm_client (LLMClient): An instance of a class that implements the LLMClient interface.
103
+ prompt_template (str): The template to use for generating prompts for the LLM.
104
+ should contain placeholders for dynamic content.
105
+ e.g., "Extract the following information: {content} based on schema: {schema}"
106
+ """
107
+ self.llm_client = llm_client
108
+ self.prompt_template = prompt_template
109
+
110
+ def extract(self, content: str, schema: BaseModel) -> str:
111
+ """
112
+ Extracts structured information from the given content based on the provided schema.
113
+
114
+ Args:
115
+ content (str): The raw content to extract information from.
116
+ schema (BaseModel): A Pydantic model defining the structure of the expected output.
117
+
118
+ Returns:
119
+ str: The structured JSON object as a string.
120
+ """
121
+ prompt = self.prompt_template.format(content=content, schema=schema.model_json_schema())
122
+ # print(f"Generated prompt: {prompt}")
123
+ response = self.llm_client.call_api(prompt)
124
+ return response
125
+
126
+
web2json/pipeline.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from web2json.ai_extractor import *
2
+ from web2json.postprocessor import *
3
+ from web2json.preprocessor import *
4
+ from pydantic import BaseModel
5
+
6
+ class Pipeline:
7
+
8
+ def __init__(self,
9
+ preprocessor: Preprocessor,
10
+ ai_extractor: AIExtractor,
11
+ postprocessor: PostProcessor):
12
+ self.preprocessor = preprocessor
13
+ self.ai_extractor = ai_extractor
14
+ self.postprocessor = postprocessor
15
+
16
+ def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
17
+ """
18
+ Run the entire pipeline: preprocess, extract, and postprocess.
19
+
20
+ Args:
21
+ content (str): The raw content to process.
22
+ is_url (bool): Whether the content is a URL or raw text.
23
+ schema (BaseModel): The schema defining the structure of the expected output.
24
+
25
+ Returns:
26
+ dict: The final structured data after processing.
27
+ """
28
+ # Step 1: Preprocess the content
29
+ preprocessed_content = self.preprocessor.preprocess(content, is_url)
30
+ # print(f"Preprocessed content: {preprocessed_content[:100]}...")
31
+ # print('+'*80)
32
+ # Step 2: Extract structured information using AI
33
+ extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
34
+ # print(f"Extracted data: {extracted_data[:100]}...")
35
+ # print('+'*80)
36
+ # Step 3: Post-process the extracted data
37
+ final_output = self.postprocessor.process(extracted_data)
38
+ # print(f"Final output: {final_output}")
39
+ # print('+'*80)
40
+
41
+ return final_output
42
+
43
+
web2json/postprocessor.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from json_repair import repair_json
2
+ import json
3
+
4
+ class PostProcessor:
5
+
6
+ def process(self, response: str) -> dict:
7
+ json_response = {}
8
+ try:
9
+ # Extract the JSON from the generated text. Handle variations in output format.
10
+ json_string = response
11
+ if "```json" in response:
12
+ json_string = response.split("```json")[1].split("```")[0]
13
+ elif "{" in response and "}" in response:
14
+ # try to grab the json
15
+ start_index = response.find("{")
16
+ end_index = response.rfind("}") + 1
17
+ json_string = response[start_index:end_index]
18
+
19
+ json_response = json.loads(repair_json(json_string)) # Added for robustness
20
+ except Exception as e:
21
+ print(f"Error parsing JSON: {e}")
22
+ print(f"Generated text: {response}")
23
+ json_response = {}
24
+
25
+
26
+ return json_response
27
+
web2json/preprocessor.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from bs4 import BeautifulSoup , Comment
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Dict, Optional
6
+
7
+
8
+ class Preprocessor(ABC):
9
+ """
10
+ Abstract base class for preprocessors.
11
+ Defines the interface for transforming raw inputs into structured data.
12
+ """
13
+
14
+ def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
15
+ """
16
+ Initialize the preprocessor with optional configuration.
17
+
18
+ Args:
19
+ config: A dictionary of configuration settings.
20
+ - keep_tags (bool): If True, keeps HTML tags in the output; otherwise, cleans them.
21
+ """
22
+ self.config = config if config is not None else {'keep_tags': False}
23
+
24
+ def _fetch_content(self, url: str) -> str:
25
+ """
26
+ Fetches and parses the text content from a URL.
27
+
28
+ Args:
29
+ url: The URL to fetch content from.
30
+
31
+ Returns:
32
+ The clean, extracted text content from the page.
33
+
34
+ Raises:
35
+ ValueError: If the URL cannot be fetched or processed.
36
+ """
37
+ try:
38
+ # Set a User-Agent header to mimic a browser, which can help avoid
39
+ # being blocked by some websites.
40
+ # Inside _fetch_content method
41
+ headers = {
42
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
43
+ 'Accept-Language': 'en-US,en;q=0.9',
44
+ 'Accept-Encoding': 'gzip, deflate, br',
45
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
46
+ 'Connection': 'keep-alive',
47
+ }
48
+
49
+ # Make the HTTP GET request with a timeout.
50
+ response = requests.get(url, headers=headers, timeout=15)
51
+
52
+
53
+ return response.text
54
+
55
+ except requests.exceptions.RequestException as e:
56
+ # Catch any network-related errors (DNS, connection, timeout, etc.)
57
+ # and re-raise them as a more user-friendly ValueError.
58
+ raise ValueError(f"Failed to fetch content from URL: {url}. Error: {e}")
59
+
60
+
61
+ @abstractmethod
62
+ def preprocess(self, content: str, is_url: bool) -> str:
63
+ """
64
+ Take raw content (HTML, text, etc.) and apply preprocessing steps.
65
+
66
+ Args:
67
+ content: The raw data to preprocess.
68
+
69
+ Returns:
70
+ A dictionary containing structured, cleaned data ready for downstream tasks.
71
+ """
72
+ pass
73
+
74
+ class BasicPreprocessor(Preprocessor):
75
+ """
76
+ Base preprocessor with common functionality.
77
+ Can be extended for specific preprocessing tasks.
78
+ """
79
+ # TODO: Might need to think of how to improve this later
80
+ def _clean_html(self, html_content: str) -> str:
81
+ """
82
+ Cleans up the given HTML content by:
83
+ - Removing <script> and <style> tags and their content.
84
+ - Removing HTML comments.
85
+ - Extracting and returning the visible text with normalized whitespace if keep_tags is False.
86
+
87
+ Args:
88
+ html_content (str): The HTML content to clean.
89
+
90
+ Returns:
91
+ str: The cleaned, visible text from the HTML.
92
+ """
93
+ # Parse the HTML content
94
+ soup = BeautifulSoup(html_content, "html.parser")
95
+
96
+ # Remove script and style elements
97
+ for tag in soup(["script", "style"]):
98
+ tag.decompose()
99
+
100
+ # Remove HTML comments
101
+ for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
102
+ comment.extract()
103
+
104
+ # Extract text and normalize whitespace
105
+ if self.config.get('keep_tags', False):
106
+ # If keep_tags is True, return the raw HTML
107
+ return str(soup)
108
+
109
+ text = soup.get_text(separator=" ", strip=True)
110
+ clean_text = re.sub(r'\s+', ' ', text)
111
+
112
+ return clean_text
113
+
114
+ def preprocess(self, content: str, is_url: bool) -> str:
115
+ """
116
+ Take raw content (HTML, text, etc.) and apply preprocessing steps.
117
+
118
+ Args:
119
+ content: The raw data to preprocess.
120
+
121
+ Returns:
122
+ A dictionary containing structured, cleaned data ready for downstream tasks.
123
+ """
124
+
125
+ html_content = content
126
+ if is_url:
127
+ # Fetch content from the URL
128
+ html_content = self._fetch_content(content)
129
+
130
+
131
+ # Clean the HTML content
132
+ cleaned_content = self._clean_html(html_content)
133
+
134
+ return cleaned_content.strip() # Return the cleaned text content, stripped of leading/trailing whitespace
135
+
136
+
137
+
138
+