abdo-Mansour's picture
completed mcp v1
f2a2588
raw
history blame
3.11 kB
import json
import pandas as pd
import gradio as gr
from typing import Dict, Any
from web2json.preprocessor import BasicPreprocessor
from web2json.ai_extractor import AIExtractor, GeminiLLMClient
from web2json.postprocessor import PostProcessor
from web2json.pipeline import Pipeline
from pydantic import BaseModel, Field
import os
import dotenv
dotenv.load_dotenv()
# Define schemas
class Article(BaseModel):
title: str = Field(..., description="The title of the article.")
author: str = Field(..., description="The author of the article.")
content: str = Field(..., description="The main content of the article.")
class Product(BaseModel):
name: str = Field(..., description="The name of the product.")
description: str = Field(..., description="A detailed description of the product.")
price: float = Field(..., description="The price of the product.")
class JobPosting(BaseModel):
title: str = Field(..., description="The title of the job position.")
company: str = Field(..., description="The name of the company offering the job.")
location: str = Field(..., description="The location of the job.")
description: str = Field(..., description="A detailed description of the job responsibilities.")
SCHEMA_OPTIONS = {
"Article": Article,
"Product": Product,
"Job Posting": JobPosting,
}
# Core processing function
def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
if schema_name not in SCHEMA_OPTIONS:
return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
schema = SCHEMA_OPTIONS[schema_name]
prompt_template = "extract the following information: {content} based on schema: {schema}"
# Initialize pipeline components
preprocessor = BasicPreprocessor(config={'keep_tags': False})
try:
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
except Exception as e:
return {"error": f"Failed to initialize LLM client: {str(e)}"}
ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
postprocessor = PostProcessor()
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
try:
result = pipeline.run(content, is_url, schema)
# print("-"*80)
# print(f"Processed result: {result}")
return result
except Exception as e:
return {"error": f"Processing error: {str(e)}"}
# Build Gradio Interface
demo = gr.Interface(
fn=webpage_to_json,
inputs=[
gr.Textbox(label="Content (URL or Raw Text)", lines=10,
placeholder="Enter URL or paste raw HTML/text here."),
gr.Checkbox(label="Content is URL?", value=False),
gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
label="Select Schema", value="Article")
],
outputs=gr.JSON(label="Output JSON"),
title="Webpage to JSON Converter",
description="Convert web pages or raw text into structured JSON using customizable schemas."
)
if __name__ == "__main__":
demo.launch(mcp_server=True)