|
import json |
|
import pandas as pd |
|
import gradio as gr |
|
from typing import Dict, Any |
|
from web2json.preprocessor import BasicPreprocessor |
|
from web2json.ai_extractor import AIExtractor, GeminiLLMClient |
|
from web2json.postprocessor import PostProcessor |
|
from web2json.pipeline import Pipeline |
|
from pydantic import BaseModel, Field |
|
import os |
|
import dotenv |
|
|
|
dotenv.load_dotenv() |
|
|
|
|
|
class Article(BaseModel): |
|
title: str = Field(..., description="The title of the article.") |
|
author: str = Field(..., description="The author of the article.") |
|
content: str = Field(..., description="The main content of the article.") |
|
|
|
class Product(BaseModel): |
|
name: str = Field(..., description="The name of the product.") |
|
description: str = Field(..., description="A detailed description of the product.") |
|
price: float = Field(..., description="The price of the product.") |
|
|
|
class JobPosting(BaseModel): |
|
title: str = Field(..., description="The title of the job position.") |
|
company: str = Field(..., description="The name of the company offering the job.") |
|
location: str = Field(..., description="The location of the job.") |
|
description: str = Field(..., description="A detailed description of the job responsibilities.") |
|
|
|
SCHEMA_OPTIONS = { |
|
"Article": Article, |
|
"Product": Product, |
|
"Job Posting": JobPosting, |
|
} |
|
|
|
|
|
|
|
def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]: |
|
if schema_name not in SCHEMA_OPTIONS: |
|
return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"} |
|
|
|
schema = SCHEMA_OPTIONS[schema_name] |
|
prompt_template = "extract the following information: {content} based on schema: {schema}" |
|
|
|
|
|
preprocessor = BasicPreprocessor(config={'keep_tags': False}) |
|
try: |
|
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')}) |
|
except Exception as e: |
|
return {"error": f"Failed to initialize LLM client: {str(e)}"} |
|
|
|
ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template) |
|
postprocessor = PostProcessor() |
|
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor) |
|
|
|
try: |
|
result = pipeline.run(content, is_url, schema) |
|
|
|
|
|
return result |
|
except Exception as e: |
|
return {"error": f"Processing error: {str(e)}"} |
|
|
|
|
|
demo = gr.Interface( |
|
fn=webpage_to_json, |
|
inputs=[ |
|
gr.Textbox(label="Content (URL or Raw Text)", lines=10, |
|
placeholder="Enter URL or paste raw HTML/text here."), |
|
gr.Checkbox(label="Content is URL?", value=False), |
|
gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()), |
|
label="Select Schema", value="Article") |
|
], |
|
outputs=gr.JSON(label="Output JSON"), |
|
title="Webpage to JSON Converter", |
|
description="Convert web pages or raw text into structured JSON using customizable schemas." |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch(mcp_server=True) |
|
|