File size: 3,114 Bytes
f2a2588 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import json
import pandas as pd
import gradio as gr
from typing import Dict, Any
from web2json.preprocessor import BasicPreprocessor
from web2json.ai_extractor import AIExtractor, GeminiLLMClient
from web2json.postprocessor import PostProcessor
from web2json.pipeline import Pipeline
from pydantic import BaseModel, Field
import os
import dotenv
dotenv.load_dotenv()
# Define schemas
class Article(BaseModel):
title: str = Field(..., description="The title of the article.")
author: str = Field(..., description="The author of the article.")
content: str = Field(..., description="The main content of the article.")
class Product(BaseModel):
name: str = Field(..., description="The name of the product.")
description: str = Field(..., description="A detailed description of the product.")
price: float = Field(..., description="The price of the product.")
class JobPosting(BaseModel):
title: str = Field(..., description="The title of the job position.")
company: str = Field(..., description="The name of the company offering the job.")
location: str = Field(..., description="The location of the job.")
description: str = Field(..., description="A detailed description of the job responsibilities.")
SCHEMA_OPTIONS = {
"Article": Article,
"Product": Product,
"Job Posting": JobPosting,
}
# Core processing function
def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
if schema_name not in SCHEMA_OPTIONS:
return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
schema = SCHEMA_OPTIONS[schema_name]
prompt_template = "extract the following information: {content} based on schema: {schema}"
# Initialize pipeline components
preprocessor = BasicPreprocessor(config={'keep_tags': False})
try:
llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
except Exception as e:
return {"error": f"Failed to initialize LLM client: {str(e)}"}
ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
postprocessor = PostProcessor()
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
try:
result = pipeline.run(content, is_url, schema)
# print("-"*80)
# print(f"Processed result: {result}")
return result
except Exception as e:
return {"error": f"Processing error: {str(e)}"}
# Build Gradio Interface
demo = gr.Interface(
fn=webpage_to_json,
inputs=[
gr.Textbox(label="Content (URL or Raw Text)", lines=10,
placeholder="Enter URL or paste raw HTML/text here."),
gr.Checkbox(label="Content is URL?", value=False),
gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
label="Select Schema", value="Article")
],
outputs=gr.JSON(label="Output JSON"),
title="Webpage to JSON Converter",
description="Convert web pages or raw text into structured JSON using customizable schemas."
)
if __name__ == "__main__":
demo.launch(mcp_server=True)
|