File size: 3,114 Bytes
f2a2588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import json
import pandas as pd
import gradio as gr
from typing import Dict, Any
from web2json.preprocessor import BasicPreprocessor
from web2json.ai_extractor import AIExtractor, GeminiLLMClient
from web2json.postprocessor import PostProcessor
from web2json.pipeline import Pipeline
from pydantic import BaseModel, Field
import os
import dotenv

dotenv.load_dotenv()

# Define schemas
class Article(BaseModel):
    title: str = Field(..., description="The title of the article.")
    author: str = Field(..., description="The author of the article.")
    content: str = Field(..., description="The main content of the article.")

class Product(BaseModel):
    name: str = Field(..., description="The name of the product.")
    description: str = Field(..., description="A detailed description of the product.")
    price: float = Field(..., description="The price of the product.")

class JobPosting(BaseModel):
    title: str = Field(..., description="The title of the job position.")
    company: str = Field(..., description="The name of the company offering the job.")
    location: str = Field(..., description="The location of the job.")
    description: str = Field(..., description="A detailed description of the job responsibilities.")

SCHEMA_OPTIONS = {
    "Article": Article,
    "Product": Product,
    "Job Posting": JobPosting,
}

# Core processing function

def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
    if schema_name not in SCHEMA_OPTIONS:
        return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}

    schema = SCHEMA_OPTIONS[schema_name]
    prompt_template = "extract the following information: {content} based on schema: {schema}"

    # Initialize pipeline components
    preprocessor = BasicPreprocessor(config={'keep_tags': False})
    try:
        llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
    except Exception as e:
        return {"error": f"Failed to initialize LLM client: {str(e)}"}

    ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
    postprocessor = PostProcessor()
    pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

    try:
        result = pipeline.run(content, is_url, schema)
        # print("-"*80)
        # print(f"Processed result: {result}")
        return result
    except Exception as e:
        return {"error": f"Processing error: {str(e)}"}

# Build Gradio Interface
demo = gr.Interface(
    fn=webpage_to_json,
    inputs=[
        gr.Textbox(label="Content (URL or Raw Text)", lines=10,
                   placeholder="Enter URL or paste raw HTML/text here."),
        gr.Checkbox(label="Content is URL?", value=False),
        gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
                    label="Select Schema", value="Article")
    ],
    outputs=gr.JSON(label="Output JSON"),
    title="Webpage to JSON Converter",
    description="Convert web pages or raw text into structured JSON using customizable schemas."
)

if __name__ == "__main__":
    demo.launch(mcp_server=True)