Spaces:

Agents-MCP-Hackathon
/

MCP_Server_Web2JSON

Running

App Files Files Community

MCP_Server_Web2JSON / app.py

abdo-Mansour

completed mcp v1

f2a2588 about 1 month ago

raw

history blame

3.11 kB

	import json
	import pandas as pd
	import gradio as gr
	from typing import Dict, Any
	from web2json.preprocessor import BasicPreprocessor
	from web2json.ai_extractor import AIExtractor, GeminiLLMClient
	from web2json.postprocessor import PostProcessor
	from web2json.pipeline import Pipeline
	from pydantic import BaseModel, Field
	import os
	import dotenv

	dotenv.load_dotenv()

	# Define schemas
	class Article(BaseModel):
	title: str = Field(..., description="The title of the article.")
	author: str = Field(..., description="The author of the article.")
	content: str = Field(..., description="The main content of the article.")

	class Product(BaseModel):
	name: str = Field(..., description="The name of the product.")
	description: str = Field(..., description="A detailed description of the product.")
	price: float = Field(..., description="The price of the product.")

	class JobPosting(BaseModel):
	title: str = Field(..., description="The title of the job position.")
	company: str = Field(..., description="The name of the company offering the job.")
	location: str = Field(..., description="The location of the job.")
	description: str = Field(..., description="A detailed description of the job responsibilities.")

	SCHEMA_OPTIONS = {
	"Article": Article,
	"Product": Product,
	"Job Posting": JobPosting,
	}

	# Core processing function

	def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
	if schema_name not in SCHEMA_OPTIONS:
	return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}

	schema = SCHEMA_OPTIONS[schema_name]
	prompt_template = "extract the following information: {content} based on schema: {schema}"

	# Initialize pipeline components
	preprocessor = BasicPreprocessor(config={'keep_tags': False})
	try:
	llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
	except Exception as e:
	return {"error": f"Failed to initialize LLM client: {str(e)}"}

	ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
	postprocessor = PostProcessor()
	pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

	try:
	result = pipeline.run(content, is_url, schema)
	# print("-"*80)
	# print(f"Processed result: {result}")
	return result
	except Exception as e:
	return {"error": f"Processing error: {str(e)}"}

	# Build Gradio Interface
	demo = gr.Interface(
	fn=webpage_to_json,
	inputs=[
	gr.Textbox(label="Content (URL or Raw Text)", lines=10,
	placeholder="Enter URL or paste raw HTML/text here."),
	gr.Checkbox(label="Content is URL?", value=False),
	gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
	label="Select Schema", value="Article")
	],
	outputs=gr.JSON(label="Output JSON"),
	title="Webpage to JSON Converter",
	description="Convert web pages or raw text into structured JSON using customizable schemas."
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)