Spaces:

Agents-MCP-Hackathon
/

MCP_Server_Web2JSON

Running

File size: 1,602 Bytes

from web2json.ai_extractor import *
from web2json.postprocessor import *
from web2json.preprocessor import *
from pydantic import BaseModel

class Pipeline:
    # constructor
    def __init__(self,
                 preprocessor: Preprocessor,
                 ai_extractor: AIExtractor,
                 postprocessor: PostProcessor):
        self.preprocessor = preprocessor
        self.ai_extractor = ai_extractor
        self.postprocessor = postprocessor

    def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
        """
        Run the entire pipeline: preprocess, extract, and postprocess.

        Args:
            content (str): The raw content to process.
            is_url (bool): Whether the content is a URL or raw text.
            schema (BaseModel): The schema defining the structure of the expected output.

        Returns:
            dict: The final structured data after processing.
        """
        # Step 1: Preprocess the content
        preprocessed_content = self.preprocessor.preprocess(content, is_url)
        print(f"Preprocessed content: {preprocessed_content[:100]}...")
        print('+'*80)
        # Step 2: Extract structured information using AI
        extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
        print(f"Extracted data: {extracted_data[:100]}...")
        print('+'*80)
        # Step 3: Post-process the extracted data
        final_output = self.postprocessor.process(extracted_data)
        print(f"Final output: {final_output}")
        print('+'*80)
        
        return final_output