File size: 1,602 Bytes
f2a2588
 
 
 
 
 
cae5a7f
f2a2588
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03d1834
 
f2a2588
 
03d1834
 
f2a2588
 
03d1834
 
f2a2588
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from web2json.ai_extractor import *
from web2json.postprocessor import *
from web2json.preprocessor import *
from pydantic import BaseModel

class Pipeline:
    # constructor
    def __init__(self,
                 preprocessor: Preprocessor,
                 ai_extractor: AIExtractor,
                 postprocessor: PostProcessor):
        self.preprocessor = preprocessor
        self.ai_extractor = ai_extractor
        self.postprocessor = postprocessor

    def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
        """
        Run the entire pipeline: preprocess, extract, and postprocess.

        Args:
            content (str): The raw content to process.
            is_url (bool): Whether the content is a URL or raw text.
            schema (BaseModel): The schema defining the structure of the expected output.

        Returns:
            dict: The final structured data after processing.
        """
        # Step 1: Preprocess the content
        preprocessed_content = self.preprocessor.preprocess(content, is_url)
        print(f"Preprocessed content: {preprocessed_content[:100]}...")
        print('+'*80)
        # Step 2: Extract structured information using AI
        extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
        print(f"Extracted data: {extracted_data[:100]}...")
        print('+'*80)
        # Step 3: Post-process the extracted data
        final_output = self.postprocessor.process(extracted_data)
        print(f"Final output: {final_output}")
        print('+'*80)
        
        return final_output