File size: 1,602 Bytes
f2a2588 cae5a7f f2a2588 03d1834 f2a2588 03d1834 f2a2588 03d1834 f2a2588 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
from web2json.ai_extractor import *
from web2json.postprocessor import *
from web2json.preprocessor import *
from pydantic import BaseModel
class Pipeline:
# constructor
def __init__(self,
preprocessor: Preprocessor,
ai_extractor: AIExtractor,
postprocessor: PostProcessor):
self.preprocessor = preprocessor
self.ai_extractor = ai_extractor
self.postprocessor = postprocessor
def run(self, content: str, is_url: bool, schema:BaseModel) -> dict:
"""
Run the entire pipeline: preprocess, extract, and postprocess.
Args:
content (str): The raw content to process.
is_url (bool): Whether the content is a URL or raw text.
schema (BaseModel): The schema defining the structure of the expected output.
Returns:
dict: The final structured data after processing.
"""
# Step 1: Preprocess the content
preprocessed_content = self.preprocessor.preprocess(content, is_url)
print(f"Preprocessed content: {preprocessed_content[:100]}...")
print('+'*80)
# Step 2: Extract structured information using AI
extracted_data = self.ai_extractor.extract(preprocessed_content, schema)
print(f"Extracted data: {extracted_data[:100]}...")
print('+'*80)
# Step 3: Post-process the extracted data
final_output = self.postprocessor.process(extracted_data)
print(f"Final output: {final_output}")
print('+'*80)
return final_output
|