|
from web2json.ai_extractor import * |
|
from web2json.postprocessor import * |
|
from web2json.preprocessor import * |
|
from pydantic import BaseModel |
|
|
|
class Pipeline: |
|
|
|
def __init__(self, |
|
preprocessor: Preprocessor, |
|
ai_extractor: AIExtractor, |
|
postprocessor: PostProcessor): |
|
self.preprocessor = preprocessor |
|
self.ai_extractor = ai_extractor |
|
self.postprocessor = postprocessor |
|
|
|
def run(self, content: str, is_url: bool, schema:BaseModel) -> dict: |
|
""" |
|
Run the entire pipeline: preprocess, extract, and postprocess. |
|
|
|
Args: |
|
content (str): The raw content to process. |
|
is_url (bool): Whether the content is a URL or raw text. |
|
schema (BaseModel): The schema defining the structure of the expected output. |
|
|
|
Returns: |
|
dict: The final structured data after processing. |
|
""" |
|
|
|
preprocessed_content = self.preprocessor.preprocess(content, is_url) |
|
print(f"Preprocessed content: {preprocessed_content[:100]}...") |
|
print('+'*80) |
|
|
|
extracted_data = self.ai_extractor.extract(preprocessed_content, schema) |
|
print(f"Extracted data: {extracted_data[:100]}...") |
|
print('+'*80) |
|
|
|
final_output = self.postprocessor.process(extracted_data) |
|
print(f"Final output: {final_output}") |
|
print('+'*80) |
|
|
|
return final_output |
|
|
|
|