abdo-Mansour commited on
Commit
2856ca3
·
1 Parent(s): 44fb3b3
Files changed (1) hide show
  1. app.py +212 -39
app.py CHANGED
@@ -1,73 +1,172 @@
1
  import json
2
  import pandas as pd
3
  import gradio as gr
4
- from typing import Dict, Any
5
  from web2json.preprocessor import BasicPreprocessor
6
  from web2json.ai_extractor import AIExtractor, GeminiLLMClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
- from pydantic import BaseModel, Field
10
  import os
11
  import dotenv
12
 
13
  dotenv.load_dotenv()
14
 
15
- # Define schemas
16
- class Article(BaseModel):
17
- title: str = Field(..., description="The title of the article.")
18
- author: str = Field(..., description="The author of the article.")
19
- content: str = Field(..., description="The main content of the article.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- class Product(BaseModel):
22
- name: str = Field(..., description="The name of the product.")
23
- description: str = Field(..., description="A detailed description of the product.")
24
- price: float = Field(..., description="The price of the product.")
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- class JobPosting(BaseModel):
27
- title: str = Field(..., description="The title of the job position.")
28
- company: str = Field(..., description="The name of the company offering the job.")
29
- location: str = Field(..., description="The location of the job.")
30
- description: str = Field(..., description="A detailed description of the job responsibilities.")
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- SCHEMA_OPTIONS = {
33
- "Article": Article,
34
- "Product": Product,
35
- "Job Posting": JobPosting,
36
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Core processing function
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, Any]:
41
- if schema_name not in SCHEMA_OPTIONS:
42
- return {"error": f"Invalid schema name: {schema_name}. Choose from: {', '.join(SCHEMA_OPTIONS.keys())}"}
 
 
 
 
 
 
 
 
43
 
44
- schema = SCHEMA_OPTIONS[schema_name]
45
  prompt_template = """Extract the following information from the provided content according to the specified schema.
46
-
47
  Content to analyze:
48
  {content}
49
-
50
  Schema requirements:
51
  {schema}
52
-
53
  Instructions:
54
  - Extract only information that is explicitly present in the content
55
  - Follow the exact structure and data types specified in the schema
56
  - If a required field cannot be found, indicate this clearly
57
  - Preserve the original formatting and context where relevant
58
  - Return the extracted data in the format specified by the schema"""
59
-
60
  # Initialize pipeline components
61
  preprocessor = BasicPreprocessor(config={'keep_tags': False})
62
  try:
63
  llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
64
  except Exception as e:
65
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
66
-
67
  ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
68
  postprocessor = PostProcessor()
69
  pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
70
-
71
  try:
72
  result = pipeline.run(content, is_url, schema)
73
  print("-"*80)
@@ -76,20 +175,94 @@ def webpage_to_json(content: str, is_url: bool, schema_name: str) -> Dict[str, A
76
  except Exception as e:
77
  return {"error": f"Processing error: {str(e)}"}
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  # Build Gradio Interface
80
  demo = gr.Interface(
81
- fn=webpage_to_json,
82
  inputs=[
83
- gr.Textbox(label="Content (URL or Raw Text)", lines=10,
84
- placeholder="Enter URL or paste raw HTML/text here."),
 
 
 
85
  gr.Checkbox(label="Content is URL?", value=False),
86
- gr.Dropdown(choices=list(SCHEMA_OPTIONS.keys()),
87
- label="Select Schema", value="Article")
 
 
 
 
88
  ],
89
  outputs=gr.JSON(label="Output JSON"),
90
  title="Webpage to JSON Converter",
91
- description="Convert web pages or raw text into structured JSON using customizable schemas."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  )
93
 
94
  if __name__ == "__main__":
95
- demo.launch(mcp_server=True)
 
1
  import json
2
  import pandas as pd
3
  import gradio as gr
4
+ from typing import Dict, Any, Type
5
  from web2json.preprocessor import BasicPreprocessor
6
  from web2json.ai_extractor import AIExtractor, GeminiLLMClient
7
  from web2json.postprocessor import PostProcessor
8
  from web2json.pipeline import Pipeline
9
+ from pydantic import BaseModel, Field, create_model
10
  import os
11
  import dotenv
12
 
13
  dotenv.load_dotenv()
14
 
15
+ def parse_schema_input(schema_input: str) -> Type[BaseModel]:
16
+ """
17
+ Convert user schema input to a Pydantic BaseModel.
18
+ Supports multiple input formats:
19
+ 1. JSON schema format
20
+ 2. Python class definition
21
+ 3. Simple field definitions
22
+ """
23
+ schema_input = schema_input.strip()
24
+
25
+ if not schema_input:
26
+ # Default schema if none provided
27
+ return create_model('DefaultSchema',
28
+ title=(str, Field(description="Title of the content")),
29
+ content=(str, Field(description="Main content")))
30
+
31
+ try:
32
+ # Try parsing as JSON schema
33
+ if schema_input.startswith('{'):
34
+ schema_dict = json.loads(schema_input)
35
+ return json_schema_to_basemodel(schema_dict)
36
+
37
+ # Try parsing as Python class definition
38
+ elif 'class ' in schema_input and 'BaseModel' in schema_input:
39
+ return python_class_to_basemodel(schema_input)
40
+
41
+ # Try parsing as simple field definitions
42
+ else:
43
+ return simple_fields_to_basemodel(schema_input)
44
+
45
+ except Exception as e:
46
+ raise ValueError(f"Could not parse schema: {str(e)}. Please check your schema format.")
47
 
48
+ def json_schema_to_basemodel(schema_dict: Dict) -> Type[BaseModel]:
49
+ """Convert JSON schema to BaseModel"""
50
+ fields = {}
51
+ properties = schema_dict.get('properties', {})
52
+ required = schema_dict.get('required', [])
53
+
54
+ for field_name, field_info in properties.items():
55
+ field_type = get_python_type(field_info.get('type', 'string'))
56
+ field_description = field_info.get('description', '')
57
+
58
+ if field_name in required:
59
+ fields[field_name] = (field_type, Field(description=field_description))
60
+ else:
61
+ fields[field_name] = (field_type, Field(default=None, description=field_description))
62
+
63
+ return create_model('DynamicSchema', **fields)
64
 
65
+ def python_class_to_basemodel(class_definition: str) -> Type[BaseModel]:
66
+ """Convert Python class definition to BaseModel"""
67
+ try:
68
+ # Execute the class definition in a safe namespace
69
+ namespace = {'BaseModel': BaseModel, 'Field': Field, 'str': str, 'int': int,
70
+ 'float': float, 'bool': bool, 'list': list, 'dict': dict}
71
+ exec(class_definition, namespace)
72
+
73
+ # Find the class that inherits from BaseModel
74
+ for name, obj in namespace.items():
75
+ if (isinstance(obj, type) and
76
+ issubclass(obj, BaseModel) and
77
+ obj != BaseModel):
78
+ return obj
79
+
80
+ raise ValueError("No BaseModel class found in definition")
81
+ except Exception as e:
82
+ raise ValueError(f"Invalid Python class definition: {str(e)}")
83
 
84
+ def simple_fields_to_basemodel(fields_text: str) -> Type[BaseModel]:
85
+ """Convert simple field definitions to BaseModel"""
86
+ fields = {}
87
+
88
+ for line in fields_text.strip().split('\n'):
89
+ line = line.strip()
90
+ if not line or line.startswith('#'):
91
+ continue
92
+
93
+ # Parse field definition (e.g., "name: str = description")
94
+ if ':' in line:
95
+ parts = line.split(':', 1)
96
+ field_name = parts[0].strip()
97
+
98
+ type_and_desc = parts[1].strip()
99
+ if '=' in type_and_desc:
100
+ type_part, desc_part = type_and_desc.split('=', 1)
101
+ field_type = get_python_type(type_part.strip())
102
+ description = desc_part.strip().strip('"\'')
103
+ else:
104
+ field_type = get_python_type(type_and_desc.strip())
105
+ description = ""
106
+
107
+ fields[field_name] = (field_type, Field(description=description))
108
+ else:
109
+ # Simple field name only
110
+ field_name = line.strip()
111
+ fields[field_name] = (str, Field(description=""))
112
+
113
+ if not fields:
114
+ raise ValueError("No valid fields found in schema definition")
115
+
116
+ return create_model('DynamicSchema', **fields)
117
 
118
+ def get_python_type(type_str: str):
119
+ """Convert type string to Python type"""
120
+ type_str = type_str.lower().strip()
121
+ type_mapping = {
122
+ 'string': str, 'str': str,
123
+ 'integer': int, 'int': int,
124
+ 'number': float, 'float': float,
125
+ 'boolean': bool, 'bool': bool,
126
+ 'array': list, 'list': list,
127
+ 'object': dict, 'dict': dict
128
+ }
129
+ return type_mapping.get(type_str, str)
130
 
131
+ def webpage_to_json_wrapper(content: str, is_url: bool, schema_input: str) -> Dict[str, Any]:
132
+ """Wrapper function that converts schema input to BaseModel"""
133
+ try:
134
+ # Parse the schema input into a BaseModel
135
+ schema_model = parse_schema_input(schema_input)
136
+
137
+ # Call the original function
138
+ return webpage_to_json(content, is_url, schema_model)
139
+
140
+ except Exception as e:
141
+ return {"error": f"Schema parsing error: {str(e)}"}
142
 
143
+ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str, Any]:
144
  prompt_template = """Extract the following information from the provided content according to the specified schema.
145
+
146
  Content to analyze:
147
  {content}
148
+
149
  Schema requirements:
150
  {schema}
151
+
152
  Instructions:
153
  - Extract only information that is explicitly present in the content
154
  - Follow the exact structure and data types specified in the schema
155
  - If a required field cannot be found, indicate this clearly
156
  - Preserve the original formatting and context where relevant
157
  - Return the extracted data in the format specified by the schema"""
158
+
159
  # Initialize pipeline components
160
  preprocessor = BasicPreprocessor(config={'keep_tags': False})
161
  try:
162
  llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
163
  except Exception as e:
164
  return {"error": f"Failed to initialize LLM client: {str(e)}"}
165
+
166
  ai_extractor = AIExtractor(llm_client=llm, prompt_template=prompt_template)
167
  postprocessor = PostProcessor()
168
  pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
169
+
170
  try:
171
  result = pipeline.run(content, is_url, schema)
172
  print("-"*80)
 
175
  except Exception as e:
176
  return {"error": f"Processing error: {str(e)}"}
177
 
178
+ # Example schemas for the user
179
+ example_schemas = """
180
+ **Example Schema Formats:**
181
+
182
+ 1. **Simple field definitions:**
183
+ ```
184
+ title: str = Page title
185
+ price: float = Product price
186
+ description: str = Product description
187
+ available: bool = Is available
188
+ ```
189
+
190
+ 2. **JSON Schema:**
191
+ ```json
192
+ {
193
+ "properties": {
194
+ "title": {"type": "string", "description": "Page title"},
195
+ "price": {"type": "number", "description": "Product price"},
196
+ "description": {"type": "string", "description": "Product description"}
197
+ },
198
+ "required": ["title"]
199
+ }
200
+ ```
201
+
202
+ 3. **Python Class Definition:**
203
+ ```python
204
+ class ProductSchema(BaseModel):
205
+ title: str = Field(description="Product title")
206
+ price: float = Field(description="Product price")
207
+ description: str = Field(description="Product description")
208
+ available: bool = Field(default=False, description="Availability status")
209
+ ```
210
+ """
211
+
212
  # Build Gradio Interface
213
  demo = gr.Interface(
214
+ fn=webpage_to_json_wrapper,
215
  inputs=[
216
+ gr.Textbox(
217
+ label="Content (URL or Raw Text)",
218
+ lines=10,
219
+ placeholder="Enter URL or paste raw HTML/text here."
220
+ ),
221
  gr.Checkbox(label="Content is URL?", value=False),
222
+ gr.Textbox(
223
+ label="Schema Definition",
224
+ lines=15,
225
+ placeholder="Define your extraction schema (see examples below)",
226
+ info=example_schemas
227
+ )
228
  ],
229
  outputs=gr.JSON(label="Output JSON"),
230
  title="Webpage to JSON Converter",
231
+ description="Convert web pages or raw text into structured JSON using customizable schemas. Define your schema using simple field definitions, JSON schema, or Python class syntax.",
232
+ examples=[
233
+ [
234
+ "https://example.com",
235
+ True,
236
+ "title: str = Page title\nprice: float = Product price\ndescription: str = Description"
237
+ ],
238
+ [
239
+ "<h1>Sample Product</h1><p>Price: $29.99</p><p>Great quality item</p>",
240
+ False,
241
+ '''{
242
+ "type": "object",
243
+ "properties": {
244
+ "title": {
245
+ "type": "string",
246
+ "description": "Name of the product"
247
+ },
248
+ "price": {
249
+ "type": "number",
250
+ "description": "Price of the product"
251
+ },
252
+ "description": {
253
+ "type": "string",
254
+ "description": "Detailed description of the product"
255
+ },
256
+ "availability": {
257
+ "type": "boolean",
258
+ "description": "Whether the product is in stock (true) or not (false)"
259
+ }
260
+ },
261
+ "required": ["title", "price"]
262
+ }'''
263
+ ]
264
+ ]
265
  )
266
 
267
  if __name__ == "__main__":
268
+ demo.launch(mcp_server=True)