Spaces:

arjunanand13
/

unstructured-to-structured-converter

Sleeping

App Files Files Community

arjunanand13 commited on 22 days ago

Commit

5f40cee

verified ·

1 Parent(s): 91a9da3

Update main.py

Browse files

Files changed (1) hide show

main.py +124 -55

main.py CHANGED Viewed

@@ -78,18 +78,26 @@ class OpenAIClient:
         try:
             response = await self.client.chat.completions.create(
                 model=self.model_name,
-                messages=[{"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
-                temperature=0.1
             )
             content = response.choices[0].message.content
-            confidence = 0.92 if "gpt-4o" in self.model_name else 0.85
             return content, confidence
         except Exception as e:
             logger.error(f"OpenAI API error: {e}")
-            return '{"error": "API call failed"}', 0.1
 class SchemaAnalyzer:
     def analyze_complexity(self, schema: Dict[str, Any]) -> ComplexityMetrics:
@@ -147,12 +155,23 @@ class SchemaAnalyzer:
         )
     def create_extraction_plan(self, schema: Dict[str, Any], complexity: ComplexityMetrics) -> ExtractionPlan:
-        if complexity.complexity_tier == 1:
-            return self._create_simple_plan(schema)
-        elif complexity.complexity_tier == 2:
-            return self._create_medium_plan(schema)
-        else:
-            return self._create_complex_plan(schema)
     def _create_simple_plan(self, schema: Dict[str, Any]) -> ExtractionPlan:
         stages = [ExtractionStage(
@@ -167,7 +186,7 @@ class SchemaAnalyzer:
             stages=stages,
             estimated_cost=0.02,
             estimated_time=5.0,
-            model_assignments={"complete_extraction": "gpt-4o-mini"}
         )
     def _create_medium_plan(self, schema: Dict[str, Any]) -> ExtractionPlan:
@@ -408,38 +427,85 @@ class ExtractionEngine:
         return context
     def _create_extraction_prompt(self, context: str, schema: Dict[str, Any], previous_results: Dict[str, Any]) -> str:
-        return f"""Extract structured data from the following content according to the JSON schema provided.
-Context:
-{context}
-JSON Schema:
-{json.dumps(schema, indent=2)}
-Instructions:
-1. Extract only the fields specified in the schema
-2. Ensure the output is valid JSON
-3. If a field cannot be determined from the content, use null
-4. Be precise and follow the schema constraints exactly
-5. Use previous results as context when relevant
-Output the extracted data as a JSON object:"""
     def _parse_response(self, response: str, expected_fields: List[str]) -> Dict[str, Any]:
         try:
-            data = json.loads(response)
-            return data
-        except json.JSONDecodeError:
             try:
-                json_match = re.search(r'\{.*\}', response, re.DOTALL)
-                if json_match:
-                    data = json.loads(json_match.group())
-                    return data
-            except:
-                pass
-            logger.warning("Failed to parse JSON response, using fallback")
-            return {field: f"extracted_value_for_{field}" for field in expected_fields[:2]}
 class QualityAssessor:
     def assess_extraction(self, result: ExtractionResult, schema: Dict[str, Any]) -> QualityReport:
@@ -448,24 +514,20 @@ class QualityAssessor:
         consistency_score = self._check_consistency(result.data)
         required_fields = schema.get('required', [])
         if field_scores:
-            total_weight = 0
-            weighted_confidence = 0
-            for field, confidence in field_scores.items():
-                weight = 2.0 if field in required_fields else 1.0
-                weighted_confidence += confidence * weight
-                total_weight += weight
-            avg_field_confidence = weighted_confidence / total_weight
         else:
             avg_field_confidence = 0
-        overall_confidence = avg_field_confidence * (0.8 + 0.2 * schema_compliance) * (0.9 + 0.1 * consistency_score)
         overall_confidence = min(overall_confidence, 1.0)
-        review_flags = self._generate_review_flags(field_scores, schema_compliance, overall_confidence, required_fields, result.data)
         review_time = self._estimate_review_time(review_flags, field_scores)
         return QualityReport(
@@ -538,24 +600,31 @@ class QualityAssessor:
         return max(0.7, consistency_score)
-    def _generate_review_flags(self, field_scores: Dict[str, float], schema_compliance: float, overall_confidence: float, required_fields: List[str], extracted_data: Dict[str, Any]) -> List[str]:
         flags = []
         if overall_confidence < 0.6:
-            flags.append("high_priority_review")
         elif overall_confidence < 0.8:
-            flags.append("standard_review")
-        if schema_compliance < 0.8:
-            flags.append("schema_compliance_issues")
-        low_confidence_fields = [field for field, score in field_scores.items() if score < 0.7]
-        if low_confidence_fields:
-            flags.append(f"uncertain_fields: {', '.join(low_confidence_fields[:3])}")
         missing_required = [field for field in required_fields if field not in extracted_data or extracted_data[field] is None]
         if missing_required:
-            flags.append(f"missing_required: {', '.join(missing_required[:3])}")
         return flags

         try:
             response = await self.client.chat.completions.create(
                 model=self.model_name,
+                messages=[
+                    {"role": "system", "content": "You are a precise data extraction specialist. Extract data according to the provided schema and output only valid JSON."},
+                    {"role": "user", "content": prompt}
+                ],
                 max_tokens=max_tokens,
+                temperature=0.1,
+                top_p=0.9
             )
             content = response.choices[0].message.content
+            confidence = 0.9 if "gpt-4o" in self.model_name else 0.8
+            if content and len(content.strip()) > 10:
+                confidence += 0.05
             return content, confidence
         except Exception as e:
             logger.error(f"OpenAI API error: {e}")
+            return '{"error": "API call failed", "details": "' + str(e) + '"}', 0.1
 class SchemaAnalyzer:
     def analyze_complexity(self, schema: Dict[str, Any]) -> ComplexityMetrics:
         )
     def create_extraction_plan(self, schema: Dict[str, Any], complexity: ComplexityMetrics) -> ExtractionPlan:
+        return self._create_single_pass_plan(schema)
+    def _create_single_pass_plan(self, schema: Dict[str, Any]) -> ExtractionPlan:
+        stages = [ExtractionStage(
+            name="complete_extraction",
+            fields=list(schema.get('properties', {}).keys()),
+            schema_subset=schema,
+            complexity=2,
+            estimated_tokens=4000
+        )]
+        return ExtractionPlan(
+            stages=stages,
+            estimated_cost=0.15,
+            estimated_time=15.0,
+            model_assignments={"complete_extraction": "gpt-4o"}
+        )
     def _create_simple_plan(self, schema: Dict[str, Any]) -> ExtractionPlan:
         stages = [ExtractionStage(
             stages=stages,
             estimated_cost=0.02,
             estimated_time=5.0,
+            model_assignments={"complete_extraction": "gpt-4o"}
         )
     def _create_medium_plan(self, schema: Dict[str, Any]) -> ExtractionPlan:
         return context
     def _create_extraction_prompt(self, context: str, schema: Dict[str, Any], previous_results: Dict[str, Any]) -> str:
+        schema_properties = schema.get('properties', {})
+        required_fields = schema.get('required', [])
+        field_descriptions = []
+        for field_name, field_def in schema_properties.items():
+            if isinstance(field_def, dict):
+                field_type = field_def.get('type', 'string')
+                is_required = field_name in required_fields
+                status = "REQUIRED" if is_required else "optional"
+                field_descriptions.append(f"- {field_name} ({field_type}) [{status}]")
+        previous_context = ""
+        if previous_results:
+            previous_context = f"\n\nPreviously extracted data:\n{json.dumps(previous_results, indent=2)}"
+        return f"""Extract ALL specified fields from the document content according to the JSON schema.
+DOCUMENT CONTENT:
+{context[:4000]}
+REQUIRED OUTPUT FIELDS:
+{chr(10).join(field_descriptions)}
+SCHEMA STRUCTURE:
+{json.dumps(schema, indent=2)}{previous_context}
+CRITICAL INSTRUCTIONS:
+1. Extract ALL fields specified in the schema properties
+2. For arrays, extract ALL items found in the content
+3. For objects, extract ALL nested properties
+4. Use null only if data truly cannot be found
+5. Maintain exact schema structure and types
+6. Output ONLY valid JSON, no explanations
+JSON OUTPUT:"""
     def _parse_response(self, response: str, expected_fields: List[str]) -> Dict[str, Any]:
         try:
+            cleaned_response = response.strip()
+            if not cleaned_response.startswith('{'):
+                json_start = cleaned_response.find('{')
+                if json_start != -1:
+                    cleaned_response = cleaned_response[json_start:]
+            if not cleaned_response.endswith('}'):
+                json_end = cleaned_response.rfind('}')
+                if json_end != -1:
+                    cleaned_response = cleaned_response[:json_end + 1]
+            data = json.loads(cleaned_response)
+            if isinstance(data, dict):
+                return data
+            else:
+                logger.warning("Response is not a dictionary")
+                return {}
+        except json.JSONDecodeError as e:
+            logger.warning(f"JSON decode error: {e}")
             try:
+                import re
+                json_pattern = r'\{(?:[^{}]|{(?:[^{}]|{[^{}]*})*})*\}'
+                matches = re.findall(json_pattern, response, re.DOTALL)
+                for match in matches:
+                    try:
+                        data = json.loads(match)
+                        if isinstance(data, dict) and data:
+                            return data
+                    except:
+                        continue
+            except Exception as e:
+                logger.warning(f"Regex parsing failed: {e}")
+            logger.error("All JSON parsing attempts failed")
+            return {}
 class QualityAssessor:
     def assess_extraction(self, result: ExtractionResult, schema: Dict[str, Any]) -> QualityReport:
         consistency_score = self._check_consistency(result.data)
         required_fields = schema.get('required', [])
+        total_expected_fields = len(schema.get('properties', {}))
+        extracted_fields = len([k for k, v in result.data.items() if v is not None])
+        completeness_score = extracted_fields / total_expected_fields if total_expected_fields > 0 else 0
         if field_scores:
+            avg_field_confidence = sum(field_scores.values()) / len(field_scores)
         else:
             avg_field_confidence = 0
+        overall_confidence = completeness_score * 0.6 + schema_compliance * 0.3 + consistency_score * 0.1
         overall_confidence = min(overall_confidence, 1.0)
+        review_flags = self._generate_review_flags(field_scores, schema_compliance, overall_confidence, required_fields, result.data, total_expected_fields, extracted_fields)
         review_time = self._estimate_review_time(review_flags, field_scores)
         return QualityReport(
         return max(0.7, consistency_score)
+    def _generate_review_flags(self, field_scores: Dict[str, float], schema_compliance: float, overall_confidence: float, required_fields: List[str], extracted_data: Dict[str, Any], total_expected: int, extracted_count: int) -> List[str]:
         flags = []
+        completeness_rate = extracted_count / total_expected if total_expected > 0 else 0
+        if completeness_rate < 0.5:
+            flags.append("incomplete_extraction")
+        elif completeness_rate < 0.8:
+            flags.append("partial_extraction")
         if overall_confidence < 0.6:
+            flags.append("low_quality")
         elif overall_confidence < 0.8:
+            flags.append("moderate_quality")
+        if schema_compliance < 0.7:
+            flags.append("schema_violations")
         missing_required = [field for field in required_fields if field not in extracted_data or extracted_data[field] is None]
         if missing_required:
+            flags.append(f"missing_required_fields")
+        empty_fields = [k for k, v in extracted_data.items() if v is None or v == ""]
+        if len(empty_fields) > total_expected * 0.3:
+            flags.append("many_empty_fields")
         return flags