Spaces:

VelaTest
/

PDFExtractor

Sleeping

File size: 10,173 Bytes

f7d4608

# from pydantic import BaseModel
# from openai import OpenAI
# from typing import List, Dict, Optional, Union

# client = OpenAI()

# class GHGParameter(BaseModel):
#     parameter: str
#     data_type: str
#     synonyms: Optional[List[str]] = None
#     uom: Optional[str] = None
#     description: Optional[str] = None
#     value: Union[int, str, None]


# class GHGCategory(BaseModel):
#     category: str
#     parameters: List[GHGParameter]

# SCHEMA = """{
#   "Gas (GHG)": {
#     "Total GHG Emissions": {
#       "data_type": "Numeric",
#       "synonyms": ["Carbon Footprint"],
#       "uom": "Metric Tons CO₂e",
#       "description": "Total greenhouse gases emitted by the organization.",
#       "value": null
#     }"""

# PROMPT = (f"""You are a PDF parsing agent.
#             Fetch the following data from pdf : {SCHEMA}"""
#             )

# def extract_emissions_data_as_json(api, model, file_input):
#     if api.lower() == "openai":
#         file = client.files.create(
#             file=("uploaded.pdf", file_input),
#             purpose="assistants"
#         )

#         completion = client.beta.chat.completions.parse(
#             model="gpt-4o-2024-08-06",
#             messages=[
#                 {
#                     "role": "user",
#                     "content": [
#                         {
#                             "type": "file",
#                             "file": {
#                                 "file_id": file.id,
#                             }
#                         },
#                         {
#                             "type": "text",
#                             "text":PROMPT,
#                         },
#                     ]
#                 }
#             ],
#             response_format=GHGCategory,
#         )

#         research_paper = completion.choices[0].message.parsed
#         return research_paper

# from pydantic import BaseModel
# from openai import OpenAI

# client = OpenAI()

# class CalendarEvent(BaseModel):
#     name: str
#     date: str
#     participants: list[str]

# def extract_emissions_data_as_json(api, model, file_input):
#     if api.lower() == "openai":
#             file = client.files.create(
#                 file=("uploaded.pdf", file_input),
#                 purpose="assistants"
#             )

#     completion = client.beta.chat.completions.parse(
#         model="gpt-4o-2024-08-06",
#         messages=[
#                     {
#                         "role": "user",
#                         "content": [
#                             {
#                                 "type": "file",
#                                 "file": {
#                                     "file_id": file.id,
#                                 }
#                             },
#                             {
#                                 "type": "text",
#                                 "text":PROMPT,
#                             },
#                         ]
#                     }
#                 ],
#         response_format=GHGCategory,
#     )

#     event = completion.choices[0].message.parsed

# response = client.chat.completions.create(
#     model="gpt-4o-2024-08-06",
#     messages=[
#         {"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
#         {"role": "user", "content": "how can I solve 8x + 7 = -23"}
#     ],
#     response_format={
#         "type": "json_schema",
#         "json_schema": {
#             "name": "GHGCategory",
#             "schema": {
#                 "type": "object",
#                 "properties": {
#                     "steps": {
#                         "type": "array",
#                         "items": {
#                             "type": "object",
#                             "properties": {
#                                 "explanation": {"type": "string"},
#                                 "output": {"type": "string"}
#                             },
#                             "required": ["explanation", "output"],
#                             "additionalProperties": False
#                         }
#                     },
#                     "final_answer": {"type": "string"}
#                 },
#                 "required": ["steps", "final_answer"],
#                 "additionalProperties": False
#             },
#             "strict": True
#         }
#     }
# )

# print(response.choices[0].message.content)


# response = await async_client.responses.create(
#             model="gpt-4o",
#             input=[
#                 {
#                     "role": "user",
#                     "content": [
#                         {
#                             "type": "input_file",
#                             "file_id": uploaded_file.id,
#                         },
#                         {
#                             "type": "input_text",
#                             "text": """
#                             You are an intelligent PDF data extractor designed to extract structured information from Brand Books. A Brand Book contains guidelines and details about a brand's identity, including its logo, colors, typography, messaging, and more.
#                             Ensure the extracted data follows this schema strictly.
#                             Return the extracted brand information in JSON format with no explaination.
#                             For brand_logo and favicon, always provide a direct URL to the image instead of just the image name or a placeholder. If no valid URLs are found, return an empty array.                        """
#                         }
#                     ]
#                 }
#             ],
#             text={
#                 "format": {
#                     "type": "json_schema",
#                     "name": "BrandBook",
#                     "strict": True,
#                     "schema": {
#                         "type": "object",
#                         "properties": {
#                             "brand_url": {
#                                 "type": "string",
#                                 "description": "The URL associated with the brand."
#                             },
#                             "brand_name": {
#                                 "type": "string",
#                                 "description": "The name of the brand."
#                             },
#                             "brand_category": {
#                                 "type": "array",
#                                 "description": "A list of categories that the brand belongs to.",
#                                 "items": {
#                                     "type": "string"
#                                 }
#                             },
#                             "brand_description": {
#                                 "type": "string",
#                                 "description": "A brief description of the brand."
#                             },
#                             "brand_colors": {
#                                 "type": "array",
#                                 "description": "A list of colors associated with the brand.",
#                                 "items": {
#                                     "type": "string"
#                                 }
#                             },
#                             "brand_fonts": {
#                                 "type": "array",
#                                 "description": "A list of fonts used by the brand.",
#                                 "items": {
#                                     "type": "string"
#                                 }
#                             },
#                             "brand_logo": {
#                                 "type": "array",
#                                 "description": "A list of logo urls associated with the brand.",
#                                 "items": {
#                                     "type": "string"
#                                 }
#                             },
#                             "target_audience": {
#                                 "type": "string",
#                                 "description": "The target audience for the brand."
#                             },
#                             "competitors": {
#                                 "type": "string",
#                                 "description": "The competitors of the brand."
#                             },
#                             "aspirational_brands": {
#                                 "type": "string",
#                                 "description": "Brands that the brand aspires to be like."
#                             },
#                             "favicon": {
#                                 "type": "array",
#                                 "description": "A list of favicon URLs associated with the brand.",
#                                 "items": {
#                                     "type": "string"
#                                 }
#                             }
#                         },
#                         "required": [
#                             "brand_url",
#                             "brand_name",
#                             "brand_category",
#                             "brand_description",
#                             "brand_colors",
#                             "brand_fonts",
#                             "brand_logo",
#                             "target_audience",
#                             "competitors",
#                             "aspirational_brands",
#                             "favicon"
#                         ],
#                         "additionalProperties": False
#                     }
#                 }
#             }
#         )