Spaces:

VelaTest
/

PDFExtractor

Sleeping

PDFExtractor / application /services /openai_model.py

Vela

Created a PdfExtraction application with basic functionality

f7d4608 3 months ago

10.2 kB

	# from pydantic import BaseModel
	# from openai import OpenAI
	# from typing import List, Dict, Optional, Union

	# client = OpenAI()

	# class GHGParameter(BaseModel):
	# parameter: str
	# data_type: str
	# synonyms: Optional[List[str]] = None
	# uom: Optional[str] = None
	# description: Optional[str] = None
	# value: Union[int, str, None]


	# class GHGCategory(BaseModel):
	# category: str
	# parameters: List[GHGParameter]

	# SCHEMA = """{
	# "Gas (GHG)": {
	# "Total GHG Emissions": {
	# "data_type": "Numeric",
	# "synonyms": ["Carbon Footprint"],
	# "uom": "Metric Tons CO₂e",
	# "description": "Total greenhouse gases emitted by the organization.",
	# "value": null
	# }"""

	# PROMPT = (f"""You are a PDF parsing agent.
	# Fetch the following data from pdf : {SCHEMA}"""
	# )

	# def extract_emissions_data_as_json(api, model, file_input):
	# if api.lower() == "openai":
	# file = client.files.create(
	# file=("uploaded.pdf", file_input),
	# purpose="assistants"
	# )

	# completion = client.beta.chat.completions.parse(
	# model="gpt-4o-2024-08-06",
	# messages=[
	# {
	# "role": "user",
	# "content": [
	# {
	# "type": "file",
	# "file": {
	# "file_id": file.id,
	# }
	# },
	# {
	# "type": "text",
	# "text":PROMPT,
	# },
	# ]
	# }
	# ],
	# response_format=GHGCategory,
	# )

	# research_paper = completion.choices[0].message.parsed
	# return research_paper

	# from pydantic import BaseModel
	# from openai import OpenAI

	# client = OpenAI()

	# class CalendarEvent(BaseModel):
	# name: str
	# date: str
	# participants: list[str]

	# def extract_emissions_data_as_json(api, model, file_input):
	# if api.lower() == "openai":
	# file = client.files.create(
	# file=("uploaded.pdf", file_input),
	# purpose="assistants"
	# )

	# completion = client.beta.chat.completions.parse(
	# model="gpt-4o-2024-08-06",
	# messages=[
	# {
	# "role": "user",
	# "content": [
	# {
	# "type": "file",
	# "file": {
	# "file_id": file.id,
	# }
	# },
	# {
	# "type": "text",
	# "text":PROMPT,
	# },
	# ]
	# }
	# ],
	# response_format=GHGCategory,
	# )

	# event = completion.choices[0].message.parsed

	# response = client.chat.completions.create(
	# model="gpt-4o-2024-08-06",
	# messages=[
	# {"role": "system", "content": "You are a helpful math tutor. Guide the user through the solution step by step."},
	# {"role": "user", "content": "how can I solve 8x + 7 = -23"}
	# ],
	# response_format={
	# "type": "json_schema",
	# "json_schema": {
	# "name": "GHGCategory",
	# "schema": {
	# "type": "object",
	# "properties": {
	# "steps": {
	# "type": "array",
	# "items": {
	# "type": "object",
	# "properties": {
	# "explanation": {"type": "string"},
	# "output": {"type": "string"}
	# },
	# "required": ["explanation", "output"],
	# "additionalProperties": False
	# }
	# },
	# "final_answer": {"type": "string"}
	# },
	# "required": ["steps", "final_answer"],
	# "additionalProperties": False
	# },
	# "strict": True
	# }
	# }
	# )

	# print(response.choices[0].message.content)


	# response = await async_client.responses.create(
	# model="gpt-4o",
	# input=[
	# {
	# "role": "user",
	# "content": [
	# {
	# "type": "input_file",
	# "file_id": uploaded_file.id,
	# },
	# {
	# "type": "input_text",
	# "text": """
	# You are an intelligent PDF data extractor designed to extract structured information from Brand Books. A Brand Book contains guidelines and details about a brand's identity, including its logo, colors, typography, messaging, and more.
	# Ensure the extracted data follows this schema strictly.
	# Return the extracted brand information in JSON format with no explaination.
	# For brand_logo and favicon, always provide a direct URL to the image instead of just the image name or a placeholder. If no valid URLs are found, return an empty array. """
	# }
	# ]
	# }
	# ],
	# text={
	# "format": {
	# "type": "json_schema",
	# "name": "BrandBook",
	# "strict": True,
	# "schema": {
	# "type": "object",
	# "properties": {
	# "brand_url": {
	# "type": "string",
	# "description": "The URL associated with the brand."
	# },
	# "brand_name": {
	# "type": "string",
	# "description": "The name of the brand."
	# },
	# "brand_category": {
	# "type": "array",
	# "description": "A list of categories that the brand belongs to.",
	# "items": {
	# "type": "string"
	# }
	# },
	# "brand_description": {
	# "type": "string",
	# "description": "A brief description of the brand."
	# },
	# "brand_colors": {
	# "type": "array",
	# "description": "A list of colors associated with the brand.",
	# "items": {
	# "type": "string"
	# }
	# },
	# "brand_fonts": {
	# "type": "array",
	# "description": "A list of fonts used by the brand.",
	# "items": {
	# "type": "string"
	# }
	# },
	# "brand_logo": {
	# "type": "array",
	# "description": "A list of logo urls associated with the brand.",
	# "items": {
	# "type": "string"
	# }
	# },
	# "target_audience": {
	# "type": "string",
	# "description": "The target audience for the brand."
	# },
	# "competitors": {
	# "type": "string",
	# "description": "The competitors of the brand."
	# },
	# "aspirational_brands": {
	# "type": "string",
	# "description": "Brands that the brand aspires to be like."
	# },
	# "favicon": {
	# "type": "array",
	# "description": "A list of favicon URLs associated with the brand.",
	# "items": {
	# "type": "string"
	# }
	# }
	# },
	# "required": [
	# "brand_url",
	# "brand_name",
	# "brand_category",
	# "brand_description",
	# "brand_colors",
	# "brand_fonts",
	# "brand_logo",
	# "target_audience",
	# "competitors",
	# "aspirational_brands",
	# "favicon"
	# ],
	# "additionalProperties": False
	# }
	# }
	# }
	# )