Spaces:

zjunlp
/

OneKE

Running

App Files Files Community

ShawnRu commited on 9 days ago

Commit

32e142e

verified ·

1 Parent(s): 56899fb

Upload 34 files

Browse files

Files changed (34) hide show

examples/config/BookExtraction.yaml +15 -0
examples/config/EE.yaml +14 -0
examples/config/NER.yaml +13 -0
examples/config/NewsExtraction.yaml +15 -0
examples/config/RE.yaml +15 -0
examples/config/Triple2KG.yaml +21 -0
examples/example.py +17 -0
examples/results/BookExtraction.json +48 -0
examples/results/EE.json +13 -0
examples/results/NER.json +16 -0
examples/results/NewsExtraction.json +51 -0
examples/results/RE.json +9 -0
examples/results/TripleExtraction.json +156 -0
src/config.yaml +21 -0
src/construct/__init__.py +1 -0
src/construct/convert.py +201 -0
src/models/__init__.py +3 -0
src/models/llm_def.py +278 -0
src/models/prompt_example.py +137 -0
src/models/prompt_template.py +195 -0
src/models/vllm_serve.py +33 -0
src/modules/__init__.py +4 -0
src/modules/extraction_agent.py +134 -0
src/modules/knowledge_base/case_repository.json +0 -0
src/modules/knowledge_base/case_repository.py +190 -0
src/modules/knowledge_base/schema_repository.py +113 -0
src/modules/reflection_agent.py +73 -0
src/modules/schema_agent.py +160 -0
src/pipeline.py +142 -0
src/run.py +51 -0
src/utils/__init__.py +2 -0
src/utils/data_def.py +58 -0
src/utils/process.py +277 -0
src/webui.py +401 -0

examples/config/BookExtraction.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+model:
+  # Recommend using ChatGPT or DeepSeek APIs for complex IE task.
+  category: ChatGPT # model category, chosen from ChatGPT, DeepSeek, LLaMA, Qwen, ChatGLM, MiniCPM, OneKE.
+  model_name_or_path: gpt-4o-mini # # model name, chosen from the model list of the selected category.
+  api_key: your_api_key # your API key for the model with API service. No need for open-source models.
+  base_url: https://api.openai.com/v1 # # base URL for the API service. No need for open-source models.
+extraction:
+  task: Base # task type, chosen from Base, NER, RE, EE.
+  instruction: Extract main characters and background setting from this chapter. # description for the task. No need for NER, RE, EE task.
+  use_file: true # whether to use a file for the input text. Default set to false.
+  file_path: ./data/input_files/Harry_Potter_Chapter1.pdf #  # path to the input file. No need if use_file is set to false.
+  mode: quick # extraction mode, chosen from quick, detailed, customized. Default set to quick. See src/config.yaml for more details.
+  update_case: false # whether to update the case repository. Default set to false.
+  show_trajectory: false # whether to display the extracted intermediate steps

examples/config/EE.yaml ADDED Viewed

	@@ -0,0 +1,14 @@

+model:
+  category: DeepSeek  # model category, chosen from ChatGPT, DeepSeek, LLaMA, Qwen, ChatGLM, MiniCPM, OneKE.
+  model_name_or_path: deepseek-chat # model name, chosen from the model list of the selected category.
+  api_key: your_api_key  # your API key for the model with API service. No need for open-source models.
+  base_url: https://api.deepseek.com # base URL for the API service. No need for open-source models.
+extraction:
+  task: EE # task type, chosen from Base, NER, RE, EE.
+  text: UConn Health , an academic medical center , says in a media statement that it identified approximately 326,000 potentially impacted individuals whose personal information was contained in the compromised email accounts. # input text for the extraction task. No need if use_file is set to true.
+  constraint: {"phishing": ["damage amount", "attack pattern", "tool", "victim", "place", "attacker", "purpose", "trusted entity", "time"], "data breach": ["damage amount", "attack pattern", "number of data", "number of victim", "tool", "compromised data", "victim", "place", "attacker", "purpose", "time"], "ransom": ["damage amount", "attack pattern", "payment method", "tool", "victim", "place", "attacker", "price", "time"], "discover vulnerability": ["vulnerable system", "vulnerability", "vulnerable system owner", "vulnerable system version", "supported platform", "common vulnerabilities and exposures", "capabilities", "time", "discoverer"], "patch vulnerability": ["vulnerable system", "vulnerability", "issues addressed", "vulnerable system version", "releaser", "supported platform", "common vulnerabilities and exposures", "patch number", "time", "patch"]} # Specified event type and the corresponding arguments for the event extraction task. Structured as a dictionary with the event type as the key and the list of arguments as the value. Default set to empty.
+  use_file: false # whether to use a file for the input text.
+  mode: standard # extraction mode, chosen from quick, detailed, customized. Default set to quick. See src/config.yaml for more details.
+  update_case: false # whether to update the case repository. Default set to false.
+  show_trajectory: false # whether to display the extracted intermediate steps

examples/config/NER.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+model:
+  category: LLaMA # model category, chosen from ChatGPT, DeepSeek, LLaMA, Qwen, ChatGLM, MiniCPM, OneKE.
+  model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct # model name to download from huggingface or use the local model path.
+  vllm_serve: false # whether to use the vllm. Default set to false.
+extraction:
+  task: NER  # task type, chosen from Base, NER, RE, EE.
+  text: Finally , every other year , ELRA organizes a major conference LREC , the International Language Resources and Evaluation Conference . # input text for the extraction task. No need if use_file is set to true.
+  constraint: ["algorithm", "conference", "else", "product", "task", "field", "metrics", "organization", "researcher", "program language", "country", "location", "person", "university"] # Specified entity types for the named entity recognition task. Default set to empty.
+  use_file: false # whether to use a file for the input text.
+  mode: quick # extraction mode, chosen from quick, detailed, customized. Default set to quick. See src/config.yaml for more details.
+  update_case: false # whether to update the case repository. Default set to false.
+  show_trajectory: false # whether to display the extracted intermediate steps

examples/config/NewsExtraction.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+model:
+  category: DeepSeek  # model category, chosen from ChatGPT, DeepSeek, LLaMA, Qwen, ChatGLM, MiniCPM, OneKE.
+  model_name_or_path: deepseek-chat # model name, chosen from the model list of the selected category.
+  api_key: your_api_key  # your API key for the model with API service. No need for open-source models.
+  base_url: https://api.deepseek.com # base URL for the API service. No need for open-source models.
+extraction:
+  task: Base # task type, chosen from Base, NER, RE, EE.
+  instruction: Extract key information from the given text. # description for the task. No need for NER, RE, EE task.
+  use_file: true # whether to use a file for the input text. Default set to false.
+  file_path: ./data/input_files/Tulsi_Gabbard_News.html # path to the input file. No need if use_file is set to false.
+  output_schema: NewsReport # output schema for the extraction task. Selected the from schema repository.
+  mode: customized # extraction mode, chosen from quick, detailed, customized. Default set to quick. See src/config.yaml for more details.
+  update_case: false # whether to update the case repository. Default set to false.
+  show_trajectory: false # whether to display the extracted intermediate steps

examples/config/RE.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+model:
+  category: ChatGPT # model category, chosen from ChatGPT, DeepSeek, LLaMA, Qwen, ChatGLM, MiniCPM, OneKE.
+  model_name_or_path: gpt-4o-mini # model name, chosen from the model list of the selected category.
+  api_key: your_api_key # your API key for the model with API service. No need for open-source models.
+  base_url: https://api.openai.com/v1 # base URL for the API service. No need for open-source models.
+extraction:
+  task: RE  # task type, chosen from Base, NER, RE, EE.
+  text: The aid group Doctors Without Borders said that since Saturday , more than 275 wounded people had been admitted and treated at Donka Hospital in the capital of Guinea , Conakry .  # input text for the extraction task. No need if use_file is set to true.
+  constraint: ["nationality", "country capital", "place of death", "children", "location contains", "place of birth", "place lived", "administrative division of country", "country of administrative divisions", "company", "neighborhood of", "company founders"] # Specified entity types for the named entity recognition task. Default set to empty.
+  truth: {"relation_list": [{"head": "Guinea", "tail": "Conakry", "relation": "country capital"}]} # Truth data for the relation extraction task. Structured as a dictionary with the list of relation tuples as the value. Required if set update_case to true.
+  use_file: false # whether to use a file for the input text.
+  mode: quick # extraction mode, chosen from quick, detailed, customized. Default set to quick. See src/config.yaml for more details.
+  update_case: true # whether to update the case repository. Default set to false.
+  show_trajectory: false # whether to display the extracted intermediate steps

examples/config/Triple2KG.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model:
+  # Recommend using ChatGPT or DeepSeek APIs for complex Triple task.
+  category: ChatGPT # model category, chosen from ChatGPT, DeepSeek, LLaMA, Qwen, ChatGLM, MiniCPM, OneKE.
+  model_name_or_path: gpt-4o-mini # # model name, chosen from the model list of the selected category.
+  api_key: your_api_key # your API key for the model with API service. No need for open-source models.
+  base_url: https://api.openai.com/v1 # # base URL for the API service. No need for open-source models.
+extraction:
+  mode: quick # extraction mode, chosen from quick, detailed, customized. Default set to quick. See src/config.yaml for more details.
+  task: Triple  # task type, chosen from Base, NER, RE, EE. Now newly added task 'Triple'.
+  use_file: true # whether to use a file for the input text. Default set to false.
+  file_path: ./data/input_files/Artificial_Intelligence_Wikipedia.txt #  # path to the input file. No need if use_file is set to false.
+  constraint: [["Person", "Place", "Event", "Property"], ["Interpersonal", "Located", "Ownership", "Action"]] # Specified entity or relation types for Triple Extraction task. You can write 3 lists for subject, relation and object types. Or you can write 2 lists for entity and relation types. Or you can write 1 list for entity type only.
+  update_case: false # whether to update the case repository. Default set to false.
+  show_trajectory: false # whether to display the extracted intermediate steps
+# construct: # (Optional) If you want to construct a Knowledge Graph, you need to set the construct field, or you must delete this field.
+#   database: Neo4j # database type, now only support Neo4j.
+#   url: neo4j://localhost:7687 # your database URL，Neo4j's default port is 7687.
+#   username: your_username # your database username.
+#   password: "your_password" # your database password.

examples/example.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import sys
+sys.path.append("./src")
+from models import *
+from pipeline import *
+import json
+# model configuration
+model = ChatGPT(model_name_or_path="your_model_name_or_path", api_key="your_api_key")
+pipeline = Pipeline(model)
+# extraction configuration
+Task = "NER"
+Text = "Finally , every other year , ELRA organizes a major conference LREC , the International Language Resources and Evaluation Conference."
+Constraint = ["nationality", "country capital", "place of death", "children", "location contains", "place of birth", "place lived", "administrative division of country", "country of administrative divisions", "company", "neighborhood of", "company founders"]
+# get extraction result
+result, trajectory, frontend_schema, frontend_res = pipeline.get_extract_result(task=Task, text=Text, constraint=Constraint, show_trajectory=True)

examples/results/BookExtraction.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "main_characters": [
+      {
+        "name": "Mr. Dursley",
+        "description": "The director of a firm called Grunnings, a big, beefy man with hardly any neck and a large mustache."
+      },
+      {
+        "name": "Mrs. Dursley",
+        "description": "Thin and blonde, with nearly twice the usual amount of neck, spends time spying on neighbors."
+      },
+      {
+        "name": "Dudley Dursley",
+        "description": "The small son of Mr. and Mrs. Dursley, considered by them to be the finest boy anywhere."
+      },
+      {
+        "name": "Albus Dumbledore",
+        "description": "A tall, thin, and very old man with long silver hair and a purple cloak, who arrives mysteriously."
+      },
+      {
+        "name": "Professor McGonagall",
+        "description": "A severe-looking woman who can transform into a cat, wearing an emerald cloak."
+      },
+      {
+        "name": "Voldemort",
+        "description": "The dark wizard who has caused fear and chaos, but has mysteriously disappeared."
+      },
+      {
+        "name": "Harry Potter",
+        "description": "The young boy who survived Voldemort's attack, becoming a significant figure in the wizarding world."
+      },
+      {
+        "name": "Lily Potter",
+        "description": "Harry's mother, who is mentioned as having been killed by Voldemort."
+      },
+      {
+        "name": "James Potter",
+        "description": "Harry's father, who is mentioned as having been killed by Voldemort."
+      },
+      {
+        "name": "Hagrid",
+        "description": "A giant man who is caring and emotional about Harry's situation."
+      }
+    ],
+    "background_setting": {
+      "location": "Number four, Privet Drive, Suburban",
+      "time_period": "A dull, gray Tuesday morning, Late 20th Century"
+    }
+}

examples/results/EE.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "event_list": [
+      {
+        "event_type": "data breach",
+        "event_trigger": "compromised",
+        "event_argument": {
+          "number of victim": 326000,
+          "compromised data": "personal information contained in email accounts",
+          "victim": "individuals whose personal information was compromised"
+        }
+      }
+    ]
+}

examples/results/NER.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "entity_list": [
+      {
+        "name": "ELRA",
+        "type": "organization"
+      },
+      {
+        "name": "LREC",
+        "type": "conference"
+      },
+      {
+        "name": "International Language Resources and Evaluation Conference",
+        "type": "conference"
+      }
+    ]
+  }

examples/results/NewsExtraction.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+    "title": "Who is Tulsi Gabbard? Meet Trump's pick for director of national intelligence",
+    "summary": "Tulsi Gabbard, President-elect Donald Trump\u2019s choice for director of national intelligence, could face a challenging Senate confirmation battle due to her lack of intelligence experience and controversial views.",
+    "publication_date": "December 4, 2024",
+    "keywords": [
+      "Tulsi Gabbard",
+      "Donald Trump",
+      "director of national intelligence",
+      "confirmation battle",
+      "intelligence agencies",
+      "Russia",
+      "Syria",
+      "Bashar al-Assad"
+    ],
+    "events": [
+      {
+        "name": "Tulsi Gabbard's nomination for director of national intelligence",
+        "people_involved": [
+          {
+            "name": "Tulsi Gabbard",
+            "identity": "Former U.S. Representative",
+            "role": "Nominee for director of national intelligence"
+          },
+          {
+            "name": "Donald Trump",
+            "identity": "President-elect",
+            "role": "Nominator"
+          },
+          {
+            "name": "Tammy Duckworth",
+            "identity": "Democratic Senator",
+            "role": "Critic of Gabbard's nomination"
+          },
+          {
+            "name": "Olivia Troye",
+            "identity": "Former national security official",
+            "role": "Commentator on Gabbard's potential impact"
+          }
+        ],
+        "process": "Gabbard's nomination is expected to lead to a Senate confirmation battle."
+      }
+    ],
+    "quotes": {
+      "Tammy Duckworth": "The U.S. intelligence community has identified her as having troubling relationships with America\u2019s foes, and so my worry is that she couldn\u2019t pass a background check.",
+      "Olivia Troye": "If Gabbard is confirmed, America\u2019s allies may not share as much information with the U.S."
+    },
+    "viewpoints": [
+      "Gabbard's lack of intelligence experience raises concerns about her ability to oversee 18 intelligence agencies.",
+      "Her past comments and meetings with foreign adversaries have led to accusations of being a national security risk."
+    ]
+  }

examples/results/RE.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "relation_list": [
+      {
+        "head": "Guinea",
+        "tail": "Conakry",
+        "relation": "country capital"
+      }
+    ]
+  }

examples/results/TripleExtraction.json ADDED Viewed

	@@ -0,0 +1,156 @@

+{
+    "triple_list": [
+        {
+            "head": "sea levels",
+            "head_type": "Property",
+            "relation": "wiped out",
+            "relation_type": "Action",
+            "tail": "coastal cities",
+            "tail_type": "Place"
+        },
+        {
+            "head": "nations",
+            "head_type": "Person",
+            "relation": "created",
+            "relation_type": "Action",
+            "tail": "mechas",
+            "tail_type": "Property"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "given to",
+            "relation_type": "Ownership",
+            "tail": "Henry and Monica",
+            "tail_type": "Person"
+        },
+        {
+            "head": "Monica",
+            "head_type": "Person",
+            "relation": "feels uncomfortable",
+            "relation_type": "Interpersonal",
+            "tail": "David",
+            "tail_type": "Person"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "befriends",
+            "relation_type": "Interpersonal",
+            "tail": "Teddy",
+            "tail_type": "Person"
+        },
+        {
+            "head": "Martin",
+            "head_type": "Person",
+            "relation": "goads",
+            "relation_type": "Action",
+            "tail": "David",
+            "tail_type": "Person"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "blamed for",
+            "relation_type": "Action",
+            "tail": "incident",
+            "tail_type": "Event"
+        },
+        {
+            "head": "Monica",
+            "head_type": "Person",
+            "relation": "returns David to",
+            "relation_type": "Ownership",
+            "tail": "creators",
+            "tail_type": "Person"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "decides to find",
+            "relation_type": "Action",
+            "tail": "Blue Fairy",
+            "tail_type": "Property"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "pleads for",
+            "relation_type": "Action",
+            "tail": "his life",
+            "tail_type": "Event"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "meets",
+            "relation_type": "Interpersonal",
+            "tail": "Professor Hobby",
+            "tail_type": "Person"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "attempts",
+            "relation_type": "Action",
+            "tail": "suicide",
+            "tail_type": "Event"
+        },
+        {
+            "head": "Joe",
+            "head_type": "Person",
+            "relation": "rescues",
+            "relation_type": "Action",
+            "tail": "David",
+            "tail_type": "Person"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "asks statue to turn him into",
+            "relation_type": "Action",
+            "tail": "real boy",
+            "tail_type": "Property"
+        },
+        {
+            "head": "humanity",
+            "head_type": "Person",
+            "relation": "is extinct",
+            "relation_type": "Action",
+            "tail": "future",
+            "tail_type": "Event"
+        },
+        {
+            "head": "Specialists",
+            "head_type": "Person",
+            "relation": "resurrect",
+            "relation_type": "Action",
+            "tail": "David and Teddy",
+            "tail_type": "Person"
+        },
+        {
+            "head": "Monica",
+            "head_type": "Person",
+            "relation": "can live for",
+            "relation_type": "Property",
+            "tail": "one day",
+            "tail_type": "Property"
+        },
+        {
+            "head": "David",
+            "head_type": "Person",
+            "relation": "spends",
+            "relation_type": "Action",
+            "tail": "happiest day with Monica",
+            "tail_type": "Event"
+        },
+        {
+            "head": "Monica",
+            "head_type": "Person",
+            "relation": "tells",
+            "relation_type": "Interpersonal",
+            "tail": "David",
+            "tail_type": "Person"
+        }
+    ]
+}

src/config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+model:
+  embedding_model: all-MiniLM-L6-v2
+agent:
+  default_schema: The final extraction result should be formatted as a JSON object.
+  default_ner: Extract the Named Entities in the given text.
+  default_re: Extract Relationships between Named Entities in the given text.
+  default_ee: Extract the Events in the given text.
+  default_triple: Extract the Triples (subject, relation, object) from the given text, hope that all the relationships for each entity can be extracted.
+  chunk_token_limit: 1024
+  mode:
+    quick:
+      schema_agent: get_deduced_schema
+      extraction_agent: extract_information_direct
+    standard:
+      schema_agent: get_deduced_schema
+      extraction_agent: extract_information_with_case
+      reflection_agent: reflect_with_case
+    customized:
+      schema_agent:  get_retrieved_schema
+      extraction_agent: extract_information_direct

src/construct/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .convert import *

src/construct/convert.py ADDED Viewed

	@@ -0,0 +1,201 @@

+import json
+import re
+from neo4j import GraphDatabase
+def sanitize_string(input_str, max_length=255):
+    """
+    Process the input string to ensure it meets the database requirements.
+    """
+    # step1: Replace invalid characters
+    input_str = re.sub(r'[^a-zA-Z0-9_]', '_', input_str)
+    # step2: Add prefix if it starts with a digit
+    if input_str[0].isdigit():
+        input_str = 'num' + input_str
+    # step3: Limit length
+    if len(input_str) > max_length:
+        input_str = input_str[:max_length]
+    return input_str
+def generate_cypher_statements(data):
+    """
+    Generates Cypher query statements based on the provided JSON data.
+    """
+    cypher_statements = []
+    parsed_data = json.loads(data)
+    def create_statement(triple):
+        head = triple.get("head")
+        head_type = triple.get("head_type")
+        relation = triple.get("relation")
+        relation_type = triple.get("relation_type")
+        tail = triple.get("tail")
+        tail_type = triple.get("tail_type")
+        # head_safe = sanitize_string(head) if head else None
+        head_type_safe = sanitize_string(head_type) if head_type else None
+        # relation_safe = sanitize_string(relation) if relation else None
+        relation_type_safe = sanitize_string(relation_type) if relation_type else None
+        # tail_safe = sanitize_string(tail) if tail else None
+        tail_type_safe = sanitize_string(tail_type) if tail_type else None
+        statement = ""
+        if head:
+            if head_type_safe:
+                statement += f'MERGE (a:{head_type_safe} {{name: "{head}"}}) '
+            else:
+                statement += f'MERGE (a:UNTYPED {{name: "{head}"}}) '
+        if tail:
+            if tail_type_safe:
+                statement += f'MERGE (b:{tail_type_safe} {{name: "{tail}"}}) '
+            else:
+                statement += f'MERGE (b:UNTYPED {{name: "{tail}"}}) '
+        if relation:
+            if head and tail: # Only create relation if head and tail exist.
+                if relation_type_safe:
+                    statement += f'MERGE (a)-[:{relation_type_safe} {{name: "{relation}"}}]->(b);'
+                else:
+                    statement += f'MERGE (a)-[:UNTYPED {{name: "{relation}"}}]->(b);'
+            else:
+                statement += ';' if statement != "" else ''
+        else:
+            if relation_type_safe: # if relation is not provided, create relation by `relation_type`.
+                statement += f'MERGE (a)-[:{relation_type_safe} {{name: "{relation_type_safe}"}}]->(b);'
+            else:
+                statement += ';' if statement != "" else ''
+        return statement
+    if "triple_list" in parsed_data:
+        for triple in parsed_data["triple_list"]:
+            cypher_statements.append(create_statement(triple))
+    else:
+        cypher_statements.append(create_statement(parsed_data))
+    return cypher_statements
+def execute_cypher_statements(uri, user, password, cypher_statements):
+    """
+    Executes the generated Cypher query statements.
+    """
+    driver = GraphDatabase.driver(uri, auth=(user, password))
+    with driver.session() as session:
+        for statement in cypher_statements:
+            session.run(statement)
+            print(f"Executed: {statement}")
+    # Write excuted cypher statements to a text file if you want.
+    # with open("executed_statements.txt", 'a') as f:
+    #     for statement in cypher_statements:
+    #         f.write(statement + '\n')
+    #     f.write('\n')
+    driver.close()
+# Here is a test of your database connection:
+if __name__ == "__main__":
+    # test_data 1: Contains a list of triples
+    test_data = '''
+    {
+        "triple_list": [
+            {
+                "head": "J.K. Rowling",
+                "head_type": "Person",
+                "relation": "wrote",
+                "relation_type": "Actions",
+                "tail": "Fantastic Beasts and Where to Find Them",
+                "tail_type": "Book"
+            },
+            {
+                "head": "Fantastic Beasts and Where to Find Them",
+                "head_type": "Book",
+                "relation": "extra section of",
+                "relation_type": "Affiliation",
+                "tail": "Harry Potter Series",
+                "tail_type": "Book"
+            },
+            {
+                "head": "J.K. Rowling",
+                "head_type": "Person",
+                "relation": "wrote",
+                "relation_type": "Actions",
+                "tail": "Harry Potter Series",
+                "tail_type": "Book"
+            },
+            {
+                "head": "Harry Potter Series",
+                "head_type": "Book",
+                "relation": "create",
+                "relation_type": "Actions",
+                "tail": "Dumbledore",
+                "tail_type": "Person"
+            },
+            {
+                "head": "Fantastic Beasts and Where to Find Them",
+                "head_type": "Book",
+                "relation": "mention",
+                "relation_type": "Actions",
+                "tail": "Dumbledore",
+                "tail_type": "Person"
+            },
+            {
+                "head": "Voldemort",
+                "head_type": "Person",
+                "relation": "afrid",
+                "relation_type": "Emotion",
+                "tail": "Dumbledore",
+                "tail_type": "Person"
+            },
+            {
+                "head": "Voldemort",
+                "head_type": "Person",
+                "relation": "robs",
+                "relation_type": "Actions",
+                "tail": "the Elder Wand",
+                "tail_type": "Weapon"
+            },
+            {
+                "head": "the Elder Wand",
+                "head_type": "Weapon",
+                "relation": "belong to",
+                "relation_type": "Affiliation",
+                "tail": "Dumbledore",
+                "tail_type": "Person"
+            }
+        ]
+    }
+    '''
+    # test_data 2: Contains a single triple
+    # test_data = '''
+    # {
+    #     "head": "Christopher Nolan",
+    #     "head_type": "Person",
+    #     "relation": "directed",
+    #     "relation_type": "Action",
+    #     "tail": "Inception",
+    #     "tail_type": "Movie"
+    # }
+    # '''
+    # Generate Cypher query statements
+    cypher_statements = generate_cypher_statements(test_data)
+    # Print the generated Cypher query statements
+    for statement in cypher_statements:
+        print(statement)
+    print("\n")
+    # Execute the generated Cypher query statements
+    execute_cypher_statements(
+        uri="neo4j://localhost:7687", # your URI
+        user="your_username", # your username
+        password="your_password", # your password
+        cypher_statements=cypher_statements,
+    )

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .llm_def import *
+from .prompt_example import *
+from .prompt_template import *

src/models/llm_def.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Surpported Models.
+Supports:
+- Open Source:LLaMA3, Qwen2.5, MiniCPM3, ChatGLM4
+- Closed Source: ChatGPT, DeepSeek
+"""
+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, GenerationConfig
+import torch
+import openai
+import os
+from openai import OpenAI
+# The inferencing code is taken from the official documentation
+class BaseEngine:
+    def __init__(self, model_name_or_path: str):
+        self.name = None
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 1024
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    def get_chat_response(self, prompt):
+        raise NotImplementedError
+    def set_hyperparameter(self, temperature: float = 0.2, top_p: float = 0.9, max_tokens: int = 1024):
+        self.temperature = temperature
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+class LLaMA(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "LLaMA"
+        self.model_id = model_name_or_path
+        self.pipeline = pipeline(
+            "text-generation",
+            model=self.model_id,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
+        self.terminators = [
+            self.pipeline.tokenizer.eos_token_id,
+            self.pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
+        ]
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt},
+        ]
+        outputs = self.pipeline(
+            messages,
+            max_new_tokens=self.max_tokens,
+            eos_token_id=self.terminators,
+            do_sample=True,
+            temperature=self.temperature,
+            top_p=self.top_p,
+        )
+        return outputs[0]["generated_text"][-1]['content'].strip()
+class Qwen(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "Qwen"
+        self.model_id = model_name_or_path
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype="auto",
+            device_map="auto"
+        )
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        text = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
+        generated_ids = self.model.generate(
+            **model_inputs,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_new_tokens=self.max_tokens
+        )
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+        ]
+        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return response
+class MiniCPM(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "MiniCPM"
+        self.model_id = model_name_or_path
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
+        model_outputs = self.model.generate(
+            model_inputs,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_new_tokens=self.max_tokens
+        )
+        output_token_ids = [
+            model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
+        ]
+        response = self.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0].strip()
+        return response
+class ChatGLM(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "ChatGLM"
+        self.model_id = model_name_or_path
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True
+        )
+    def get_chat_response(self, prompt):
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True, tokenize=True).to(self.device)
+        model_outputs = self.model.generate(
+            **model_inputs,
+            temperature=self.temperature,
+            top_p=self.top_p,
+            max_new_tokens=self.max_tokens
+        )
+        model_outputs = model_outputs[:, model_inputs['input_ids'].shape[1]:]
+        response = self.tokenizer.batch_decode(model_outputs, skip_special_tokens=True)[0].strip()
+        return response
+class OneKE(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "OneKE"
+        self.model_id = model_name_or_path
+        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            config=config,
+            device_map="auto",
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+    def get_chat_response(self, prompt):
+        system_prompt = '<<SYS>>\nYou are a helpful assistant. 你是一个乐于助人的助手。\n<</SYS>>\n\n'
+        sintruct = '[INST] ' + system_prompt + prompt + '[/INST]'
+        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
+        input_ids = self.tokenizer.encode(sintruct, return_tensors="pt").to(self.device)
+        input_length = input_ids.size(1)
+        generation_output = self.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_length=1024, max_new_tokens=512, return_dict_in_generate=True,pad_token_id=self.tokenizer.pad_token_id,eos_token_id=self.tokenizer.eos_token_id))
+        generation_output = generation_output.sequences[0]
+        generation_output = generation_output[input_length:]
+        response = self.tokenizer.decode(generation_output, skip_special_tokens=True)
+        return response
+class ChatGPT(BaseEngine):
+    def __init__(self, model_name_or_path: str, api_key: str, base_url=openai.base_url):
+        self.name = "ChatGPT"
+        self.model = model_name_or_path
+        self.base_url = base_url
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 4096 # Close source model
+        if api_key != "":
+            self.api_key = api_key
+        else:
+            self.api_key = os.environ["OPENAI_API_KEY"]
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def get_chat_response(self, input):
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "user", "content": input},
+            ],
+            stream=False,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            stop=None
+        )
+        return response.choices[0].message.content
+class DeepSeek(BaseEngine):
+    def __init__(self, model_name_or_path: str, api_key: str, base_url="https://api.deepseek.com"):
+        self.name = "DeepSeek"
+        self.model = model_name_or_path
+        self.base_url = base_url
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 4096 # Close source model
+        if api_key != "":
+            self.api_key = api_key
+        else:
+            self.api_key = os.environ["DEEPSEEK_API_KEY"]
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def get_chat_response(self, input):
+        response = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "user", "content": input},
+            ],
+            stream=False,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens,
+            stop=None
+        )
+        return response.choices[0].message.content
+class LocalServer(BaseEngine):
+    def __init__(self, model_name_or_path: str, base_url="http://localhost:8000/v1"):
+        self.name = model_name_or_path.split('/')[-1]
+        self.model = model_name_or_path
+        self.base_url = base_url
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 1024
+        self.api_key = "EMPTY_API_KEY"
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def get_chat_response(self, input):
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "user", "content": input},
+                ],
+                stream=False,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                stop=None
+            )
+            return response.choices[0].message.content
+        except ConnectionError:
+            print("Error: Unable to connect to the server. Please check if the vllm service is running and the port is 8080.")
+        except Exception as e:
+            print(f"Error: {e}")

src/models/prompt_example.py ADDED Viewed

	@@ -0,0 +1,137 @@

+json_schema_examples = """
+**Task**: Please extract all economic policies affecting the stock market between 2015 and 2023 and the exact dates of their implementation.
+**Text**: This text is from the field of Economics and represents the genre of Article.
+...(example text)...
+**Output Schema**:
+{
+  "economic_policies": [
+      {
+          "name": null,
+          "implementation_date": null
+      }
+  ]
+}
+Example2:
+**Task**: Tell me the main content of papers related to NLP between 2022 and 2023.
+**Text**: This text is from the field of AI and represents the genre of Research Paper.
+...(example text)...
+**Output Schema**:
+{
+  "papers": [
+      {
+          "title": null,
+          "content": null
+      }
+  ]
+}
+Example3:
+**Task**: Extract all the information in the given text.
+**Text**: This text is from the field of Political and represents the genre of News Report.
+...(example text)...
+**Output Schema**:
+Answer:
+{
+  "news_report":
+    {
+      "title": null,
+      "summary": null,
+      "publication_date": null,
+      "keywords": [],
+      "events": [
+          {
+              "name": null,
+              "time": null,
+              "people_involved": [],
+              "cause": null,
+              "process": null,
+              "result": null
+          }
+      ],
+      quotes: [],
+      viewpoints: []
+    }
+}
+"""
+code_schema_examples = """
+Example1:
+**Task**: Extract all the entities in the given text.
+**Text**:
+...(example text)...
+**Output Schema**:
+```python
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class Entity(BaseModel):
+    label : str = Field(description="The type or category of the entity, such as 'Process', 'Technique', 'Data Structure', 'Methodology', 'Person', etc. ")
+    name : str = Field(description="The specific name of the entity. It should represent a single, distinct concept and must not be an empty string. For example, if the entity is a 'Technique', the name could be 'Neural Networks'.")
+class ExtractionTarget(BaseModel):
+    entity_list : List[Entity] = Field(description="All the entities presented in the context. The entities should encode ONE concept.")
+```
+Example2:
+**Task**: Extract all the information in the given text.
+**Text**: This text is from the field of Political and represents the genre of News Article.
+...(example text)...
+**Output Schema**:
+```python
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class Person(BaseModel):
+    name: str = Field(description="The name of the person")
+    identity: Optional[str] = Field(description="The occupation, status or characteristics of the person.")
+    role: Optional[str] = Field(description="The role or function the person plays in an event.")
+class Event(BaseModel):
+    name: str = Field(description="Name of the event")
+    time: Optional[str] = Field(description="Time when the event took place")
+    people_involved: Optional[List[Person]] = Field(description="People involved in the event")
+    cause: Optional[str] = Field(default=None, description="Reason for the event, if applicable")
+    process: Optional[str] = Field(description="Details of the event process")
+    result: Optional[str] = Field(default=None, description="Result or outcome of the event")
+class NewsReport(BaseModel):
+    title: str = Field(description="The title or headline of the news report")
+    summary: str = Field(description="A brief summary of the news report")
+    publication_date: Optional[str] = Field(description="The publication date of the report")
+    keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the news report")
+    events: List[Event] = Field(description="Events covered in the news report")
+    quotes: Optional[dict] = Field(default=None, description="Quotes related to the news, with keys as the citation sources and values as the quoted content. ")
+    viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
+```
+Example3:
+**Task**: Extract the key information in the given text.
+**Text**: This text is from the field of AI and represents the genre of Research Paper.
+...(example text)...
+```python
+from typing import List, Optional
+from pydantic import BaseModel, Field
+class MetaData(BaseModel):
+    title : str = Field(description="The title of the article")
+    authors : List[str] = Field(description="The list of the article's authors")
+    abstract: str = Field(description="The article's abstract")
+    key_words: List[str] = Field(description="The key words associated with the article")
+class Baseline(BaseModel):
+    method_name : str = Field(description="The name of the baseline method")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+class ExtractionTarget(BaseModel):
+    key_contributions: List[str] = Field(description="The key contributions of the article")
+    limitation_of_sota : str=Field(description="the summary limitation of the existing work")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    baselines : List[Baseline] = Field(description="The list of baseline methods and their details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+    paper_limitations : str=Field(description="The limitations of the proposed solution of the paper")
+```
+"""

src/models/prompt_template.py ADDED Viewed

	@@ -0,0 +1,195 @@

+from langchain.prompts import PromptTemplate
+from .prompt_example import *
+# ==================================================================== #
+#                           SCHEMA AGENT                               #
+# ==================================================================== #
+# Get Text Analysis
+TEXT_ANALYSIS_INSTRUCTION = """
+**Instruction**: Please analyze and categorize the given text.
+{examples}
+**Text**: {text}
+**Output Shema**: {schema}
+"""
+text_analysis_instruction = PromptTemplate(
+    input_variables=["examples", "text", "schema"],
+    template=TEXT_ANALYSIS_INSTRUCTION,
+)
+# Get Deduced Schema Json
+DEDUCE_SCHEMA_JSON_INSTRUCTION = """
+**Instruction**: Generate an output format that meets the requirements as described in the task. Pay attention to the following requirements:
+    - Format: Return your responses in dictionary format as a JSON object.
+    - Content: Do not include any actual data; all attributes values should be set to None.
+    - Note: Attributes not mentioned in the task description should be ignored.
+{examples}
+**Task**: {instruction}
+**Text**: {distilled_text}
+{text}
+Now please deduce the output schema in json format. All attributes values should be set to None.
+**Output Schema**:
+"""
+deduced_schema_json_instruction = PromptTemplate(
+    input_variables=["examples", "instruction", "distilled_text", "text", "schema"],
+    template=DEDUCE_SCHEMA_JSON_INSTRUCTION,
+)
+# Get Deduced Schema Code
+DEDUCE_SCHEMA_CODE_INSTRUCTION = """
+**Instruction**: Based on the provided text and task description, Define the output schema in Python using Pydantic. Name the final extraction target class as 'ExtractionTarget'.
+{examples}
+**Task**: {instruction}
+**Text**: {distilled_text}
+{text}
+Now please deduce the output schema. Ensure that the output code snippet is wrapped in '```',and can be directly parsed by the Python interpreter.
+**Output Schema**: """
+deduced_schema_code_instruction = PromptTemplate(
+    input_variables=["examples", "instruction", "distilled_text", "text"],
+    template=DEDUCE_SCHEMA_CODE_INSTRUCTION,
+)
+# ==================================================================== #
+#                         EXTRACTION AGENT                             #
+# ==================================================================== #
+EXTRACT_INSTRUCTION = """
+**Instruction**: You are an agent skilled in information extarction. {instruction}
+{examples}
+**Text**: {text}
+{additional_info}
+**Output Schema**: {schema}
+Now please extract the corresponding information from the text. Ensure that the information you extract has a clear reference in the given text. Set any property not explicitly mentioned in the text to null.
+"""
+extract_instruction = PromptTemplate(
+    input_variables=["instruction", "examples", "text", "schema", "additional_info"],
+    template=EXTRACT_INSTRUCTION,
+)
+instruction_mapper = {
+    'NER': "You are an expert in named entity recognition. Please extract entities that match the schema definition from the input. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.",
+    'RE': "You are an expert in relationship extraction. Please extract relationship triples that match the schema definition from the input. Return an empty list for relationships that do not exist. Please respond in the format of a JSON string.",
+    'EE': "You are an expert in event extraction. Please extract events from the input that conform to the schema definition. Return an empty list for events that do not exist, and return NAN for arguments that do not exist. If an argument has multiple values, please return a list. Respond in the format of a JSON string.",
+}
+EXTRACT_INSTRUCTION_JSON = """
+{{
+    "instruction": {instruction},
+    "schema": {constraint},
+    "input": {input},
+}}
+"""
+extract_instruction_json = PromptTemplate(
+    input_variables=["instruction", "constraint", "input"],
+    template=EXTRACT_INSTRUCTION_JSON,
+)
+SUMMARIZE_INSTRUCTION = """
+**Instruction**: Below is a list of results obtained after segmenting and extracting information from a long article. Please consolidate all the answers to generate a final response.
+{examples}
+**Task**: {instruction}
+**Result List**: {answer_list}
+**Output Schema**: {schema}
+Now summarize all the information from the Result List. Filter or merge the redundant information.
+"""
+summarize_instruction = PromptTemplate(
+    input_variables=["instruction", "examples", "answer_list", "schema"],
+    template=SUMMARIZE_INSTRUCTION,
+)
+# ==================================================================== #
+#                          REFLECION AGENT                             #
+# ==================================================================== #
+REFLECT_INSTRUCTION = """**Instruction**: You are an agent skilled in reflection and optimization based on the original result. Refer to **Reflection Reference** to identify potential issues in the current extraction results.
+**Reflection Reference**: {examples}
+Now please review each element in the extraction result. Identify and improve any potential issues in the result based on the reflection. NOTE: If the original result is correct, no modifications are needed!
+**Task**: {instruction}
+**Text**: {text}
+**Output Schema**: {schema}
+**Original Result**: {result}
+"""
+reflect_instruction = PromptTemplate(
+    input_variables=["instruction", "examples", "text", "schema", "result"],
+    template=REFLECT_INSTRUCTION,
+)
+SUMMARIZE_INSTRUCTION = """
+**Instruction**: Below is a list of results obtained after segmenting and extracting information from a long article. Please consolidate all the answers to generate a final response.
+**Task**: {instruction}
+**Result List**: {answer_list}
+{additional_info}
+**Output Schema**: {schema}
+Now summarize the information from the Result List.
+"""
+summarize_instruction = PromptTemplate(
+    input_variables=["instruction", "answer_list", "additional_info", "schema"],
+    template=SUMMARIZE_INSTRUCTION,
+)
+# ==================================================================== #
+#                            CASE REPOSITORY                           #
+# ==================================================================== #
+GOOD_CASE_ANALYSIS_INSTRUCTION = """
+**Instruction**: Below is an information extraction task and its corresponding correct answer. Provide the reasoning steps that led to the correct answer, along with brief explanation of the answer. Your response should be brief and organized.
+**Task**: {instruction}
+**Text**: {text}
+{additional_info}
+**Correct Answer**: {result}
+Now please generate the reasoning steps and breif analysis of the **Correct Answer** given above. DO NOT generate your own extraction result.
+**Analysis**:
+"""
+good_case_analysis_instruction = PromptTemplate(
+    input_variables=["instruction", "text", "result", "additional_info"],
+    template=GOOD_CASE_ANALYSIS_INSTRUCTION,
+)
+BAD_CASE_REFLECTION_INSTRUCTION = """
+**Instruction**: Based on the task description, compare the original answer with the correct one. Your output should be a brief reflection or concise summarized rules.
+**Task**: {instruction}
+**Text**: {text}
+{additional_info}
+**Original Answer**: {original_answer}
+**Correct Answer**: {correct_answer}
+Now please generate a brief and organized reflection. DO NOT generate your own extraction result.
+**Reflection**:
+"""
+bad_case_reflection_instruction = PromptTemplate(
+    input_variables=["instruction", "text", "original_answer", "correct_answer", "additional_info"],
+    template=BAD_CASE_REFLECTION_INSTRUCTION,
+)

src/models/vllm_serve.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import argparse
+import warnings
+import subprocess
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from utils import *
+def main():
+    # Create command-line argument parser
+    parser = argparse.ArgumentParser(description='Run the extraction model.')
+    parser.add_argument('--config', type=str, required=True,
+                        help='Path to the YAML configuration file.')
+    parser.add_argument('--tensor-parallel-size', type=int, default=2,
+                        help='Tensor parallel size for the VLLM server.')
+    parser.add_argument('--max-model-len', type=int, default=32768,
+                        help='Maximum model length for the VLLM server.')
+    # Parse command-line arguments
+    args = parser.parse_args()
+    # Load configuration
+    config = load_extraction_config(args.config)
+    # Model config
+    model_config = config['model']
+    if model_config['vllm_serve'] == False:
+        warnings.warn("VLLM-deployed model will not be used for extraction. To enable VLLM, set vllm_serve to true in the configuration file.")
+    model_name_or_path = model_config['model_name_or_path']
+    command = f"vllm serve {model_name_or_path} --tensor-parallel-size {args.tensor_parallel_size} --max-model-len {args.max_model_len} --enforce-eager --port 8000"
+    subprocess.run(command, shell=True)
+if __name__ == "__main__":
+    main()

src/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .schema_agent import SchemaAgent
+from .extraction_agent import ExtractionAgent
+from .reflection_agent import ReflectionAgent
+from .knowledge_base.case_repository import CaseRepositoryHandler

src/modules/extraction_agent.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from models import *
+from utils import *
+from .knowledge_base.case_repository import CaseRepositoryHandler
+class InformationExtractor:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+    def extract_information(self, instruction="", text="", examples="", schema="", additional_info=""):
+        examples = good_case_wrapper(examples)
+        prompt = extract_instruction.format(instruction=instruction, examples=examples, text=text, additional_info=additional_info, schema=schema)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        return response
+    def extract_information_compatible(self, task="", text="", constraint=""):
+        instruction = instruction_mapper.get(task)
+        prompt = extract_instruction_json.format(instruction=instruction, constraint=constraint, input=text)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        return response
+    def summarize_answer(self, instruction="", answer_list="", schema="", additional_info=""):
+        prompt = summarize_instruction.format(instruction=instruction, answer_list=answer_list, schema=schema, additional_info=additional_info)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        return response
+class ExtractionAgent:
+    def __init__(self, llm: BaseEngine, case_repo: CaseRepositoryHandler):
+        self.llm = llm
+        self.module = InformationExtractor(llm = llm)
+        self.case_repo = case_repo
+        self.methods = ["extract_information_direct", "extract_information_with_case"]
+    def __get_constraint(self, data: DataPoint):
+        if data.constraint == "":
+            return data
+        if data.task == "NER":
+            constraint = json.dumps(data.constraint)
+            if "**Entity Type Constraint**" in constraint or self.llm.name == "OneKE":
+                return data
+            data.constraint = f"\n**Entity Type Constraint**: The type of entities must be chosen from the following list.\n{constraint}\n"
+        elif data.task == "RE":
+            constraint = json.dumps(data.constraint)
+            if "**Relation Type Constraint**" in constraint or self.llm.name == "OneKE":
+                return data
+            data.constraint = f"\n**Relation Type Constraint**: The type of relations must be chosen from the following list.\n{constraint}\n"
+        elif data.task == "EE":
+            constraint = json.dumps(data.constraint)
+            if "**Event Extraction Constraint**" in constraint:
+                return data
+            if self.llm.name != "OneKE":
+                data.constraint = f"\n**Event Extraction Constraint**: The event type must be selected from the following dictionary keys, and its event arguments should be chosen from its corresponding dictionary values. \n{constraint}\n"
+            else:
+                try:
+                    result = [
+                                {
+                                    "event_type": key,
+                                    "trigger": True,
+                                    "arguments": value
+                                }
+                                for key, value in data.constraint.items()
+                            ]
+                    data.constraint = json.dumps(result)
+                except:
+                    print("Invalid Constraint: Event Extraction constraint must be a dictionary with event types as keys and lists of arguments as values.", data.constraint)
+        elif data.task == "Triple":
+            constraint = json.dumps(data.constraint)
+            if "**Triple Extraction Constraint**" in constraint:
+                return data
+            if self.llm.name != "OneKE":
+                if len(data.constraint) == 1: # 1 list means entity
+                    data.constraint = f"\n**Triple Extraction Constraint**: Entities type must chosen from following list:\n{constraint}\n"
+                elif len(data.constraint) == 2: # 2 list means entity and relation
+                    if data.constraint[0] == []:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Relation type must chosen from following list:\n{data.constraint[1]}\n"
+                    elif data.constraint[1] == []:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Entities type must chosen from following list:\n{data.constraint[0]}\n"
+                    else:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Entities type must chosen from following list:\n{data.constraint[0]}\nRelation type must chosen from following list:\n{data.constraint[1]}\n"
+                elif len(data.constraint) == 3: # 3 list means entity, relation and object
+                    if data.constraint[0] == []:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Relation type must chosen from following list:\n{data.constraint[1]}\nObject Entities must chosen from following list:\n{data.constraint[2]}\n"
+                    elif data.constraint[1] == []:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Subject Entities must chosen from following list:\n{data.constraint[0]}\nObject Entities must chosen from following list:\n{data.constraint[2]}\n"
+                    elif data.constraint[2] == []:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Subject Entities must chosen from following list:\n{data.constraint[0]}\nRelation type must chosen from following list:\n{data.constraint[1]}\n"
+                    else:
+                        data.constraint = f"\n**Triple Extraction Constraint**: Subject Entities must chosen from following list:\n{data.constraint[0]}\nRelation type must chosen from following list:\n{data.constraint[1]}\nObject Entities must chosen from following list:\n{data.constraint[2]}\n"
+                else:
+                    data.constraint = f"\n**Triple Extraction Constraint**: The type of entities must be chosen from the following list:\n{constraint}\n"
+            else:
+                print("OneKE does not support Triple Extraction task now, please wait for the next version.")
+            # print("data.constraint", data.constraint)
+        return data
+    def extract_information_direct(self, data: DataPoint):
+        data = self.__get_constraint(data)
+        result_list = []
+        for chunk_text in data.chunk_text_list:
+            if self.llm.name != "OneKE":
+                extract_direct_result = self.module.extract_information(instruction=data.instruction, text=chunk_text, schema=data.output_schema, examples="", additional_info=data.constraint)
+            else:
+                extract_direct_result = self.module.extract_information_compatible(task=data.task, text=chunk_text, constraint=data.constraint)
+            result_list.append(extract_direct_result)
+        function_name = current_function_name()
+        data.set_result_list(result_list)
+        data.update_trajectory(function_name, result_list)
+        return data
+    def extract_information_with_case(self, data: DataPoint):
+        data = self.__get_constraint(data)
+        result_list = []
+        for chunk_text in data.chunk_text_list:
+            examples = self.case_repo.query_good_case(data)
+            extract_case_result = self.module.extract_information(instruction=data.instruction, text=chunk_text, schema=data.output_schema, examples=examples, additional_info=data.constraint)
+            result_list.append(extract_case_result)
+        function_name = current_function_name()
+        data.set_result_list(result_list)
+        data.update_trajectory(function_name, result_list)
+        return data
+    def summarize_answer(self, data: DataPoint):
+        if len(data.result_list) == 0:
+            return data
+        if len(data.result_list) == 1:
+            data.set_pred(data.result_list[0])
+            return data
+        summarized_result = self.module.summarize_answer(instruction=data.instruction, answer_list=data.result_list, schema=data.output_schema, additional_info=data.constraint)
+        funtion_name = current_function_name()
+        data.set_pred(summarized_result)
+        data.update_trajectory(funtion_name, summarized_result)
+        return data

src/modules/knowledge_base/case_repository.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/modules/knowledge_base/case_repository.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import json
+import os
+import torch
+import numpy as np
+from utils import *
+from sentence_transformers import SentenceTransformer
+from rapidfuzz import process
+from models import *
+import copy
+import warnings
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+docker_model_path = "/app/model/all-MiniLM-L6-v2"
+warnings.filterwarnings("ignore", category=FutureWarning, message=r".*clean_up_tokenization_spaces*")
+class CaseRepository:
+    def __init__(self):
+        try:
+            self.embedder = SentenceTransformer(docker_model_path)
+        except:
+            self.embedder = SentenceTransformer(config['model']['embedding_model'])
+        self.embedder.to(device)
+        self.corpus = self.load_corpus()
+        self.embedded_corpus = self.embed_corpus()
+    def load_corpus(self):
+        with open(os.path.join(os.path.dirname(__file__), "case_repository.json")) as file:
+            corpus = json.load(file)
+        return corpus
+    def update_corpus(self):
+        try:
+            with open(os.path.join(os.path.dirname(__file__), "case_repository.json"), "w") as file:
+                json.dump(self.corpus, file, indent=2)
+        except Exception as e:
+            print(f"Error when updating corpus: {e}")
+    def embed_corpus(self):
+        embedded_corpus = {}
+        for key, content in self.corpus.items():
+            good_index = [item['index']['embed_index'] for item in content['good']]
+            encoded_good_index = self.embedder.encode(good_index, convert_to_tensor=True).to(device)
+            bad_index = [item['index']['embed_index'] for item in content['bad']]
+            encoded_bad_index = self.embedder.encode(bad_index, convert_to_tensor=True).to(device)
+            embedded_corpus[key] = {"good": encoded_good_index, "bad": encoded_bad_index}
+        return embedded_corpus
+    def get_similarity_scores(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Embedding similarity match
+        encoded_embed_query = self.embedder.encode(embed_index, convert_to_tensor=True).to(device)
+        embedding_similarity_matrix = self.embedder.similarity(encoded_embed_query, self.embedded_corpus[task][case_type])
+        embedding_similarity_scores = embedding_similarity_matrix[0].to(device)
+        # String similarity match
+        str_match_corpus = [item['index']['str_index'] for item in self.corpus[task][case_type]]
+        str_similarity_results = process.extract(str_index, str_match_corpus, limit=len(str_match_corpus))
+        scores_dict = {match[0]: match[1] for match in str_similarity_results}
+        scores_in_order = [scores_dict[candidate] for candidate in str_match_corpus]
+        str_similarity_scores = torch.tensor(scores_in_order, dtype=torch.float32).to(device)
+        # Normalize scores
+        embedding_score_range = embedding_similarity_scores.max() - embedding_similarity_scores.min()
+        str_score_range = str_similarity_scores.max() - str_similarity_scores.min()
+        if embedding_score_range > 0:
+            embed_norm_scores = (embedding_similarity_scores - embedding_similarity_scores.min()) / embedding_score_range
+        else:
+            embed_norm_scores = embedding_similarity_scores
+        if str_score_range > 0:
+            str_norm_scores = (str_similarity_scores - str_similarity_scores.min()) / str_score_range
+        else:
+            str_norm_scores = str_similarity_scores / 100
+        # Combine the scores with weights
+        combined_scores = 0.5 * embed_norm_scores + 0.5 * str_norm_scores
+        original_combined_scores = 0.5 * embedding_similarity_scores + 0.5 * str_similarity_scores / 100
+        scores, indices = torch.topk(combined_scores, k=min(top_k, combined_scores.size(0)))
+        original_scores, original_indices = torch.topk(original_combined_scores, k=min(top_k, original_combined_scores.size(0)))
+        return scores, indices, original_scores, original_indices
+    def query_case(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2) -> list:
+        _, indices, _, _ = self.get_similarity_scores(task, embed_index, str_index, case_type, top_k)
+        top_matches = [self.corpus[task][case_type][idx]["content"] for idx in indices]
+        return top_matches
+    def update_case(self, task: TaskType, embed_index="", str_index="", content="" ,case_type=""):
+        self.corpus[task][case_type].append({"index": {"embed_index": embed_index, "str_index": str_index}, "content": content})
+        self.embedded_corpus[task][case_type] = torch.cat([self.embedded_corpus[task][case_type], self.embedder.encode([embed_index], convert_to_tensor=True).to(device)], dim=0)
+        print(f"A {case_type} case updated for {task} task.")
+class CaseRepositoryHandler:
+    def __init__(self, llm: BaseEngine):
+        self.repository = CaseRepository()
+        self.llm = llm
+    def __get_good_case_analysis(self, instruction="", text="", result="", additional_info=""):
+        prompt = good_case_analysis_instruction.format(
+            instruction=instruction, text=text, result=result, additional_info=additional_info
+        )
+        for _ in range(3):
+            response = self.llm.get_chat_response(prompt)
+            response = extract_json_dict(response)
+            if not isinstance(response, dict):
+                return response
+        return None
+    def __get_bad_case_reflection(self, instruction="", text="", original_answer="", correct_answer="", additional_info=""):
+        prompt = bad_case_reflection_instruction.format(
+            instruction=instruction, text=text, original_answer=original_answer, correct_answer=correct_answer, additional_info=additional_info
+        )
+        for _ in range(3):
+            response = self.llm.get_chat_response(prompt)
+            response = extract_json_dict(response)
+            if not isinstance(response, dict):
+                return response
+        return None
+    def __get_index(self, data: DataPoint, case_type: str):
+        # set embed_index
+        embed_index = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        # set str_index
+        if data.task == "Base":
+            str_index = f"**Task**: {data.instruction}"
+        else:
+            str_index = f"{data.constraint}"
+        if case_type == "bad":
+            str_index += f"\n\n**Original Result**: {json.dumps(data.pred)}"
+        return embed_index, str_index
+    def query_good_case(self, data: DataPoint):
+        embed_index, str_index = self.__get_index(data, "good")
+        return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="good")
+    def query_bad_case(self, data: DataPoint):
+        embed_index, str_index = self.__get_index(data, "bad")
+        return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="bad")
+    def update_good_case(self, data: DataPoint):
+        if data.truth == "" :
+            print("No truth value provided.")
+            return
+        embed_index, str_index = self.__get_index(data, "good")
+        _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "good", 1)
+        original_scores = original_scores.tolist()
+        if original_scores[0] >= 0.9:
+            print("The similar good case is already in the corpus. Similarity Score: ", original_scores[0])
+            return
+        good_case_alaysis = self.__get_good_case_analysis(instruction=data.instruction, text=data.distilled_text, result=data.truth, additional_info=data.constraint)
+        wrapped_good_case_analysis = f"**Analysis**: {good_case_alaysis}"
+        wrapped_instruction = f"**Task**: {data.instruction}"
+        wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        wrapped_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+        if data.task == "Base":
+            content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+        else:
+            content = f"{wrapped_text}\n\n{data.constraint}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+        self.repository.update_case(data.task, embed_index, str_index, content, "good")
+    def update_bad_case(self, data: DataPoint):
+        if data.truth == "" :
+            print("No truth value provided.")
+            return
+        if normalize_obj(data.pred) == normalize_obj(data.truth):
+            return
+        embed_index, str_index = self.__get_index(data, "bad")
+        _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "bad", 1)
+        original_scores = original_scores.tolist()
+        if original_scores[0] >= 0.9:
+            print("The similar bad case is already in the corpus. Similarity Score: ", original_scores[0])
+            return
+        bad_case_reflection = self.__get_bad_case_reflection(instruction=data.instruction, text=data.distilled_text, original_answer=data.pred, correct_answer=data.truth, additional_info=data.constraint)
+        wrapped_bad_case_reflection = f"**Reflection**: {bad_case_reflection}"
+        wrapper_original_answer = f"**Original Answer**: {json.dumps(data.pred)}"
+        wrapper_correct_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+        wrapped_instruction = f"**Task**: {data.instruction}"
+        wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        if data.task == "Base":
+            content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+        else:
+            content =  f"{wrapped_text}\n\n{data.constraint}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+        self.repository.update_case(data.task, embed_index, str_index, content, "bad")
+    def update_case(self, data: DataPoint):
+        self.update_good_case(data)
+        self.update_bad_case(data)
+        self.repository.update_corpus()

src/modules/knowledge_base/schema_repository.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from typing import List, Optional
+from pydantic import BaseModel, Field
+from langchain_core.output_parsers import JsonOutputParser
+# ==================================================================== #
+#                                NER TASK                              #
+# ==================================================================== #
+class Entity(BaseModel):
+    name : str = Field(description="The specific name of the entity. ")
+    type : str = Field(description="The type or category that the entity belongs to.")
+class EntityList(BaseModel):
+    entity_list : List[Entity] = Field(description="Named entities appearing in the text.")
+# ==================================================================== #
+#                               RE TASK                                #
+# ==================================================================== #
+class Relation(BaseModel):
+    head : str = Field(description="The starting entity in the relationship.")
+    tail : str = Field(description="The ending entity in the relationship.")
+    relation : str = Field(description="The predicate that defines the relationship between the two entities.")
+class RelationList(BaseModel):
+    relation_list : List[Relation] = Field(description="The collection of relationships between various entities.")
+# ==================================================================== #
+#                               EE TASK                                #
+# ==================================================================== #
+class Event(BaseModel):
+    event_type : str = Field(description="The type of the event.")
+    event_trigger : str = Field(description="A specific word or phrase that indicates the occurrence of the event.")
+    event_argument : dict = Field(description="The arguments or participants involved in the event.")
+class EventList(BaseModel):
+    event_list : List[Event] = Field(description="The events presented in the text.")
+# ==================================================================== #
+#                            Triple TASK                               #
+# ==================================================================== #
+class Triple(BaseModel):
+    head: str = Field(description="The subject or head of the triple.")
+    head_type: str = Field(description="The type of the subject entity.")
+    relation: str = Field(description="The predicate or relation between the entities.")
+    relation_type: str = Field(description="The type of the relation.")
+    tail: str = Field(description="The object or tail of the triple.")
+    tail_type: str = Field(description="The type of the object entity.")
+class TripleList(BaseModel):
+    triple_list: List[Triple] = Field(description="The collection of triples and their types presented in the text.")
+# ==================================================================== #
+#                          TEXT DESCRIPTION                            #
+# ==================================================================== #
+class TextDescription(BaseModel):
+    field: str = Field(description="The field of the given text, such as 'Science', 'Literature', 'Business', 'Medicine', 'Entertainment', etc.")
+    genre: str = Field(description="The genre of the given text, such as 'Article', 'Novel', 'Dialog', 'Blog', 'Manual','Expository', 'News Report', 'Research Paper', etc.")
+# ==================================================================== #
+#                        USER DEFINED SCHEMA                           #
+# ==================================================================== #
+# --------------------------- Research Paper ----------------------- #
+class MetaData(BaseModel):
+    title : str = Field(description="The title of the article")
+    authors : List[str] = Field(description="The list of the article's authors")
+    abstract: str = Field(description="The article's abstract")
+    key_words: List[str] = Field(description="The key words associated with the article")
+class Baseline(BaseModel):
+    method_name : str = Field(description="The name of the baseline method")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+class ExtractionTarget(BaseModel):
+    key_contributions: List[str] = Field(description="The key contributions of the article")
+    limitation_of_sota : str=Field(description="the summary limitation of the existing work")
+    proposed_solution : str = Field(description="the proposed solution in details")
+    baselines : List[Baseline] = Field(description="The list of baseline methods and their details")
+    performance_metrics : str = Field(description="The performance metrics of the method and comparative analysis")
+    paper_limitations : str=Field(description="The limitations of the proposed solution of the paper")
+# --------------------------- News ----------------------- #
+class Person(BaseModel):
+    name: str = Field(description="The name of the person")
+    identity: Optional[str] = Field(description="The occupation, status or characteristics of the person.")
+    role: Optional[str] = Field(description="The role or function the person plays in an event.")
+class Event(BaseModel):
+    name: str = Field(description="Name of the event")
+    time: Optional[str] = Field(description="Time when the event took place")
+    people_involved: Optional[List[Person]] = Field(description="People involved in the event")
+    cause: Optional[str] = Field(default=None, description="Reason for the event, if applicable")
+    process: Optional[str] = Field(description="Details of the event process")
+    result: Optional[str] = Field(default=None, description="Result or outcome of the event")
+class NewsReport(BaseModel):
+    title: str = Field(description="The title or headline of the news report")
+    summary: str = Field(description="A brief summary of the news report")
+    publication_date: Optional[str] = Field(description="The publication date of the report")
+    keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the news report")
+    events: List[Event] = Field(description="Events covered in the news report")
+    quotes: Optional[dict] = Field(default=None, description="Quotes related to the news, with keys as the citation sources and values as the quoted content. ")
+    viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
+# --------- You can customize new extraction schemas below -------- #
+class ChemicalSubstance(BaseModel):
+    name: str = Field(description="Name of the chemical substance")
+    formula: str = Field(description="Molecular formula")
+    appearance: str = Field(description="Physical appearance")
+    uses: List[str] = Field(description="Primary uses")
+    hazards: str = Field(description="Hazard classification")
+class ChemicalList(BaseModel):
+  chemicals: List[ChemicalSubstance] = Field(description="List of chemicals")

src/modules/reflection_agent.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from models import *
+from utils import *
+from .extraction_agent import ExtractionAgent
+from .knowledge_base.case_repository import CaseRepositoryHandler
+class ReflectionGenerator:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+    def get_reflection(self, instruction="", examples="", text="",schema="", result=""):
+        result = json.dumps(result)
+        examples = bad_case_wrapper(examples)
+        prompt = reflect_instruction.format(instruction=instruction, examples=examples, text=text, schema=schema, result=result)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        return response
+class ReflectionAgent:
+    def __init__(self, llm: BaseEngine, case_repo: CaseRepositoryHandler):
+        self.llm = llm
+        self.module = ReflectionGenerator(llm = llm)
+        self.extractor = ExtractionAgent(llm = llm, case_repo = case_repo)
+        self.case_repo = case_repo
+        self.methods = ["reflect_with_case"]
+    def __select_result(self, result_list):
+        dict_objects = [obj for obj in result_list if isinstance(obj, dict)]
+        if dict_objects:
+            selected_obj = max(dict_objects, key=lambda d: len(json.dumps(d)))
+        else:
+            selected_obj = max(result_list, key=lambda o: len(json.dumps(o)))
+        return selected_obj
+    def __self_consistance_check(self, data: DataPoint):
+        extract_func = list(data.result_trajectory.keys())[-1]
+        if hasattr(self.extractor, extract_func):
+            result_trails = []
+            result_trails.append(data.result_list)
+            extract_func = getattr(self.extractor, extract_func)
+            temperature = [0.5, 1]
+            for index in range(2):
+                self.module.llm.set_hyperparameter(temperature=temperature[index])
+                data = extract_func(data)
+                result_trails.append(data.result_list)
+            self.module.llm.set_hyperparameter()
+            consistant_result = []
+            reflect_index = []
+            for index, elements in enumerate(zip(*result_trails)):
+                normalized_elements = [normalize_obj(e) for e in elements]
+                element_counts = Counter(normalized_elements)
+                selected_element = next((elements[i] for i, element in enumerate(normalized_elements)
+                                        if element_counts[element] >= 2), None)
+                if selected_element is None:
+                    selected_element = self.__select_result(elements)
+                    reflect_index.append(index)
+                consistant_result.append(selected_element)
+            data.set_result_list(consistant_result)
+            return reflect_index
+    def reflect_with_case(self, data: DataPoint):
+        if data.result_list == []:
+            return data
+        reflect_index = self.__self_consistance_check(data)
+        reflected_result_list = data.result_list
+        for idx in reflect_index:
+            text = data.chunk_text_list[idx]
+            result = data.result_list[idx]
+            examples = json.dumps(self.case_repo.query_bad_case(data))
+            reflected_res = self.module.get_reflection(instruction=data.instruction, examples=examples, text=text, schema=data.output_schema, result=result)
+            reflected_result_list[idx] = reflected_res
+        data.set_result_list(reflected_result_list)
+        function_name = current_function_name()
+        data.update_trajectory(function_name, data.result_list)
+        return data

src/modules/schema_agent.py ADDED Viewed

	@@ -0,0 +1,160 @@

+from models import *
+from utils import *
+from .knowledge_base import schema_repository
+from langchain_core.output_parsers import JsonOutputParser
+class SchemaAnalyzer:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+    def serialize_schema(self, schema) -> str:
+        if isinstance(schema, (str, list, dict, set, tuple)):
+            return schema
+        try:
+            parser = JsonOutputParser(pydantic_object = schema)
+            schema_description = parser.get_format_instructions()
+            schema_content = re.findall(r'```(.*?)```', schema_description, re.DOTALL)
+            explanation = "For example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}}, the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance."
+            schema = f"{schema_content}\n\n{explanation}"
+        except:
+            return schema
+        return schema
+    def redefine_text(self, text_analysis):
+        try:
+            field = text_analysis['field']
+            genre = text_analysis['genre']
+        except:
+            return text_analysis
+        prompt = f"This text is from the field of {field} and represents the genre of {genre}."
+        return prompt
+    def get_text_analysis(self, text: str):
+        output_schema = self.serialize_schema(schema_repository.TextDescription)
+        prompt = text_analysis_instruction.format(examples="", text=text, schema=output_schema)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        response = self.redefine_text(response)
+        return response
+    def get_deduced_schema_json(self, instruction: str, text: str, distilled_text: str):
+        prompt = deduced_schema_json_instruction.format(examples=example_wrapper(json_schema_examples), instruction=instruction, distilled_text=distilled_text, text=text)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
+        code = response
+        print(f"Deduced Schema in Json: \n{response}\n\n")
+        return code, response
+    def get_deduced_schema_code(self, instruction: str, text: str, distilled_text: str):
+        prompt = deduced_schema_code_instruction.format(examples=example_wrapper(code_schema_examples), instruction=instruction, distilled_text=distilled_text, text=text)
+        response = self.llm.get_chat_response(prompt)
+        code_blocks = re.findall(r'```[^\n]*\n(.*?)\n```', response, re.DOTALL)
+        if code_blocks:
+            try:
+                code_block = code_blocks[-1]
+                namespace = {}
+                exec(code_block, namespace)
+                schema = namespace.get('ExtractionTarget')
+                if schema is not None:
+                    index = code_block.find("class")
+                    code = code_block[index:]
+                    print(f"Deduced Schema in Code: \n{code}\n\n")
+                    schema = self.serialize_schema(schema)
+                    return code, schema
+            except Exception as e:
+                print(e)
+                return self.get_deduced_schema_json(instruction, text, distilled_text)
+        return self.get_deduced_schema_json(instruction, text, distilled_text)
+class SchemaAgent:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+        self.module = SchemaAnalyzer(llm = llm)
+        self.schema_repo = schema_repository
+        self.methods = ["get_default_schema", "get_retrieved_schema", "get_deduced_schema"]
+    def __preprocess_text(self, data: DataPoint):
+        if data.use_file:
+            data.chunk_text_list = chunk_file(data.file_path)
+        else:
+            data.chunk_text_list = chunk_str(data.text)
+        if data.task == "NER":
+            data.print_schema = """
+class Entity(BaseModel):
+    name : str = Field(description="The specific name of the entity. ")
+    type : str = Field(description="The type or category that the entity belongs to.")
+class EntityList(BaseModel):
+    entity_list : List[Entity] = Field(description="Named entities appearing in the text.")
+            """
+        elif data.task == "RE":
+            data.print_schema = """
+class Relation(BaseModel):
+    head : str = Field(description="The starting entity in the relationship.")
+    tail : str = Field(description="The ending entity in the relationship.")
+    relation : str = Field(description="The predicate that defines the relationship between the two entities.")
+class RelationList(BaseModel):
+    relation_list : List[Relation] = Field(description="The collection of relationships between various entities.")
+            """
+        elif data.task == "EE":
+            data.print_schema = """
+class Event(BaseModel):
+    event_type : str = Field(description="The type of the event.")
+    event_trigger : str = Field(description="A specific word or phrase that indicates the occurrence of the event.")
+    event_argument : dict = Field(description="The arguments or participants involved in the event.")
+class EventList(BaseModel):
+    event_list : List[Event] = Field(description="The events presented in the text.")
+            """
+        elif data.task == "Triple":
+            data.print_schema = """
+class Triple(BaseModel):
+    head: str = Field(description="The subject or head of the triple.")
+    head_type: str = Field(description="The type of the subject entity.")
+    relation: str = Field(description="The predicate or relation between the entities.")
+    relation_type: str = Field(description="The type of the relation.")
+    tail: str = Field(description="The object or tail of the triple.")
+    tail_type: str = Field(description="The type of the object entity.")
+class TripleList(BaseModel):
+    triple_list: List[Triple] = Field(description="The collection of triples and their types presented in the text.")
+"""
+        return data
+    def get_default_schema(self, data: DataPoint):
+        data = self.__preprocess_text(data)
+        default_schema = config['agent']['default_schema']
+        data.set_schema(default_schema)
+        function_name = current_function_name()
+        data.update_trajectory(function_name, default_schema)
+        return data
+    def get_retrieved_schema(self, data: DataPoint):
+        self.__preprocess_text(data)
+        schema_name = data.output_schema
+        schema_class = getattr(self.schema_repo, schema_name, None)
+        if schema_class is not None:
+            schema = self.module.serialize_schema(schema_class)
+            default_schema = config['agent']['default_schema']
+            data.set_schema(f"{default_schema}\n{schema}")
+            function_name = current_function_name()
+            data.update_trajectory(function_name, schema)
+        else:
+            return self.get_default_schema(data)
+        return data
+    def get_deduced_schema(self, data: DataPoint):
+        self.__preprocess_text(data)
+        target_text = data.chunk_text_list[0]
+        analysed_text = self.module.get_text_analysis(target_text)
+        if len(data.chunk_text_list) > 1:
+            prefix = "Below is a portion of the text to be extracted. "
+            analysed_text = f"{prefix}\n{target_text}"
+        distilled_text = self.module.redefine_text(analysed_text)
+        code, deduced_schema = self.module.get_deduced_schema_code(data.instruction, target_text, distilled_text)
+        data.print_schema = code
+        data.set_distilled_text(distilled_text)
+        default_schema = config['agent']['default_schema']
+        data.set_schema(f"{default_schema}\n{deduced_schema}")
+        function_name = current_function_name()
+        data.update_trajectory(function_name, deduced_schema)
+        return data

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,142 @@

+from typing import Literal
+from models import *
+from utils import *
+from modules import *
+from construct import *
+class Pipeline:
+    def __init__(self, llm: BaseEngine):
+        self.llm = llm
+        self.case_repo = CaseRepositoryHandler(llm = llm)
+        self.schema_agent = SchemaAgent(llm = llm)
+        self.extraction_agent = ExtractionAgent(llm = llm, case_repo = self.case_repo)
+        self.reflection_agent = ReflectionAgent(llm = llm, case_repo = self.case_repo)
+    def __check_consistancy(self, llm, task, mode, update_case):
+        if llm.name == "OneKE":
+            if task == "Base" or task == "Triple":
+                raise ValueError("The finetuned OneKE only supports quick extraction mode for NER, RE and EE Task.")
+            else:
+                mode = "quick"
+                update_case = False
+                print("The fine-tuned OneKE defaults to quick extraction mode without case update.")
+                return mode, update_case
+        return mode, update_case
+    def __init_method(self, data: DataPoint, process_method2):
+        default_order = ["schema_agent", "extraction_agent", "reflection_agent"]
+        if "schema_agent" not in process_method2:
+            process_method2["schema_agent"] = "get_default_schema"
+        if data.task != "Base":
+            process_method2["schema_agent"] = "get_retrieved_schema"
+        if "extraction_agent" not in process_method2:
+            process_method2["extraction_agent"] = "extract_information_direct"
+        sorted_process_method = {key: process_method2[key] for key in default_order if key in process_method2}
+        return sorted_process_method
+    def __init_data(self, data: DataPoint):
+        if data.task == "NER":
+            data.instruction = config['agent']['default_ner']
+            data.output_schema = "EntityList"
+        elif data.task == "RE":
+            data.instruction = config['agent']['default_re']
+            data.output_schema = "RelationList"
+        elif data.task == "EE":
+            data.instruction = config['agent']['default_ee']
+            data.output_schema = "EventList"
+        elif data.task == "Triple":
+            data.instruction = config['agent']['default_triple']
+            data.output_schema = "TripleList"
+        return data
+    # main entry
+    def get_extract_result(self,
+                           task: TaskType,
+                           three_agents = {},
+                           construct = {},
+                           instruction: str = "",
+                           text: str = "",
+                           output_schema: str = "",
+                           constraint: str = "",
+                           use_file: bool = False,
+                           file_path: str = "",
+                           truth: str = "",
+                           mode: str = "quick",
+                           update_case: bool = False,
+                           show_trajectory: bool = False,
+                           isgui: bool = False,
+                           iskg: bool = False,
+                           ):
+        # for key, value in locals().items():
+        #     print(f"{key}: {value}")
+        # Check Consistancy
+        mode, update_case = self.__check_consistancy(self.llm, task, mode, update_case)
+        # Load Data
+        data = DataPoint(task=task, instruction=instruction, text=text, output_schema=output_schema, constraint=constraint, use_file=use_file, file_path=file_path, truth=truth)
+        data = self.__init_data(data)
+        if mode in config['agent']['mode'].keys():
+            process_method = config['agent']['mode'][mode].copy()
+        else:
+            process_method = mode
+        if isgui and mode == "customized":
+            process_method = three_agents
+            print("Customized 3-Agents: ", three_agents)
+        sorted_process_method = self.__init_method(data, process_method)
+        print("Process Method: ", sorted_process_method)
+        print_schema = False #
+        frontend_schema = "" #
+        frontend_res = "" #
+        # Information Extract
+        for agent_name, method_name in sorted_process_method.items():
+            agent = getattr(self, agent_name, None)
+            if not agent:
+                raise AttributeError(f"{agent_name} does not exist.")
+            method = getattr(agent, method_name, None)
+            if not method:
+                raise AttributeError(f"Method '{method_name}' not found in {agent_name}.")
+            data = method(data)
+            if not print_schema and data.print_schema: #
+                print("Schema: \n", data.print_schema)
+                frontend_schema = data.print_schema
+                print_schema = True
+        data = self.extraction_agent.summarize_answer(data)
+        # show result
+        if show_trajectory:
+            print("Extraction Trajectory: \n", json.dumps(data.get_result_trajectory(), indent=2))
+        extraction_result = json.dumps(data.pred, indent=2)
+        print("Extraction Result: \n", extraction_result)
+        # construct KG
+        if iskg:
+            myurl = construct['url']
+            myusername = construct['username']
+            mypassword = construct['password']
+            print(f"Construct KG in your {construct['database']} now...")
+            cypher_statements = generate_cypher_statements(extraction_result)
+            execute_cypher_statements(uri=myurl, user=myusername, password=mypassword, cypher_statements=cypher_statements)
+        frontend_res = data.pred #
+        # Case Update
+        if update_case:
+            if (data.truth == ""):
+                truth = input("Please enter the correct answer you prefer, or just press Enter to accept the current answer: ")
+                if truth.strip() == "":
+                    data.truth = data.pred
+                else:
+                    data.truth = extract_json_dict(truth)
+            self.case_repo.update_case(data)
+        # return result
+        result = data.pred
+        trajectory = data.get_result_trajectory()
+        return result, trajectory, frontend_schema, frontend_res

src/run.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import argparse
+import os
+import yaml
+from pipeline import Pipeline
+from typing import Literal
+import models
+from models import *
+from utils import *
+from modules import *
+def main():
+    # Create command-line argument parser
+    parser = argparse.ArgumentParser(description='Run the extraction framefork.')
+    parser.add_argument('--config', type=str, required=True,
+                        help='Path to the YAML configuration file.')
+    # Parse command-line arguments
+    args = parser.parse_args()
+    # Load configuration
+    config = load_extraction_config(args.config)
+    # Model config
+    model_config = config['model']
+    if model_config['vllm_serve'] == True:
+        model = LocalServer(model_config['model_name_or_path'])
+    else:
+        clazz = getattr(models, model_config['category'], None)
+        if clazz is None:
+            print(f"Error: The model category '{model_config['category']}' is not supported.")
+            return
+        if model_config['api_key'] == "":
+            model = clazz(model_config['model_name_or_path'])
+        else:
+            model = clazz(model_config['model_name_or_path'], model_config['api_key'], model_config['base_url'])
+    pipeline = Pipeline(model)
+    # Extraction config
+    extraction_config = config['extraction']
+    # constuct config
+    if 'construct' in config:
+        construct_config = config['construct']
+        result, trajectory, _, _ = pipeline.get_extract_result(task=extraction_config['task'], instruction=extraction_config['instruction'], text=extraction_config['text'], output_schema=extraction_config['output_schema'], constraint=extraction_config['constraint'], use_file=extraction_config['use_file'], file_path=extraction_config['file_path'], truth=extraction_config['truth'], mode=extraction_config['mode'], update_case=extraction_config['update_case'], show_trajectory=extraction_config['show_trajectory'],
+                                                               construct=construct_config, iskg=True) # When 'construct' is provided, 'iskg' should be True to construct the knowledge graph.
+        return
+    else:
+        print("please provide construct config in the yaml file.")
+    result, trajectory, _, _ = pipeline.get_extract_result(task=extraction_config['task'], instruction=extraction_config['instruction'], text=extraction_config['text'], output_schema=extraction_config['output_schema'], constraint=extraction_config['constraint'], use_file=extraction_config['use_file'], file_path=extraction_config['file_path'], truth=extraction_config['truth'], mode=extraction_config['mode'], update_case=extraction_config['update_case'], show_trajectory=extraction_config['show_trajectory'])
+    return
+if __name__ == "__main__":
+    main()

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .process import *
2	+ from .data_def import DataPoint, TaskType

src/utils/data_def.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from typing import Literal
+from models import *
+from .process import *
+# predefined processing logic for routine extraction tasks
+TaskType = Literal["NER", "RE", "EE", "Base"]
+class DataPoint:
+    def __init__(self,
+                 task: TaskType = "Base",
+                 instruction: str = "",
+                 text: str = "",
+                 output_schema: str = "",
+                 constraint: str = "",
+                 use_file: bool = False,
+                 file_path: str = "",
+                 truth: str = ""):
+        """
+        Initialize a DataPoint instance.
+        """
+        # task information
+        self.task = task
+        self.instruction = instruction
+        self.text = text
+        self.output_schema = output_schema
+        self.constraint = constraint
+        self.use_file = use_file
+        self.file_path = file_path
+        self.truth = extract_json_dict(truth)
+        # temp storage
+        self.print_schema = ""
+        self.distilled_text = ""
+        self.chunk_text_list = []
+        # result feedback
+        self.result_list = []
+        self.result_trajectory = {}
+        self.pred = ""
+    def set_constraint(self, constraint):
+        self.constraint = constraint
+    def set_schema(self, output_schema):
+        self.output_schema = output_schema
+    def set_pred(self, pred):
+        self.pred = pred
+    def set_result_list(self, result_list):
+        self.result_list = result_list
+    def set_distilled_text(self, distilled_text):
+        self.distilled_text = distilled_text
+    def update_trajectory(self, function, result):
+        if function not in self.result_trajectory:
+            self.result_trajectory.update({function: result})
+    def get_result_trajectory(self):
+        return {"instruction": self.instruction, "text": self.text, "constraint": self.constraint,  "trajectory": self.result_trajectory, "pred": self.pred}

src/utils/process.py ADDED Viewed

	@@ -0,0 +1,277 @@

+"""
+Data Processing Functions.
+Supports:
+- Segmentation of long text
+- Segmentation of file content
+"""
+from langchain_community.document_loaders import TextLoader, PyPDFLoader, Docx2txtLoader, BSHTMLLoader, JSONLoader
+from nltk.tokenize import sent_tokenize
+from collections import Counter
+import re
+import json
+import yaml
+import os
+import yaml
+import os
+import inspect
+import ast
+with open(os.path.join(os.path.dirname(__file__), "..", "config.yaml")) as file:
+    config = yaml.safe_load(file)
+# Load configuration
+def load_extraction_config(yaml_path):
+    # Read YAML content from the file path
+    if not os.path.exists(yaml_path):
+        print(f"Error: The config file '{yaml_path}' does not exist.")
+        return {}
+    with open(yaml_path, 'r') as file:
+        config = yaml.safe_load(file)
+    # Extract the 'extraction' configuration dictionary
+    model_config = config.get('model', {})
+    extraction_config = config.get('extraction', {})
+    # Model config
+    model_name_or_path = model_config.get('model_name_or_path', "")
+    model_category = model_config.get('category', "")
+    api_key = model_config.get('api_key', "")
+    base_url = model_config.get('base_url', "")
+    vllm_serve = model_config.get('vllm_serve', False)
+    # Extraction config
+    task = extraction_config.get('task', "")
+    instruction = extraction_config.get('instruction', "")
+    text = extraction_config.get('text', "")
+    output_schema = extraction_config.get('output_schema', "")
+    constraint = extraction_config.get('constraint', "")
+    truth = extraction_config.get('truth', "")
+    use_file = extraction_config.get('use_file', False)
+    file_path = extraction_config.get('file_path', "")
+    mode = extraction_config.get('mode', "quick")
+    update_case = extraction_config.get('update_case', False)
+    show_trajectory = extraction_config.get('show_trajectory', False)
+    # Construct config (optional: for constructing your knowledge graph)
+    if 'construct' in config:
+        construct_config = config.get('construct', {})
+        database = construct_config.get('database', "")
+        url = construct_config.get('url', "")
+        username = construct_config.get('username', "")
+        password = construct_config.get('password', "")
+        # Return a dictionary containing these variables
+        return {
+            "model": {
+                "model_name_or_path": model_name_or_path,
+                "category": model_category,
+                "api_key": api_key,
+                "base_url": base_url,
+                "vllm_serve": vllm_serve
+            },
+            "extraction": {
+                "task": task,
+                "instruction": instruction,
+                "text": text,
+                "output_schema": output_schema,
+                "constraint": constraint,
+                "truth": truth,
+                "use_file": use_file,
+                "file_path": file_path,
+                "mode": mode,
+                "update_case": update_case,
+                "show_trajectory": show_trajectory
+            },
+            "construct": {
+                "database": database,
+                "url": url,
+                "username": username,
+                "password": password
+            }
+        }
+    # Return a dictionary containing these variables
+    return {
+        "model": {
+            "model_name_or_path": model_name_or_path,
+            "category": model_category,
+            "api_key": api_key,
+            "base_url": base_url,
+            "vllm_serve": vllm_serve
+        },
+        "extraction": {
+            "task": task,
+            "instruction": instruction,
+            "text": text,
+            "output_schema": output_schema,
+            "constraint": constraint,
+            "truth": truth,
+            "use_file": use_file,
+            "file_path": file_path,
+            "mode": mode,
+            "update_case": update_case,
+            "show_trajectory": show_trajectory
+        }
+    }
+# Split the string text into chunks
+def chunk_str(text):
+    sentences = sent_tokenize(text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        token_count = len(sentence.split())
+        if current_length + token_count <= config['agent']['chunk_token_limit']:
+            current_chunk.append(sentence)
+            current_length += token_count
+        else:
+            if current_chunk:
+                chunks.append(' '.join(current_chunk))
+            current_chunk = [sentence]
+            current_length = token_count
+    if current_chunk:
+        chunks.append(' '.join(current_chunk))
+    return chunks
+# Load and split the content of a file
+def chunk_file(file_path):
+    pages = []
+    if file_path.endswith(".pdf"):
+        loader = PyPDFLoader(file_path)
+    elif file_path.endswith(".txt"):
+        loader = TextLoader(file_path)
+    elif file_path.endswith(".docx"):
+        loader = Docx2txtLoader(file_path)
+    elif file_path.endswith(".html"):
+        loader = BSHTMLLoader(file_path)
+    elif file_path.endswith(".json"):
+        loader = JSONLoader(file_path)
+    else:
+        raise ValueError("Unsupported file format")  # Inform that the format is unsupported
+    pages = loader.load_and_split()
+    docs = ""
+    for item in pages:
+        docs += item.page_content
+    pages = chunk_str(docs)
+    return pages
+def process_single_quotes(text):
+    result = re.sub(r"(?<!\w)'|'(?!\w)", '"', text)
+    return result
+def remove_empty_values(data):
+    def is_empty(value):
+        return value is None or value == [] or value == "" or value == {}
+    if isinstance(data, dict):
+        return {
+            k: remove_empty_values(v)
+            for k, v in data.items()
+            if not is_empty(v)
+        }
+    elif isinstance(data, list):
+        return [
+            remove_empty_values(item)
+            for item in data
+            if not is_empty(item)
+        ]
+    else:
+        return data
+def extract_json_dict(text):
+    if isinstance(text, dict):
+        return text
+    pattern = r'\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\})*)*\})*)*\}'
+    matches = re.findall(pattern, text)
+    if matches:
+        json_string = matches[-1]
+        json_string = process_single_quotes(json_string)
+        try:
+            json_dict = json.loads(json_string)
+            json_dict = remove_empty_values(json_dict)
+            if json_dict is None:
+                return "No valid information found."
+            return json_dict
+        except json.JSONDecodeError:
+            return json_string
+    else:
+        return text
+def good_case_wrapper(example: str):
+    if example is None or example == "":
+        return ""
+    example = f"\nHere are some examples:\n{example}\n(END OF EXAMPLES)\nRefer to the reasoning steps and analysis in the examples to help complete the extraction task below.\n\n"
+    return example
+def bad_case_wrapper(example: str):
+    if example is None or example == "":
+        return ""
+    example = f"\nHere are some examples of bad cases:\n{example}\n(END OF EXAMPLES)\nRefer to the reflection rules and reflection steps in the examples to help optimize the original result below.\n\n"
+    return example
+def example_wrapper(example: str):
+    if example is None or example == "":
+        return ""
+    example = f"\nHere are some examples:\n{example}\n(END OF EXAMPLES)\n\n"
+    return example
+def remove_redundant_space(s):
+    s = ' '.join(s.split())
+    s = re.sub(r"\s*(,|:|\(|\)|\.|_|;|'|-)\s*", r'\1', s)
+    return s
+def format_string(s):
+    s = remove_redundant_space(s)
+    s = s.lower()
+    s = s.replace('{','').replace('}','')
+    s = re.sub(',+', ',', s)
+    s = re.sub('\.+', '.', s)
+    s = re.sub(';+', ';', s)
+    s = s.replace('’', "'")
+    return s
+def calculate_metrics(y_truth: set, y_pred: set):
+    TP = len(y_truth & y_pred)
+    FN = len(y_truth - y_pred)
+    FP = len(y_pred - y_truth)
+    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
+    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
+    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+    return precision, recall, f1_score
+def current_function_name():
+    try:
+        stack = inspect.stack()
+        if len(stack) > 1:
+            outer_func_name = stack[1].function
+            return outer_func_name
+        else:
+            print("No caller function found")
+            return None
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        pass
+def normalize_obj(value):
+    if isinstance(value, dict):
+        return frozenset((k, normalize_obj(v)) for k, v in value.items())
+    elif isinstance(value, (list, set, tuple)):
+        return tuple(Counter(map(normalize_obj, value)).items())
+    elif isinstance(value, str):
+        return format_string(value)
+    return value
+def dict_list_to_set(data_list):
+    result_set = set()
+    try:
+        for dictionary in data_list:
+            value_tuple = tuple(format_string(value) for value in dictionary.values())
+            result_set.add(value_tuple)
+        return result_set
+    except Exception as e:
+        print (f"Failed to convert dictionary list to set: {data_list}")
+        return result_set

src/webui.py ADDED Viewed

	@@ -0,0 +1,401 @@

+"""
+For HuggingFace Space.
+"""
+import gradio as gr
+import json
+import random
+import re
+from models import *
+from pipeline import Pipeline
+examples = [
+    {
+        "task": "NER",
+        "mode": "quick",
+        "use_file": False,
+        "text": "Finally, every other year , ELRA organizes a major conference LREC , the International Language Resources and Evaluation Conference .",
+        "instruction": "",
+        "constraint": """["algorithm", "conference", "else", "product", "task", "field", "metrics", "organization", "researcher", "program language", "country", "location", "person", "university"]""",
+        "file_path": None,
+        "update_case": False,
+        "truth": "",
+    },
+    {
+        "task": "Base",
+        "mode": "quick",
+        "use_file": True,
+        "file_path": "data/input_files/Tulsi_Gabbard_News.html",
+        "instruction": "Extract key information from the given text.",
+        "constraint": "",
+        "text": "",
+        "update_case": False,
+        "truth": "",
+    },
+    {
+        "task": "RE",
+        "mode": "quick",
+        "use_file": False,
+        "text": "The aid group Doctors Without Borders said that since Saturday , more than 275 wounded people had been admitted and treated at Donka Hospital in the capital of Guinea , Conakry .",
+        "instruction": "",
+        "constraint": """["nationality", "country capital", "place of death", "children", "location contains", "place of birth", "place lived", "administrative division of country", "country of administrative divisions", "company", "neighborhood of", "company founders"]""",
+        "file_path": None,
+        "update_case": True,
+        "truth": """{"relation_list": [{"head": "Guinea", "tail": "Conakry", "relation": "country capital"}]}""",
+    },
+    {
+        "task": "EE",
+        "mode": "standard",
+        "use_file": False,
+        "text": "The file suggested to the user contains no software related to video streaming and simply carries the malicious payload that later compromises victim \u2019s account and sends out the deceptive messages to all victim \u2019s contacts .",
+        "instruction": "",
+        "constraint": """{"phishing": ["damage amount", "attack pattern", "tool", "victim", "place", "attacker", "purpose", "trusted entity", "time"], "data breach": ["damage amount", "attack pattern", "number of data", "number of victim", "tool", "compromised data", "victim", "place", "attacker", "purpose", "time"], "ransom": ["damage amount", "attack pattern", "payment method", "tool", "victim", "place", "attacker", "price", "time"], "discover vulnerability": ["vulnerable system", "vulnerability", "vulnerable system owner", "vulnerable system version", "supported platform", "common vulnerabilities and exposures", "capabilities", "time", "discoverer"], "patch vulnerability": ["vulnerable system", "vulnerability", "issues addressed", "vulnerable system version", "releaser", "supported platform", "common vulnerabilities and exposures", "patch number", "time", "patch"]}""",
+        "file_path": None,
+        "update_case": False,
+        "truth": "",
+    },
+    {
+        "task": "Triple",
+        "mode": "quick",
+        "use_file": True,
+        "file_path": "data/input_files/Artificial_Intelligence_Wikipedia.txt",
+        "instruction": "",
+        "constraint": """[["Person", "Place", "Event", "property"], ["Interpersonal", "Located", "Ownership", "Action"]]""",
+        "text": "",
+        "update_case": False,
+        "truth": "",
+    },
+    {
+        "task": "Base",
+        "mode": "quick",
+        "use_file": True,
+        "file_path": "data/input_files/Harry_Potter_Chapter1.pdf",
+        "instruction": "Extract main characters and the background setting from this chapter.",
+        "constraint": "",
+        "text": "",
+        "update_case": False,
+        "truth": "",
+    },
+]
+example_start_index = 0
+def create_interface():
+    with gr.Blocks(title="OneKE Demo", theme=gr.themes.Glass(text_size="lg")) as demo:
+        gr.HTML("""
+            <div style="text-align:center;">
+                <p align="center">
+                    <a>
+                        <img src="https://raw.githubusercontent.com/zjunlp/OneKE/refs/heads/main/figs/logo.png" width="240"/>
+                    </a>
+                </p>
+                <h1>OneKE: A Dockerized Schema-Guided LLM Agent-based Knowledge Extraction System</h1>
+                <p>
+                🌐[<a href="https://oneke.openkg.cn/" target="_blank">Home</a>]
+                📹[<a href="http://oneke.openkg.cn/demo.mp4" target="_blank">Video</a>]
+                📝[<a href="https://arxiv.org/abs/2412.20005v2" target="_blank">Paper</a>]
+                💻[<a href="https://github.com/zjunlp/OneKE" target="_blank">Code</a>]
+                </p>
+            </div>
+        """)
+        example_button_gr = gr.Button("🎲 Quick Start with an Example ���")
+        with gr.Row():
+            with gr.Column():
+                model_gr = gr.Dropdown(
+                    label="🪄 Select your Model",
+                    choices=["deepseek-chat", "deepseek-reasoner",
+                             "gpt-3.5-turbo", "gpt-4o-mini", "gpt-4o",
+                    ],
+                    value="deepseek-chat",
+                )
+                api_key_gr = gr.Textbox(
+                    label="🔑 Enter your API-Key",
+                    placeholder="Please enter your API-Key from ChatGPT or DeepSeek.",
+                    type="password",
+                )
+                base_url_gr = gr.Textbox(
+                    label="🔗 Enter your Base-URL",
+                    placeholder="Please leave this field empty if using the default Base-URL.",
+                )
+            with gr.Column():
+                task_gr = gr.Dropdown(
+                    label="🎯 Select your Task",
+                    choices=["Base", "NER", "RE", "EE", "Triple"],
+                    value="Base",
+                )
+                mode_gr = gr.Dropdown(
+                    label="🧭 Select your Mode",
+                    choices=["quick", "standard", "customized"],
+                    value="quick",
+                )
+                schema_agent_gr = gr.Dropdown(choices=["Not Required", "get_default_schema", "get_deduced_schema"], value="Not Required", label="🤖 Select your Schema-Agent", visible=False)
+                extraction_Agent_gr = gr.Dropdown(choices=["Not Required", "extract_information_direct", "extract_information_with_case"], value="Not Required", label="🤖 Select your Extraction-Agent", visible=False)
+                reflection_agent_gr = gr.Dropdown(choices=["Not Required", "reflect_with_case"], value="Not Required", label="🤖 Select your Reflection-Agent", visible=False)
+        use_file_gr = gr.Checkbox(label="📂 Use File", value=True)
+        file_path_gr = gr.File(label="📖 Upload a File", visible=True)
+        text_gr = gr.Textbox(label="📖 Text", lines=5, placeholder="Please enter the text to be processed.", visible=False)
+        instruction_gr = gr.Textbox(label="🕹️ Instruction", lines=3, placeholder="Please enter any type of information you want to extract here, for example: Help me extract all the place names.", visible=True)
+        constraint_gr = gr.Textbox(label="🕹️ Constraint", lines=3, placeholder="Please specify the types of entities, relations, events, or other relevant attributes in list format as per the task requirements.", visible=False)
+        update_case_gr = gr.Checkbox(label="💰 Update Case", value=False)
+        # update_schema_gr = gr.Checkbox(label="📟 Update Schema", value=False)
+        truth_gr = gr.Textbox(label="🪙 Truth", lines=2, placeholder="""Please enter the truth you want LLM know, for example: {"relation_list": [{"head": "Guinea", "tail": "Conakry", "relation": "country capital"}]}""", visible=False)
+        # selfschema_gr = gr.Textbox(label="📟 Schema", lines=5, placeholder="Enter your New Schema", visible=False,  interactive=True)
+        def get_model_category(model_name_or_path):
+            if model_name_or_path in ["gpt-3.5-turbo", "gpt-4o-mini", "gpt-4o", "o3-mini"]:
+                return ChatGPT
+            elif model_name_or_path in ["deepseek-chat", "deepseek-reasoner"]:
+                return DeepSeek
+            elif re.search(r'(?i)llama', model_name_or_path):
+                return LLaMA
+            elif re.search(r'(?i)qwen', model_name_or_path):
+                return Qwen
+            elif re.search(r'(?i)minicpm', model_name_or_path):
+                return MiniCPM
+            elif re.search(r'(?i)chatglm', model_name_or_path):
+                return ChatGLM
+            else:
+                return BaseEngine
+        def customized_mode(mode):
+            if mode == "customized":
+                return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
+            else:
+                return gr.update(visible=False, value="Not Required"), gr.update(visible=False, value="Not Required"), gr.update(visible=False, value="Not Required")
+        def update_fields(task):
+            if task == "Base" or task == "":
+                return gr.update(visible=True, label="🕹️ Instruction", lines=3,
+                                 placeholder="Please enter any type of information you want to extract here, for example: Help me extract all the place names."), gr.update(visible=False)
+            elif task == "NER":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", lines=3,
+                                                           placeholder="Please specify the entity types to extract in list format, and all types will be extracted by default if not specified.")
+            elif task == "RE":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", lines=3,
+                                                           placeholder="Please specify the relation types to extract in list format, and all types will be extracted by default if not specified.")
+            elif task == "EE":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", lines=3,
+                                                           placeholder="Please specify the event types and their corresponding extraction attributes in dictionary format, and all types and attributes will be extracted by default if not specified.")
+            elif task == "Triple":
+                return gr.update(visible=False), gr.update(visible=True, label="🕹️ Constraint", lines=3,
+                                                           placeholder="Please read the documentation and specify the types of triples in list format.")
+        def update_input_fields(use_file):
+            if use_file:
+                return gr.update(visible=False), gr.update(visible=True)
+            else:
+                return gr.update(visible=True), gr.update(visible=False)
+        def update_case(update_case):
+            if update_case:
+                return gr.update(visible=True)
+            else:
+                return gr.update(visible=False)
+        # def update_schema(update_schema):
+        #     if update_schema:
+        #         return gr.update(visible=True)
+        #     else:
+        #         return gr.update(visible=False)
+        def start_with_example():
+            global example_start_index
+            example = examples[example_start_index]
+            example_start_index += 1
+            if example_start_index >= len(examples):
+                example_start_index = 0
+            return (
+                gr.update(value=example["task"]),
+                gr.update(value=example["mode"]),
+                gr.update(value=example["use_file"]),
+                gr.update(value=example["file_path"], visible=example["use_file"]),
+                gr.update(value=example["text"], visible=not example["use_file"]),
+                gr.update(value=example["instruction"], visible=example["task"] == "Base"),
+                gr.update(value=example["constraint"], visible=example["task"] in ["NER", "RE", "EE", "Triple"]),
+                gr.update(value=example["update_case"]),
+                gr.update(value=example["truth"]), # gr.update(value=example["update_schema"]), gr.update(value=example["selfschema"]),
+                gr.update(value="Not Required", visible=False),
+                gr.update(value="Not Required", visible=False),
+                gr.update(value="Not Required", visible=False),
+            )
+        def submit(model, api_key, base_url, task, mode, instruction, constraint, text, use_file, file_path, update_case, truth, schema_agent, extraction_Agent, reflection_agent):
+            try:
+                ModelClass = get_model_category(model)
+                if base_url == "Default" or base_url == "":
+                    if api_key == "":
+                        pipeline = Pipeline(ModelClass(model_name_or_path=model))
+                    else:
+                        pipeline = Pipeline(ModelClass(model_name_or_path=model, api_key=api_key))
+                else:
+                    if api_key == "":
+                        pipeline = Pipeline(ModelClass(model_name_or_path=model, base_url=base_url))
+                    else:
+                        pipeline = Pipeline(ModelClass(model_name_or_path=model, api_key=api_key, base_url=base_url))
+                if task == "Base":
+                    instruction = instruction
+                    constraint = ""
+                else:
+                    instruction = ""
+                    constraint = constraint
+                if use_file:
+                    text = ""
+                    file_path = file_path
+                else:
+                    text = text
+                    file_path = None
+                if not update_case:
+                    truth = ""
+                agent3 = {}
+                if mode == "customized":
+                    if schema_agent not in ["", "Not Required"]:
+                        agent3["schema_agent"] = schema_agent
+                    if extraction_Agent not in ["", "Not Required"]:
+                        agent3["extraction_agent"] = extraction_Agent
+                    if reflection_agent not in ["", "Not Required"]:
+                        agent3["reflection_agent"] = reflection_agent
+                # use 'Pipeline'
+                _, _, ger_frontend_schema, ger_frontend_res = pipeline.get_extract_result(
+                    task=task,
+                    text=text,
+                    use_file=use_file,
+                    file_path=file_path,
+                    instruction=instruction,
+                    constraint=constraint,
+                    mode=mode,
+                    three_agents=agent3,
+                    isgui=True,
+                    update_case=update_case,
+                    truth=truth,
+                    output_schema="",
+                    show_trajectory=False,
+                )
+                ger_frontend_schema = str(ger_frontend_schema)
+                ger_frontend_res = json.dumps(ger_frontend_res, ensure_ascii=False, indent=4) if isinstance(ger_frontend_res, dict) else str(ger_frontend_res)
+                return ger_frontend_schema, ger_frontend_res, gr.update(value="", visible=False)
+            except Exception as e:
+                error_message = f"⚠️ Error:\n {str(e)}"
+                return "", "", gr.update(value=error_message, visible=True)
+        def clear_all():
+            return (
+                gr.update(value="Not Required", visible=False),  # sechema_agent
+                gr.update(value="Not Required", visible=False),  # extraction_Agent
+                gr.update(value="Not Required", visible=False),  # reflection_agent
+                gr.update(value="Base"),  # task
+                gr.update(value="quick"),  # mode
+                gr.update(value="", visible=False),  # instruction
+                gr.update(value="", visible=False),  # constraint
+                gr.update(value=True),  # use_file
+                gr.update(value="", visible=False),  # text
+                gr.update(value=None, visible=True),  # file_path
+                gr.update(value=False),  # update_case
+                gr.update(value="", visible=False), # truth # gr.update(value=False),  # update_schema gr.update(value="", visible=False),  # selfschema
+                gr.update(value=""), # py_output_gr
+                gr.update(value=""), # json_output_gr
+                gr.update(value="", visible=False),  # error_output
+            )
+        with gr.Row():
+            submit_button_gr = gr.Button("Submit", variant="primary", scale=8)
+            clear_button = gr.Button("Clear", scale=5)
+        gr.HTML("""
+		    <div style="width: 100%; text-align: center; font-size: 16px; font-weight: bold; position: relative; margin: 20px 0;">
+    			<span style="position: absolute; left: 0; top: 50%; transform: translateY(-50%); width: 45%; border-top: 1px solid #ccc;"></span>
+	    		<span style="position: relative; z-index: 1; background-color: white; padding: 0 10px;">Output:</span>
+			    <span style="position: absolute; right: 0; top: 50%; transform: translateY(-50%); width: 45%; border-top: 1px solid #ccc;"></span>
+		    </div>
+        """)
+        error_output_gr = gr.Textbox(label="😵‍💫 Ops, an Error Occurred", visible=False, interactive=False)
+        with gr.Row():
+            with gr.Column(scale=1):
+                py_output_gr = gr.Code(label="🤔 Generated Schema", language="python", lines=10, interactive=False)
+            with gr.Column(scale=1):
+                json_output_gr = gr.Code(label="😉 Final Answer", language="json", lines=10, interactive=False)
+        task_gr.change(fn=update_fields, inputs=task_gr, outputs=[instruction_gr, constraint_gr])
+        mode_gr.change(fn=customized_mode, inputs=mode_gr, outputs=[schema_agent_gr, extraction_Agent_gr, reflection_agent_gr])
+        use_file_gr.change(fn=update_input_fields, inputs=use_file_gr, outputs=[text_gr, file_path_gr])
+        update_case_gr.change(fn=update_case, inputs=update_case_gr, outputs=[truth_gr])
+        # update_schema_gr.change(fn=update_schema, inputs=update_schema_gr, outputs=[selfschema_gr])
+        example_button_gr.click(
+            fn=start_with_example,
+            inputs=[],
+            outputs=[
+                task_gr,
+                mode_gr,
+                use_file_gr,
+                file_path_gr,
+                text_gr,
+                instruction_gr,
+                constraint_gr,
+                update_case_gr,
+                truth_gr, # update_schema_gr, selfschema_gr,
+                schema_agent_gr,
+                extraction_Agent_gr,
+                reflection_agent_gr,
+            ],
+        )
+        submit_button_gr.click(
+            fn=submit,
+            inputs=[
+                model_gr,
+                api_key_gr,
+                base_url_gr,
+                task_gr,
+                mode_gr,
+                instruction_gr,
+                constraint_gr,
+                text_gr,
+                use_file_gr,
+                file_path_gr,
+                update_case_gr,
+                truth_gr, # update_schema_gr, selfschema_gr,
+                schema_agent_gr,
+                extraction_Agent_gr,
+                reflection_agent_gr,
+            ],
+            outputs=[py_output_gr, json_output_gr, error_output_gr],
+            show_progress=True,
+        )
+        clear_button.click(
+            fn=clear_all,
+            outputs=[
+                schema_agent_gr,
+                extraction_Agent_gr,
+                reflection_agent_gr,
+                task_gr,
+                mode_gr,
+                instruction_gr,
+                constraint_gr,
+                use_file_gr,
+                text_gr,
+                file_path_gr,
+                update_case_gr,
+                truth_gr, # update_schema_gr, selfschema_gr,
+                py_output_gr,
+                json_output_gr,
+                error_output_gr,
+            ],
+        )
+    return demo
+# Launch the front-end interface
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()