Spaces:

chenzihong
/

GraphGen

Runtime error

App Files Files Community

github-actions[bot] commited on 13 days ago

Commit

56943c6

1 Parent(s): fb9c306

Auto-sync from demo at Thu Aug 28 09:33:37 UTC 2025

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README_HF.md +43 -0
hf-repo/graphgen/configs/README.md +1 -0
hf-repo/graphgen/configs/aggregated_config.yaml +21 -0
hf-repo/graphgen/configs/atomic_config.yaml +21 -0
hf-repo/graphgen/configs/cot_config.yaml +13 -0
hf-repo/graphgen/configs/multi_hop_config.yaml +21 -0
hf-repo/graphgen/models/community/__init__.py +0 -0
hf-repo/graphgen/models/community/community_detector.py +95 -0
hf-repo/graphgen/models/search/db/__init__.py +0 -0
hf-repo/graphgen/models/search/db/uniprot_search.py +64 -0
hf-repo/graphgen/models/search/kg/__init__.py +0 -0
hf-repo/graphgen/models/search/kg/wiki_search.py +37 -0
hf-repo/graphgen/models/search/web/__init__.py +0 -0
hf-repo/graphgen/models/search/web/bing_search.py +43 -0
hf-repo/graphgen/models/search/web/google_search.py +45 -0
hf-repo/graphgen/models/vis/__init__.py +0 -0
hf-repo/graphgen/models/vis/community_visualizer.py +48 -0
hf-repo/graphgen/operators/generate/__init__.py +0 -0
hf-repo/graphgen/operators/generate/generate_cot.py +117 -0
hf-repo/graphgen/operators/kg/__init__.py +0 -0
hf-repo/graphgen/operators/kg/extract_kg.py +151 -0
hf-repo/graphgen/operators/kg/merge_kg.py +212 -0
hf-repo/graphgen/operators/kg/split_kg.py +381 -0
hf-repo/graphgen/operators/preprocess/__init__.py +0 -0
hf-repo/graphgen/operators/preprocess/resolute_coreference.py +33 -0
hf-repo/graphgen/operators/search/__init__.py +0 -0
hf-repo/graphgen/operators/search/db/__init__.py +0 -0
hf-repo/graphgen/operators/search/db/search_uniprot.py +0 -0
hf-repo/graphgen/operators/search/kg/__init__.py +0 -0
hf-repo/graphgen/operators/search/kg/search_wikipedia.py +58 -0
hf-repo/graphgen/operators/search/search_all.py +82 -0
hf-repo/graphgen/operators/search/web/__init__.py +0 -0
hf-repo/graphgen/operators/search/web/search_bing.py +53 -0
hf-repo/graphgen/operators/search/web/search_google.py +49 -0
hf-repo/graphgen/templates/community/__init__.py +2 -0
hf-repo/graphgen/templates/community/cot_generation.py +87 -0
hf-repo/graphgen/templates/community/cot_template_design.py +107 -0
hf-repo/graphgen/utils/file.py +24 -0
hf-repo/hf-repo/LICENSE +201 -0
hf-repo/hf-repo/app.py +586 -0
hf-repo/hf-repo/graphgen/__init__.py +0 -0
hf-repo/hf-repo/graphgen/evaluate.py +142 -0
hf-repo/hf-repo/graphgen/generate.py +103 -0
hf-repo/hf-repo/graphgen/graphgen.py +395 -0
hf-repo/hf-repo/graphgen/judge.py +60 -0
hf-repo/hf-repo/graphgen/models/__init__.py +45 -0
hf-repo/hf-repo/graphgen/models/embed/__init__.py +0 -0
hf-repo/hf-repo/graphgen/models/embed/embedding.py +29 -0
hf-repo/hf-repo/graphgen/models/evaluate/__init__.py +0 -0
hf-repo/hf-repo/graphgen/models/evaluate/base_evaluator.py +51 -0

README_HF.md ADDED Viewed

	@@ -0,0 +1,43 @@

+---
+title: GraphGen Demo
+emoji: 📊
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: "4.44.0"
+python_version: "3.10"
+app_file: webui/app.py
+suggested_hardware: cpu-basic
+pinned: false
+short_description: "Interactive knowledge-driven synthetic data generation demo powered by GraphGen & Gradio"
+tags:
+  - synthetic-data
+  - knowledge-graph
+  - gradio-demo
+---
+# GraphGen Space 🤖📊
+This is the **official Hugging Face Space** for [GraphGen](https://github.com/open-sciencelab/GraphGen) – a framework that leverages knowledge graphs to generate high-quality synthetic question–answer pairs for supervised fine-tuning of LLMs.
+🔗 Paper: [arXiv 2505.20416](https://arxiv.org/abs/2505.20416)
+🐙 GitHub: [open-sciencelab/GraphGen](https://github.com/open-sciencelab/GraphGen)
+---
+## How to use (🖱️ 3 clicks)
+1. Open the **Gradio app** above.
+2. Upload or paste your source text → click **Generate KG**.
+3. Download the generated QA pairs directly.
+---
+## Local quick start (optional)
+```bash
+git clone https://github.com/open-sciencelab/GraphGen
+cd GraphGen
+uv venv --python 3.10 && uv pip install -r requirements.txt
+uv run webui/app.py   # http://localhost:7860
+```

hf-repo/graphgen/configs/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Configs for GraphGen

hf-repo/graphgen/configs/aggregated_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: aggregated # atomic, aggregated, multi_hop, cot
+output_data_format: ChatML # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 5 # maximum depth for graph traversal
+  max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

hf-repo/graphgen/configs/atomic_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: atomic # atomic, aggregated, multi_hop, cot
+output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 3 # maximum depth for graph traversal
+  max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

hf-repo/graphgen/configs/cot_config.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: cot # atomic, aggregated, multi_hop, cot
+output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+method_params:
+  method: leiden
+  max_size: 20 # Maximum size of communities
+  use_lcc: false
+  random_seed: 42

hf-repo/graphgen/configs/multi_hop_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
+output_data_format: ChatML # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 1 # maximum depth for graph traversal
+  max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

hf-repo/graphgen/models/community/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/models/community/community_detector.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List
+from graphgen.models.storage.networkx_storage import NetworkXStorage
+@dataclass
+class CommunityDetector:
+    """Class for community detection algorithms."""
+    graph_storage: NetworkXStorage = None
+    method: str = "leiden"
+    method_params: Dict[str, Any] = None
+    async def detect_communities(self) -> Dict[str, int]:
+        if self.method == "leiden":
+            return await self._leiden_communities(**self.method_params or {})
+        raise ValueError(f"Unknown community detection method: {self.method}")
+    async def get_graph(self):
+        return await self.graph_storage.get_graph()
+    async def _leiden_communities(
+        self, max_size: int = None, **kwargs
+    ) -> Dict[str, int]:
+        """
+        Detect communities using the Leiden algorithm.
+        If max_size is given, any community larger than max_size will be split
+        into smaller sub-communities each having at most max_size nodes.
+        """
+        import igraph as ig
+        import networkx as nx
+        from leidenalg import ModularityVertexPartition, find_partition
+        graph = await self.get_graph()
+        graph.remove_nodes_from(list(nx.isolates(graph)))
+        ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
+        random_seed = kwargs.get("random_seed", 42)
+        use_lcc = kwargs.get("use_lcc", False)
+        communities: Dict[str, int] = {}
+        if use_lcc:
+            lcc = ig_graph.components().giant()
+            partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
+            for part, cluster in enumerate(partition):
+                for v in cluster:
+                    communities[lcc.vs[v]["name"]] = part
+        else:
+            offset = 0
+            for component in ig_graph.components():
+                subgraph = ig_graph.induced_subgraph(component)
+                partition = find_partition(
+                    subgraph, ModularityVertexPartition, seed=random_seed
+                )
+                for part, cluster in enumerate(partition):
+                    for v in cluster:
+                        original_node = subgraph.vs[v]["name"]
+                        communities[original_node] = part + offset
+                offset += len(partition)
+        # split large communities if max_size is specified
+        if max_size is None or max_size <= 0:
+            return communities
+        return await self._split_communities(communities, max_size)
+    @staticmethod
+    async def _split_communities(
+        communities: Dict[str, int], max_size: int
+    ) -> Dict[str, int]:
+        """
+        Split communities larger than max_size into smaller sub-communities.
+        """
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for node, cid in communities.items():
+            cid2nodes[cid].append(node)
+        new_communities: Dict[str, int] = {}
+        new_cid = 0
+        for cid, nodes in cid2nodes.items():
+            if len(nodes) <= max_size:
+                for n in nodes:
+                    new_communities[n] = new_cid
+                new_cid += 1
+            else:
+                for start in range(0, len(nodes), max_size):
+                    sub = nodes[start : start + max_size]
+                    for n in sub:
+                        new_communities[n] = new_cid
+                    new_cid += 1
+        return new_communities

hf-repo/graphgen/models/search/db/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/models/search/db/uniprot_search.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from dataclasses import dataclass
+import requests
+from fastapi import HTTPException
+from graphgen.utils import logger
+UNIPROT_BASE = "https://rest.uniprot.org/uniprotkb/search"
+@dataclass
+class UniProtSearch:
+    """
+    UniProt Search client to search with UniProt.
+    1) Get the protein by accession number.
+    2) Search with keywords or protein names.
+    """
+    def get_entry(self, accession: str) -> dict:
+        """
+        Get the UniProt entry by accession number(e.g., P04637).
+        """
+        url = f"{UNIPROT_BASE}/{accession}.json"
+        return self._safe_get(url).json()
+    def search(
+        self,
+        query: str,
+        *,
+        size: int = 10,
+        cursor: str = None,
+        fields: list[str] = None,
+    ) -> dict:
+        """
+        Search UniProt with a query string.
+        :param query: The search query.
+        :param size: The number of results to return.
+        :param cursor: The cursor for pagination.
+        :param fields: The fields to return in the response.
+        :return: A dictionary containing the search results.
+        """
+        params = {
+            "query": query,
+            "size": size,
+        }
+        if cursor:
+            params["cursor"] = cursor
+        if fields:
+            params["fields"] = ",".join(fields)
+        url = UNIPROT_BASE
+        return self._safe_get(url, params=params).json()
+    @staticmethod
+    def _safe_get(url: str, params: dict = None) -> requests.Response:
+        r = requests.get(
+            url,
+            params=params,
+            headers={"Accept": "application/json"},
+            timeout=10,
+        )
+        if not r.ok:
+            logger.error("Search engine error: %s", r.text)
+            raise HTTPException(r.status_code, "Search engine error.")
+        return r

hf-repo/graphgen/models/search/kg/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/models/search/kg/wiki_search.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from dataclasses import dataclass
+from typing import List, Union
+import wikipedia
+from wikipedia import set_lang
+from graphgen.utils import detect_main_language, logger
+@dataclass
+class WikiSearch:
+    @staticmethod
+    def set_language(language: str):
+        assert language in ["en", "zh"], "Only support English and Chinese"
+        set_lang(language)
+    async def search(self, query: str, num_results: int = 1) -> Union[List[str], None]:
+        self.set_language(detect_main_language(query))
+        return wikipedia.search(query, results=num_results, suggestion=False)
+    async def summary(self, query: str) -> Union[str, None]:
+        self.set_language(detect_main_language(query))
+        try:
+            result = wikipedia.summary(query, auto_suggest=False, redirect=False)
+        except wikipedia.exceptions.DisambiguationError as e:
+            logger.error("DisambiguationError: %s", e)
+            result = None
+        return result
+    async def page(self, query: str) -> Union[str, None]:
+        self.set_language(detect_main_language(query))
+        try:
+            result = wikipedia.page(query, auto_suggest=False, redirect=False).content
+        except wikipedia.exceptions.DisambiguationError as e:
+            logger.error("DisambiguationError: %s", e)
+            result = None
+        return result

hf-repo/graphgen/models/search/web/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/models/search/web/bing_search.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from dataclasses import dataclass
+import requests
+from fastapi import HTTPException
+from graphgen.utils import logger
+BING_SEARCH_V7_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
+BING_MKT = "en-US"
+@dataclass
+class BingSearch:
+    """
+    Bing Search client to search with Bing.
+    """
+    subscription_key: str
+    def search(self, query: str, num_results: int = 1):
+        """
+        Search with Bing and return the contexts.
+        :param query: The search query.
+        :param num_results: The number of results to return.
+        :return: A list of search results.
+        """
+        params = {"q": query, "mkt": BING_MKT, "count": num_results}
+        response = requests.get(
+            BING_SEARCH_V7_ENDPOINT,
+            headers={"Ocp-Apim-Subscription-Key": self.subscription_key},
+            params=params,
+            timeout=10,
+        )
+        if not response.ok:
+            logger.error("Search engine error: %s", response.text)
+            raise HTTPException(response.status_code, "Search engine error.")
+        json_content = response.json()
+        try:
+            contexts = json_content["webPages"]["value"][:num_results]
+        except KeyError:
+            logger.error("Error encountered: %s", json_content)
+            return []
+        return contexts

hf-repo/graphgen/models/search/web/google_search.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from dataclasses import dataclass
+import requests
+from fastapi import HTTPException
+from graphgen.utils import logger
+GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1"
+@dataclass
+class GoogleSearch:
+    def __init__(self, subscription_key: str, cx: str):
+        """
+        Initialize the Google Search client with the subscription key and custom search engine ID.
+        :param subscription_key: Your Google API subscription key.
+        :param cx: Your custom search engine ID.
+        """
+        self.subscription_key = subscription_key
+        self.cx = cx
+    def search(self, query: str, num_results: int = 1):
+        """
+        Search with Google and return the contexts.
+        :param query: The search query.
+        :param num_results: The number of results to return.
+        :return: A list of search results.
+        """
+        params = {
+            "key": self.subscription_key,
+            "cx": self.cx,
+            "q": query,
+            "num": num_results,
+        }
+        response = requests.get(GOOGLE_SEARCH_ENDPOINT, params=params, timeout=10)
+        if not response.ok:
+            logger.error("Search engine error: %s", response.text)
+            raise HTTPException(response.status_code, "Search engine error.")
+        json_content = response.json()
+        try:
+            contexts = json_content["items"][:num_results]
+        except KeyError:
+            logger.error("Error encountered: %s", json_content)
+            return []
+        return contexts

hf-repo/graphgen/models/vis/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/models/vis/community_visualizer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from dataclasses import dataclass
+from typing import Dict
+import matplotlib.pyplot as plt
+import networkx as nx
+@dataclass
+class Visualizer:
+    """
+    Class for visualizing graphs using NetworkX and Matplotlib.
+    """
+    graph: nx.Graph = None
+    communities: Dict[str, int] = None
+    layout: str = "spring"
+    max_nodes: int = 1000
+    node_size: int = 10
+    alpha: float = 0.6
+    def visualize(self, save_path: str = None):
+        n = self.graph.number_of_nodes()
+        if self.layout == "spring":
+            k = max(0.1, 1.0 / (n**0.5))
+            pos = nx.spring_layout(self.graph, k=k, seed=42)
+        else:
+            raise ValueError(f"Unknown layout: {self.layout}")
+        plt.figure(figsize=(10, 10))
+        node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()]
+        nx.draw_networkx_nodes(
+            self.graph,
+            pos,
+            node_size=self.node_size,
+            node_color=node_colors,
+            cmap=plt.cm.tab20,
+            alpha=self.alpha,
+        )
+        nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2)
+        plt.axis("off")
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            print("Saved to", save_path)
+        else:
+            plt.show()

hf-repo/graphgen/operators/generate/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/generate/generate_cot.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import asyncio
+from typing import Dict, List, Tuple
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import CommunityDetector, NetworkXStorage, OpenAIModel
+from graphgen.templates import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language
+async def generate_cot(
+    graph_storage: NetworkXStorage,
+    synthesizer_llm_client: OpenAIModel,
+    method_params: Dict = None,
+):
+    method = method_params.get("method", "leiden")
+    detector = CommunityDetector(
+        graph_storage=graph_storage, method=method, method_params=method_params
+    )
+    results = await detector.detect_communities()
+    # Convert results to a format suitable for summarization
+    communities = {}
+    for node, community_id in results.items():
+        if community_id not in communities:
+            communities[community_id] = []
+        communities[community_id].append(node)
+    if not communities:
+        return {}
+    semaphore = asyncio.Semaphore(value=1000)
+    async def _generate_from_single_community(
+        c_id: int, nodes: List[str]
+    ) -> Tuple[int, Tuple[str, str, str]]:
+        """Summarize a single community."""
+        async with semaphore:
+            entities: List[str] = []
+            relationships: List[str] = []
+            for n in nodes:
+                node_data = await graph_storage.get_node(n)
+                if node_data is not None:
+                    entities.append(f"({n}: {node_data.get('description')})")
+                edges = await graph_storage.get_node_edges(n)
+                for edge in edges:
+                    target = edge[1]
+                    if target in nodes:
+                        edge_data = await graph_storage.get_edge(n, target)
+                        relationships.append(
+                            f"({n}) - [{edge_data['description']}] -> ({target})"
+                        )
+            entities_str = "\n".join(entities)
+            relationships_str = "\n".join(relationships)
+            language = (
+                "English"
+                if detect_main_language(entities_str + relationships_str) == "en"
+                else "Chinese"
+            )
+            prompt = COT_TEMPLATE_DESIGN_PROMPT[language]["TEMPLATE"].format(
+                entities=entities_str,
+                relationships=relationships_str,
+            )
+            cot_template = await synthesizer_llm_client.generate_answer(prompt)
+            if "问题：" in cot_template and "推理路径设计：" in cot_template:
+                question = cot_template.split("问题：")[1].split("推理路径设计：")[0].strip()
+                reasoning_path = cot_template.split("推理路径设计：")[1].strip()
+            elif (
+                "Question:" in cot_template and "Reasoning-Path Design:" in cot_template
+            ):
+                question = (
+                    cot_template.split("Question:")[1]
+                    .split("Reasoning-Path Design:")[0]
+                    .strip()
+                )
+                reasoning_path = cot_template.split("Reasoning-Path Design:")[1].strip()
+            else:
+                raise ValueError("COT template format is incorrect.")
+            prompt = COT_GENERATION_PROMPT[language]["TEMPLATE"].format(
+                entities=entities_str,
+                relationships=relationships_str,
+                question=question,
+                reasoning_template=reasoning_path,
+            )
+            cot_answer = await synthesizer_llm_client.generate_answer(prompt)
+            return c_id, (question, reasoning_path, cot_answer)
+    cid_nodes = list(communities.items())
+    results: Dict = {}
+    async for coro in tqdm_async(
+        asyncio.as_completed(
+            [_generate_from_single_community(cid, nodes) for cid, nodes in cid_nodes]
+        ),
+        total=len(cid_nodes),
+        desc="[Generating COT] Generating CoT data from communities",
+        unit="community",
+    ):
+        cid, (q, r, a) = await coro
+        results[compute_content_hash(q)] = {
+            "question": q,
+            "reasoning_path": r,
+            "answer": a,
+        }
+    return results

hf-repo/graphgen/operators/kg/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/kg/extract_kg.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import asyncio
+import re
+from collections import defaultdict
+from typing import List
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import Chunk, OpenAIModel, Tokenizer
+from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
+from graphgen.templates import KG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_if_chinese,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    pack_history_conversations,
+    split_string_by_multi_markers,
+)
+# pylint: disable=too-many-statements
+async def extract_kg(
+    llm_client: OpenAIModel,
+    kg_instance: BaseGraphStorage,
+    tokenizer_instance: Tokenizer,
+    chunks: List[Chunk],
+    progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000,
+):
+    """
+    :param llm_client: Synthesizer LLM model to extract entities and relationships
+    :param kg_instance
+    :param tokenizer_instance
+    :param chunks
+    :param progress_bar: Gradio progress bar to show the progress of the extraction
+    :param max_concurrent
+    :return:
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def _process_single_content(chunk: Chunk, max_loop: int = 3):
+        async with semaphore:
+            chunk_id = chunk.id
+            content = chunk.content
+            if detect_if_chinese(content):
+                language = "Chinese"
+            else:
+                language = "English"
+            KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
+            hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
+                **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
+            )
+            final_result = await llm_client.generate_answer(hint_prompt)
+            logger.info("First result: %s", final_result)
+            history = pack_history_conversations(hint_prompt, final_result)
+            for loop_index in range(max_loop):
+                if_loop_result = await llm_client.generate_answer(
+                    text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history
+                )
+                if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+                if if_loop_result != "yes":
+                    break
+                glean_result = await llm_client.generate_answer(
+                    text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
+                )
+                logger.info("Loop %s glean: %s", loop_index, glean_result)
+                history += pack_history_conversations(
+                    KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
+                )
+                final_result += glean_result
+                if loop_index == max_loop - 1:
+                    break
+            records = split_string_by_multi_markers(
+                final_result,
+                [
+                    KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                    KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+                ],
+            )
+            nodes = defaultdict(list)
+            edges = defaultdict(list)
+            for record in records:
+                record = re.search(r"\((.*)\)", record)
+                if record is None:
+                    continue
+                record = record.group(1)  # 提取括号内的内容
+                record_attributes = split_string_by_multi_markers(
+                    record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
+                )
+                entity = await handle_single_entity_extraction(
+                    record_attributes, chunk_id
+                )
+                if entity is not None:
+                    nodes[entity["entity_name"]].append(entity)
+                    continue
+                relation = await handle_single_relationship_extraction(
+                    record_attributes, chunk_id
+                )
+                if relation is not None:
+                    edges[(relation["src_id"], relation["tgt_id"])].append(relation)
+            return dict(nodes), dict(edges)
+    results = []
+    chunk_number = len(chunks)
+    async for result in tqdm_async(
+        asyncio.as_completed([_process_single_content(c) for c in chunks]),
+        total=len(chunks),
+        desc="[2/4]Extracting entities and relationships from chunks",
+        unit="chunk",
+    ):
+        try:
+            if progress_bar is not None:
+                progress_bar(
+                    len(results) / chunk_number,
+                    desc="[3/4]Extracting entities and relationships from chunks",
+                )
+            results.append(await result)
+            if progress_bar is not None and len(results) == chunk_number:
+                progress_bar(
+                    1, desc="[3/4]Extracting entities and relationships from chunks"
+                )
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(
+                "Error occurred while extracting entities and relationships from chunks: %s",
+                e,
+            )
+    nodes = defaultdict(list)
+    edges = defaultdict(list)
+    for n, e in results:
+        for k, v in n.items():
+            nodes[k].extend(v)
+        for k, v in e.items():
+            edges[tuple(sorted(k))].extend(v)
+    await merge_nodes(nodes, kg_instance, llm_client, tokenizer_instance)
+    await merge_edges(edges, kg_instance, llm_client, tokenizer_instance)
+    return kg_instance

hf-repo/graphgen/operators/kg/merge_kg.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import asyncio
+from collections import Counter
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import Tokenizer, TopkTokenModel
+from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
+from graphgen.utils import detect_main_language, logger
+from graphgen.utils.format import split_string_by_multi_markers
+async def _handle_kg_summary(
+    entity_or_relation_name: str,
+    description: str,
+    llm_client: TopkTokenModel,
+    tokenizer_instance: Tokenizer,
+    max_summary_tokens: int = 200,
+) -> str:
+    """
+    处理实体或关系的描述信息
+    :param entity_or_relation_name
+    :param description
+    :param llm_client
+    :param tokenizer_instance
+    :param max_summary_tokens
+    :return: new description
+    """
+    language = detect_main_language(description)
+    if language == "en":
+        language = "English"
+    else:
+        language = "Chinese"
+    KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
+    tokens = tokenizer_instance.encode_string(description)
+    if len(tokens) < max_summary_tokens:
+        return description
+    use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens])
+    prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
+        entity_name=entity_or_relation_name,
+        description_list=use_description.split("<SEP>"),
+        **KG_SUMMARIZATION_PROMPT["FORMAT"],
+    )
+    new_description = await llm_client.generate_answer(prompt)
+    logger.info(
+        "Entity or relation %s summary: %s", entity_or_relation_name, new_description
+    )
+    return new_description
+async def merge_nodes(
+    nodes_data: dict,
+    kg_instance: BaseGraphStorage,
+    llm_client: TopkTokenModel,
+    tokenizer_instance: Tokenizer,
+    max_concurrent: int = 1000,
+):
+    """
+    Merge nodes
+    :param nodes_data
+    :param kg_instance
+    :param llm_client
+    :param tokenizer_instance
+    :param max_concurrent
+    :return
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def process_single_node(entity_name: str, node_data: list[dict]):
+        async with semaphore:
+            entity_types = []
+            source_ids = []
+            descriptions = []
+            node = await kg_instance.get_node(entity_name)
+            if node is not None:
+                entity_types.append(node["entity_type"])
+                source_ids.extend(
+                    split_string_by_multi_markers(node["source_id"], ["<SEP>"])
+                )
+                descriptions.append(node["description"])
+            # 统计当前节点数据和已有节点数据的entity_type出现次数，取出现次数最多的entity_type
+            entity_type = sorted(
+                Counter([dp["entity_type"] for dp in node_data] + entity_types).items(),
+                key=lambda x: x[1],
+                reverse=True,
+            )[0][0]
+            description = "<SEP>".join(
+                sorted(set([dp["description"] for dp in node_data] + descriptions))
+            )
+            description = await _handle_kg_summary(
+                entity_name, description, llm_client, tokenizer_instance
+            )
+            source_id = "<SEP>".join(
+                set([dp["source_id"] for dp in node_data] + source_ids)
+            )
+            node_data = {
+                "entity_type": entity_type,
+                "description": description,
+                "source_id": source_id,
+            }
+            await kg_instance.upsert_node(entity_name, node_data=node_data)
+            node_data["entity_name"] = entity_name
+            return node_data
+    logger.info("Inserting entities into storage...")
+    entities_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [process_single_node(k, v) for k, v in nodes_data.items()]
+        ),
+        total=len(nodes_data),
+        desc="Inserting entities into storage",
+        unit="entity",
+    ):
+        try:
+            entities_data.append(await result)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error occurred while inserting entities into storage: %s", e)
+async def merge_edges(
+    edges_data: dict,
+    kg_instance: BaseGraphStorage,
+    llm_client: TopkTokenModel,
+    tokenizer_instance: Tokenizer,
+    max_concurrent: int = 1000,
+):
+    """
+    Merge edges
+    :param edges_data
+    :param kg_instance
+    :param llm_client
+    :param tokenizer_instance
+    :param max_concurrent
+    :return
+    """
+    semaphore = asyncio.Semaphore(max_concurrent)
+    async def process_single_edge(src_id: str, tgt_id: str, edge_data: list[dict]):
+        async with semaphore:
+            source_ids = []
+            descriptions = []
+            edge = await kg_instance.get_edge(src_id, tgt_id)
+            if edge is not None:
+                source_ids.extend(
+                    split_string_by_multi_markers(edge["source_id"], ["<SEP>"])
+                )
+                descriptions.append(edge["description"])
+            description = "<SEP>".join(
+                sorted(set([dp["description"] for dp in edge_data] + descriptions))
+            )
+            source_id = "<SEP>".join(
+                set([dp["source_id"] for dp in edge_data] + source_ids)
+            )
+            for insert_id in [src_id, tgt_id]:
+                if not await kg_instance.has_node(insert_id):
+                    await kg_instance.upsert_node(
+                        insert_id,
+                        node_data={
+                            "source_id": source_id,
+                            "description": description,
+                            "entity_type": "UNKNOWN",
+                        },
+                    )
+            description = await _handle_kg_summary(
+                f"({src_id}, {tgt_id})", description, llm_client, tokenizer_instance
+            )
+            await kg_instance.upsert_edge(
+                src_id,
+                tgt_id,
+                edge_data={"source_id": source_id, "description": description},
+            )
+            edge_data = {"src_id": src_id, "tgt_id": tgt_id, "description": description}
+            return edge_data
+    logger.info("Inserting relationships into storage...")
+    relationships_data = []
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [
+                process_single_edge(src_id, tgt_id, v)
+                for (src_id, tgt_id), v in edges_data.items()
+            ]
+        ),
+        total=len(edges_data),
+        desc="Inserting relationships into storage",
+        unit="relationship",
+    ):
+        try:
+            relationships_data.append(await result)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(
+                "Error occurred while inserting relationships into storage: %s", e
+            )

hf-repo/graphgen/operators/kg/split_kg.py ADDED Viewed

	@@ -0,0 +1,381 @@

+import random
+from collections import defaultdict
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import NetworkXStorage, TraverseStrategy
+from graphgen.utils import logger
+async def _get_node_info(
+    node_id: str,
+    graph_storage: NetworkXStorage,
+) -> dict:
+    """
+    Get node info
+    :param node_id: node id
+    :param graph_storage: graph storage instance
+    :return: node info
+    """
+    node_data = await graph_storage.get_node(node_id)
+    return {"node_id": node_id, **node_data}
+def _get_level_n_edges_by_max_width(
+    edge_adj_list: dict,
+    node_dict: dict,
+    edges: list,
+    nodes,
+    src_edge: tuple,
+    max_depth: int,
+    bidirectional: bool,
+    max_extra_edges: int,
+    edge_sampling: str,
+    loss_strategy: str = "only_edge",
+) -> list:
+    """
+    Get level n edges for an edge.
+    n is decided by max_depth in traverse_strategy
+    :param edge_adj_list
+    :param node_dict
+    :param edges
+    :param nodes
+    :param src_edge
+    :param max_depth
+    :param bidirectional
+    :param max_extra_edges
+    :param edge_sampling
+    :return: level n edges
+    """
+    src_id, tgt_id, _ = src_edge
+    level_n_edges = []
+    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
+    while max_depth > 0 and max_extra_edges > 0:
+        max_depth -= 1
+        candidate_edges = [
+            edges[edge_id]
+            for node in start_nodes
+            for edge_id in edge_adj_list[node]
+            if not edges[edge_id][2].get("visited", False)
+        ]
+        if not candidate_edges:
+            break
+        if len(candidate_edges) >= max_extra_edges:
+            if loss_strategy == "both":
+                er_tuples = [
+                    ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
+                    for edge in candidate_edges
+                ]
+                candidate_edges = _sort_tuples(er_tuples, edge_sampling)[
+                    :max_extra_edges
+                ]
+            elif loss_strategy == "only_edge":
+                candidate_edges = _sort_edges(candidate_edges, edge_sampling)[
+                    :max_extra_edges
+                ]
+            else:
+                raise ValueError(f"Invalid loss strategy: {loss_strategy}")
+            for edge in candidate_edges:
+                level_n_edges.append(edge)
+                edge[2]["visited"] = True
+            break
+        max_extra_edges -= len(candidate_edges)
+        new_start_nodes = set()
+        for edge in candidate_edges:
+            level_n_edges.append(edge)
+            edge[2]["visited"] = True
+            if not edge[0] in start_nodes:
+                new_start_nodes.add(edge[0])
+            if not edge[1] in start_nodes:
+                new_start_nodes.add(edge[1])
+        start_nodes = new_start_nodes
+    return level_n_edges
+def _get_level_n_edges_by_max_tokens(
+    edge_adj_list: dict,
+    node_dict: dict,
+    edges: list,
+    nodes: list,
+    src_edge: tuple,
+    max_depth: int,
+    bidirectional: bool,
+    max_tokens: int,
+    edge_sampling: str,
+    loss_strategy: str = "only_edge",
+) -> list:
+    """
+    Get level n edges for an edge.
+    n is decided by max_depth in traverse_strategy.
+    :param edge_adj_list
+    :param node_dict
+    :param edges
+    :param nodes
+    :param src_edge
+    :param max_depth
+    :param bidirectional
+    :param max_tokens
+    :param edge_sampling
+    :return: level n edges
+    """
+    src_id, tgt_id, src_edge_data = src_edge
+    max_tokens -= (
+        src_edge_data["length"]
+        + nodes[node_dict[src_id]][1]["length"]
+        + nodes[node_dict[tgt_id]][1]["length"]
+    )
+    level_n_edges = []
+    start_nodes = {tgt_id} if not bidirectional else {src_id, tgt_id}
+    temp_nodes = {src_id, tgt_id}
+    while max_depth > 0 and max_tokens > 0:
+        max_depth -= 1
+        candidate_edges = [
+            edges[edge_id]
+            for node in start_nodes
+            for edge_id in edge_adj_list[node]
+            if not edges[edge_id][2].get("visited", False)
+        ]
+        if not candidate_edges:
+            break
+        if loss_strategy == "both":
+            er_tuples = [
+                ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
+                for edge in candidate_edges
+            ]
+            candidate_edges = _sort_tuples(er_tuples, edge_sampling)
+        elif loss_strategy == "only_edge":
+            candidate_edges = _sort_edges(candidate_edges, edge_sampling)
+        else:
+            raise ValueError(f"Invalid loss strategy: {loss_strategy}")
+        for edge in candidate_edges:
+            max_tokens -= edge[2]["length"]
+            if not edge[0] in temp_nodes:
+                max_tokens -= nodes[node_dict[edge[0]]][1]["length"]
+            if not edge[1] in temp_nodes:
+                max_tokens -= nodes[node_dict[edge[1]]][1]["length"]
+            if max_tokens < 0:
+                return level_n_edges
+            level_n_edges.append(edge)
+            edge[2]["visited"] = True
+            temp_nodes.add(edge[0])
+            temp_nodes.add(edge[1])
+        new_start_nodes = set()
+        for edge in candidate_edges:
+            if not edge[0] in start_nodes:
+                new_start_nodes.add(edge[0])
+            if not edge[1] in start_nodes:
+                new_start_nodes.add(edge[1])
+        start_nodes = new_start_nodes
+    return level_n_edges
+def _sort_tuples(er_tuples: list, edge_sampling: str) -> list:
+    """
+    Sort edges with edge sampling strategy
+    :param er_tuples: [(nodes:list, edge:tuple)]
+    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+    :return: sorted edges
+    """
+    if edge_sampling == "random":
+        er_tuples = random.sample(er_tuples, len(er_tuples))
+    elif edge_sampling == "min_loss":
+        er_tuples = sorted(
+            er_tuples,
+            key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
+        )
+    elif edge_sampling == "max_loss":
+        er_tuples = sorted(
+            er_tuples,
+            key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
+            reverse=True,
+        )
+    else:
+        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+    edges = [edge for _, edge in er_tuples]
+    return edges
+def _sort_edges(edges: list, edge_sampling: str) -> list:
+    """
+    Sort edges with edge sampling strategy
+    :param edges: total edges
+    :param edge_sampling: edge sampling strategy (random, min_loss, max_loss)
+    :return: sorted edges
+    """
+    if edge_sampling == "random":
+        random.shuffle(edges)
+    elif edge_sampling == "min_loss":
+        edges = sorted(edges, key=lambda x: x[2]["loss"])
+    elif edge_sampling == "max_loss":
+        edges = sorted(edges, key=lambda x: x[2]["loss"], reverse=True)
+    else:
+        raise ValueError(f"Invalid edge sampling: {edge_sampling}")
+    return edges
+async def get_batches_with_strategy(  # pylint: disable=too-many-branches
+    nodes: list,
+    edges: list,
+    graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy,
+):
+    expand_method = traverse_strategy.expand_method
+    if expand_method == "max_width":
+        logger.info("Using max width strategy")
+    elif expand_method == "max_tokens":
+        logger.info("Using max tokens strategy")
+    else:
+        raise ValueError(f"Invalid expand method: {expand_method}")
+    max_depth = traverse_strategy.max_depth
+    edge_sampling = traverse_strategy.edge_sampling
+    # 构建临接矩阵
+    edge_adj_list = defaultdict(list)
+    node_dict = {}
+    processing_batches = []
+    node_cache = {}
+    async def get_cached_node_info(node_id: str) -> dict:
+        if node_id not in node_cache:
+            node_cache[node_id] = await _get_node_info(node_id, graph_storage)
+        return node_cache[node_id]
+    for i, (node_name, _) in enumerate(nodes):
+        node_dict[node_name] = i
+    if traverse_strategy.loss_strategy == "both":
+        er_tuples = [
+            ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
+            for edge in edges
+        ]
+        edges = _sort_tuples(er_tuples, edge_sampling)
+    elif traverse_strategy.loss_strategy == "only_edge":
+        edges = _sort_edges(edges, edge_sampling)
+    else:
+        raise ValueError(f"Invalid loss strategy: {traverse_strategy.loss_strategy}")
+    for i, (src, tgt, _) in enumerate(edges):
+        edge_adj_list[src].append(i)
+        edge_adj_list[tgt].append(i)
+    for edge in tqdm_async(edges, desc="Preparing batches"):
+        if "visited" in edge[2] and edge[2]["visited"]:
+            continue
+        edge[2]["visited"] = True
+        _process_nodes = []
+        _process_edges = []
+        src_id = edge[0]
+        tgt_id = edge[1]
+        _process_nodes.extend(
+            [await get_cached_node_info(src_id), await get_cached_node_info(tgt_id)]
+        )
+        _process_edges.append(edge)
+        if expand_method == "max_width":
+            level_n_edges = _get_level_n_edges_by_max_width(
+                edge_adj_list,
+                node_dict,
+                edges,
+                nodes,
+                edge,
+                max_depth,
+                traverse_strategy.bidirectional,
+                traverse_strategy.max_extra_edges,
+                edge_sampling,
+                traverse_strategy.loss_strategy,
+            )
+        else:
+            level_n_edges = _get_level_n_edges_by_max_tokens(
+                edge_adj_list,
+                node_dict,
+                edges,
+                nodes,
+                edge,
+                max_depth,
+                traverse_strategy.bidirectional,
+                traverse_strategy.max_tokens,
+                edge_sampling,
+                traverse_strategy.loss_strategy,
+            )
+        for _edge in level_n_edges:
+            _process_nodes.append(await get_cached_node_info(_edge[0]))
+            _process_nodes.append(await get_cached_node_info(_edge[1]))
+            _process_edges.append(_edge)
+        # 去重
+        _process_nodes = list(
+            {node["node_id"]: node for node in _process_nodes}.values()
+        )
+        _process_edges = list(
+            {(edge[0], edge[1]): edge for edge in _process_edges}.values()
+        )
+        processing_batches.append((_process_nodes, _process_edges))
+    logger.info("Processing batches: %d", len(processing_batches))
+    # isolate nodes
+    isolated_node_strategy = traverse_strategy.isolated_node_strategy
+    if isolated_node_strategy == "add":
+        processing_batches = await _add_isolated_nodes(
+            nodes, processing_batches, graph_storage
+        )
+        logger.info(
+            "Processing batches after adding isolated nodes: %d",
+            len(processing_batches),
+        )
+    return processing_batches
+async def _add_isolated_nodes(
+    nodes: list,
+    processing_batches: list,
+    graph_storage: NetworkXStorage,
+) -> list:
+    visited_nodes = set()
+    for _process_nodes, _process_edges in processing_batches:
+        for node in _process_nodes:
+            visited_nodes.add(node["node_id"])
+    for node in nodes:
+        if node[0] not in visited_nodes:
+            _process_nodes = [await _get_node_info(node[0], graph_storage)]
+            processing_batches.append((_process_nodes, []))
+    return processing_batches

hf-repo/graphgen/operators/preprocess/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/preprocess/resolute_coreference.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import List
+from graphgen.models import Chunk, OpenAIModel
+from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT
+from graphgen.utils import detect_main_language
+async def resolute_coreference(
+    llm_client: OpenAIModel, chunks: List[Chunk]
+) -> List[Chunk]:
+    """
+    Resolute conference
+    :param llm_client: LLM model
+    :param chunks: List of chunks
+    :return: List of chunks
+    """
+    if len(chunks) == 0:
+        return chunks
+    results = [chunks[0]]
+    for _, chunk in enumerate(chunks[1:]):
+        language = detect_main_language(chunk.content)
+        result = await llm_client.generate_answer(
+            COREFERENCE_RESOLUTION_PROMPT[language].format(
+                reference=results[0].content, input_sentence=chunk.content
+            )
+        )
+        results.append(Chunk(id=chunk.id, content=result))
+    return results

hf-repo/graphgen/operators/search/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/search/db/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/search/db/search_uniprot.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/search/kg/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/search/kg/search_wikipedia.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from tqdm.asyncio import tqdm_asyncio as tqdm_async
+from graphgen.models import WikiSearch
+from graphgen.utils import logger
+async def _process_single_entity(
+    entity_name: str,
+    wiki_search_client: WikiSearch,
+) -> str | None:
+    """
+    Process single entity by searching Wikipedia
+    :param entity_name
+    :param wiki_search_client
+    :return: summary of the entity or None if not found
+    """
+    search_results = await wiki_search_client.search(entity_name)
+    if not search_results:
+        return None
+    summary = None
+    try:
+        summary = await wiki_search_client.summary(search_results[-1])
+        logger.info(
+            "Entity %s search result: %s summary: %s",
+            entity_name,
+            str(search_results),
+            summary,
+        )
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error("Error processing entity %s: %s", entity_name, str(e))
+    return summary
+async def search_wikipedia(
+    wiki_search_client: WikiSearch,
+    entities: set[str],
+) -> dict:
+    """
+    Search wikipedia for entities
+    :param wiki_search_client: wiki search client
+    :param entities: list of entities to search
+    :return: nodes with search results
+    """
+    wiki_data = {}
+    async for entity in tqdm_async(
+        entities, desc="Searching Wikipedia", total=len(entities)
+    ):
+        try:
+            summary = await _process_single_entity(entity, wiki_search_client)
+            if summary:
+                wiki_data[entity] = summary
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error processing entity %s: %s", entity, str(e))
+    return wiki_data

hf-repo/graphgen/operators/search/search_all.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+To use Google Web Search API,
+follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
+to get your Google search api key.
+To use Bing Web Search API,
+follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
+and obtain your Bing subscription key.
+"""
+import os
+from graphgen.utils import logger
+async def search_all(
+    search_types: dict, search_entities: set[str]
+) -> dict[str, dict[str, str]]:
+    """
+    :param search_types
+    :param search_entities: list of entities to search
+    :return: nodes with search results
+    """
+    results = {}
+    for search_type in search_types:
+        if search_type == "wikipedia":
+            from graphgen.models import WikiSearch
+            from graphgen.operators.search.kg.search_wikipedia import search_wikipedia
+            wiki_search_client = WikiSearch()
+            wiki_results = await search_wikipedia(wiki_search_client, search_entities)
+            for entity_name, description in wiki_results.items():
+                if description:
+                    results[entity_name] = {"wikipedia": description}
+        elif search_type == "google":
+            from graphgen.models import GoogleSearch
+            from graphgen.operators.search.web.search_google import search_google
+            google_search_client = GoogleSearch(
+                subscription_key=os.environ["GOOGLE_SEARCH_API_KEY"],
+                cx=os.environ["GOOGLE_SEARCH_CX"],
+            )
+            google_results = await search_google(google_search_client, search_entities)
+            for entity_name, description in google_results.items():
+                if description:
+                    results[entity_name] = results.get(entity_name, {})
+                    results[entity_name]["google"] = description
+        elif search_type == "bing":
+            from graphgen.models import BingSearch
+            from graphgen.operators.search.web.search_bing import search_bing
+            bing_search_client = BingSearch(
+                subscription_key=os.environ["BING_SEARCH_API_KEY"]
+            )
+            bing_results = await search_bing(bing_search_client, search_entities)
+            for entity_name, description in bing_results.items():
+                if description:
+                    results[entity_name] = results.get(entity_name, {})
+                    results[entity_name]["bing"] = description
+        elif search_type == "uniprot":
+            # from graphgen.models import UniProtSearch
+            # from graphgen.operators.search.db.search_uniprot import search_uniprot
+            #
+            # uniprot_search_client = UniProtSearch()
+            #
+            # uniprot_results = await search_uniprot(
+            #     uniprot_search_client, search_entities
+            # )
+            raise NotImplementedError(
+                "Processing of UniProt search results is not implemented yet."
+            )
+        else:
+            logger.error("Search type %s is not supported yet.", search_type)
+            continue
+    return results

hf-repo/graphgen/operators/search/web/__init__.py ADDED Viewed

File without changes

hf-repo/graphgen/operators/search/web/search_bing.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import trafilatura
+from tqdm.asyncio import tqdm_asyncio as tqdm_async
+from graphgen.models import BingSearch
+from graphgen.utils import logger
+async def _process_single_entity(
+    entity_name: str, bing_search_client: BingSearch
+) -> str | None:
+    """
+    Process single entity by searching Bing.
+    :param entity_name: The name of the entity to search.
+    :param bing_search_client: The Bing search client.
+    :return: Summary of the entity or None if not found.
+    """
+    search_results = bing_search_client.search(entity_name)
+    if not search_results:
+        return None
+    # Get more details from the first search result
+    first_result = search_results[0]
+    content = trafilatura.fetch_url(first_result["url"])
+    summary = trafilatura.extract(content, include_comments=False, include_links=False)
+    summary = summary.strip()
+    logger.info(
+        "Entity %s search result: %s",
+        entity_name,
+        summary,
+    )
+    return summary
+async def search_bing(
+    bing_search_client: BingSearch,
+    entities: set[str],
+) -> dict[str, str]:
+    """
+    Search with Bing and return the contexts.
+    :return:
+    """
+    bing_data = {}
+    async for entity in tqdm_async(
+        entities, desc="Searching Bing", total=len(entities)
+    ):
+        try:
+            summary = await _process_single_entity(entity, bing_search_client)
+            if summary:
+                bing_data[entity] = summary
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error processing entity %s: %s", entity, str(e))
+    return bing_data

hf-repo/graphgen/operators/search/web/search_google.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import trafilatura
+from tqdm.asyncio import tqdm_asyncio as tqdm_async
+from graphgen.models import GoogleSearch
+from graphgen.utils import logger
+async def _process_single_entity(
+    entity_name: str, google_search_client: GoogleSearch
+) -> str | None:
+    search_results = google_search_client.search(entity_name)
+    if not search_results:
+        return None
+    # Get more details from the first search result
+    first_result = search_results[0]
+    content = trafilatura.fetch_url(first_result["link"])
+    summary = trafilatura.extract(content, include_comments=False, include_links=False)
+    summary = summary.strip()
+    logger.info(
+        "Entity %s search result: %s",
+        entity_name,
+        summary,
+    )
+    return summary
+async def search_google(
+    google_search_client: GoogleSearch,
+    entities: set[str],
+) -> dict:
+    """
+    Search with Google and return the contexts.
+    :param google_search_client: Google search client
+    :param entities: list of entities to search
+    :return:
+    """
+    google_data = {}
+    async for entity in tqdm_async(
+        entities, desc="Searching Google", total=len(entities)
+    ):
+        try:
+            summary = await _process_single_entity(entity, google_search_client)
+            if summary:
+                google_data[entity] = summary
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error processing entity %s: %s", entity, str(e))
+    return google_data

hf-repo/graphgen/templates/community/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .cot_generation import COT_GENERATION_PROMPT
2	+ from .cot_template_design import COT_TEMPLATE_DESIGN_PROMPT

hf-repo/graphgen/templates/community/cot_generation.py ADDED Viewed

	@@ -0,0 +1,87 @@

+TEMPLATE_ZH = """根据给定的知识图谱原始信息及已生成的推理路径，产出一条符合模板要求、可直接用于下游训练或推理的 CoT 数据。\
+CoT（Chain-of-Thought，思维链）指在回答复杂问题时，把中间推理步骤一步一步显式写出来，使推理过程透明、可追溯，而不是直接给出最终答案。
+-输入格式-
+[Entities:]
+(实体名:实体描述)
+...
+[Relationships:]
+(来源实体)-[关系描述]->(目标实体)
+...
+[Question and Reasoning Path:]
+(问题)
+(推理路径)
+-输出要求-
+1. 每一步只完成一个不可分割的子任务，并用自然语言衔接，但是要避免生硬的连接词。
+2. 使用中文。
+3. 不要使用有序列表或编号。
+4. 请直接给出答案，不要生成无关信息。
+-真实数据-
+输入:
+[Entities:]:
+{entities}
+[Relationships:]:
+{relationships}
+[Question:]:
+{question}
+[Reasoning_Template:]:
+{reasoning_template}
+输出：
+"""
+TEMPLATE_EN = """Given the raw knowledge graph information and the provided reasoning-path, \
+produce one Chain-of-Thought (CoT) sample that strictly follows the template \
+and can be directly used for downstream training or inference.
+CoT (Chain-of-Thought) means that when answering a complex question, the intermediate reasoning steps are \
+explicitly written out one by one, making the reasoning process transparent and traceable instead of giving \
+only the final answer.
+-Input Format-
+[Entities:]:
+(ENTITY_NAME: ENTITY_DESCRIPTION)
+...
+[Relationships:]:
+(ENTITY_SOURCE)-[RELATIONSHIP_DESCRIPTION]->(ENTITY_TARGET)
+...
+[Question and Reasoning Path:]:
+(QUESTION)
+(REASONING_PATH)
+-Output Requirements-
+1. Each step completes a single, indivisible sub-task and is naturally connected, avoiding abrupt transition words.
+2. Use English.
+3. Do not use ordered lists or numbering.
+4. Do not generate extraneous information, just provide the answer.
+-Real Data-
+Input:
+[Entities:]:
+{entities}
+[Relationships:]:
+{relationships}
+[Question:]:
+{question}
+[Reasoning_Template:]:
+{reasoning_template}
+Output:
+"""
+COT_GENERATION_PROMPT = {
+    "Chinese": {"TEMPLATE": TEMPLATE_ZH},
+    "English": {"TEMPLATE": TEMPLATE_EN},
+}

hf-repo/graphgen/templates/community/cot_template_design.py ADDED Viewed

	@@ -0,0 +1,107 @@

+TEMPLATE_ZH = """你是一位“元推理架构师”。你的任务不是回答问题，\
+而是根据给定的知识图谱中的实体和关系的名称以及描述信息，设计一条可复用、可泛化的 CoT 推理路径模板。\
+-步骤-
+1. 实体识别
+- 准确地识别[Entities:]章节中的实体信息，包括实体名、实体描述信息。
+- 实体信息的一般格式为:
+(实体名:实体描述)
+2. 关系识别
+- 准确地识别[Relationships:]章节中的关系信息，包括来源实体名、目标实体名、关系描述信息。
+- 关系信息的一般格式为:
+(来源实体名)-[关系描述]->(目标实体名)
+3. 图结构理解
+- 正确地将关系信息中的来源实体名与实体信息关联。
+- 根据提供的关系信息还原出图结构。
+4. 问题设计
+- 围绕知识图谱所表达的“核心主题”设计一个问题。
+- 问题必须能在图谱内部通过实体、关系或属性直接验证；避免主观判断。
+- 问题应该能够模型足够的思考，充分利用图谱中的实体和关系，避免过于简单或无关的问题。
+5. 推理路径生成
+- 根据问题设计一个**可被后续模型直接执行的推理蓝图**。
+- 保持步骤最小化：每一步只解决一个“不可分割”的子问题。
+-约束条件-
+1. 不要在回答中描述你的思考过程，直接给出回复，只给出问题和推理路径设计，不要生成无关信息。
+2. 如果提供的描述信息相互矛盾，请解决矛盾并提供一个单一、连贯的逻辑。
+3. 避免使用停用词和过于常见的词汇。
+4. 不要出现具体数值或结论，不要出现“识别实体”、“识别关系”这类无意义的操作描述。
+5. 使用中文作为输出语言。
+6. 输出格式为：
+问题：
+推理路径设计：
+-真实数据-
+输入:
+[Entities:]:
+{entities}
+[Relationships:]:
+{relationships}
+输出:
+"""
+TEMPLATE_EN = """You are a “meta-reasoning architect”. \
+Your task is NOT to answer the question, but to design a reusable, generalizable CoT reasoning-path \
+template based solely on the names and descriptions of entities and \
+relationships in the provided knowledge graph.
+- Steps -
+1. Entity Recognition
+- Accurately recognize entity information in the [Entities:] section, including entity names and descriptions.
+- The general formats for entity information are:
+(ENTITY_NAME: ENTITY_DESCRIPTION)
+2. Relationship Recognition
+- Accurately recognize relationship information in the [Relationships:] section, including source_entity_name, target_entity_name, and relationship descriptions.
+- The general formats for relationship information are:
+(SOURCE_ENTITY_NAME)-[RELATIONSHIP_DESCRIPTION]->(TARGET_ENTITY_NAME)
+3. Graph Structure Understanding
+- Correctly associate the source entity name in the relationship information with the entity information.
+- Reconstruct the graph structure based on the provided relationship information.
+4. Question Design
+- Design a question around the "core theme" expressed by the knowledge graph.
+- The question must be verifiable directly within the graph through entities, relationships, or attributes; avoid subjective judgments.
+- The question should allow the model to think sufficiently, fully utilizing the entities and relationships in the graph, avoiding overly simple or irrelevant questions.
+5. Reasoning-Path Design
+- Output a **blueprint that any later model can directly execute**.
+- Keep steps minimal: each step solves one indivisible sub-problem.
+- Constraints -
+1. Do NOT describe your thinking; output only the reasoning-path design.
+2. If the provided descriptions are contradictory, resolve conflicts and provide a single coherent logic.
+3. Avoid using stop words and overly common words.
+4. Do not include specific numerical values or conclusions, \
+and DO NOT describing meaningless operations like "Identify the entity" or "Identify the relationship".
+5. Use English as the output language.
+6. The output format is:
+Question:
+Reasoning-Path Design:
+Please summarize the information expressed by the knowledge graph based on the following [Entities:] and [Relationships:] provided.
+- Real Data -
+Input:
+[Entities:]:
+{entities}
+[Relationships:]:
+{relationships}
+Output:
+"""
+COT_TEMPLATE_DESIGN_PROMPT = {
+    "Chinese": {"TEMPLATE": TEMPLATE_ZH},
+    "English": {"TEMPLATE": TEMPLATE_EN},
+}

hf-repo/graphgen/utils/file.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import json
+def read_file(input_file: str) -> list:
+    """
+    Read data from a file based on the specified data type.
+    :param input_file
+    :return:
+    """
+    if input_file.endswith(".jsonl"):
+        with open(input_file, "r", encoding="utf-8") as f:
+            data = [json.loads(line) for line in f]
+    elif input_file.endswith(".json"):
+        with open(input_file, "r", encoding="utf-8") as f:
+            data = json.load(f)
+    elif input_file.endswith(".txt"):
+        with open(input_file, "r", encoding="utf-8") as f:
+            data = [line.strip() for line in f if line.strip()]
+            data = [{"content": line} for line in data]
+    else:
+        raise ValueError(f"Unsupported file format: {input_file}")
+    return data

hf-repo/hf-repo/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

hf-repo/hf-repo/app.py ADDED Viewed

	@@ -0,0 +1,586 @@

+import json
+import os
+import sys
+import tempfile
+import gradio as gr
+import pandas as pd
+from base import GraphGenParams
+from cache_utils import cleanup_workspace, setup_workspace
+from count_tokens import count_tokens
+from gradio_i18n import Translate
+from gradio_i18n import gettext as _
+from test_api import test_api_connection
+# pylint: disable=wrong-import-position
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(root_dir)
+from graphgen.graphgen import GraphGen
+from graphgen.models import OpenAIModel, Tokenizer, TraverseStrategy
+from graphgen.models.llm.limitter import RPM, TPM
+from graphgen.utils import set_logger
+css = """
+.center-row {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+}
+"""
+def init_graph_gen(config: dict, env: dict) -> GraphGen:
+    # Set up working directory
+    log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
+    set_logger(log_file, if_stream=False)
+    graph_gen = GraphGen(working_dir=working_dir)
+    # Set up LLM clients
+    graph_gen.synthesizer_llm_client = OpenAIModel(
+        model_name=env.get("SYNTHESIZER_MODEL", ""),
+        base_url=env.get("SYNTHESIZER_BASE_URL", ""),
+        api_key=env.get("SYNTHESIZER_API_KEY", ""),
+        request_limit=True,
+        rpm=RPM(env.get("RPM", 1000)),
+        tpm=TPM(env.get("TPM", 50000)),
+    )
+    graph_gen.trainee_llm_client = OpenAIModel(
+        model_name=env.get("TRAINEE_MODEL", ""),
+        base_url=env.get("TRAINEE_BASE_URL", ""),
+        api_key=env.get("TRAINEE_API_KEY", ""),
+        request_limit=True,
+        rpm=RPM(env.get("RPM", 1000)),
+        tpm=TPM(env.get("TPM", 50000)),
+    )
+    graph_gen.tokenizer_instance = Tokenizer(config.get("tokenizer", "cl100k_base"))
+    strategy_config = config.get("traverse_strategy", {})
+    graph_gen.traverse_strategy = TraverseStrategy(
+        qa_form=strategy_config.get("qa_form"),
+        expand_method=strategy_config.get("expand_method"),
+        bidirectional=strategy_config.get("bidirectional"),
+        max_extra_edges=strategy_config.get("max_extra_edges"),
+        max_tokens=strategy_config.get("max_tokens"),
+        max_depth=strategy_config.get("max_depth"),
+        edge_sampling=strategy_config.get("edge_sampling"),
+        isolated_node_strategy=strategy_config.get("isolated_node_strategy"),
+        loss_strategy=str(strategy_config.get("loss_strategy")),
+    )
+    return graph_gen
+# pylint: disable=too-many-statements
+def run_graphgen(params, progress=gr.Progress()):
+    def sum_tokens(client):
+        return sum(u["total_tokens"] for u in client.token_usage)
+    config = {
+        "if_trainee_model": params.if_trainee_model,
+        "input_file": params.input_file,
+        "tokenizer": params.tokenizer,
+        "quiz_samples": params.quiz_samples,
+        "traverse_strategy": {
+            "qa_form": params.qa_form,
+            "bidirectional": params.bidirectional,
+            "expand_method": params.expand_method,
+            "max_extra_edges": params.max_extra_edges,
+            "max_tokens": params.max_tokens,
+            "max_depth": params.max_depth,
+            "edge_sampling": params.edge_sampling,
+            "isolated_node_strategy": params.isolated_node_strategy,
+            "loss_strategy": params.loss_strategy,
+        },
+        "chunk_size": params.chunk_size,
+    }
+    env = {
+        "SYNTHESIZER_BASE_URL": params.synthesizer_url,
+        "SYNTHESIZER_MODEL": params.synthesizer_model,
+        "TRAINEE_BASE_URL": params.trainee_url,
+        "TRAINEE_MODEL": params.trainee_model,
+        "SYNTHESIZER_API_KEY": params.api_key,
+        "TRAINEE_API_KEY": params.trainee_api_key,
+        "RPM": params.rpm,
+        "TPM": params.tpm,
+    }
+    # Test API connection
+    test_api_connection(
+        env["SYNTHESIZER_BASE_URL"],
+        env["SYNTHESIZER_API_KEY"],
+        env["SYNTHESIZER_MODEL"],
+    )
+    if config["if_trainee_model"]:
+        test_api_connection(
+            env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
+        )
+    # Initialize GraphGen
+    graph_gen = init_graph_gen(config, env)
+    graph_gen.clear()
+    graph_gen.progress_bar = progress
+    try:
+        # Load input data
+        file = config["input_file"]
+        if isinstance(file, list):
+            file = file[0]
+        data = []
+        if file.endswith(".jsonl"):
+            data_type = "raw"
+            with open(file, "r", encoding="utf-8") as f:
+                data.extend(json.loads(line) for line in f)
+        elif file.endswith(".json"):
+            data_type = "chunked"
+            with open(file, "r", encoding="utf-8") as f:
+                data.extend(json.load(f))
+        elif file.endswith(".txt"):
+            # 读取文件后根据chunk_size转成raw格式的数据
+            data_type = "raw"
+            content = ""
+            with open(file, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+                for line in lines:
+                    content += line.strip() + " "
+            size = int(config.get("chunk_size", 512))
+            chunks = [content[i : i + size] for i in range(0, len(content), size)]
+            data.extend([{"content": chunk} for chunk in chunks])
+        else:
+            raise ValueError(f"Unsupported file type: {file}")
+        # Process the data
+        graph_gen.insert(data, data_type)
+        if config["if_trainee_model"]:
+            # Generate quiz
+            graph_gen.quiz(max_samples=config["quiz_samples"])
+            # Judge statements
+            graph_gen.judge()
+        else:
+            graph_gen.traverse_strategy.edge_sampling = "random"
+            # Skip judge statements
+            graph_gen.judge(skip=True)
+        # Traverse graph
+        graph_gen.traverse(traverse_strategy=graph_gen.traverse_strategy)
+        # Save output
+        output_data = graph_gen.qa_storage.data
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+        ) as tmpfile:
+            json.dump(output_data, tmpfile, ensure_ascii=False)
+            output_file = tmpfile.name
+        synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
+        trainee_tokens = (
+            sum_tokens(graph_gen.trainee_llm_client)
+            if config["if_trainee_model"]
+            else 0
+        )
+        total_tokens = synthesizer_tokens + trainee_tokens
+        data_frame = params.token_counter
+        try:
+            _update_data = [
+                [data_frame.iloc[0, 0], data_frame.iloc[0, 1], str(total_tokens)]
+            ]
+            new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
+            data_frame = new_df
+        except Exception as e:
+            raise gr.Error(f"DataFrame operation error: {str(e)}")
+        return output_file, gr.DataFrame(
+            label="Token Stats",
+            headers=["Source Text Token Count", "Expected Token Usage", "Token Used"],
+            datatype="str",
+            interactive=False,
+            value=data_frame,
+            visible=True,
+            wrap=True,
+        )
+    except Exception as e:  # pylint: disable=broad-except
+        raise gr.Error(f"Error occurred: {str(e)}")
+    finally:
+        # Clean up workspace
+        cleanup_workspace(graph_gen.working_dir)
+with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
+    # Header
+    gr.Image(
+        value=os.path.join(root_dir, "resources", "images", "logo.png"),
+        label="GraphGen Banner",
+        elem_id="banner",
+        interactive=False,
+        container=False,
+        show_download_button=False,
+        show_fullscreen_button=False,
+    )
+    lang_btn = gr.Radio(
+        choices=[
+            ("English", "en"),
+            ("简体中文", "zh"),
+        ],
+        value="en",
+        # label=_("Language"),
+        render=False,
+        container=False,
+        elem_classes=["center-row"],
+    )
+    gr.HTML(
+        """
+    <div style="display: flex; gap: 8px; margin-left: auto; align-items: center; justify-content: center;">
+        <a href="https://github.com/open-sciencelab/GraphGen/releases">
+            <img src="https://img.shields.io/badge/Version-v0.1.0-blue" alt="Version">
+        </a>
+        <a href="https://graphgen-docs.example.com">
+            <img src="https://img.shields.io/badge/Docs-Latest-brightgreen" alt="Documentation">
+        </a>
+        <a href="https://github.com/open-sciencelab/GraphGen/issues/10">
+            <img src="https://img.shields.io/github/stars/open-sciencelab/GraphGen?style=social" alt="GitHub Stars">
+        </a>
+        <a href="https://arxiv.org/abs/2505.20416">
+            <img src="https://img.shields.io/badge/arXiv-pdf-yellow" alt="arXiv">
+        </a>
+    </div>
+    """
+    )
+    with Translate(
+        os.path.join(root_dir, "webui", "translation.json"),
+        lang_btn,
+        placeholder_langs=["en", "zh"],
+        persistant=False,  # True to save the language setting in the browser. Requires gradio >= 5.6.0
+    ):
+        lang_btn.render()
+        gr.Markdown(
+            value="# "
+            + _("Title")
+            + "\n\n"
+            + "### [GraphGen](https://github.com/open-sciencelab/GraphGen) "
+            + _("Intro")
+        )
+        if_trainee_model = gr.Checkbox(
+            label=_("Use Trainee Model"), value=False, interactive=True
+        )
+        with gr.Accordion(label=_("Model Config"), open=False):
+            synthesizer_url = gr.Textbox(
+                label="Synthesizer URL",
+                value="https://api.siliconflow.cn/v1",
+                info=_("Synthesizer URL Info"),
+                interactive=True,
+            )
+            synthesizer_model = gr.Textbox(
+                label="Synthesizer Model",
+                value="Qwen/Qwen2.5-7B-Instruct",
+                info=_("Synthesizer Model Info"),
+                interactive=True,
+            )
+            trainee_url = gr.Textbox(
+                label="Trainee URL",
+                value="https://api.siliconflow.cn/v1",
+                info=_("Trainee URL Info"),
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
+            trainee_model = gr.Textbox(
+                label="Trainee Model",
+                value="Qwen/Qwen2.5-7B-Instruct",
+                info=_("Trainee Model Info"),
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
+            trainee_api_key = gr.Textbox(
+                label=_("SiliconFlow Token for Trainee Model"),
+                type="password",
+                value="",
+                info="https://cloud.siliconflow.cn/account/ak",
+                visible=if_trainee_model.value is True,
+            )
+        with gr.Accordion(label=_("Generation Config"), open=False):
+            chunk_size = gr.Slider(
+                label="Chunk Size",
+                minimum=256,
+                maximum=4096,
+                value=512,
+                step=256,
+                interactive=True,
+            )
+            tokenizer = gr.Textbox(
+                label="Tokenizer", value="cl100k_base", interactive=True
+            )
+            qa_form = gr.Radio(
+                choices=["atomic", "multi_hop", "aggregated"],
+                label="QA Form",
+                value="aggregated",
+                interactive=True,
+            )
+            quiz_samples = gr.Number(
+                label="Quiz Samples",
+                value=2,
+                minimum=1,
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
+            bidirectional = gr.Checkbox(
+                label="Bidirectional", value=True, interactive=True
+            )
+            expand_method = gr.Radio(
+                choices=["max_width", "max_tokens"],
+                label="Expand Method",
+                value="max_tokens",
+                interactive=True,
+            )
+            max_extra_edges = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=5,
+                label="Max Extra Edges",
+                step=1,
+                interactive=True,
+                visible=expand_method.value == "max_width",
+            )
+            max_tokens = gr.Slider(
+                minimum=64,
+                maximum=1024,
+                value=256,
+                label="Max Tokens",
+                step=64,
+                interactive=True,
+                visible=(expand_method.value != "max_width"),
+            )
+            max_depth = gr.Slider(
+                minimum=1,
+                maximum=5,
+                value=2,
+                label="Max Depth",
+                step=1,
+                interactive=True,
+            )
+            edge_sampling = gr.Radio(
+                choices=["max_loss", "min_loss", "random"],
+                label="Edge Sampling",
+                value="max_loss",
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
+            isolated_node_strategy = gr.Radio(
+                choices=["add", "ignore"],
+                label="Isolated Node Strategy",
+                value="ignore",
+                interactive=True,
+            )
+            loss_strategy = gr.Radio(
+                choices=["only_edge", "both"],
+                label="Loss Strategy",
+                value="only_edge",
+                interactive=True,
+            )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=3):
+                api_key = gr.Textbox(
+                    label=_("SiliconFlow Token"),
+                    type="password",
+                    value="",
+                    info="https://cloud.siliconflow.cn/account/ak",
+                )
+            with gr.Column(scale=1):
+                test_connection_btn = gr.Button(_("Test Connection"))
+        with gr.Blocks():
+            with gr.Row(equal_height=True):
+                with gr.Column():
+                    rpm = gr.Slider(
+                        label="RPM",
+                        minimum=10,
+                        maximum=10000,
+                        value=1000,
+                        step=100,
+                        interactive=True,
+                        visible=True,
+                    )
+                with gr.Column():
+                    tpm = gr.Slider(
+                        label="TPM",
+                        minimum=5000,
+                        maximum=5000000,
+                        value=50000,
+                        step=1000,
+                        interactive=True,
+                        visible=True,
+                    )
+        with gr.Blocks():
+            with gr.Row(equal_height=True):
+                with gr.Column(scale=1):
+                    upload_file = gr.File(
+                        label=_("Upload File"),
+                        file_count="single",
+                        file_types=[".txt", ".json", ".jsonl"],
+                        interactive=True,
+                    )
+                    examples_dir = os.path.join(root_dir, "webui", "examples")
+                    gr.Examples(
+                        examples=[
+                            [os.path.join(examples_dir, "txt_demo.txt")],
+                            [os.path.join(examples_dir, "raw_demo.jsonl")],
+                            [os.path.join(examples_dir, "chunked_demo.json")],
+                        ],
+                        inputs=upload_file,
+                        label=_("Example Files"),
+                        examples_per_page=3,
+                    )
+                with gr.Column(scale=1):
+                    output = gr.File(
+                        label="Output(See Github FAQ)",
+                        file_count="single",
+                        interactive=False,
+                    )
+        with gr.Blocks():
+            token_counter = gr.DataFrame(
+                label="Token Stats",
+                headers=[
+                    "Source Text Token Count",
+                    "Estimated Token Usage",
+                    "Token Used",
+                ],
+                datatype="str",
+                interactive=False,
+                visible=False,
+                wrap=True,
+            )
+        submit_btn = gr.Button(_("Run GraphGen"))
+        # Test Connection
+        test_connection_btn.click(
+            test_api_connection,
+            inputs=[synthesizer_url, api_key, synthesizer_model],
+            outputs=[],
+        )
+        if if_trainee_model.value:
+            test_connection_btn.click(
+                test_api_connection,
+                inputs=[trainee_url, api_key, trainee_model],
+                outputs=[],
+            )
+        expand_method.change(
+            lambda method: (
+                gr.update(visible=method == "max_width"),
+                gr.update(visible=method != "max_width"),
+            ),
+            inputs=expand_method,
+            outputs=[max_extra_edges, max_tokens],
+        )
+        if_trainee_model.change(
+            lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
+            inputs=if_trainee_model,
+            outputs=[
+                trainee_url,
+                trainee_model,
+                quiz_samples,
+                edge_sampling,
+                trainee_api_key,
+            ],
+        )
+        upload_file.change(
+            lambda x: (gr.update(visible=True)),
+            inputs=[upload_file],
+            outputs=[token_counter],
+        ).then(
+            count_tokens,
+            inputs=[upload_file, tokenizer, token_counter],
+            outputs=[token_counter],
+        )
+        # run GraphGen
+        submit_btn.click(
+            lambda x: (gr.update(visible=False)),
+            inputs=[token_counter],
+            outputs=[token_counter],
+        )
+        submit_btn.click(
+            lambda *args: run_graphgen(
+                GraphGenParams(
+                    if_trainee_model=args[0],
+                    input_file=args[1],
+                    tokenizer=args[2],
+                    qa_form=args[3],
+                    bidirectional=args[4],
+                    expand_method=args[5],
+                    max_extra_edges=args[6],
+                    max_tokens=args[7],
+                    max_depth=args[8],
+                    edge_sampling=args[9],
+                    isolated_node_strategy=args[10],
+                    loss_strategy=args[11],
+                    synthesizer_url=args[12],
+                    synthesizer_model=args[13],
+                    trainee_model=args[14],
+                    api_key=args[15],
+                    chunk_size=args[16],
+                    rpm=args[17],
+                    tpm=args[18],
+                    quiz_samples=args[19],
+                    trainee_url=args[20],
+                    trainee_api_key=args[21],
+                    token_counter=args[22],
+                )
+            ),
+            inputs=[
+                if_trainee_model,
+                upload_file,
+                tokenizer,
+                qa_form,
+                bidirectional,
+                expand_method,
+                max_extra_edges,
+                max_tokens,
+                max_depth,
+                edge_sampling,
+                isolated_node_strategy,
+                loss_strategy,
+                synthesizer_url,
+                synthesizer_model,
+                trainee_model,
+                api_key,
+                chunk_size,
+                rpm,
+                tpm,
+                quiz_samples,
+                trainee_url,
+                trainee_api_key,
+                token_counter,
+            ],
+            outputs=[output, token_counter],
+        )
+if __name__ == "__main__":
+    demo.queue(api_open=False, default_concurrency_limit=2)
+    demo.launch(server_name="0.0.0.0")

hf-repo/hf-repo/graphgen/__init__.py ADDED Viewed

File without changes

hf-repo/hf-repo/graphgen/evaluate.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Evaluate the quality of the generated text using various metrics"""
+import os
+import json
+import argparse
+import pandas as pd
+from dotenv import load_dotenv
+from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator
+from .utils import logger, set_logger
+sys_path = os.path.abspath(os.path.dirname(__file__))
+set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
+load_dotenv()
+def evaluate_length(corpus, tokenizer_name):
+    length_evaluator = LengthEvaluator(
+        tokenizer_name=tokenizer_name
+    )
+    logger.info("Length evaluator loaded")
+    scores = length_evaluator.get_average_score(corpus)
+    logger.info("Length scores: %s", scores)
+    return scores
+def evaluate_mtld(corpus):
+    mtld_evaluator = MTLDEvaluator()
+    logger.info("MTLD evaluator loaded")
+    scores = mtld_evaluator.get_average_score(corpus)
+    logger.info("MTLD scores: %s", scores)
+    min_max_scores = mtld_evaluator.get_min_max_score(corpus)
+    logger.info("MTLD min max scores: %s", min_max_scores)
+    return scores, min_max_scores
+def evaluate_reward(corpus, reward_model_names):
+    scores = []
+    for reward_name in reward_model_names:
+        reward_evaluator = RewardEvaluator(
+            reward_name=reward_name
+        )
+        logger.info("Loaded reward model: %s", reward_name)
+        average_score = reward_evaluator.get_average_score(corpus)
+        logger.info("%s scores: %s", reward_name, average_score)
+        min_max_scores = reward_evaluator.get_min_max_score(corpus)
+        logger.info("%s min max scores: %s", reward_name, min_max_scores)
+        scores.append({
+            'reward_name': reward_name.split('/')[-1],
+            'score': average_score,
+            'min_max_scores': min_max_scores
+        })
+        del reward_evaluator
+        clean_gpu_cache()
+    return scores
+def evaluate_uni(corpus, uni_model_name):
+    uni_evaluator = UniEvaluator(
+        model_name=uni_model_name
+    )
+    logger.info("Uni evaluator loaded with model %s", uni_model_name)
+    uni_scores = uni_evaluator.get_average_score(corpus)
+    for key, value in uni_scores.items():
+        logger.info("Uni %s scores: %s", key, value)
+    min_max_scores = uni_evaluator.get_min_max_score(corpus)
+    for key, value in min_max_scores.items():
+        logger.info("Uni %s min max scores: %s", key, value)
+    del uni_evaluator
+    clean_gpu_cache()
+    return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'],
+            min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability'])
+def clean_gpu_cache():
+    import torch
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+if __name__ == '__main__':
+    import torch.multiprocessing as mp
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data')
+    parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
+    parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
+    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
+                        help='Comma-separated list of reward models')
+    parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
+    args = parser.parse_args()
+    if not os.path.exists(args.folder):
+        raise ValueError(f"Folder {args.folder} does not exist")
+    if not os.path.exists(args.output):
+        os.makedirs(args.output)
+    reward_models = args.reward.split(',')
+    results = []
+    logger.info("Data loaded from %s", args.folder)
+    mp.set_start_method('spawn')
+    for file in os.listdir(args.folder):
+        if file.endswith('.json'):
+            logger.info("Processing %s", file)
+            with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            data = [TextPair(
+                question=data[key]['question'],
+                answer=data[key]['answer']
+            ) for key in data]
+            length_scores = evaluate_length(data, args.tokenizer)
+            mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
+            reward_scores = evaluate_reward(data, reward_models)
+            uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \
+            min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \
+                = evaluate_uni(data, args.uni)
+            result = {
+                'file': file,
+                'number': len(data),
+                'length': length_scores,
+                'mtld': mtld_scores,
+                'mtld_min_max': min_max_mtld_scores,
+                'uni_naturalness': uni_naturalness_scores,
+                'uni_coherence': uni_coherence_scores,
+                'uni_understandability': uni_understandability_scores,
+                'uni_naturalness_min_max': min_max_uni_naturalness_scores,
+                'uni_coherence_min_max': min_max_uni_coherence_scores,
+                'uni_understandability_min_max': min_max_uni_understandability_scores
+            }
+            for reward_score in reward_scores:
+                result[reward_score['reward_name']] = reward_score['score']
+                result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores']
+            results.append(result)
+    results = pd.DataFrame(results)
+    results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)

hf-repo/hf-repo/graphgen/generate.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import argparse
+import os
+import time
+from importlib.resources import files
+import yaml
+from dotenv import load_dotenv
+from .graphgen import GraphGen
+from .utils import logger, set_logger
+sys_path = os.path.abspath(os.path.dirname(__file__))
+load_dotenv()
+def set_working_dir(folder):
+    os.makedirs(folder, exist_ok=True)
+    os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
+    os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
+def save_config(config_path, global_config):
+    if not os.path.exists(os.path.dirname(config_path)):
+        os.makedirs(os.path.dirname(config_path))
+    with open(config_path, "w", encoding="utf-8") as config_file:
+        yaml.dump(
+            global_config, config_file, default_flow_style=False, allow_unicode=True
+        )
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_file",
+        help="Config parameters for GraphGen.",
+        default=files("graphgen").joinpath("configs", "aggregated_config.yaml"),
+        type=str,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Output directory for GraphGen.",
+        default=sys_path,
+        required=True,
+        type=str,
+    )
+    args = parser.parse_args()
+    working_dir = args.output_dir
+    set_working_dir(working_dir)
+    with open(args.config_file, "r", encoding="utf-8") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    output_data_type = config["output_data_type"]
+    unique_id = int(time.time())
+    set_logger(
+        os.path.join(
+            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
+        ),
+        if_stream=True,
+    )
+    logger.info(
+        "GraphGen with unique ID %s logging to %s",
+        unique_id,
+        os.path.join(
+            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
+        ),
+    )
+    graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
+    graph_gen.insert()
+    if config["search"]["enabled"]:
+        graph_gen.search()
+    # Use pipeline according to the output data type
+    if output_data_type in ["atomic", "aggregated", "multi_hop"]:
+        if "quiz_and_judge_strategy" in config and config[
+            "quiz_and_judge_strategy"
+        ].get("enabled", False):
+            graph_gen.quiz()
+            graph_gen.judge()
+        else:
+            logger.warning(
+                "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
+            )
+            graph_gen.traverse_strategy.edge_sampling = "random"
+        graph_gen.traverse()
+    elif output_data_type == "cot":
+        graph_gen.generate_reasoning(method_params=config["method_params"])
+    else:
+        raise ValueError(f"Unsupported output data type: {output_data_type}")
+    output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id))
+    save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config)
+    logger.info("GraphGen completed successfully. Data saved to %s", output_path)
+if __name__ == "__main__":
+    main()

hf-repo/hf-repo/graphgen/graphgen.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import asyncio
+import os
+import time
+from dataclasses import dataclass, field
+from typing import Dict, List, Union, cast
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+from .models import (
+    Chunk,
+    JsonKVStorage,
+    JsonListStorage,
+    NetworkXStorage,
+    OpenAIModel,
+    Tokenizer,
+    TraverseStrategy,
+)
+from .models.storage.base_storage import StorageNameSpace
+from .operators import (
+    extract_kg,
+    generate_cot,
+    judge_statement,
+    quiz,
+    search_all,
+    traverse_graph_atomically,
+    traverse_graph_by_edge,
+    traverse_graph_for_multi_hop,
+)
+from .utils import (
+    compute_content_hash,
+    create_event_loop,
+    format_generation_results,
+    logger,
+    read_file,
+)
+sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+@dataclass
+class GraphGen:
+    unique_id: int = int(time.time())
+    working_dir: str = os.path.join(sys_path, "cache")
+    config: Dict = field(default_factory=dict)
+    # llm
+    tokenizer_instance: Tokenizer = None
+    synthesizer_llm_client: OpenAIModel = None
+    trainee_llm_client: OpenAIModel = None
+    # text chunking
+    # TODO: make it configurable
+    chunk_size: int = 1024
+    chunk_overlap_size: int = 100
+    # search
+    search_config: dict = field(
+        default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]}
+    )
+    # traversal
+    traverse_strategy: TraverseStrategy = None
+    # webui
+    progress_bar: gr.Progress = None
+    def __post_init__(self):
+        self.tokenizer_instance: Tokenizer = Tokenizer(
+            model_name=self.config["tokenizer"]
+        )
+        self.synthesizer_llm_client: OpenAIModel = OpenAIModel(
+            model_name=os.getenv("SYNTHESIZER_MODEL"),
+            api_key=os.getenv("SYNTHESIZER_API_KEY"),
+            base_url=os.getenv("SYNTHESIZER_BASE_URL"),
+            tokenizer_instance=self.tokenizer_instance,
+        )
+        self.trainee_llm_client: OpenAIModel = OpenAIModel(
+            model_name=os.getenv("TRAINEE_MODEL"),
+            api_key=os.getenv("TRAINEE_API_KEY"),
+            base_url=os.getenv("TRAINEE_BASE_URL"),
+            tokenizer_instance=self.tokenizer_instance,
+        )
+        self.search_config = self.config["search"]
+        if "traverse_strategy" in self.config:
+            self.traverse_strategy = TraverseStrategy(
+                **self.config["traverse_strategy"]
+            )
+        self.full_docs_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="full_docs"
+        )
+        self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="text_chunks"
+        )
+        self.graph_storage: NetworkXStorage = NetworkXStorage(
+            self.working_dir, namespace="graph"
+        )
+        self.search_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="search"
+        )
+        self.rephrase_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="rephrase"
+        )
+        self.qa_storage: JsonListStorage = JsonListStorage(
+            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)),
+            namespace=f"qa-{self.unique_id}",
+        )
+    async def async_split_chunks(
+        self, data: List[Union[List, Dict]], data_type: str
+    ) -> dict:
+        # TODO: configurable whether to use coreference resolution
+        if len(data) == 0:
+            return {}
+        inserting_chunks = {}
+        if data_type == "raw":
+            assert isinstance(data, list) and isinstance(data[0], dict)
+            # compute hash for each document
+            new_docs = {
+                compute_content_hash(doc["content"], prefix="doc-"): {
+                    "content": doc["content"]
+                }
+                for doc in data
+            }
+            _add_doc_keys = await self.full_docs_storage.filter_keys(
+                list(new_docs.keys())
+            )
+            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+            if len(new_docs) == 0:
+                logger.warning("All docs are already in the storage")
+                return {}
+            logger.info("[New Docs] inserting %d docs", len(new_docs))
+            cur_index = 1
+            doc_number = len(new_docs)
+            async for doc_key, doc in tqdm_async(
+                new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
+            ):
+                chunks = {
+                    compute_content_hash(dp["content"], prefix="chunk-"): {
+                        **dp,
+                        "full_doc_id": doc_key,
+                    }
+                    for dp in self.tokenizer_instance.chunk_by_token_size(
+                        doc["content"], self.chunk_overlap_size, self.chunk_size
+                    )
+                }
+                inserting_chunks.update(chunks)
+                if self.progress_bar is not None:
+                    self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
+                    cur_index += 1
+            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+        elif data_type == "chunked":
+            assert isinstance(data, list) and isinstance(data[0], list)
+            new_docs = {
+                compute_content_hash("".join(chunk["content"]), prefix="doc-"): {
+                    "content": "".join(chunk["content"])
+                }
+                for doc in data
+                for chunk in doc
+            }
+            _add_doc_keys = await self.full_docs_storage.filter_keys(
+                list(new_docs.keys())
+            )
+            new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
+            if len(new_docs) == 0:
+                logger.warning("All docs are already in the storage")
+                return {}
+            logger.info("[New Docs] inserting %d docs", len(new_docs))
+            async for doc in tqdm_async(
+                data, desc="[1/4]Chunking documents", unit="doc"
+            ):
+                doc_str = "".join([chunk["content"] for chunk in doc])
+                for chunk in doc:
+                    chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
+                    inserting_chunks[chunk_key] = {
+                        **chunk,
+                        "full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
+                    }
+            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+        else:
+            raise ValueError(f"Unknown data type: {data_type}")
+        await self.full_docs_storage.upsert(new_docs)
+        await self.text_chunks_storage.upsert(inserting_chunks)
+        return inserting_chunks
+    def insert(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_insert())
+    async def async_insert(self):
+        """
+        insert chunks into the graph
+        """
+        input_file = self.config["input_file"]
+        data_type = self.config["input_data_type"]
+        data = read_file(input_file)
+        inserting_chunks = await self.async_split_chunks(data, data_type)
+        if len(inserting_chunks) == 0:
+            logger.warning("All chunks are already in the storage")
+            return
+        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
+        logger.info("[Entity and Relation Extraction]...")
+        _add_entities_and_relations = await extract_kg(
+            llm_client=self.synthesizer_llm_client,
+            kg_instance=self.graph_storage,
+            tokenizer_instance=self.tokenizer_instance,
+            chunks=[
+                Chunk(id=k, content=v["content"]) for k, v in inserting_chunks.items()
+            ],
+            progress_bar=self.progress_bar,
+        )
+        if not _add_entities_and_relations:
+            logger.warning("No entities or relations extracted")
+            return
+        await self._insert_done()
+    async def _insert_done(self):
+        tasks = []
+        for storage_instance in [
+            self.full_docs_storage,
+            self.text_chunks_storage,
+            self.graph_storage,
+            self.search_storage,
+        ]:
+            if storage_instance is None:
+                continue
+            tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
+        await asyncio.gather(*tasks)
+    def search(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_search())
+    async def async_search(self):
+        logger.info(
+            "Search is %s", "enabled" if self.search_config["enabled"] else "disabled"
+        )
+        if self.search_config["enabled"]:
+            logger.info(
+                "[Search] %s ...", ", ".join(self.search_config["search_types"])
+            )
+            all_nodes = await self.graph_storage.get_all_nodes()
+            all_nodes_names = [node[0] for node in all_nodes]
+            new_search_entities = await self.full_docs_storage.filter_keys(
+                all_nodes_names
+            )
+            logger.info(
+                "[Search] Found %d entities to search", len(new_search_entities)
+            )
+            _add_search_data = await search_all(
+                search_types=self.search_config["search_types"],
+                search_entities=new_search_entities,
+            )
+            if _add_search_data:
+                await self.search_storage.upsert(_add_search_data)
+                logger.info("[Search] %d entities searched", len(_add_search_data))
+                # Format search results for inserting
+                search_results = []
+                for _, search_data in _add_search_data.items():
+                    search_results.extend(
+                        [
+                            {"content": search_data[key]}
+                            for key in list(search_data.keys())
+                        ]
+                    )
+                # TODO: fix insert after search
+                await self.async_insert()
+    def quiz(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_quiz())
+    async def async_quiz(self):
+        max_samples = self.config["quiz_and_judge_strategy"]["quiz_samples"]
+        await quiz(
+            self.synthesizer_llm_client,
+            self.graph_storage,
+            self.rephrase_storage,
+            max_samples,
+        )
+        await self.rephrase_storage.index_done_callback()
+    def judge(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_judge())
+    async def async_judge(self):
+        re_judge = self.config["quiz_and_judge_strategy"]["re_judge"]
+        _update_relations = await judge_statement(
+            self.trainee_llm_client,
+            self.graph_storage,
+            self.rephrase_storage,
+            re_judge,
+        )
+        await _update_relations.index_done_callback()
+    def traverse(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_traverse())
+    async def async_traverse(self):
+        output_data_type = self.config["output_data_type"]
+        if output_data_type == "atomic":
+            results = await traverse_graph_atomically(
+                self.synthesizer_llm_client,
+                self.tokenizer_instance,
+                self.graph_storage,
+                self.traverse_strategy,
+                self.text_chunks_storage,
+                self.progress_bar,
+            )
+        elif output_data_type == "multi_hop":
+            results = await traverse_graph_for_multi_hop(
+                self.synthesizer_llm_client,
+                self.tokenizer_instance,
+                self.graph_storage,
+                self.traverse_strategy,
+                self.text_chunks_storage,
+                self.progress_bar,
+            )
+        elif output_data_type == "aggregated":
+            results = await traverse_graph_by_edge(
+                self.synthesizer_llm_client,
+                self.tokenizer_instance,
+                self.graph_storage,
+                self.traverse_strategy,
+                self.text_chunks_storage,
+                self.progress_bar,
+            )
+        else:
+            raise ValueError(f"Unknown qa_form: {output_data_type}")
+        results = format_generation_results(
+            results, output_data_format=self.config["output_data_format"]
+        )
+        await self.qa_storage.upsert(results)
+        await self.qa_storage.index_done_callback()
+    def generate_reasoning(self, method_params):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_generate_reasoning(method_params))
+    async def async_generate_reasoning(self, method_params):
+        results = await generate_cot(
+            self.graph_storage,
+            self.synthesizer_llm_client,
+            method_params=method_params,
+        )
+        results = format_generation_results(
+            results, output_data_format=self.config["output_data_format"]
+        )
+        await self.qa_storage.upsert(results)
+        await self.qa_storage.index_done_callback()
+    def clear(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_clear())
+    async def async_clear(self):
+        await self.full_docs_storage.drop()
+        await self.text_chunks_storage.drop()
+        await self.search_storage.drop()
+        await self.graph_storage.clear()
+        await self.rephrase_storage.drop()
+        await self.qa_storage.drop()
+        logger.info("All caches are cleared")

hf-repo/hf-repo/graphgen/judge.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import argparse
+import asyncio
+from dotenv import load_dotenv
+from .models import NetworkXStorage, JsonKVStorage, OpenAIModel
+from .operators import judge_statement
+sys_path = os.path.abspath(os.path.dirname(__file__))
+load_dotenv()
+def calculate_average_loss(graph: NetworkXStorage):
+    """
+    Calculate the average loss of the graph.
+    :param graph: NetworkXStorage
+    :return: float
+    """
+    edges = asyncio.run(graph.get_all_edges())
+    total_loss = 0
+    for edge in edges:
+        total_loss += edge[2]['loss']
+    return total_loss / len(edges)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input', type=str, default=os.path.join(sys_path, "cache"), help='path to load input graph')
+    parser.add_argument('--output', type=str, default='cache/output/new_graph.graphml', help='path to save output')
+    args = parser.parse_args()
+    llm_client = OpenAIModel(
+        model_name=os.getenv("TRAINEE_MODEL"),
+        api_key=os.getenv("TRAINEE_API_KEY"),
+        base_url=os.getenv("TRAINEE_BASE_URL")
+    )
+    graph_storage = NetworkXStorage(
+        args.input,
+        namespace="graph"
+    )
+    average_loss = calculate_average_loss(graph_storage)
+    print(f"Average loss of the graph: {average_loss}")
+    rephrase_storage = JsonKVStorage(
+        os.path.join(sys_path, "cache"),
+        namespace="rephrase"
+    )
+    new_graph = asyncio.run(judge_statement(llm_client, graph_storage, rephrase_storage, re_judge=True))
+    graph_file = asyncio.run(graph_storage.get_graph())
+    new_graph.write_nx_graph(graph_file, args.output)
+    average_loss = calculate_average_loss(new_graph)
+    print(f"Average loss of the graph: {average_loss}")

hf-repo/hf-repo/graphgen/models/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from .community.community_detector import CommunityDetector
+from .evaluate.length_evaluator import LengthEvaluator
+from .evaluate.mtld_evaluator import MTLDEvaluator
+from .evaluate.reward_evaluator import RewardEvaluator
+from .evaluate.uni_evaluator import UniEvaluator
+from .llm.openai_model import OpenAIModel
+from .llm.tokenizer import Tokenizer
+from .llm.topk_token_model import Token, TopkTokenModel
+from .search.db.uniprot_search import UniProtSearch
+from .search.kg.wiki_search import WikiSearch
+from .search.web.bing_search import BingSearch
+from .search.web.google_search import GoogleSearch
+from .storage.json_storage import JsonKVStorage, JsonListStorage
+from .storage.networkx_storage import NetworkXStorage
+from .strategy.travserse_strategy import TraverseStrategy
+from .text.chunk import Chunk
+from .text.text_pair import TextPair
+__all__ = [
+    # llm models
+    "OpenAIModel",
+    "TopkTokenModel",
+    "Token",
+    "Tokenizer",
+    # storage models
+    "Chunk",
+    "NetworkXStorage",
+    "JsonKVStorage",
+    "JsonListStorage",
+    # search models
+    "WikiSearch",
+    "GoogleSearch",
+    "BingSearch",
+    "UniProtSearch",
+    # evaluate models
+    "TextPair",
+    "LengthEvaluator",
+    "MTLDEvaluator",
+    "RewardEvaluator",
+    "UniEvaluator",
+    # strategy models
+    "TraverseStrategy",
+    # community models
+    "CommunityDetector",
+]

hf-repo/hf-repo/graphgen/models/embed/__init__.py ADDED Viewed

File without changes

hf-repo/hf-repo/graphgen/models/embed/embedding.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from dataclasses import dataclass
+import asyncio
+import numpy as np
+class UnlimitedSemaphore:
+    """A context manager that allows unlimited access."""
+    async def __aenter__(self):
+        pass
+    async def __aexit__(self, exc_type, exc, tb):
+        pass
+@dataclass
+class EmbeddingFunc:
+    embedding_dim: int
+    max_token_size: int
+    func: callable
+    concurrent_limit: int = 16
+    def __post_init__(self):
+        if self.concurrent_limit != 0:
+            self._semaphore = asyncio.Semaphore(self.concurrent_limit)
+        else:
+            self._semaphore = UnlimitedSemaphore()
+    async def __call__(self, *args, **kwargs) -> np.ndarray:
+        async with self._semaphore:
+            return await self.func(*args, **kwargs)

hf-repo/hf-repo/graphgen/models/evaluate/__init__.py ADDED Viewed

File without changes

hf-repo/hf-repo/graphgen/models/evaluate/base_evaluator.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import asyncio
+from dataclasses import dataclass
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.utils import create_event_loop
+from graphgen.models.text.text_pair import TextPair
+@dataclass
+class BaseEvaluator:
+    max_concurrent: int = 100
+    results: list[float] = None
+    def evaluate(self, pairs: list[TextPair]) -> list[float]:
+        """
+        Evaluate the text and return a score.
+        """
+        return create_event_loop().run_until_complete(self.async_evaluate(pairs))
+    async def async_evaluate(self, pairs: list[TextPair]) -> list[float]:
+        semaphore = asyncio.Semaphore(self.max_concurrent)
+        async def evaluate_with_semaphore(pair):
+            async with semaphore:  # 获取Semaphore
+                return await self.evaluate_single(pair)
+        results = []
+        for result in tqdm_async(
+            asyncio.as_completed([evaluate_with_semaphore(pair) for pair in pairs]),
+            total=len(pairs),
+        ):
+            results.append(await result)
+        return results
+    async def evaluate_single(self, pair: TextPair) -> float:
+        raise NotImplementedError()
+    def get_average_score(self, pairs: list[TextPair]) -> float:
+        """
+        Get the average score of a batch of texts.
+        """
+        results = self.evaluate(pairs)
+        self.results = results
+        return sum(self.results) / len(pairs)
+    def get_min_max_score(self, pairs: list[TextPair]) -> tuple[float, float]:
+        """
+        Get the min and max score of a batch of texts.
+        """
+        if self.results is None:
+            self.get_average_score(pairs)
+        return min(self.results), max(self.results)