|
import os |
|
import io |
|
import tempfile |
|
import streamlit as st |
|
import pandas as pd |
|
from pypdf import PdfReader |
|
from pyvis.network import Network |
|
|
|
from knowledge_graph_maker import ( |
|
GraphMaker, Ontology, Document, |
|
OpenAIClient, GroqClient |
|
) |
|
|
|
st.set_page_config(page_title="Knowledge Graph Maker", layout="wide") |
|
|
|
st.title("Knowledge Graph from Text/PDF (Docker Space)") |
|
st.caption("Uses knowledge-graph-maker with OpenAI or Groq. Paste text or upload a PDF; view the interactive graph below.") |
|
|
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "") |
|
GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") |
|
|
|
with st.sidebar: |
|
st.subheader("Model Settings") |
|
temperature = st.slider("Temperature", 0.0, 1.0, 0.1, 0.05) |
|
top_p = st.slider("Top-p", 0.0, 1.0, 0.5, 0.05) |
|
provider = st.radio("Provider", ["OpenAI", "Groq"], index=0 if OPENAI_API_KEY else 1 if GROQ_API_KEY else 0) |
|
if provider == "OpenAI": |
|
oai_model = st.text_input("OpenAI model", value="gpt-3.5-turbo") |
|
else: |
|
groq_model = st.text_input("Groq model", value="mixtral-8x7b-32768") |
|
|
|
st.markdown("### Ontology (labels)") |
|
default_labels = [ |
|
{"Person": "Person name without adjectives (may appear as name or pronoun)"}, |
|
{"Object": "Avoid the definite article 'the' in name"}, |
|
{"Event": "Events involving multiple people; no verbs like gives/leaves"}, |
|
"Place", "Document", "Organisation", "Action", |
|
{"Miscellanous": "Important concept that fits none of the above"} |
|
] |
|
labels_text = st.text_area("Labels (JSON or comma-separated)", value=", ".join( |
|
[lbl if isinstance(lbl, str) else list(lbl.keys())[0] for lbl in default_labels] |
|
), height=80) |
|
st.markdown("### Relationships focus") |
|
relationships_text = st.text_input("Relationships (comma-separated)", value="Relation between any pair of Entities") |
|
|
|
def parse_labels(text): |
|
|
|
if not text.strip(): |
|
return [ "Person","Object","Event","Place","Document","Organisation","Action","Miscellanous" ] |
|
return [lbl.strip() for lbl in text.split(",") if lbl.strip()] |
|
|
|
def split_pdf(file) -> str: |
|
reader = PdfReader(file) |
|
parts = [] |
|
for page in reader.pages: |
|
try: |
|
parts.append(page.extract_text() or "") |
|
except Exception: |
|
continue |
|
return "\n".join(parts) |
|
|
|
def build_graph_documents(text: str) -> list[Document]: |
|
|
|
|
|
CHARS = 3500 |
|
docs = [] |
|
for i in range(0, len(text), CHARS): |
|
chunk = text[i:i+CHARS].strip() |
|
if chunk: |
|
docs.append(Document(text=chunk, metadata={"chunk_id": i//CHARS})) |
|
return docs |
|
|
|
def edges_to_pyvis(edges): |
|
net = Network(height="700px", width="100%", bgcolor="#ffffff", font_color="#222222", notebook=False, directed=False) |
|
|
|
node_ids = {} |
|
|
|
def node_key(label, name): return f"{label}:{name}" |
|
|
|
for e in edges: |
|
n1 = node_key(e.node_1.label, e.node_1.name) |
|
n2 = node_key(e.node_2.label, e.node_2.name) |
|
|
|
if n1 not in node_ids: |
|
net.add_node(n1, label=e.node_1.name, title=e.node_1.label) |
|
node_ids[n1] = True |
|
if n2 not in node_ids: |
|
net.add_node(n2, label=e.node_2.name, title=e.node_2.label) |
|
node_ids[n2] = True |
|
|
|
rel = e.relationship or "" |
|
net.add_edge(n1, n2, title=rel, value=1) |
|
|
|
net.toggle_physics(True) |
|
return net |
|
|
|
|
|
tab_text, tab_pdf = st.tabs(["📝 Paste Text", "📄 Upload PDF"]) |
|
input_text = "" |
|
with tab_text: |
|
input_text = st.text_area("Paste your text here", height=220, placeholder="Paste text…") |
|
with tab_pdf: |
|
pdf_file = st.file_uploader("Upload a PDF", type=["pdf"]) |
|
if pdf_file: |
|
input_text = split_pdf(pdf_file) |
|
|
|
if st.button("Generate Knowledge Graph", type="primary"): |
|
if not input_text.strip(): |
|
st.warning("Please provide text or a PDF.") |
|
st.stop() |
|
|
|
|
|
if provider == "OpenAI": |
|
if not OPENAI_API_KEY: |
|
st.error("OPENAI_API_KEY is not set in the Space Secrets.") |
|
st.stop() |
|
llm = OpenAIClient(model=oai_model, temperature=temperature, top_p=top_p) |
|
else: |
|
if not GROQ_API_KEY: |
|
st.error("GROQ_API_KEY is not set in the Space Secrets.") |
|
st.stop() |
|
llm = GroqClient(model=groq_model, temperature=temperature, top_p=top_p) |
|
|
|
|
|
ontology = Ontology( |
|
labels=parse_labels(labels_text), |
|
relationships=[r.strip() for r in relationships_text.split(",") if r.strip()] or ["Relation between any pair of Entities"] |
|
) |
|
|
|
st.info("Chunking input and building graph… this may take a bit for longer texts.") |
|
|
|
gm = GraphMaker(ontology=ontology, llm_client=llm, verbose=False) |
|
docs = build_graph_documents(input_text) |
|
|
|
edges = gm.from_documents(docs, delay_s_between=0) |
|
st.success(f"Graph built with {len(edges)} edges.") |
|
|
|
|
|
df = pd.DataFrame([{ |
|
"node_1_label": e.node_1.label, "node_1": e.node_1.name, |
|
"node_2_label": e.node_2.label, "node_2": e.node_2.name, |
|
"relationship": e.relationship |
|
} for e in edges]) |
|
st.dataframe(df, use_container_width=True) |
|
|
|
|
|
net = edges_to_pyvis(edges) |
|
with tempfile.TemporaryDirectory() as td: |
|
html_path = os.path.join(td, "graph.html") |
|
net.save_graph(html_path) |
|
html_content = open(html_path, "r", encoding="utf-8").read() |
|
st.components.v1.html(html_content, height=750, scrolling=True) |
|
|
|
st.markdown("---") |
|
st.caption("Built with [knowledge-graph-maker](https://github.com/rahulnyk/knowledge_graph_maker).") |
|
|