First commit
Browse files- Dockerfile +25 -6
- README.md +278 -14
- README_HF_SPACES.md +69 -0
- app.py +338 -0
- app_streamlit.py +377 -0
- config/__pycache__/settings.cpython-312.pyc +0 -0
- src/__pycache__/document_processor.cpython-312.pyc +0 -0
- src/__pycache__/graph_builder.cpython-312.pyc +0 -0
- src/__pycache__/llm_extractor.cpython-312.pyc +0 -0
- src/__pycache__/visualizer.cpython-312.pyc +0 -0
- src/document_processor.py +125 -0
- src/graph_builder.py +234 -0
- src/llm_extractor.py +222 -0
- src/streamlit_app.py +0 -40
- src/visualizer.py +497 -0
- test_document.txt +15 -0
Dockerfile
CHANGED
@@ -1,21 +1,40 @@
|
|
|
|
1 |
FROM python:3.9-slim
|
2 |
|
|
|
3 |
WORKDIR /app
|
4 |
|
|
|
5 |
RUN apt-get update && apt-get install -y \
|
6 |
build-essential \
|
7 |
curl \
|
8 |
software-properties-common \
|
9 |
git \
|
|
|
|
|
|
|
10 |
&& rm -rf /var/lib/apt/lists/*
|
11 |
|
12 |
-
|
13 |
-
COPY
|
14 |
|
15 |
-
|
|
|
16 |
|
17 |
-
|
|
|
18 |
|
19 |
-
|
|
|
20 |
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use Python 3.9 slim image as base
|
2 |
FROM python:3.9-slim
|
3 |
|
4 |
+
# Set working directory
|
5 |
WORKDIR /app
|
6 |
|
7 |
+
# Install system dependencies needed for PDF processing and matplotlib
|
8 |
RUN apt-get update && apt-get install -y \
|
9 |
build-essential \
|
10 |
curl \
|
11 |
software-properties-common \
|
12 |
git \
|
13 |
+
libpoppler-cpp-dev \
|
14 |
+
pkg-config \
|
15 |
+
poppler-utils \
|
16 |
&& rm -rf /var/lib/apt/lists/*
|
17 |
|
18 |
+
# Copy requirements first for better Docker layer caching
|
19 |
+
COPY requirements.txt .
|
20 |
|
21 |
+
# Install Python dependencies
|
22 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
23 |
|
24 |
+
# Copy the entire application
|
25 |
+
COPY . .
|
26 |
|
27 |
+
# Set Python path to include the app directory
|
28 |
+
ENV PYTHONPATH="${PYTHONPATH}:/app"
|
29 |
|
30 |
+
# Create necessary directories and set permissions
|
31 |
+
RUN mkdir -p /tmp && chmod 777 /tmp
|
32 |
+
|
33 |
+
# Expose port 7860 for Hugging Face Spaces
|
34 |
+
EXPOSE 7860
|
35 |
+
|
36 |
+
# Health check
|
37 |
+
HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health
|
38 |
+
|
39 |
+
# Command to run the Streamlit app
|
40 |
+
CMD ["streamlit", "run", "app.py", "--server.address=0.0.0.0", "--server.port=7860", "--browser.gatherUsageStats=false", "--server.headless=true"]
|
README.md
CHANGED
@@ -1,20 +1,284 @@
|
|
1 |
---
|
2 |
-
title: Generate Knowledge Graphs
|
3 |
-
emoji: 🚀
|
4 |
-
colorFrom: red
|
5 |
-
colorTo: red
|
6 |
-
sdk: docker
|
7 |
-
app_port: 8501
|
8 |
-
tags:
|
9 |
-
- streamlit
|
10 |
-
pinned: false
|
11 |
-
short_description: Use LLM to generate knowledge graphs from your input data.
|
12 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
|
17 |
-
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
1 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
license: mit
|
3 |
+
title: Generate Knowledge Graphs
|
4 |
+
sdk: streamlit
|
5 |
+
emoji: 📉
|
6 |
+
colorFrom: indigo
|
7 |
+
colorTo: pink
|
8 |
+
short_description: Use LLM to generate a knowledge graph from your input data.
|
9 |
---
|
10 |
+
# 🕸️ Knowledge Graph Extraction App
|
11 |
+
|
12 |
+
A complete knowledge graph extraction application using LLMs via OpenRouter, available in both Gradio and Streamlit versions.
|
13 |
+
|
14 |
+
## 🚀 Features
|
15 |
+
|
16 |
+
- **Multi-format Document Support**: PDF, TXT, DOCX, JSON files up to 10MB
|
17 |
+
- **LLM-powered Extraction**: Uses OpenRouter API with free models (Gemma-2-9B, Llama-3.1-8B)
|
18 |
+
- **Smart Entity Detection**: Automatically identifies people, organizations, locations, concepts, events, and objects
|
19 |
+
- **Importance Scoring**: LLM evaluates entity importance from 0.0 to 1.0
|
20 |
+
- **Interactive Visualization**: Multiple graph layout algorithms with filtering options
|
21 |
+
- **Batch Processing**: Optional processing of multiple documents together
|
22 |
+
- **Export Capabilities**: JSON, GraphML, and GEXF formats
|
23 |
+
- **Real-time Statistics**: Graph metrics and centrality analysis
|
24 |
+
|
25 |
+
## 📁 Project Structure
|
26 |
+
|
27 |
+
```
|
28 |
+
knowledge-graphs/
|
29 |
+
├── app.py # Main Gradio application (legacy)
|
30 |
+
├── app_streamlit.py # Main Streamlit application (recommended)
|
31 |
+
├── run_streamlit.py # Simple launcher script
|
32 |
+
├── requirements.txt # Python dependencies
|
33 |
+
├── README.md # Project documentation
|
34 |
+
├── .env.example # Environment variables template
|
35 |
+
├── config/
|
36 |
+
│ └── settings.py # Configuration management
|
37 |
+
└── src/
|
38 |
+
├── document_processor.py # Document loading and chunking
|
39 |
+
├── llm_extractor.py # LLM-based entity extraction
|
40 |
+
├── graph_builder.py # NetworkX graph construction
|
41 |
+
└── visualizer.py # Graph visualization and export
|
42 |
+
```
|
43 |
+
|
44 |
+
## 🔧 Installation & Setup
|
45 |
+
|
46 |
+
### Option 1: Streamlit Version (Recommended)
|
47 |
+
|
48 |
+
The Streamlit version is more stable and has better file handling.
|
49 |
+
|
50 |
+
**Quick Start:**
|
51 |
+
```bash
|
52 |
+
python run_streamlit.py
|
53 |
+
```
|
54 |
+
|
55 |
+
**Manual Setup:**
|
56 |
+
1. **Install dependencies**:
|
57 |
+
```bash
|
58 |
+
pip install -r requirements.txt
|
59 |
+
```
|
60 |
+
|
61 |
+
2. **Run the Streamlit app**:
|
62 |
+
```bash
|
63 |
+
streamlit run app_streamlit.py --server.address 0.0.0.0 --server.port 8501
|
64 |
+
```
|
65 |
+
|
66 |
+
The app will be available at `http://localhost:8501`
|
67 |
+
|
68 |
+
### Option 2: Gradio Version (Legacy)
|
69 |
+
|
70 |
+
The Gradio version may have some file caching issues but is provided for compatibility.
|
71 |
+
|
72 |
+
1. **Install dependencies**:
|
73 |
+
```bash
|
74 |
+
pip install -r requirements.txt
|
75 |
+
```
|
76 |
+
|
77 |
+
2. **Set up environment variables** (optional):
|
78 |
+
```bash
|
79 |
+
cp .env.example .env
|
80 |
+
# Edit .env and add your OpenRouter API key
|
81 |
+
```
|
82 |
+
|
83 |
+
3. **Run the application**:
|
84 |
+
```bash
|
85 |
+
python app.py
|
86 |
+
```
|
87 |
+
|
88 |
+
The app will be available at `http://localhost:7860`
|
89 |
+
|
90 |
+
### HuggingFace Spaces Deployment
|
91 |
+
|
92 |
+
For **Streamlit deployment**:
|
93 |
+
1. Create a new Space on [HuggingFace Spaces](https://huggingface.co/spaces)
|
94 |
+
2. Choose "Streamlit" as the SDK
|
95 |
+
3. Upload `app_streamlit.py` as `app.py` (HF Spaces expects this name)
|
96 |
+
4. Upload all other project files maintaining directory structure
|
97 |
+
|
98 |
+
For **Gradio deployment**:
|
99 |
+
1. Create a new Space with "Gradio" as the SDK
|
100 |
+
2. Upload `app.py` and all other files
|
101 |
+
3. Note: May experience file handling issues
|
102 |
+
|
103 |
+
## 🔑 API Configuration
|
104 |
+
|
105 |
+
### Getting OpenRouter API Key
|
106 |
+
|
107 |
+
1. Visit [OpenRouter.ai](https://openrouter.ai)
|
108 |
+
2. Sign up for a free account
|
109 |
+
3. Navigate to API Keys section
|
110 |
+
4. Generate a new API key
|
111 |
+
5. Copy the key and use it in the application
|
112 |
+
|
113 |
+
### Free Models Used
|
114 |
+
|
115 |
+
- **Primary**: `google/gemma-2-9b-it:free`
|
116 |
+
- **Backup**: `meta-llama/llama-3.1-8b-instruct:free`
|
117 |
+
|
118 |
+
These models are specifically chosen to minimize API costs while maintaining quality.
|
119 |
+
|
120 |
+
## 📖 Usage Guide
|
121 |
+
|
122 |
+
### Basic Workflow
|
123 |
+
|
124 |
+
1. **Upload Documents**:
|
125 |
+
- Select one or more files (PDF, TXT, DOCX, JSON)
|
126 |
+
- Toggle batch mode for multiple document processing
|
127 |
+
|
128 |
+
2. **Configure API**:
|
129 |
+
- Enter your OpenRouter API key
|
130 |
+
- Key is stored temporarily for the session
|
131 |
+
|
132 |
+
3. **Customize Settings**:
|
133 |
+
- Choose graph layout algorithm
|
134 |
+
- Toggle label visibility options
|
135 |
+
- Set minimum importance threshold
|
136 |
+
- Select entity types to include
|
137 |
+
|
138 |
+
4. **Extract Knowledge Graph**:
|
139 |
+
- Click "Extract Knowledge Graph" button
|
140 |
+
- Monitor progress through the status updates
|
141 |
+
- View results in multiple tabs
|
142 |
+
|
143 |
+
5. **Explore Results**:
|
144 |
+
- **Graph Visualization**: Interactive graph with colored nodes by entity type
|
145 |
+
- **Statistics**: Detailed metrics about the graph structure
|
146 |
+
- **Entities**: Complete list of extracted entities with details
|
147 |
+
- **Central Nodes**: Most important entities based on centrality measures
|
148 |
+
|
149 |
+
6. **Export Data**:
|
150 |
+
- Choose export format (JSON, GraphML, GEXF)
|
151 |
+
- Download structured graph data
|
152 |
+
|
153 |
+
### Advanced Features
|
154 |
+
|
155 |
+
#### Entity Types
|
156 |
+
- **PERSON**: Individuals mentioned in the text
|
157 |
+
- **ORGANIZATION**: Companies, institutions, groups
|
158 |
+
- **LOCATION**: Places, addresses, geographical entities
|
159 |
+
- **CONCEPT**: Abstract ideas, theories, methodologies
|
160 |
+
- **EVENT**: Specific occurrences, meetings, incidents
|
161 |
+
- **OBJECT**: Physical items, products, artifacts
|
162 |
+
|
163 |
+
#### Relationship Types
|
164 |
+
- **works_at**: Employment relationships
|
165 |
+
- **located_in**: Geographical associations
|
166 |
+
- **part_of**: Hierarchical relationships
|
167 |
+
- **causes**: Causal relationships
|
168 |
+
- **related_to**: General associations
|
169 |
+
|
170 |
+
#### Filtering Options
|
171 |
+
- **Importance Threshold**: Show only entities above specified importance score
|
172 |
+
- **Entity Types**: Filter by specific entity categories
|
173 |
+
- **Layout Algorithms**: Spring, circular, shell, Kamada-Kawai, random
|
174 |
+
|
175 |
+
## 🛠️ Technical Details
|
176 |
+
|
177 |
+
### Architecture Components
|
178 |
+
|
179 |
+
1. **Document Processing**:
|
180 |
+
- Multi-format file parsing
|
181 |
+
- Intelligent text chunking with overlap
|
182 |
+
- File size validation
|
183 |
+
|
184 |
+
2. **LLM Integration**:
|
185 |
+
- OpenRouter API integration
|
186 |
+
- Structured prompt engineering
|
187 |
+
- Error handling and fallback models
|
188 |
+
|
189 |
+
3. **Graph Processing**:
|
190 |
+
- NetworkX-based graph construction
|
191 |
+
- Entity deduplication and standardization
|
192 |
+
- Relationship validation
|
193 |
+
|
194 |
+
4. **Visualization**:
|
195 |
+
- Matplotlib-based static graphs
|
196 |
+
- Interactive HTML visualizations
|
197 |
+
- Multiple export formats
|
198 |
+
|
199 |
+
### Configuration Options
|
200 |
+
|
201 |
+
All settings can be modified in `config/settings.py`:
|
202 |
+
|
203 |
+
- **Chunk Size**: Default 2000 characters
|
204 |
+
- **Chunk Overlap**: Default 200 characters
|
205 |
+
- **Max File Size**: Default 10MB
|
206 |
+
- **Max Entities**: Default 100 per extraction
|
207 |
+
- **Max Relationships**: Default 200 per extraction
|
208 |
+
- **Importance Threshold**: Default 0.3
|
209 |
+
|
210 |
+
### Differences Between Versions
|
211 |
+
|
212 |
+
**Streamlit Version Advantages:**
|
213 |
+
- More reliable file handling
|
214 |
+
- Better progress indicators
|
215 |
+
- Cleaner UI with sidebar configuration
|
216 |
+
- More stable caching system
|
217 |
+
- Built-in download functionality
|
218 |
+
|
219 |
+
**Gradio Version Advantages:**
|
220 |
+
- Simpler deployment to HF Spaces
|
221 |
+
- More compact interface
|
222 |
+
- Familiar for ML practitioners
|
223 |
+
|
224 |
+
## 🔒 Security & Privacy
|
225 |
+
|
226 |
+
- API keys are not stored permanently
|
227 |
+
- Files are processed temporarily and discarded
|
228 |
+
- No data is retained between sessions
|
229 |
+
- All processing happens server-side
|
230 |
+
|
231 |
+
## 🐛 Troubleshooting
|
232 |
+
|
233 |
+
### Common Issues
|
234 |
+
|
235 |
+
1. **"OpenRouter API key is required"**:
|
236 |
+
- Ensure you've entered a valid API key
|
237 |
+
- Check the key has sufficient credits
|
238 |
+
|
239 |
+
2. **"No entities extracted"**:
|
240 |
+
- Document may be too short or unstructured
|
241 |
+
- Try lowering the importance threshold
|
242 |
+
- Check if the document contains meaningful text
|
243 |
+
|
244 |
+
3. **File upload issues (Gradio version)**:
|
245 |
+
- Known issue with Gradio's file caching system
|
246 |
+
- Try the Streamlit version instead
|
247 |
+
- Ensure files are valid and not corrupted
|
248 |
+
|
249 |
+
4. **Segmentation fault (local development)**:
|
250 |
+
- Usually related to matplotlib backend
|
251 |
+
- Try setting `MPLBACKEND=Agg` environment variable
|
252 |
+
- Install GUI toolkit if running locally with display
|
253 |
+
|
254 |
+
5. **Module import errors**:
|
255 |
+
- Ensure all requirements are installed: `pip install -r requirements.txt`
|
256 |
+
- Check Python version compatibility (3.8+)
|
257 |
+
|
258 |
+
### Performance Tips
|
259 |
+
|
260 |
+
- Use batch mode for related documents
|
261 |
+
- Adjust chunk size for very long documents
|
262 |
+
- Lower importance threshold for sparse documents
|
263 |
+
- Use simpler layout algorithms for large graphs
|
264 |
+
|
265 |
+
## 🤝 Contributing
|
266 |
+
|
267 |
+
1. Fork the repository
|
268 |
+
2. Create a feature branch
|
269 |
+
3. Make your changes
|
270 |
+
4. Test with both Streamlit and Gradio versions if applicable
|
271 |
+
5. Add tests if applicable
|
272 |
+
6. Submit a pull request
|
273 |
+
|
274 |
+
## 📄 License
|
275 |
|
276 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
277 |
|
278 |
+
## 🙏 Acknowledgments
|
279 |
|
280 |
+
- [OpenRouter](https://openrouter.ai) for LLM API access
|
281 |
+
- [Streamlit](https://streamlit.io) for the modern web interface framework
|
282 |
+
- [Gradio](https://gradio.app) for the ML-focused web interface
|
283 |
+
- [NetworkX](https://networkx.org) for graph processing
|
284 |
+
- [HuggingFace Spaces](https://huggingface.co/spaces) for hosting
|
README_HF_SPACES.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Hugging Face Spaces Deployment
|
2 |
+
|
3 |
+
This guide explains how to deploy the Knowledge Graph Extraction app to Hugging Face Spaces using the provided Dockerfile.
|
4 |
+
|
5 |
+
## Deployment Steps
|
6 |
+
|
7 |
+
### 1. Create a New Space
|
8 |
+
1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
9 |
+
2. Click "Create new Space"
|
10 |
+
3. Choose a name for your space
|
11 |
+
4. Select "Docker" as the SDK
|
12 |
+
5. Choose "Public" or "Private" visibility
|
13 |
+
|
14 |
+
### 2. Upload Files
|
15 |
+
Upload all the project files to your space, including:
|
16 |
+
- `Dockerfile`
|
17 |
+
- `app_streamlit.py`
|
18 |
+
- `requirements.txt`
|
19 |
+
- `src/` directory with all modules
|
20 |
+
- `config/` directory
|
21 |
+
- `.env.example` (optional)
|
22 |
+
|
23 |
+
### 3. Environment Variables
|
24 |
+
In your Hugging Face Space settings, add the following environment variable:
|
25 |
+
- `OPENROUTER_API_KEY`: Your OpenRouter API key (optional, users can also input it in the app)
|
26 |
+
|
27 |
+
### 4. Docker Configuration
|
28 |
+
The Dockerfile is already optimized for HF Spaces with:
|
29 |
+
- Port 7860 (HF Spaces standard)
|
30 |
+
- Proper system dependencies for PDF processing
|
31 |
+
- Streamlit configured for headless deployment
|
32 |
+
- Health checks enabled
|
33 |
+
|
34 |
+
### 5. App Features
|
35 |
+
Once deployed, users can:
|
36 |
+
- Upload PDF, TXT, DOCX, or JSON documents
|
37 |
+
- Enter their OpenRouter API key in the sidebar
|
38 |
+
- Extract knowledge graphs using LLM processing
|
39 |
+
- Visualize graphs with various layout options
|
40 |
+
- Filter entities by type and importance
|
41 |
+
- Export graphs in JSON, GraphML, or GEXF formats
|
42 |
+
|
43 |
+
## Technical Details
|
44 |
+
|
45 |
+
### Dependencies
|
46 |
+
The Dockerfile installs:
|
47 |
+
- System packages: `poppler-utils`, `libpoppler-cpp-dev` for PDF processing
|
48 |
+
- Python packages: All from `requirements.txt`
|
49 |
+
|
50 |
+
### File Limits
|
51 |
+
- Default Streamlit file upload limit: 200MB
|
52 |
+
- Recommended per-file limit in config: 10MB
|
53 |
+
|
54 |
+
### Models Used
|
55 |
+
- Primary: `qwen/qwen3-32b` (free model via OpenRouter)
|
56 |
+
- The app uses cost-effective models to minimize API costs
|
57 |
+
|
58 |
+
## Usage Notes
|
59 |
+
- Users need their own OpenRouter API key
|
60 |
+
- Processing time depends on document size and complexity
|
61 |
+
- The app supports batch processing of multiple documents
|
62 |
+
- Temporary files are automatically cleaned up after processing
|
63 |
+
|
64 |
+
## Troubleshooting
|
65 |
+
If deployment fails:
|
66 |
+
1. Check that all files are uploaded correctly
|
67 |
+
2. Verify the Dockerfile syntax
|
68 |
+
3. Check HF Spaces logs for specific error messages
|
69 |
+
4. Ensure requirements.txt contains all necessary dependencies
|
app.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from typing import List, Dict, Any, Optional, Tuple
|
6 |
+
import traceback
|
7 |
+
|
8 |
+
# Import our modules
|
9 |
+
from src.document_processor import DocumentProcessor
|
10 |
+
from src.llm_extractor import LLMExtractor
|
11 |
+
from src.graph_builder import GraphBuilder
|
12 |
+
from src.visualizer import GraphVisualizer
|
13 |
+
from config.settings import Config
|
14 |
+
|
15 |
+
# Page config
|
16 |
+
st.set_page_config(
|
17 |
+
page_title="Knowledge Graph Extraction",
|
18 |
+
page_icon="🕸️",
|
19 |
+
layout="wide"
|
20 |
+
)
|
21 |
+
|
22 |
+
# Initialize components
|
23 |
+
@st.cache_resource
|
24 |
+
def initialize_components():
|
25 |
+
config = Config()
|
26 |
+
doc_processor = DocumentProcessor()
|
27 |
+
llm_extractor = LLMExtractor()
|
28 |
+
graph_builder = GraphBuilder()
|
29 |
+
visualizer = GraphVisualizer()
|
30 |
+
return config, doc_processor, llm_extractor, graph_builder, visualizer
|
31 |
+
|
32 |
+
config, doc_processor, llm_extractor, graph_builder, visualizer = initialize_components()
|
33 |
+
|
34 |
+
def process_uploaded_files(uploaded_files, api_key, batch_mode, layout_type,
|
35 |
+
show_labels, show_edge_labels, min_importance, entity_types_filter):
|
36 |
+
"""Process uploaded files and extract knowledge graph."""
|
37 |
+
|
38 |
+
try:
|
39 |
+
# Update API key
|
40 |
+
if api_key.strip():
|
41 |
+
config.OPENROUTER_API_KEY = api_key.strip()
|
42 |
+
llm_extractor.config.OPENROUTER_API_KEY = api_key.strip()
|
43 |
+
llm_extractor.headers["Authorization"] = f"Bearer {api_key.strip()}"
|
44 |
+
|
45 |
+
if not config.OPENROUTER_API_KEY:
|
46 |
+
st.error("❌ OpenRouter API key is required")
|
47 |
+
return None
|
48 |
+
|
49 |
+
if not uploaded_files:
|
50 |
+
st.error("❌ Please upload at least one file")
|
51 |
+
return None
|
52 |
+
|
53 |
+
progress_bar = st.progress(0)
|
54 |
+
status_text = st.empty()
|
55 |
+
|
56 |
+
status_text.text("Loading documents...")
|
57 |
+
progress_bar.progress(0.1)
|
58 |
+
|
59 |
+
# Save uploaded files to temporary location
|
60 |
+
file_paths = []
|
61 |
+
for uploaded_file in uploaded_files:
|
62 |
+
# Create temporary file
|
63 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{uploaded_file.name}") as tmp_file:
|
64 |
+
tmp_file.write(uploaded_file.getvalue())
|
65 |
+
file_paths.append(tmp_file.name)
|
66 |
+
|
67 |
+
# Process documents
|
68 |
+
doc_results = doc_processor.process_documents(file_paths, batch_mode)
|
69 |
+
|
70 |
+
# Clean up temporary files
|
71 |
+
for file_path in file_paths:
|
72 |
+
try:
|
73 |
+
os.unlink(file_path)
|
74 |
+
except:
|
75 |
+
pass
|
76 |
+
|
77 |
+
# Check for errors
|
78 |
+
failed_files = [r for r in doc_results if r['status'] == 'error']
|
79 |
+
if failed_files:
|
80 |
+
error_msg = "Failed to process files:\n" + "\n".join([f"- {r['file_path']}: {r['error']}" for r in failed_files])
|
81 |
+
if len(failed_files) == len(doc_results):
|
82 |
+
st.error(f"❌ {error_msg}")
|
83 |
+
return None
|
84 |
+
|
85 |
+
status_text.text("Extracting entities and relationships...")
|
86 |
+
progress_bar.progress(0.3)
|
87 |
+
|
88 |
+
# Extract entities and relationships
|
89 |
+
all_entities = []
|
90 |
+
all_relationships = []
|
91 |
+
extraction_errors = []
|
92 |
+
|
93 |
+
for doc_result in doc_results:
|
94 |
+
if doc_result['status'] == 'success':
|
95 |
+
extraction_result = llm_extractor.process_chunks(doc_result['chunks'])
|
96 |
+
|
97 |
+
if extraction_result.get('errors'):
|
98 |
+
extraction_errors.extend(extraction_result['errors'])
|
99 |
+
|
100 |
+
all_entities.extend(extraction_result.get('entities', []))
|
101 |
+
all_relationships.extend(extraction_result.get('relationships', []))
|
102 |
+
|
103 |
+
if not all_entities:
|
104 |
+
error_msg = "No entities extracted from documents"
|
105 |
+
if extraction_errors:
|
106 |
+
error_msg += f"\nExtraction errors: {'; '.join(extraction_errors[:3])}"
|
107 |
+
st.error(f"❌ {error_msg}")
|
108 |
+
return None
|
109 |
+
|
110 |
+
status_text.text("Building knowledge graph...")
|
111 |
+
progress_bar.progress(0.6)
|
112 |
+
|
113 |
+
# Build graph
|
114 |
+
graph = graph_builder.build_graph(all_entities, all_relationships)
|
115 |
+
|
116 |
+
if not graph.nodes():
|
117 |
+
st.error("❌ No valid knowledge graph could be built")
|
118 |
+
return None
|
119 |
+
|
120 |
+
status_text.text("Applying filters...")
|
121 |
+
progress_bar.progress(0.7)
|
122 |
+
|
123 |
+
# Apply filters
|
124 |
+
filtered_graph = graph
|
125 |
+
if entity_types_filter:
|
126 |
+
filtered_graph = graph_builder.filter_graph(
|
127 |
+
entity_types=entity_types_filter,
|
128 |
+
min_importance=min_importance
|
129 |
+
)
|
130 |
+
elif min_importance > 0:
|
131 |
+
filtered_graph = graph_builder.filter_graph(min_importance=min_importance)
|
132 |
+
|
133 |
+
if not filtered_graph.nodes():
|
134 |
+
st.error("❌ No entities remain after applying filters")
|
135 |
+
return None
|
136 |
+
|
137 |
+
status_text.text("Generating visualizations...")
|
138 |
+
progress_bar.progress(0.8)
|
139 |
+
|
140 |
+
# Generate graph visualization
|
141 |
+
graph_image_path = visualizer.visualize_graph(
|
142 |
+
filtered_graph,
|
143 |
+
layout_type=layout_type,
|
144 |
+
show_labels=show_labels,
|
145 |
+
show_edge_labels=show_edge_labels
|
146 |
+
)
|
147 |
+
|
148 |
+
# Get statistics
|
149 |
+
stats = graph_builder.get_graph_statistics()
|
150 |
+
stats_summary = visualizer.create_statistics_summary(filtered_graph, stats)
|
151 |
+
|
152 |
+
# Get entity list
|
153 |
+
entity_list = visualizer.create_entity_list(filtered_graph)
|
154 |
+
|
155 |
+
# Get central nodes
|
156 |
+
central_nodes = graph_builder.get_central_nodes()
|
157 |
+
central_nodes_text = "## Most Central Entities\n\n"
|
158 |
+
for i, (node, score) in enumerate(central_nodes, 1):
|
159 |
+
central_nodes_text += f"{i}. **{node}** (centrality: {score:.3f})\n"
|
160 |
+
|
161 |
+
status_text.text("Complete!")
|
162 |
+
progress_bar.progress(1.0)
|
163 |
+
|
164 |
+
# Success message
|
165 |
+
success_msg = f"✅ Successfully processed {len([r for r in doc_results if r['status'] == 'success'])} document(s)"
|
166 |
+
if failed_files:
|
167 |
+
success_msg += f"\n⚠️ {len(failed_files)} file(s) failed to process"
|
168 |
+
if extraction_errors:
|
169 |
+
success_msg += f"\n⚠️ {len(extraction_errors)} extraction error(s) occurred"
|
170 |
+
|
171 |
+
return {
|
172 |
+
'success_msg': success_msg,
|
173 |
+
'graph_image_path': graph_image_path,
|
174 |
+
'stats_summary': stats_summary,
|
175 |
+
'entity_list': entity_list,
|
176 |
+
'central_nodes_text': central_nodes_text,
|
177 |
+
'graph': filtered_graph
|
178 |
+
}
|
179 |
+
|
180 |
+
except Exception as e:
|
181 |
+
st.error(f"❌ Error: {str(e)}")
|
182 |
+
st.error(f"Full traceback:\n{traceback.format_exc()}")
|
183 |
+
return None
|
184 |
+
|
185 |
+
# Main app
|
186 |
+
def main():
|
187 |
+
st.title("🕸️ Knowledge Graph Extraction")
|
188 |
+
st.markdown("""
|
189 |
+
Upload documents and extract knowledge graphs using LLMs via OpenRouter.
|
190 |
+
Supports PDF, TXT, DOCX, and JSON files.
|
191 |
+
""")
|
192 |
+
|
193 |
+
# Sidebar for configuration
|
194 |
+
with st.sidebar:
|
195 |
+
st.header("📁 Document Upload")
|
196 |
+
uploaded_files = st.file_uploader(
|
197 |
+
"Choose files",
|
198 |
+
type=['pdf', 'txt', 'docx', 'json'],
|
199 |
+
accept_multiple_files=True
|
200 |
+
)
|
201 |
+
|
202 |
+
batch_mode = st.checkbox(
|
203 |
+
"Batch Processing Mode",
|
204 |
+
value=False,
|
205 |
+
help="Process multiple files together"
|
206 |
+
)
|
207 |
+
|
208 |
+
st.header("🔑 API Configuration")
|
209 |
+
api_key = st.text_input(
|
210 |
+
"OpenRouter API Key",
|
211 |
+
type="password",
|
212 |
+
placeholder="Enter your OpenRouter API key",
|
213 |
+
help="Get your key at openrouter.ai"
|
214 |
+
)
|
215 |
+
|
216 |
+
st.header("🎛️ Visualization Settings")
|
217 |
+
layout_type = st.selectbox(
|
218 |
+
"Layout Algorithm",
|
219 |
+
options=visualizer.get_layout_options(),
|
220 |
+
index=0
|
221 |
+
)
|
222 |
+
|
223 |
+
show_labels = st.checkbox("Show Node Labels", value=True)
|
224 |
+
show_edge_labels = st.checkbox("Show Edge Labels", value=False)
|
225 |
+
|
226 |
+
st.header("🔍 Filtering Options")
|
227 |
+
min_importance = st.slider(
|
228 |
+
"Minimum Entity Importance",
|
229 |
+
min_value=0.0,
|
230 |
+
max_value=1.0,
|
231 |
+
value=0.3,
|
232 |
+
step=0.1
|
233 |
+
)
|
234 |
+
|
235 |
+
entity_types_filter = st.multiselect(
|
236 |
+
"Entity Types Filter",
|
237 |
+
options=[],
|
238 |
+
help="Filter will be populated after processing"
|
239 |
+
)
|
240 |
+
|
241 |
+
process_button = st.button("🚀 Extract Knowledge Graph", type="primary")
|
242 |
+
|
243 |
+
# Main content area
|
244 |
+
if process_button and uploaded_files:
|
245 |
+
with st.spinner("Processing..."):
|
246 |
+
result = process_uploaded_files(
|
247 |
+
uploaded_files, api_key, batch_mode, layout_type,
|
248 |
+
show_labels, show_edge_labels, min_importance, entity_types_filter
|
249 |
+
)
|
250 |
+
|
251 |
+
if result:
|
252 |
+
# Store results in session state
|
253 |
+
st.session_state['result'] = result
|
254 |
+
|
255 |
+
# Display success message
|
256 |
+
st.success(result['success_msg'])
|
257 |
+
|
258 |
+
# Create tabs for results
|
259 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Graph Visualization", "📋 Statistics", "📝 Entities", "🎯 Central Nodes"])
|
260 |
+
|
261 |
+
with tab1:
|
262 |
+
if result['graph_image_path'] and os.path.exists(result['graph_image_path']):
|
263 |
+
st.image(result['graph_image_path'], caption="Knowledge Graph", use_column_width=True)
|
264 |
+
else:
|
265 |
+
st.error("Failed to generate graph visualization")
|
266 |
+
|
267 |
+
with tab2:
|
268 |
+
st.markdown(result['stats_summary'])
|
269 |
+
|
270 |
+
with tab3:
|
271 |
+
st.markdown(result['entity_list'])
|
272 |
+
|
273 |
+
with tab4:
|
274 |
+
st.markdown(result['central_nodes_text'])
|
275 |
+
|
276 |
+
# Export options
|
277 |
+
st.header("💾 Export Options")
|
278 |
+
col1, col2 = st.columns(2)
|
279 |
+
|
280 |
+
with col1:
|
281 |
+
export_format = st.selectbox(
|
282 |
+
"Export Format",
|
283 |
+
options=["json", "graphml", "gexf"],
|
284 |
+
index=0
|
285 |
+
)
|
286 |
+
|
287 |
+
with col2:
|
288 |
+
if st.button("📥 Export Graph"):
|
289 |
+
try:
|
290 |
+
export_data = graph_builder.export_graph(export_format)
|
291 |
+
st.text_area("Export Data", value=export_data, height=300)
|
292 |
+
|
293 |
+
# Download button
|
294 |
+
st.download_button(
|
295 |
+
label=f"Download {export_format.upper()} file",
|
296 |
+
data=export_data,
|
297 |
+
file_name=f"knowledge_graph.{export_format}",
|
298 |
+
mime="application/octet-stream"
|
299 |
+
)
|
300 |
+
except Exception as e:
|
301 |
+
st.error(f"Export failed: {str(e)}")
|
302 |
+
|
303 |
+
elif process_button and not uploaded_files:
|
304 |
+
st.warning("Please upload at least one file before processing.")
|
305 |
+
|
306 |
+
# Instructions
|
307 |
+
st.header("📚 Instructions")
|
308 |
+
|
309 |
+
with st.expander("How to use this app"):
|
310 |
+
st.markdown("""
|
311 |
+
1. **Upload Documents**: Select one or more files (PDF, TXT, DOCX, JSON) using the file uploader in the sidebar
|
312 |
+
2. **Enter API Key**: Get a free API key from [OpenRouter](https://openrouter.ai) and enter it in the sidebar
|
313 |
+
3. **Configure Settings**: Adjust visualization and filtering options in the sidebar
|
314 |
+
4. **Extract Graph**: Click the "Extract Knowledge Graph" button and wait for processing
|
315 |
+
5. **Explore Results**: View the graph, statistics, and entity details in the tabs
|
316 |
+
6. **Export**: Download the graph data in various formats
|
317 |
+
""")
|
318 |
+
|
319 |
+
with st.expander("Features"):
|
320 |
+
st.markdown("""
|
321 |
+
- **Multi-format Support**: PDF, TXT, DOCX, JSON files
|
322 |
+
- **Batch Processing**: Process multiple documents together
|
323 |
+
- **Smart Extraction**: Uses LLM to identify important entities and relationships
|
324 |
+
- **Interactive Filtering**: Filter by entity type and importance
|
325 |
+
- **Multiple Layouts**: Various graph layout algorithms
|
326 |
+
- **Export Options**: JSON, GraphML, GEXF formats
|
327 |
+
- **Free Models**: Uses cost-effective OpenRouter models
|
328 |
+
""")
|
329 |
+
|
330 |
+
with st.expander("Notes"):
|
331 |
+
st.markdown("""
|
332 |
+
- File size limit: 10MB per file
|
333 |
+
- Free OpenRouter models are used to minimize costs
|
334 |
+
- Processing time depends on document size and complexity
|
335 |
+
""")
|
336 |
+
|
337 |
+
if __name__ == "__main__":
|
338 |
+
main()
|
app_streamlit.py
ADDED
@@ -0,0 +1,377 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import tempfile
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
from typing import List, Dict, Any, Optional, Tuple
|
6 |
+
import traceback
|
7 |
+
|
8 |
+
# Import our modules
|
9 |
+
from src.document_processor import DocumentProcessor
|
10 |
+
from src.llm_extractor import LLMExtractor
|
11 |
+
from src.graph_builder import GraphBuilder
|
12 |
+
from src.visualizer import GraphVisualizer
|
13 |
+
from config.settings import Config
|
14 |
+
|
15 |
+
# Page config
|
16 |
+
st.set_page_config(
|
17 |
+
page_title="Knowledge Graph Extraction",
|
18 |
+
page_icon="🕸️",
|
19 |
+
layout="wide"
|
20 |
+
)
|
21 |
+
|
22 |
+
# Initialize components
|
23 |
+
@st.cache_resource
|
24 |
+
def initialize_components():
|
25 |
+
config = Config()
|
26 |
+
doc_processor = DocumentProcessor()
|
27 |
+
llm_extractor = LLMExtractor()
|
28 |
+
graph_builder = GraphBuilder()
|
29 |
+
visualizer = GraphVisualizer()
|
30 |
+
return config, doc_processor, llm_extractor, graph_builder, visualizer
|
31 |
+
|
32 |
+
config, doc_processor, llm_extractor, graph_builder, visualizer = initialize_components()
|
33 |
+
|
34 |
+
def process_uploaded_files(uploaded_files, api_key, batch_mode, visualization_type, layout_type,
|
35 |
+
show_labels, show_edge_labels, min_importance, entity_types_filter):
|
36 |
+
"""Process uploaded files and extract knowledge graph."""
|
37 |
+
|
38 |
+
try:
|
39 |
+
# Update API key
|
40 |
+
if api_key.strip():
|
41 |
+
config.OPENROUTER_API_KEY = api_key.strip()
|
42 |
+
llm_extractor.config.OPENROUTER_API_KEY = api_key.strip()
|
43 |
+
llm_extractor.headers["Authorization"] = f"Bearer {api_key.strip()}"
|
44 |
+
|
45 |
+
if not config.OPENROUTER_API_KEY:
|
46 |
+
st.error("❌ OpenRouter API key is required")
|
47 |
+
return None
|
48 |
+
|
49 |
+
if not uploaded_files:
|
50 |
+
st.error("❌ Please upload at least one file")
|
51 |
+
return None
|
52 |
+
|
53 |
+
progress_bar = st.progress(0)
|
54 |
+
status_text = st.empty()
|
55 |
+
|
56 |
+
status_text.text("Loading documents...")
|
57 |
+
progress_bar.progress(0.1)
|
58 |
+
|
59 |
+
# Save uploaded files to temporary location
|
60 |
+
file_paths = []
|
61 |
+
for uploaded_file in uploaded_files:
|
62 |
+
# Create temporary file
|
63 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=f"_{uploaded_file.name}") as tmp_file:
|
64 |
+
tmp_file.write(uploaded_file.getvalue())
|
65 |
+
file_paths.append(tmp_file.name)
|
66 |
+
|
67 |
+
# Process documents
|
68 |
+
doc_results = doc_processor.process_documents(file_paths, batch_mode)
|
69 |
+
|
70 |
+
# Clean up temporary files
|
71 |
+
for file_path in file_paths:
|
72 |
+
try:
|
73 |
+
os.unlink(file_path)
|
74 |
+
except:
|
75 |
+
pass
|
76 |
+
|
77 |
+
# Check for errors
|
78 |
+
failed_files = [r for r in doc_results if r['status'] == 'error']
|
79 |
+
if failed_files:
|
80 |
+
error_msg = "Failed to process files:\n" + "\n".join([f"- {r['file_path']}: {r['error']}" for r in failed_files])
|
81 |
+
if len(failed_files) == len(doc_results):
|
82 |
+
st.error(f"❌ {error_msg}")
|
83 |
+
return None
|
84 |
+
|
85 |
+
status_text.text("Extracting entities and relationships...")
|
86 |
+
progress_bar.progress(0.3)
|
87 |
+
|
88 |
+
# Extract entities and relationships
|
89 |
+
all_entities = []
|
90 |
+
all_relationships = []
|
91 |
+
extraction_errors = []
|
92 |
+
|
93 |
+
for doc_result in doc_results:
|
94 |
+
if doc_result['status'] == 'success':
|
95 |
+
extraction_result = llm_extractor.process_chunks(doc_result['chunks'])
|
96 |
+
|
97 |
+
if extraction_result.get('errors'):
|
98 |
+
extraction_errors.extend(extraction_result['errors'])
|
99 |
+
|
100 |
+
all_entities.extend(extraction_result.get('entities', []))
|
101 |
+
all_relationships.extend(extraction_result.get('relationships', []))
|
102 |
+
|
103 |
+
if not all_entities:
|
104 |
+
error_msg = "No entities extracted from documents"
|
105 |
+
if extraction_errors:
|
106 |
+
error_msg += f"\nExtraction errors: {'; '.join(extraction_errors[:3])}"
|
107 |
+
st.error(f"❌ {error_msg}")
|
108 |
+
return None
|
109 |
+
|
110 |
+
status_text.text("Building knowledge graph...")
|
111 |
+
progress_bar.progress(0.6)
|
112 |
+
|
113 |
+
# Build graph
|
114 |
+
graph = graph_builder.build_graph(all_entities, all_relationships)
|
115 |
+
|
116 |
+
if not graph.nodes():
|
117 |
+
st.error("❌ No valid knowledge graph could be built")
|
118 |
+
return None
|
119 |
+
|
120 |
+
status_text.text("Applying filters...")
|
121 |
+
progress_bar.progress(0.7)
|
122 |
+
|
123 |
+
# Apply filters
|
124 |
+
filtered_graph = graph
|
125 |
+
if entity_types_filter:
|
126 |
+
filtered_graph = graph_builder.filter_graph(
|
127 |
+
entity_types=entity_types_filter,
|
128 |
+
min_importance=min_importance
|
129 |
+
)
|
130 |
+
elif min_importance > 0:
|
131 |
+
filtered_graph = graph_builder.filter_graph(min_importance=min_importance)
|
132 |
+
|
133 |
+
if not filtered_graph.nodes():
|
134 |
+
st.error("❌ No entities remain after applying filters")
|
135 |
+
return None
|
136 |
+
|
137 |
+
status_text.text("Generating visualizations...")
|
138 |
+
progress_bar.progress(0.8)
|
139 |
+
|
140 |
+
# Generate graph visualization based on type
|
141 |
+
if visualization_type == "plotly":
|
142 |
+
graph_viz = visualizer.create_plotly_interactive(filtered_graph, layout_type)
|
143 |
+
graph_image_path = None
|
144 |
+
elif visualization_type == "pyvis":
|
145 |
+
graph_image_path = visualizer.create_pyvis_interactive(filtered_graph, layout_type)
|
146 |
+
graph_viz = None
|
147 |
+
elif visualization_type == "vis.js":
|
148 |
+
graph_viz = visualizer.create_interactive_html(filtered_graph)
|
149 |
+
graph_image_path = None
|
150 |
+
else: # matplotlib
|
151 |
+
graph_image_path = visualizer.visualize_graph(
|
152 |
+
filtered_graph,
|
153 |
+
layout_type=layout_type,
|
154 |
+
show_labels=show_labels,
|
155 |
+
show_edge_labels=show_edge_labels
|
156 |
+
)
|
157 |
+
graph_viz = None
|
158 |
+
|
159 |
+
# Get statistics
|
160 |
+
stats = graph_builder.get_graph_statistics()
|
161 |
+
stats_summary = visualizer.create_statistics_summary(filtered_graph, stats)
|
162 |
+
|
163 |
+
# Get entity list
|
164 |
+
entity_list = visualizer.create_entity_list(filtered_graph)
|
165 |
+
|
166 |
+
# Get central nodes
|
167 |
+
central_nodes = graph_builder.get_central_nodes()
|
168 |
+
central_nodes_text = "## Most Central Entities\n\n"
|
169 |
+
for i, (node, score) in enumerate(central_nodes, 1):
|
170 |
+
central_nodes_text += f"{i}. **{node}** (centrality: {score:.3f})\n"
|
171 |
+
|
172 |
+
status_text.text("Complete!")
|
173 |
+
progress_bar.progress(1.0)
|
174 |
+
|
175 |
+
# Success message
|
176 |
+
success_msg = f"✅ Successfully processed {len([r for r in doc_results if r['status'] == 'success'])} document(s)"
|
177 |
+
if failed_files:
|
178 |
+
success_msg += f"\n⚠️ {len(failed_files)} file(s) failed to process"
|
179 |
+
if extraction_errors:
|
180 |
+
success_msg += f"\n⚠️ {len(extraction_errors)} extraction error(s) occurred"
|
181 |
+
|
182 |
+
return {
|
183 |
+
'success_msg': success_msg,
|
184 |
+
'graph_image_path': graph_image_path,
|
185 |
+
'graph_viz': graph_viz,
|
186 |
+
'visualization_type': visualization_type,
|
187 |
+
'stats_summary': stats_summary,
|
188 |
+
'entity_list': entity_list,
|
189 |
+
'central_nodes_text': central_nodes_text,
|
190 |
+
'graph': filtered_graph
|
191 |
+
}
|
192 |
+
|
193 |
+
except Exception as e:
|
194 |
+
st.error(f"❌ Error: {str(e)}")
|
195 |
+
st.error(f"Full traceback:\n{traceback.format_exc()}")
|
196 |
+
return None
|
197 |
+
|
198 |
+
# Main app
|
199 |
+
def main():
|
200 |
+
st.title("🕸️ Knowledge Graph Extraction")
|
201 |
+
st.markdown("""
|
202 |
+
Upload documents and extract knowledge graphs using LLMs via OpenRouter.
|
203 |
+
Supports PDF, TXT, DOCX, and JSON files.
|
204 |
+
""")
|
205 |
+
|
206 |
+
# Sidebar for configuration
|
207 |
+
with st.sidebar:
|
208 |
+
st.header("📁 Document Upload")
|
209 |
+
uploaded_files = st.file_uploader(
|
210 |
+
"Choose files",
|
211 |
+
type=['pdf', 'txt', 'docx', 'json'],
|
212 |
+
accept_multiple_files=True
|
213 |
+
)
|
214 |
+
|
215 |
+
batch_mode = st.checkbox(
|
216 |
+
"Batch Processing Mode",
|
217 |
+
value=False,
|
218 |
+
help="Process multiple files together"
|
219 |
+
)
|
220 |
+
|
221 |
+
st.header("🔑 API Configuration")
|
222 |
+
api_key = st.text_input(
|
223 |
+
"OpenRouter API Key",
|
224 |
+
type="password",
|
225 |
+
placeholder="Enter your OpenRouter API key",
|
226 |
+
help="Get your key at openrouter.ai"
|
227 |
+
)
|
228 |
+
|
229 |
+
st.header("🎛️ Visualization Settings")
|
230 |
+
visualization_type = st.selectbox(
|
231 |
+
"Visualization Type",
|
232 |
+
options=visualizer.get_visualization_options(),
|
233 |
+
index=1, # Default to plotly for interactivity
|
234 |
+
help="Choose visualization method"
|
235 |
+
)
|
236 |
+
|
237 |
+
layout_type = st.selectbox(
|
238 |
+
"Layout Algorithm",
|
239 |
+
options=visualizer.get_layout_options(),
|
240 |
+
index=0
|
241 |
+
)
|
242 |
+
|
243 |
+
show_labels = st.checkbox("Show Node Labels", value=True)
|
244 |
+
show_edge_labels = st.checkbox("Show Edge Labels", value=False)
|
245 |
+
|
246 |
+
st.header("🔍 Filtering Options")
|
247 |
+
min_importance = st.slider(
|
248 |
+
"Minimum Entity Importance",
|
249 |
+
min_value=0.0,
|
250 |
+
max_value=1.0,
|
251 |
+
value=0.3,
|
252 |
+
step=0.1
|
253 |
+
)
|
254 |
+
|
255 |
+
entity_types_filter = st.multiselect(
|
256 |
+
"Entity Types Filter",
|
257 |
+
options=[],
|
258 |
+
help="Filter will be populated after processing"
|
259 |
+
)
|
260 |
+
|
261 |
+
process_button = st.button("🚀 Extract Knowledge Graph", type="primary")
|
262 |
+
|
263 |
+
# Main content area
|
264 |
+
if process_button and uploaded_files:
|
265 |
+
with st.spinner("Processing..."):
|
266 |
+
result = process_uploaded_files(
|
267 |
+
uploaded_files, api_key, batch_mode, visualization_type, layout_type,
|
268 |
+
show_labels, show_edge_labels, min_importance, entity_types_filter
|
269 |
+
)
|
270 |
+
|
271 |
+
if result:
|
272 |
+
# Store results in session state
|
273 |
+
st.session_state['result'] = result
|
274 |
+
|
275 |
+
# Display success message
|
276 |
+
st.success(result['success_msg'])
|
277 |
+
|
278 |
+
# Create tabs for results
|
279 |
+
tab1, tab2, tab3, tab4 = st.tabs(["📈 Graph Visualization", "📋 Statistics", "📝 Entities", "🎯 Central Nodes"])
|
280 |
+
|
281 |
+
with tab1:
|
282 |
+
viz_type = result['visualization_type']
|
283 |
+
|
284 |
+
if viz_type == "plotly" and result['graph_viz']:
|
285 |
+
st.plotly_chart(result['graph_viz'], use_container_width=True)
|
286 |
+
st.info("🎯 Interactive Plotly graph: Hover over nodes for details, drag to pan, scroll to zoom")
|
287 |
+
|
288 |
+
elif viz_type == "pyvis" and result['graph_image_path'] and os.path.exists(result['graph_image_path']):
|
289 |
+
# Read HTML file and display
|
290 |
+
with open(result['graph_image_path'], 'r', encoding='utf-8') as f:
|
291 |
+
html_content = f.read()
|
292 |
+
st.components.v1.html(html_content, height=600, scrolling=True)
|
293 |
+
st.info("🎯 Interactive Pyvis graph: Drag nodes to rearrange, hover for details")
|
294 |
+
|
295 |
+
elif viz_type == "vis.js" and result['graph_viz']:
|
296 |
+
st.components.v1.html(result['graph_viz'], height=600, scrolling=True)
|
297 |
+
st.info("🎯 Interactive vis.js graph: Drag nodes, hover for details, use physics simulation")
|
298 |
+
|
299 |
+
elif viz_type == "matplotlib" and result['graph_image_path'] and os.path.exists(result['graph_image_path']):
|
300 |
+
st.image(result['graph_image_path'], caption="Knowledge Graph", use_column_width=True)
|
301 |
+
st.info("📊 Static matplotlib visualization")
|
302 |
+
|
303 |
+
else:
|
304 |
+
st.error("Failed to generate graph visualization")
|
305 |
+
|
306 |
+
with tab2:
|
307 |
+
st.markdown(result['stats_summary'])
|
308 |
+
|
309 |
+
with tab3:
|
310 |
+
st.markdown(result['entity_list'])
|
311 |
+
|
312 |
+
with tab4:
|
313 |
+
st.markdown(result['central_nodes_text'])
|
314 |
+
|
315 |
+
# Export options
|
316 |
+
st.header("💾 Export Options")
|
317 |
+
col1, col2 = st.columns(2)
|
318 |
+
|
319 |
+
with col1:
|
320 |
+
export_format = st.selectbox(
|
321 |
+
"Export Format",
|
322 |
+
options=["json", "graphml", "gexf"],
|
323 |
+
index=0
|
324 |
+
)
|
325 |
+
|
326 |
+
with col2:
|
327 |
+
if st.button("📥 Export Graph"):
|
328 |
+
try:
|
329 |
+
export_data = graph_builder.export_graph(export_format)
|
330 |
+
st.text_area("Export Data", value=export_data, height=300)
|
331 |
+
|
332 |
+
# Download button
|
333 |
+
st.download_button(
|
334 |
+
label=f"Download {export_format.upper()} file",
|
335 |
+
data=export_data,
|
336 |
+
file_name=f"knowledge_graph.{export_format}",
|
337 |
+
mime="application/octet-stream"
|
338 |
+
)
|
339 |
+
except Exception as e:
|
340 |
+
st.error(f"Export failed: {str(e)}")
|
341 |
+
|
342 |
+
elif process_button and not uploaded_files:
|
343 |
+
st.warning("Please upload at least one file before processing.")
|
344 |
+
|
345 |
+
# Instructions
|
346 |
+
st.header("📚 Instructions")
|
347 |
+
|
348 |
+
with st.expander("How to use this app"):
|
349 |
+
st.markdown("""
|
350 |
+
1. **Upload Documents**: Select one or more files (PDF, TXT, DOCX, JSON) using the file uploader in the sidebar
|
351 |
+
2. **Enter API Key**: Get a free API key from [OpenRouter](https://openrouter.ai) and enter it in the sidebar
|
352 |
+
3. **Configure Settings**: Adjust visualization and filtering options in the sidebar
|
353 |
+
4. **Extract Graph**: Click the "Extract Knowledge Graph" button and wait for processing
|
354 |
+
5. **Explore Results**: View the graph, statistics, and entity details in the tabs
|
355 |
+
6. **Export**: Download the graph data in various formats
|
356 |
+
""")
|
357 |
+
|
358 |
+
with st.expander("Features"):
|
359 |
+
st.markdown("""
|
360 |
+
- **Multi-format Support**: PDF, TXT, DOCX, JSON files
|
361 |
+
- **Batch Processing**: Process multiple documents together
|
362 |
+
- **Smart Extraction**: Uses LLM to identify important entities and relationships
|
363 |
+
- **Interactive Filtering**: Filter by entity type and importance
|
364 |
+
- **Multiple Layouts**: Various graph layout algorithms
|
365 |
+
- **Export Options**: JSON, GraphML, GEXF formats
|
366 |
+
- **Free Models**: Uses cost-effective OpenRouter models
|
367 |
+
""")
|
368 |
+
|
369 |
+
with st.expander("Notes"):
|
370 |
+
st.markdown("""
|
371 |
+
- File size limit: 10MB per file
|
372 |
+
- Free OpenRouter models are used to minimize costs
|
373 |
+
- Processing time depends on document size and complexity
|
374 |
+
""")
|
375 |
+
|
376 |
+
if __name__ == "__main__":
|
377 |
+
main()
|
config/__pycache__/settings.cpython-312.pyc
ADDED
Binary file (1.34 kB). View file
|
|
src/__pycache__/document_processor.cpython-312.pyc
ADDED
Binary file (6.07 kB). View file
|
|
src/__pycache__/graph_builder.cpython-312.pyc
ADDED
Binary file (12.4 kB). View file
|
|
src/__pycache__/llm_extractor.cpython-312.pyc
ADDED
Binary file (10.1 kB). View file
|
|
src/__pycache__/visualizer.cpython-312.pyc
ADDED
Binary file (14.7 kB). View file
|
|
src/document_processor.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
from typing import List, Dict, Any
|
4 |
+
import pdfplumber
|
5 |
+
from docx import Document
|
6 |
+
from config.settings import Config
|
7 |
+
|
8 |
+
class DocumentProcessor:
|
9 |
+
def __init__(self):
|
10 |
+
self.config = Config()
|
11 |
+
|
12 |
+
def validate_file_size(self, file_path: str) -> bool:
|
13 |
+
"""Validate file size is within limits."""
|
14 |
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
15 |
+
return file_size_mb <= self.config.MAX_FILE_SIZE_MB
|
16 |
+
|
17 |
+
def load_document(self, file_path: str) -> str:
|
18 |
+
"""Load document content based on file extension."""
|
19 |
+
if not self.validate_file_size(file_path):
|
20 |
+
raise ValueError(f"File size exceeds {self.config.MAX_FILE_SIZE_MB}MB limit")
|
21 |
+
|
22 |
+
file_ext = os.path.splitext(file_path)[1].lower()
|
23 |
+
|
24 |
+
if file_ext == '.pdf':
|
25 |
+
return self._load_pdf(file_path)
|
26 |
+
elif file_ext == '.docx':
|
27 |
+
return self._load_docx(file_path)
|
28 |
+
elif file_ext == '.txt':
|
29 |
+
return self._load_txt(file_path)
|
30 |
+
elif file_ext == '.json':
|
31 |
+
return self._load_json(file_path)
|
32 |
+
else:
|
33 |
+
raise ValueError(f"Unsupported file format: {file_ext}")
|
34 |
+
|
35 |
+
def _load_pdf(self, file_path: str) -> str:
|
36 |
+
"""Load PDF content."""
|
37 |
+
text = ""
|
38 |
+
with pdfplumber.open(file_path) as pdf:
|
39 |
+
for page in pdf.pages:
|
40 |
+
page_text = page.extract_text()
|
41 |
+
if page_text:
|
42 |
+
text += page_text + "\n"
|
43 |
+
return text
|
44 |
+
|
45 |
+
def _load_docx(self, file_path: str) -> str:
|
46 |
+
"""Load DOCX content."""
|
47 |
+
doc = Document(file_path)
|
48 |
+
text = ""
|
49 |
+
for paragraph in doc.paragraphs:
|
50 |
+
text += paragraph.text + "\n"
|
51 |
+
return text
|
52 |
+
|
53 |
+
def _load_txt(self, file_path: str) -> str:
|
54 |
+
"""Load TXT content."""
|
55 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
56 |
+
return file.read()
|
57 |
+
|
58 |
+
def _load_json(self, file_path: str) -> str:
|
59 |
+
"""Load JSON content and convert to text."""
|
60 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
61 |
+
data = json.load(file)
|
62 |
+
return json.dumps(data, indent=2)
|
63 |
+
|
64 |
+
def chunk_text(self, text: str) -> List[str]:
|
65 |
+
"""Split text into overlapping chunks for processing."""
|
66 |
+
if len(text) <= self.config.CHUNK_SIZE:
|
67 |
+
return [text]
|
68 |
+
|
69 |
+
chunks = []
|
70 |
+
start = 0
|
71 |
+
|
72 |
+
while start < len(text):
|
73 |
+
end = start + self.config.CHUNK_SIZE
|
74 |
+
|
75 |
+
# Try to break at sentence boundaries
|
76 |
+
if end < len(text):
|
77 |
+
# Look for sentence endings
|
78 |
+
sentence_end = text.rfind('.', start, end)
|
79 |
+
if sentence_end == -1:
|
80 |
+
sentence_end = text.rfind('!', start, end)
|
81 |
+
if sentence_end == -1:
|
82 |
+
sentence_end = text.rfind('?', start, end)
|
83 |
+
|
84 |
+
if sentence_end != -1 and sentence_end > start + self.config.CHUNK_SIZE // 2:
|
85 |
+
end = sentence_end + 1
|
86 |
+
|
87 |
+
chunk = text[start:end].strip()
|
88 |
+
if chunk:
|
89 |
+
chunks.append(chunk)
|
90 |
+
|
91 |
+
start = end - self.config.CHUNK_OVERLAP
|
92 |
+
if start >= len(text):
|
93 |
+
break
|
94 |
+
|
95 |
+
return chunks
|
96 |
+
|
97 |
+
def process_documents(self, file_paths: List[str], batch_mode: bool = False) -> List[Dict[str, Any]]:
|
98 |
+
"""Process multiple documents."""
|
99 |
+
results = []
|
100 |
+
|
101 |
+
for file_path in file_paths:
|
102 |
+
try:
|
103 |
+
content = self.load_document(file_path)
|
104 |
+
chunks = self.chunk_text(content)
|
105 |
+
|
106 |
+
results.append({
|
107 |
+
'file_path': file_path,
|
108 |
+
'content': content,
|
109 |
+
'chunks': chunks,
|
110 |
+
'status': 'success'
|
111 |
+
})
|
112 |
+
|
113 |
+
if not batch_mode:
|
114 |
+
break # Process only one file if not in batch mode
|
115 |
+
|
116 |
+
except Exception as e:
|
117 |
+
results.append({
|
118 |
+
'file_path': file_path,
|
119 |
+
'content': '',
|
120 |
+
'chunks': [],
|
121 |
+
'status': 'error',
|
122 |
+
'error': str(e)
|
123 |
+
})
|
124 |
+
|
125 |
+
return results
|
src/graph_builder.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import networkx as nx
|
2 |
+
from typing import List, Dict, Any, Tuple
|
3 |
+
import json
|
4 |
+
|
5 |
+
class GraphBuilder:
|
6 |
+
def __init__(self):
|
7 |
+
self.graph = nx.DiGraph() # Directed graph for relationships
|
8 |
+
|
9 |
+
def build_graph(self, entities: List[Dict[str, Any]], relationships: List[Dict[str, Any]]) -> nx.DiGraph:
|
10 |
+
"""Build NetworkX graph from entities and relationships."""
|
11 |
+
self.graph.clear()
|
12 |
+
|
13 |
+
# Add entities as nodes
|
14 |
+
for entity in entities:
|
15 |
+
node_id = entity.get("name", "").strip()
|
16 |
+
if node_id:
|
17 |
+
self.graph.add_node(
|
18 |
+
node_id,
|
19 |
+
type=entity.get("type", "UNKNOWN"),
|
20 |
+
importance=entity.get("importance", 0.0),
|
21 |
+
description=entity.get("description", ""),
|
22 |
+
size=self._calculate_node_size(entity.get("importance", 0.0))
|
23 |
+
)
|
24 |
+
|
25 |
+
# Add relationships as edges
|
26 |
+
for relationship in relationships:
|
27 |
+
source = relationship.get("source", "").strip()
|
28 |
+
target = relationship.get("target", "").strip()
|
29 |
+
rel_type = relationship.get("relationship", "related_to")
|
30 |
+
description = relationship.get("description", "")
|
31 |
+
|
32 |
+
if source and target and source in self.graph.nodes and target in self.graph.nodes:
|
33 |
+
self.graph.add_edge(
|
34 |
+
source,
|
35 |
+
target,
|
36 |
+
relationship=rel_type,
|
37 |
+
description=description,
|
38 |
+
weight=1.0
|
39 |
+
)
|
40 |
+
|
41 |
+
return self.graph
|
42 |
+
|
43 |
+
def _calculate_node_size(self, importance: float) -> int:
|
44 |
+
"""Calculate node size based on importance score."""
|
45 |
+
# Map importance (0.0-1.0) to node size (10-50)
|
46 |
+
min_size, max_size = 10, 50
|
47 |
+
return int(min_size + (max_size - min_size) * importance)
|
48 |
+
|
49 |
+
def get_graph_statistics(self) -> Dict[str, Any]:
|
50 |
+
"""Get basic statistics about the graph."""
|
51 |
+
if not self.graph.nodes():
|
52 |
+
return {
|
53 |
+
"num_nodes": 0,
|
54 |
+
"num_edges": 0,
|
55 |
+
"density": 0.0,
|
56 |
+
"is_connected": False,
|
57 |
+
"num_components": 0
|
58 |
+
}
|
59 |
+
|
60 |
+
# Convert to undirected for connectivity analysis
|
61 |
+
undirected = self.graph.to_undirected()
|
62 |
+
|
63 |
+
return {
|
64 |
+
"num_nodes": self.graph.number_of_nodes(),
|
65 |
+
"num_edges": self.graph.number_of_edges(),
|
66 |
+
"density": nx.density(self.graph),
|
67 |
+
"is_connected": nx.is_connected(undirected),
|
68 |
+
"num_components": nx.number_connected_components(undirected),
|
69 |
+
"avg_degree": sum(dict(self.graph.degree()).values()) / self.graph.number_of_nodes() if self.graph.number_of_nodes() > 0 else 0
|
70 |
+
}
|
71 |
+
|
72 |
+
def get_central_nodes(self, top_k: int = 5) -> List[Tuple[str, float]]:
|
73 |
+
"""Get most central nodes using various centrality measures."""
|
74 |
+
if not self.graph.nodes():
|
75 |
+
return []
|
76 |
+
|
77 |
+
centralities = {}
|
78 |
+
|
79 |
+
# Degree centrality
|
80 |
+
degree_centrality = nx.degree_centrality(self.graph)
|
81 |
+
|
82 |
+
# Betweenness centrality (if graph has enough nodes)
|
83 |
+
if self.graph.number_of_nodes() > 2:
|
84 |
+
betweenness_centrality = nx.betweenness_centrality(self.graph)
|
85 |
+
else:
|
86 |
+
betweenness_centrality = {node: 0.0 for node in self.graph.nodes()}
|
87 |
+
|
88 |
+
# PageRank
|
89 |
+
try:
|
90 |
+
pagerank = nx.pagerank(self.graph)
|
91 |
+
except:
|
92 |
+
pagerank = {node: 1.0/self.graph.number_of_nodes() for node in self.graph.nodes()}
|
93 |
+
|
94 |
+
# Combine centrality measures
|
95 |
+
for node in self.graph.nodes():
|
96 |
+
importance = self.graph.nodes[node].get('importance', 0.0)
|
97 |
+
combined_score = (
|
98 |
+
0.3 * degree_centrality.get(node, 0.0) +
|
99 |
+
0.3 * betweenness_centrality.get(node, 0.0) +
|
100 |
+
0.2 * pagerank.get(node, 0.0) +
|
101 |
+
0.2 * importance
|
102 |
+
)
|
103 |
+
centralities[node] = combined_score
|
104 |
+
|
105 |
+
# Sort by centrality score
|
106 |
+
sorted_nodes = sorted(centralities.items(), key=lambda x: x[1], reverse=True)
|
107 |
+
return sorted_nodes[:top_k]
|
108 |
+
|
109 |
+
def filter_graph(self,
|
110 |
+
entity_types: List[str] = None,
|
111 |
+
min_importance: float = None,
|
112 |
+
relationship_types: List[str] = None) -> nx.DiGraph:
|
113 |
+
"""Filter graph by entity types, importance, or relationship types."""
|
114 |
+
filtered_graph = self.graph.copy()
|
115 |
+
|
116 |
+
# Filter nodes by type and importance
|
117 |
+
nodes_to_remove = []
|
118 |
+
for node, data in filtered_graph.nodes(data=True):
|
119 |
+
if entity_types and data.get('type') not in entity_types:
|
120 |
+
nodes_to_remove.append(node)
|
121 |
+
elif min_importance and data.get('importance', 0.0) < min_importance:
|
122 |
+
nodes_to_remove.append(node)
|
123 |
+
|
124 |
+
filtered_graph.remove_nodes_from(nodes_to_remove)
|
125 |
+
|
126 |
+
# Filter edges by relationship type
|
127 |
+
if relationship_types:
|
128 |
+
edges_to_remove = []
|
129 |
+
for u, v, data in filtered_graph.edges(data=True):
|
130 |
+
if data.get('relationship') not in relationship_types:
|
131 |
+
edges_to_remove.append((u, v))
|
132 |
+
filtered_graph.remove_edges_from(edges_to_remove)
|
133 |
+
|
134 |
+
return filtered_graph
|
135 |
+
|
136 |
+
def export_graph(self, format_type: str = "json") -> str:
|
137 |
+
"""Export graph in various formats."""
|
138 |
+
if format_type.lower() == "json":
|
139 |
+
return self._export_json()
|
140 |
+
elif format_type.lower() == "graphml":
|
141 |
+
return self._export_graphml()
|
142 |
+
elif format_type.lower() == "gexf":
|
143 |
+
return self._export_gexf()
|
144 |
+
else:
|
145 |
+
raise ValueError(f"Unsupported export format: {format_type}")
|
146 |
+
|
147 |
+
def _export_json(self) -> str:
|
148 |
+
"""Export graph as JSON."""
|
149 |
+
data = {
|
150 |
+
"nodes": [],
|
151 |
+
"edges": []
|
152 |
+
}
|
153 |
+
|
154 |
+
# Export nodes
|
155 |
+
for node, attrs in self.graph.nodes(data=True):
|
156 |
+
node_data = {"id": node}
|
157 |
+
node_data.update(attrs)
|
158 |
+
data["nodes"].append(node_data)
|
159 |
+
|
160 |
+
# Export edges
|
161 |
+
for u, v, attrs in self.graph.edges(data=True):
|
162 |
+
edge_data = {"source": u, "target": v}
|
163 |
+
edge_data.update(attrs)
|
164 |
+
data["edges"].append(edge_data)
|
165 |
+
|
166 |
+
return json.dumps(data, indent=2)
|
167 |
+
|
168 |
+
def _export_graphml(self) -> str:
|
169 |
+
"""Export graph as GraphML."""
|
170 |
+
import io
|
171 |
+
output = io.StringIO()
|
172 |
+
nx.write_graphml(self.graph, output)
|
173 |
+
return output.getvalue()
|
174 |
+
|
175 |
+
def _export_gexf(self) -> str:
|
176 |
+
"""Export graph as GEXF."""
|
177 |
+
import io
|
178 |
+
output = io.StringIO()
|
179 |
+
nx.write_gexf(self.graph, output)
|
180 |
+
return output.getvalue()
|
181 |
+
|
182 |
+
def get_subgraph_around_node(self, node: str, radius: int = 1) -> nx.DiGraph:
|
183 |
+
"""Get subgraph within specified radius of a node."""
|
184 |
+
if node not in self.graph:
|
185 |
+
return nx.DiGraph()
|
186 |
+
|
187 |
+
# Get nodes within radius
|
188 |
+
nodes_in_radius = set([node])
|
189 |
+
current_nodes = set([node])
|
190 |
+
|
191 |
+
for _ in range(radius):
|
192 |
+
next_nodes = set()
|
193 |
+
for n in current_nodes:
|
194 |
+
# Add neighbors (both incoming and outgoing)
|
195 |
+
next_nodes.update(self.graph.successors(n))
|
196 |
+
next_nodes.update(self.graph.predecessors(n))
|
197 |
+
|
198 |
+
nodes_in_radius.update(next_nodes)
|
199 |
+
current_nodes = next_nodes - nodes_in_radius
|
200 |
+
|
201 |
+
if not current_nodes:
|
202 |
+
break
|
203 |
+
|
204 |
+
return self.graph.subgraph(nodes_in_radius).copy()
|
205 |
+
|
206 |
+
def get_shortest_path(self, source: str, target: str) -> List[str]:
|
207 |
+
"""Get shortest path between two nodes."""
|
208 |
+
try:
|
209 |
+
# Convert to undirected for path finding
|
210 |
+
undirected = self.graph.to_undirected()
|
211 |
+
return nx.shortest_path(undirected, source, target)
|
212 |
+
except (nx.NetworkXNoPath, nx.NodeNotFound):
|
213 |
+
return []
|
214 |
+
|
215 |
+
def get_node_info(self, node: str) -> Dict[str, Any]:
|
216 |
+
"""Get detailed information about a specific node."""
|
217 |
+
if node not in self.graph:
|
218 |
+
return {}
|
219 |
+
|
220 |
+
node_data = dict(self.graph.nodes[node])
|
221 |
+
|
222 |
+
# Add connectivity information
|
223 |
+
predecessors = list(self.graph.predecessors(node))
|
224 |
+
successors = list(self.graph.successors(node))
|
225 |
+
|
226 |
+
node_data.update({
|
227 |
+
"in_degree": self.graph.in_degree(node),
|
228 |
+
"out_degree": self.graph.out_degree(node),
|
229 |
+
"predecessors": predecessors,
|
230 |
+
"successors": successors,
|
231 |
+
"total_connections": len(predecessors) + len(successors)
|
232 |
+
})
|
233 |
+
|
234 |
+
return node_data
|
src/llm_extractor.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import requests
|
3 |
+
from typing import List, Dict, Any, Optional
|
4 |
+
from config.settings import Config
|
5 |
+
|
6 |
+
class LLMExtractor:
|
7 |
+
def __init__(self):
|
8 |
+
self.config = Config()
|
9 |
+
self.headers = {
|
10 |
+
"Authorization": f"Bearer {self.config.OPENROUTER_API_KEY}",
|
11 |
+
"Content-Type": "application/json"
|
12 |
+
}
|
13 |
+
|
14 |
+
def extract_entities_and_relationships(self, text: str) -> Dict[str, Any]:
|
15 |
+
"""Extract entities and relationships from text using LLM."""
|
16 |
+
prompt = self._create_extraction_prompt(text)
|
17 |
+
|
18 |
+
try:
|
19 |
+
response = self._call_openrouter_api(prompt, self.config.EXTRACTION_MODEL)
|
20 |
+
result = self._parse_extraction_response(response)
|
21 |
+
return result
|
22 |
+
except Exception as e:
|
23 |
+
# Try backup model
|
24 |
+
try:
|
25 |
+
response = self._call_openrouter_api(prompt, self.config.BACKUP_MODEL)
|
26 |
+
result = self._parse_extraction_response(response)
|
27 |
+
return result
|
28 |
+
except Exception as backup_e:
|
29 |
+
return {
|
30 |
+
"entities": [],
|
31 |
+
"relationships": [],
|
32 |
+
"error": f"Primary: {str(e)}, Backup: {str(backup_e)}"
|
33 |
+
}
|
34 |
+
|
35 |
+
def _create_extraction_prompt(self, text: str) -> str:
|
36 |
+
"""Create prompt for entity and relationship extraction."""
|
37 |
+
return f"""
|
38 |
+
You are an expert knowledge graph extraction system. Analyze the following text and extract:
|
39 |
+
|
40 |
+
1. ENTITIES: Important people, organizations, locations, concepts, events, objects, etc.
|
41 |
+
2. RELATIONSHIPS: How these entities relate to each other
|
42 |
+
3. IMPORTANCE SCORES: Rate each entity's importance from 0.0 to 1.0 based on how central it is to the text
|
43 |
+
|
44 |
+
For each entity, provide:
|
45 |
+
- name: The entity name (standardized/canonical form)
|
46 |
+
- type: The entity type (PERSON, ORGANIZATION, LOCATION, CONCEPT, EVENT, OBJECT, etc.)
|
47 |
+
- importance: Score from 0.0 to 1.0
|
48 |
+
- description: Brief description of the entity's role/significance
|
49 |
+
|
50 |
+
For each relationship, provide:
|
51 |
+
- source: Source entity name
|
52 |
+
- target: Target entity name
|
53 |
+
- relationship: Type of relationship (works_at, located_in, part_of, causes, etc.)
|
54 |
+
- description: Brief description of the relationship
|
55 |
+
|
56 |
+
Only respond with a valid JSON object with this structure and nothing else. Your response must be valid, parsable JSON!!
|
57 |
+
=== JSON STRUCTURE FOR RESPONSE / RESPONSE FORMAT ===
|
58 |
+
{{
|
59 |
+
"entities": [
|
60 |
+
{{
|
61 |
+
"name": "entity_name",
|
62 |
+
"type": "ENTITY_TYPE",
|
63 |
+
"importance": 0.8,
|
64 |
+
"description": "Brief description"
|
65 |
+
}}
|
66 |
+
],
|
67 |
+
"relationships": [
|
68 |
+
{{
|
69 |
+
"source": "entity1",
|
70 |
+
"target": "entity2",
|
71 |
+
"relationship": "relationship_type",
|
72 |
+
"description": "Brief description"
|
73 |
+
}}
|
74 |
+
]
|
75 |
+
}}
|
76 |
+
=== END OF JSON STRUCTURE FOR RESPONSE / END OF RESPONSE FORMAT ===
|
77 |
+
|
78 |
+
TEXT TO ANALYZE:
|
79 |
+
{text}
|
80 |
+
|
81 |
+
Reply in valid json using the format above!
|
82 |
+
JSON OUTPUT:
|
83 |
+
"""
|
84 |
+
|
85 |
+
def _call_openrouter_api(self, prompt: str, model: str) -> str:
|
86 |
+
"""Make API call to OpenRouter."""
|
87 |
+
if not self.config.OPENROUTER_API_KEY:
|
88 |
+
raise ValueError("OpenRouter API key not configured")
|
89 |
+
|
90 |
+
payload = {
|
91 |
+
"model": model,
|
92 |
+
"messages": [
|
93 |
+
{
|
94 |
+
"role": "user",
|
95 |
+
"content": prompt
|
96 |
+
}
|
97 |
+
],
|
98 |
+
"max_tokens": 2048,
|
99 |
+
"temperature": 0.1
|
100 |
+
}
|
101 |
+
|
102 |
+
response = requests.post(
|
103 |
+
f"{self.config.OPENROUTER_BASE_URL}/chat/completions",
|
104 |
+
headers=self.headers,
|
105 |
+
json=payload,
|
106 |
+
timeout=60
|
107 |
+
)
|
108 |
+
|
109 |
+
if response.status_code != 200:
|
110 |
+
raise Exception(f"API call failed: {response.status_code} - {response.text}")
|
111 |
+
|
112 |
+
result = response.json()
|
113 |
+
if "choices" not in result or not result["choices"]:
|
114 |
+
raise Exception("Invalid API response format")
|
115 |
+
|
116 |
+
return result["choices"][0]["message"]["content"]
|
117 |
+
|
118 |
+
def _parse_extraction_response(self, response: str) -> Dict[str, Any]:
|
119 |
+
"""Parse the LLM response into structured data."""
|
120 |
+
try:
|
121 |
+
# Try to find JSON in the response
|
122 |
+
start_idx = response.find("{")
|
123 |
+
end_idx = response.rfind("}") + 1
|
124 |
+
|
125 |
+
if start_idx == -1 or end_idx == 0:
|
126 |
+
raise ValueError("No JSON found in response")
|
127 |
+
|
128 |
+
json_str = response[start_idx:end_idx]
|
129 |
+
data = json.loads(json_str)
|
130 |
+
|
131 |
+
# Validate structure
|
132 |
+
if "entities" not in data:
|
133 |
+
data["entities"] = []
|
134 |
+
if "relationships" not in data:
|
135 |
+
data["relationships"] = []
|
136 |
+
|
137 |
+
# Filter entities by importance threshold
|
138 |
+
filtered_entities = [
|
139 |
+
entity for entity in data["entities"]
|
140 |
+
if entity.get("importance", 0) >= self.config.ENTITY_IMPORTANCE_THRESHOLD
|
141 |
+
]
|
142 |
+
|
143 |
+
# Limit number of entities and relationships
|
144 |
+
data["entities"] = filtered_entities[:self.config.MAX_ENTITIES]
|
145 |
+
data["relationships"] = data["relationships"][:self.config.MAX_RELATIONSHIPS]
|
146 |
+
|
147 |
+
return data
|
148 |
+
|
149 |
+
except json.JSONDecodeError as e:
|
150 |
+
return {
|
151 |
+
"entities": [],
|
152 |
+
"relationships": [],
|
153 |
+
"error": f"JSON parsing error: {str(e)}"
|
154 |
+
}
|
155 |
+
except Exception as e:
|
156 |
+
return {
|
157 |
+
"entities": [],
|
158 |
+
"relationships": [],
|
159 |
+
"error": f"Response parsing error: {str(e)}"
|
160 |
+
}
|
161 |
+
|
162 |
+
def process_chunks(self, chunks: List[str]) -> Dict[str, Any]:
|
163 |
+
"""Process multiple text chunks and combine results."""
|
164 |
+
all_entities = []
|
165 |
+
all_relationships = []
|
166 |
+
errors = []
|
167 |
+
|
168 |
+
for i, chunk in enumerate(chunks):
|
169 |
+
try:
|
170 |
+
result = self.extract_entities_and_relationships(chunk)
|
171 |
+
|
172 |
+
if "error" in result:
|
173 |
+
errors.append(f"Chunk {i+1}: {result['error']}")
|
174 |
+
continue
|
175 |
+
|
176 |
+
all_entities.extend(result.get("entities", []))
|
177 |
+
all_relationships.extend(result.get("relationships", []))
|
178 |
+
|
179 |
+
except Exception as e:
|
180 |
+
errors.append(f"Chunk {i+1}: {str(e)}")
|
181 |
+
|
182 |
+
# Deduplicate and standardize entities
|
183 |
+
unique_entities = self._deduplicate_entities(all_entities)
|
184 |
+
|
185 |
+
# Validate relationships against existing entities
|
186 |
+
valid_relationships = self._validate_relationships(all_relationships, unique_entities)
|
187 |
+
|
188 |
+
return {
|
189 |
+
"entities": unique_entities,
|
190 |
+
"relationships": valid_relationships,
|
191 |
+
"errors": errors if errors else None
|
192 |
+
}
|
193 |
+
|
194 |
+
def _deduplicate_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
195 |
+
"""Remove duplicate entities and merge similar ones."""
|
196 |
+
seen_names = set()
|
197 |
+
unique_entities = []
|
198 |
+
|
199 |
+
for entity in entities:
|
200 |
+
name = entity.get("name", "").lower().strip()
|
201 |
+
if name and name not in seen_names:
|
202 |
+
seen_names.add(name)
|
203 |
+
unique_entities.append(entity)
|
204 |
+
|
205 |
+
# Sort by importance
|
206 |
+
unique_entities.sort(key=lambda x: x.get("importance", 0), reverse=True)
|
207 |
+
|
208 |
+
return unique_entities[:self.config.MAX_ENTITIES]
|
209 |
+
|
210 |
+
def _validate_relationships(self, relationships: List[Dict[str, Any]], entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
211 |
+
"""Validate that relationships reference existing entities."""
|
212 |
+
entity_names = {entity.get("name", "").lower() for entity in entities}
|
213 |
+
valid_relationships = []
|
214 |
+
|
215 |
+
for rel in relationships:
|
216 |
+
source = rel.get("source", "").lower()
|
217 |
+
target = rel.get("target", "").lower()
|
218 |
+
|
219 |
+
if source in entity_names and target in entity_names:
|
220 |
+
valid_relationships.append(rel)
|
221 |
+
|
222 |
+
return valid_relationships[:self.config.MAX_RELATIONSHIPS]
|
src/streamlit_app.py
DELETED
@@ -1,40 +0,0 @@
|
|
1 |
-
import altair as alt
|
2 |
-
import numpy as np
|
3 |
-
import pandas as pd
|
4 |
-
import streamlit as st
|
5 |
-
|
6 |
-
"""
|
7 |
-
# Welcome to Streamlit!
|
8 |
-
|
9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
11 |
-
forums](https://discuss.streamlit.io).
|
12 |
-
|
13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
14 |
-
"""
|
15 |
-
|
16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
18 |
-
|
19 |
-
indices = np.linspace(0, 1, num_points)
|
20 |
-
theta = 2 * np.pi * num_turns * indices
|
21 |
-
radius = indices
|
22 |
-
|
23 |
-
x = radius * np.cos(theta)
|
24 |
-
y = radius * np.sin(theta)
|
25 |
-
|
26 |
-
df = pd.DataFrame({
|
27 |
-
"x": x,
|
28 |
-
"y": y,
|
29 |
-
"idx": indices,
|
30 |
-
"rand": np.random.randn(num_points),
|
31 |
-
})
|
32 |
-
|
33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
34 |
-
.mark_point(filled=True)
|
35 |
-
.encode(
|
36 |
-
x=alt.X("x", axis=None),
|
37 |
-
y=alt.Y("y", axis=None),
|
38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
40 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/visualizer.py
ADDED
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib
|
2 |
+
matplotlib.use('Agg') # Use non-interactive backend to avoid GUI issues
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
import networkx as nx
|
5 |
+
import numpy as np
|
6 |
+
from typing import Dict, List, Any, Tuple, Optional
|
7 |
+
import json
|
8 |
+
import io
|
9 |
+
import base64
|
10 |
+
import tempfile
|
11 |
+
import os
|
12 |
+
import plotly.graph_objects as go
|
13 |
+
import plotly.express as px
|
14 |
+
from pyvis.network import Network
|
15 |
+
|
16 |
+
class GraphVisualizer:
|
17 |
+
def __init__(self):
|
18 |
+
self.color_map = {
|
19 |
+
'PERSON': '#FF6B6B',
|
20 |
+
'ORGANIZATION': '#4ECDC4',
|
21 |
+
'LOCATION': '#45B7D1',
|
22 |
+
'CONCEPT': '#96CEB4',
|
23 |
+
'EVENT': '#FFEAA7',
|
24 |
+
'OBJECT': '#DDA0DD',
|
25 |
+
'UNKNOWN': '#95A5A6'
|
26 |
+
}
|
27 |
+
|
28 |
+
def visualize_graph(self,
|
29 |
+
graph: nx.DiGraph,
|
30 |
+
layout_type: str = "spring",
|
31 |
+
show_labels: bool = True,
|
32 |
+
show_edge_labels: bool = False,
|
33 |
+
node_size_factor: float = 1.0,
|
34 |
+
figsize: Tuple[int, int] = (12, 8)) -> str:
|
35 |
+
"""Create a matplotlib visualization of the graph and return file path."""
|
36 |
+
|
37 |
+
if not graph.nodes():
|
38 |
+
return self._create_empty_graph_image()
|
39 |
+
|
40 |
+
# Create figure
|
41 |
+
plt.figure(figsize=figsize)
|
42 |
+
plt.clf()
|
43 |
+
|
44 |
+
# Calculate layout
|
45 |
+
pos = self._calculate_layout(graph, layout_type)
|
46 |
+
|
47 |
+
# Get node properties
|
48 |
+
node_colors = [self.color_map.get(graph.nodes[node].get('type', 'UNKNOWN'), '#95A5A6')
|
49 |
+
for node in graph.nodes()]
|
50 |
+
node_sizes = [graph.nodes[node].get('size', 20) * node_size_factor * 10
|
51 |
+
for node in graph.nodes()]
|
52 |
+
|
53 |
+
# Draw nodes
|
54 |
+
nx.draw_networkx_nodes(graph, pos,
|
55 |
+
node_color=node_colors,
|
56 |
+
node_size=node_sizes,
|
57 |
+
alpha=0.8)
|
58 |
+
|
59 |
+
# Draw edges
|
60 |
+
nx.draw_networkx_edges(graph, pos,
|
61 |
+
edge_color='gray',
|
62 |
+
arrows=True,
|
63 |
+
arrowsize=20,
|
64 |
+
alpha=0.6,
|
65 |
+
width=1.5)
|
66 |
+
|
67 |
+
# Draw labels
|
68 |
+
if show_labels:
|
69 |
+
# Create labels with importance scores
|
70 |
+
labels = {}
|
71 |
+
for node in graph.nodes():
|
72 |
+
importance = graph.nodes[node].get('importance', 0.0)
|
73 |
+
labels[node] = f"{node}\n({importance:.2f})"
|
74 |
+
|
75 |
+
nx.draw_networkx_labels(graph, pos, labels, font_size=8)
|
76 |
+
|
77 |
+
# Draw edge labels
|
78 |
+
if show_edge_labels:
|
79 |
+
edge_labels = {(u, v): data.get('relationship', '')
|
80 |
+
for u, v, data in graph.edges(data=True)}
|
81 |
+
nx.draw_networkx_edge_labels(graph, pos, edge_labels, font_size=6)
|
82 |
+
|
83 |
+
plt.title("Knowledge Graph", fontsize=16, fontweight='bold')
|
84 |
+
plt.axis('off')
|
85 |
+
plt.tight_layout()
|
86 |
+
|
87 |
+
# Save to temporary file
|
88 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
|
89 |
+
plt.savefig(temp_file.name, format='png', dpi=150, bbox_inches='tight')
|
90 |
+
plt.close()
|
91 |
+
|
92 |
+
return temp_file.name
|
93 |
+
|
94 |
+
def _calculate_layout(self, graph: nx.DiGraph, layout_type: str) -> Dict[str, Tuple[float, float]]:
|
95 |
+
"""Calculate node positions using specified layout algorithm."""
|
96 |
+
try:
|
97 |
+
if layout_type == "spring":
|
98 |
+
return nx.spring_layout(graph, k=1, iterations=50)
|
99 |
+
elif layout_type == "circular":
|
100 |
+
return nx.circular_layout(graph)
|
101 |
+
elif layout_type == "shell":
|
102 |
+
return nx.shell_layout(graph)
|
103 |
+
elif layout_type == "kamada_kawai":
|
104 |
+
return nx.kamada_kawai_layout(graph)
|
105 |
+
elif layout_type == "random":
|
106 |
+
return nx.random_layout(graph)
|
107 |
+
else:
|
108 |
+
return nx.spring_layout(graph, k=1, iterations=50)
|
109 |
+
except:
|
110 |
+
# Fallback to simple layout if algorithm fails
|
111 |
+
return nx.spring_layout(graph, k=1, iterations=50)
|
112 |
+
|
113 |
+
def _create_empty_graph_image(self) -> str:
|
114 |
+
"""Create an image for empty graph."""
|
115 |
+
plt.figure(figsize=(8, 6))
|
116 |
+
plt.text(0.5, 0.5, 'No graph data to display',
|
117 |
+
horizontalalignment='center', verticalalignment='center',
|
118 |
+
fontsize=16, transform=plt.gca().transAxes)
|
119 |
+
plt.axis('off')
|
120 |
+
|
121 |
+
# Save to temporary file
|
122 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.png')
|
123 |
+
plt.savefig(temp_file.name, format='png', dpi=150, bbox_inches='tight')
|
124 |
+
plt.close()
|
125 |
+
|
126 |
+
return temp_file.name
|
127 |
+
|
128 |
+
def create_interactive_html(self, graph: nx.DiGraph) -> str:
|
129 |
+
"""Create an interactive HTML visualization using vis.js."""
|
130 |
+
if not graph.nodes():
|
131 |
+
return "<div>No graph data to display</div>"
|
132 |
+
|
133 |
+
# Convert graph to vis.js format
|
134 |
+
nodes = []
|
135 |
+
edges = []
|
136 |
+
|
137 |
+
for node, data in graph.nodes(data=True):
|
138 |
+
nodes.append({
|
139 |
+
"id": node,
|
140 |
+
"label": node,
|
141 |
+
"color": self.color_map.get(data.get('type', 'UNKNOWN'), '#95A5A6'),
|
142 |
+
"size": data.get('size', 20),
|
143 |
+
"title": f"Type: {data.get('type', 'UNKNOWN')}<br>"
|
144 |
+
f"Importance: {data.get('importance', 0.0):.2f}<br>"
|
145 |
+
f"Description: {data.get('description', 'N/A')}"
|
146 |
+
})
|
147 |
+
|
148 |
+
for u, v, data in graph.edges(data=True):
|
149 |
+
edges.append({
|
150 |
+
"from": u,
|
151 |
+
"to": v,
|
152 |
+
"label": data.get('relationship', ''),
|
153 |
+
"title": data.get('description', ''),
|
154 |
+
"arrows": {"to": {"enabled": True}}
|
155 |
+
})
|
156 |
+
|
157 |
+
html_template = f"""
|
158 |
+
<!DOCTYPE html>
|
159 |
+
<html>
|
160 |
+
<head>
|
161 |
+
<script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
|
162 |
+
<style>
|
163 |
+
#mynetworkid {{
|
164 |
+
width: 100%;
|
165 |
+
height: 600px;
|
166 |
+
border: 1px solid lightgray;
|
167 |
+
}}
|
168 |
+
</style>
|
169 |
+
</head>
|
170 |
+
<body>
|
171 |
+
<div id="mynetworkid"></div>
|
172 |
+
|
173 |
+
<script>
|
174 |
+
var nodes = new vis.DataSet({json.dumps(nodes)});
|
175 |
+
var edges = new vis.DataSet({json.dumps(edges)});
|
176 |
+
var container = document.getElementById('mynetworkid');
|
177 |
+
|
178 |
+
var data = {{
|
179 |
+
nodes: nodes,
|
180 |
+
edges: edges
|
181 |
+
}};
|
182 |
+
|
183 |
+
var options = {{
|
184 |
+
nodes: {{
|
185 |
+
shape: 'dot',
|
186 |
+
scaling: {{
|
187 |
+
min: 10,
|
188 |
+
max: 30
|
189 |
+
}},
|
190 |
+
font: {{
|
191 |
+
size: 12,
|
192 |
+
face: 'Tahoma'
|
193 |
+
}}
|
194 |
+
}},
|
195 |
+
edges: {{
|
196 |
+
font: {{align: 'middle'}},
|
197 |
+
color: {{color:'gray'}},
|
198 |
+
arrows: {{to: {{enabled: true, scaleFactor: 1}}}}
|
199 |
+
}},
|
200 |
+
physics: {{
|
201 |
+
enabled: true,
|
202 |
+
stabilization: {{enabled: true, iterations: 200}}
|
203 |
+
}},
|
204 |
+
interaction: {{
|
205 |
+
hover: true,
|
206 |
+
tooltipDelay: 200
|
207 |
+
}}
|
208 |
+
}};
|
209 |
+
|
210 |
+
var network = new vis.Network(container, data, options);
|
211 |
+
</script>
|
212 |
+
</body>
|
213 |
+
</html>
|
214 |
+
"""
|
215 |
+
|
216 |
+
return html_template
|
217 |
+
|
218 |
+
def create_statistics_summary(self, graph: nx.DiGraph, stats: Dict[str, Any]) -> str:
|
219 |
+
"""Create a formatted statistics summary."""
|
220 |
+
if not graph.nodes():
|
221 |
+
return "No graph statistics available."
|
222 |
+
|
223 |
+
# Entity type distribution
|
224 |
+
type_counts = {}
|
225 |
+
for node, data in graph.nodes(data=True):
|
226 |
+
node_type = data.get('type', 'UNKNOWN')
|
227 |
+
type_counts[node_type] = type_counts.get(node_type, 0) + 1
|
228 |
+
|
229 |
+
# Relationship type distribution
|
230 |
+
rel_counts = {}
|
231 |
+
for u, v, data in graph.edges(data=True):
|
232 |
+
rel_type = data.get('relationship', 'unknown')
|
233 |
+
rel_counts[rel_type] = rel_counts.get(rel_type, 0) + 1
|
234 |
+
|
235 |
+
summary = f"""
|
236 |
+
## Graph Statistics
|
237 |
+
|
238 |
+
**Basic Metrics:**
|
239 |
+
- Nodes: {stats['num_nodes']}
|
240 |
+
- Edges: {stats['num_edges']}
|
241 |
+
- Density: {stats['density']:.3f}
|
242 |
+
- Connected: {'Yes' if stats['is_connected'] else 'No'}
|
243 |
+
- Components: {stats['num_components']}
|
244 |
+
- Average Degree: {stats['avg_degree']:.2f}
|
245 |
+
|
246 |
+
**Entity Types:**
|
247 |
+
"""
|
248 |
+
|
249 |
+
for entity_type, count in sorted(type_counts.items()):
|
250 |
+
summary += f"\n- {entity_type}: {count}"
|
251 |
+
|
252 |
+
summary += "\n\n**Relationship Types:**"
|
253 |
+
for rel_type, count in sorted(rel_counts.items()):
|
254 |
+
summary += f"\n- {rel_type}: {count}"
|
255 |
+
|
256 |
+
return summary
|
257 |
+
|
258 |
+
def create_entity_list(self, graph: nx.DiGraph, sort_by: str = "importance") -> str:
|
259 |
+
"""Create a formatted list of entities."""
|
260 |
+
if not graph.nodes():
|
261 |
+
return "No entities found."
|
262 |
+
|
263 |
+
entities = []
|
264 |
+
for node, data in graph.nodes(data=True):
|
265 |
+
entities.append({
|
266 |
+
'name': node,
|
267 |
+
'type': data.get('type', 'UNKNOWN'),
|
268 |
+
'importance': data.get('importance', 0.0),
|
269 |
+
'description': data.get('description', 'N/A'),
|
270 |
+
'connections': graph.degree(node)
|
271 |
+
})
|
272 |
+
|
273 |
+
# Sort entities
|
274 |
+
if sort_by == "importance":
|
275 |
+
entities.sort(key=lambda x: x['importance'], reverse=True)
|
276 |
+
elif sort_by == "connections":
|
277 |
+
entities.sort(key=lambda x: x['connections'], reverse=True)
|
278 |
+
elif sort_by == "name":
|
279 |
+
entities.sort(key=lambda x: x['name'])
|
280 |
+
|
281 |
+
entity_list = "## Entities\n\n"
|
282 |
+
for entity in entities:
|
283 |
+
entity_list += f"""
|
284 |
+
**{entity['name']}** ({entity['type']})
|
285 |
+
- Importance: {entity['importance']:.2f}
|
286 |
+
- Connections: {entity['connections']}
|
287 |
+
- Description: {entity['description']}
|
288 |
+
|
289 |
+
"""
|
290 |
+
|
291 |
+
return entity_list
|
292 |
+
|
293 |
+
def get_layout_options(self) -> List[str]:
|
294 |
+
"""Get available layout options."""
|
295 |
+
return ["spring", "circular", "shell", "kamada_kawai", "random"]
|
296 |
+
|
297 |
+
def get_entity_types(self, graph: nx.DiGraph) -> List[str]:
|
298 |
+
"""Get unique entity types from the graph."""
|
299 |
+
types = set()
|
300 |
+
for node, data in graph.nodes(data=True):
|
301 |
+
types.add(data.get('type', 'UNKNOWN'))
|
302 |
+
return sorted(list(types))
|
303 |
+
|
304 |
+
def create_plotly_interactive(self, graph: nx.DiGraph, layout_type: str = "spring") -> go.Figure:
|
305 |
+
"""Create an interactive Plotly visualization of the graph."""
|
306 |
+
if not graph.nodes():
|
307 |
+
# Return empty figure
|
308 |
+
fig = go.Figure()
|
309 |
+
fig.add_annotation(
|
310 |
+
text="No graph data to display",
|
311 |
+
xref="paper", yref="paper",
|
312 |
+
x=0.5, y=0.5, xanchor='center', yanchor='middle',
|
313 |
+
showarrow=False, font=dict(size=16)
|
314 |
+
)
|
315 |
+
return fig
|
316 |
+
|
317 |
+
# Calculate layout
|
318 |
+
pos = self._calculate_layout(graph, layout_type)
|
319 |
+
|
320 |
+
# Prepare node data
|
321 |
+
node_x = []
|
322 |
+
node_y = []
|
323 |
+
node_text = []
|
324 |
+
node_info = []
|
325 |
+
node_colors = []
|
326 |
+
node_sizes = []
|
327 |
+
|
328 |
+
for node in graph.nodes():
|
329 |
+
x, y = pos[node]
|
330 |
+
node_x.append(x)
|
331 |
+
node_y.append(y)
|
332 |
+
|
333 |
+
data = graph.nodes[node]
|
334 |
+
node_type = data.get('type', 'UNKNOWN')
|
335 |
+
importance = data.get('importance', 0.0)
|
336 |
+
description = data.get('description', 'N/A')
|
337 |
+
connections = graph.degree(node)
|
338 |
+
|
339 |
+
node_text.append(node)
|
340 |
+
node_info.append(
|
341 |
+
f"<b>{node}</b><br>"
|
342 |
+
f"Type: {node_type}<br>"
|
343 |
+
f"Importance: {importance:.2f}<br>"
|
344 |
+
f"Connections: {connections}<br>"
|
345 |
+
f"Description: {description}"
|
346 |
+
)
|
347 |
+
node_colors.append(self.color_map.get(node_type, '#95A5A6'))
|
348 |
+
node_sizes.append(max(10, data.get('size', 20)))
|
349 |
+
|
350 |
+
# Prepare edge data
|
351 |
+
edge_x = []
|
352 |
+
edge_y = []
|
353 |
+
edge_info = []
|
354 |
+
|
355 |
+
for edge in graph.edges():
|
356 |
+
x0, y0 = pos[edge[0]]
|
357 |
+
x1, y1 = pos[edge[1]]
|
358 |
+
edge_x.extend([x0, x1, None])
|
359 |
+
edge_y.extend([y0, y1, None])
|
360 |
+
|
361 |
+
edge_data = graph.edges[edge]
|
362 |
+
relationship = edge_data.get('relationship', 'connected')
|
363 |
+
edge_info.append(f"{edge[0]} → {edge[1]}<br>Relationship: {relationship}")
|
364 |
+
|
365 |
+
# Create edge trace
|
366 |
+
edge_trace = go.Scatter(
|
367 |
+
x=edge_x, y=edge_y,
|
368 |
+
line=dict(width=2, color='gray'),
|
369 |
+
hoverinfo='none',
|
370 |
+
mode='lines'
|
371 |
+
)
|
372 |
+
|
373 |
+
# Create node trace
|
374 |
+
node_trace = go.Scatter(
|
375 |
+
x=node_x, y=node_y,
|
376 |
+
mode='markers+text',
|
377 |
+
hoverinfo='text',
|
378 |
+
text=node_text,
|
379 |
+
hovertext=node_info,
|
380 |
+
textposition="middle center",
|
381 |
+
marker=dict(
|
382 |
+
size=node_sizes,
|
383 |
+
color=node_colors,
|
384 |
+
line=dict(width=2, color='white')
|
385 |
+
)
|
386 |
+
)
|
387 |
+
|
388 |
+
# Create figure
|
389 |
+
fig = go.Figure(data=[edge_trace, node_trace],
|
390 |
+
layout=go.Layout(
|
391 |
+
title='Interactive Knowledge Graph',
|
392 |
+
titlefont_size=16,
|
393 |
+
showlegend=False,
|
394 |
+
hovermode='closest',
|
395 |
+
margin=dict(b=20,l=5,r=5,t=40),
|
396 |
+
annotations=[ dict(
|
397 |
+
text="Hover over nodes for details. Drag to pan, scroll to zoom.",
|
398 |
+
showarrow=False,
|
399 |
+
xref="paper", yref="paper",
|
400 |
+
x=0.005, y=-0.002,
|
401 |
+
xanchor='left', yanchor='bottom',
|
402 |
+
font=dict(color="gray", size=12)
|
403 |
+
)],
|
404 |
+
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
|
405 |
+
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
|
406 |
+
plot_bgcolor='white'
|
407 |
+
))
|
408 |
+
|
409 |
+
return fig
|
410 |
+
|
411 |
+
def create_pyvis_interactive(self, graph: nx.DiGraph, layout_type: str = "spring") -> str:
|
412 |
+
"""Create an interactive pyvis visualization and return HTML file path."""
|
413 |
+
if not graph.nodes():
|
414 |
+
return self._create_empty_pyvis_graph()
|
415 |
+
|
416 |
+
# Create pyvis network
|
417 |
+
net = Network(height="600px", width="100%", bgcolor="#ffffff", font_color="black")
|
418 |
+
|
419 |
+
# Configure physics
|
420 |
+
net.set_options("""
|
421 |
+
{
|
422 |
+
"physics": {
|
423 |
+
"enabled": true,
|
424 |
+
"stabilization": {"enabled": true, "iterations": 200},
|
425 |
+
"barnesHut": {
|
426 |
+
"gravitationalConstant": -2000,
|
427 |
+
"centralGravity": 0.3,
|
428 |
+
"springLength": 95,
|
429 |
+
"springConstant": 0.04,
|
430 |
+
"damping": 0.09
|
431 |
+
}
|
432 |
+
},
|
433 |
+
"interaction": {
|
434 |
+
"hover": true,
|
435 |
+
"tooltipDelay": 200,
|
436 |
+
"hideEdgesOnDrag": false
|
437 |
+
}
|
438 |
+
}
|
439 |
+
""")
|
440 |
+
|
441 |
+
# Add nodes
|
442 |
+
for node, data in graph.nodes(data=True):
|
443 |
+
node_type = data.get('type', 'UNKNOWN')
|
444 |
+
importance = data.get('importance', 0.0)
|
445 |
+
description = data.get('description', 'N/A')
|
446 |
+
connections = graph.degree(node)
|
447 |
+
|
448 |
+
# Node properties
|
449 |
+
color = self.color_map.get(node_type, '#95A5A6')
|
450 |
+
size = max(10, data.get('size', 20))
|
451 |
+
|
452 |
+
# Tooltip text
|
453 |
+
title = f"""
|
454 |
+
<b>{node}</b><br>
|
455 |
+
Type: {node_type}<br>
|
456 |
+
Importance: {importance:.2f}<br>
|
457 |
+
Connections: {connections}<br>
|
458 |
+
Description: {description}
|
459 |
+
"""
|
460 |
+
|
461 |
+
net.add_node(node, label=node, title=title, color=color, size=size)
|
462 |
+
|
463 |
+
# Add edges
|
464 |
+
for u, v, data in graph.edges(data=True):
|
465 |
+
relationship = data.get('relationship', 'connected')
|
466 |
+
title = f"{u} → {v}<br>Relationship: {relationship}"
|
467 |
+
|
468 |
+
net.add_edge(u, v, title=title, arrows="to", color="gray")
|
469 |
+
|
470 |
+
# Save to temporary file
|
471 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html', mode='w')
|
472 |
+
net.save_graph(temp_file.name)
|
473 |
+
temp_file.close()
|
474 |
+
|
475 |
+
return temp_file.name
|
476 |
+
|
477 |
+
def _create_empty_pyvis_graph(self) -> str:
|
478 |
+
"""Create an empty pyvis graph."""
|
479 |
+
net = Network(height="600px", width="100%", bgcolor="#ffffff", font_color="black")
|
480 |
+
net.add_node(1, label="No graph data", color="#cccccc")
|
481 |
+
|
482 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html', mode='w')
|
483 |
+
net.save_graph(temp_file.name)
|
484 |
+
temp_file.close()
|
485 |
+
|
486 |
+
return temp_file.name
|
487 |
+
|
488 |
+
def get_visualization_options(self) -> List[str]:
|
489 |
+
"""Get available visualization types."""
|
490 |
+
return ["matplotlib", "plotly", "pyvis", "vis.js"]
|
491 |
+
|
492 |
+
def get_relationship_types(self, graph: nx.DiGraph) -> List[str]:
|
493 |
+
"""Get unique relationship types from the graph."""
|
494 |
+
types = set()
|
495 |
+
for u, v, data in graph.edges(data=True):
|
496 |
+
types.add(data.get('relationship', 'unknown'))
|
497 |
+
return sorted(list(types))
|
test_document.txt
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Knowledge Graph Test Document
|
2 |
+
|
3 |
+
This is a test document for the knowledge graph extraction application. The document discusses several important entities and their relationships.
|
4 |
+
|
5 |
+
John Smith is a software engineer at TechCorp, a technology company located in San Francisco. He works on artificial intelligence projects and collaborates with Sarah Johnson, who is the head of research at the same company.
|
6 |
+
|
7 |
+
TechCorp was founded in 2015 and specializes in machine learning solutions. The company has partnerships with several universities, including Stanford University and MIT, for collaborative research on natural language processing.
|
8 |
+
|
9 |
+
The artificial intelligence field has seen rapid advancement in recent years, particularly in areas like deep learning and neural networks. These technologies are revolutionizing industries such as healthcare, finance, and autonomous vehicles.
|
10 |
+
|
11 |
+
Sarah Johnson previously worked at DataSystems Inc, another technology company, before joining TechCorp in 2018. She holds a PhD in Computer Science from Stanford University and has published numerous papers on machine learning algorithms.
|
12 |
+
|
13 |
+
The collaboration between TechCorp and Stanford University has resulted in several breakthrough innovations in conversational AI and knowledge representation systems. These projects are funded by government grants and private investments.
|
14 |
+
|
15 |
+
Machine learning and artificial intelligence are closely related concepts that form the foundation of modern data science and automation technologies.
|