Spaces:
Paused
Paused
initial commit
Browse files- .gitignore +2 -0
- .python-version +1 -0
- README.md +41 -1
- __pycache__/config.cpython-310.pyc +0 -0
- __pycache__/retrieval.cpython-310.pyc +0 -0
- app.py +17 -0
- config.py +15 -0
- data_ingestion.py +69 -0
- ingest_papers.py +9 -0
- ingestion-flow-diagram.mermaid +15 -0
- requirements.txt +51 -0
- retrieval-flow-diagram.mermaid +26 -0
- retrieval.py +30 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.venv/
|
2 |
+
.env
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10
|
README.md
CHANGED
@@ -10,4 +10,44 @@ pinned: false
|
|
10 |
license: cc-by-4.0
|
11 |
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
license: cc-by-4.0
|
11 |
---
|
12 |
|
13 |
+
# arXiv RAG System README
|
14 |
+
|
15 |
+
## Key Stakeholder
|
16 |
+
The primary stakeholder for this system is an agentic "System of Agents". This design choice emphasizes the need for modularity, flexibility, and the ability for the system to evolve and improve its own processes.
|
17 |
+
|
18 |
+
## Architectural Vision
|
19 |
+
- The system is designed with modularity in mind, using a microservices architecture to allow easy replacement of specific components (libraries, applications, LLMs).
|
20 |
+
- A Hugging Face dataset is used to store metadata and interim results for retrieved documents, crucial for avoiding repetitive and costly processing.
|
21 |
+
- The system captures and tracks the history of document reviews, summarizations, and evaluations performed by the agents.
|
22 |
+
|
23 |
+
## Key Architectural Choices
|
24 |
+
|
25 |
+
1. **Document Loading**: PyMuPDF for efficient PDF processing with image extraction.
|
26 |
+
|
27 |
+
2. **Text Splitting**: RecursiveCharacterTextSplitter for content chunking.
|
28 |
+
|
29 |
+
3. **Embedding Model**: OpenAI's text-embedding-3-small for generating embeddings.
|
30 |
+
|
31 |
+
4. **Vector Database**: Qdrant for storing and retrieving embeddings.
|
32 |
+
|
33 |
+
5. **Retrieval Mechanism**: Similarity search with cosine similarity threshold of 0.5 and k=5.
|
34 |
+
|
35 |
+
6. **Language Model**: Llama3 70B via Groq API for response generation.
|
36 |
+
|
37 |
+
7. **Orchestration**: LangChain/LCEL for RAG pipeline orchestration.
|
38 |
+
|
39 |
+
8. **User Interface**: Gradio for demonstration purposes (note: primary interface is for the agentic system).
|
40 |
+
|
41 |
+
9. **Logging and Monitoring**: LangSmith for comprehensive logging and LLM operations monitoring.
|
42 |
+
|
43 |
+
10. **Metadata and Results Storage**: Hugging Face dataset for storing document metadata, interim results, and agent review history.
|
44 |
+
|
45 |
+
## Sequence Diagrams
|
46 |
+
|
47 |
+
### 1. Ingestion Flow
|
48 |
+
|
49 |
+
[Mermaid Diagram - Ingestion Flow](ingestion-flow-diagram.mermaid)
|
50 |
+
|
51 |
+
### 2. Retrieval Flow
|
52 |
+
|
53 |
+
[Mermaid Diagram - Retrieval Flow](retrieval-flow-diagram.mermaid)
|
__pycache__/config.cpython-310.pyc
ADDED
Binary file (586 Bytes). View file
|
|
__pycache__/retrieval.cpython-310.pyc
ADDED
Binary file (1.04 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File: app.py
|
2 |
+
import gradio as gr
|
3 |
+
from retrieval import rag_query
|
4 |
+
|
5 |
+
def gradio_interface(query: str) -> str:
|
6 |
+
return rag_query(query)
|
7 |
+
|
8 |
+
iface = gr.Interface(
|
9 |
+
fn=gradio_interface,
|
10 |
+
inputs="text",
|
11 |
+
outputs="text",
|
12 |
+
title="arXiv RAG System",
|
13 |
+
description="Ask questions about arXiv papers"
|
14 |
+
)
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
iface.launch()
|
config.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File: config.py
|
2 |
+
import os
|
3 |
+
|
4 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
5 |
+
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
6 |
+
QDRANT_API_URL = os.getenv("QDRANT_API_URL")
|
7 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
8 |
+
COLLECTION_NAME = "arxiv_papers"
|
9 |
+
DATASET_NAME = "dwb2023/arxiv-papers-dataset" # Replace with your actual dataset name
|
10 |
+
|
11 |
+
LANGCHAIN_PROJECT="arxiv_papers"
|
12 |
+
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
|
13 |
+
LANGCHAIN_TRACING_V2="true"
|
14 |
+
LANGCHAIN_HUB_PROMPT="rlm/rag-prompt-llama3"
|
15 |
+
LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
|
data_ingestion.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# File: data_ingestion.py
|
3 |
+
import arxiv
|
4 |
+
import io
|
5 |
+
import requests
|
6 |
+
from typing import List, Dict, Any
|
7 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
+
from langchain_openai import OpenAIEmbeddings
|
9 |
+
from langchain_qdrant import Qdrant
|
10 |
+
from datasets import load_dataset, Dataset
|
11 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
12 |
+
from config import *
|
13 |
+
|
14 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
15 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
16 |
+
|
17 |
+
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
18 |
+
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
19 |
+
search = arxiv.Search(query=query, max_results=max_results)
|
20 |
+
results = []
|
21 |
+
for result in client.results(search):
|
22 |
+
metadata = {
|
23 |
+
"title": result.title,
|
24 |
+
"authors": [author.name for author in result.authors],
|
25 |
+
"published": result.published.isoformat(),
|
26 |
+
"updated": result.updated.isoformat(),
|
27 |
+
"pdf_url": result.pdf_url,
|
28 |
+
"entry_id": result.entry_id,
|
29 |
+
"summary": result.summary
|
30 |
+
}
|
31 |
+
results.append(metadata)
|
32 |
+
return results
|
33 |
+
|
34 |
+
def process_pdf(pdf_url: str) -> str:
|
35 |
+
loader = PyMuPDFLoader(pdf_url)
|
36 |
+
data = loader.load()
|
37 |
+
return "\n".join([page.page_content for page in data])
|
38 |
+
|
39 |
+
def ingest_documents(metadata_list: List[Dict[str, Any]]):
|
40 |
+
qdrant = Qdrant.from_documents(
|
41 |
+
[], # We'll add documents one by one
|
42 |
+
embeddings,
|
43 |
+
url=QDRANT_API_URL,
|
44 |
+
api_key=QDRANT_API_KEY,
|
45 |
+
collection_name=COLLECTION_NAME,
|
46 |
+
)
|
47 |
+
|
48 |
+
dataset = load_dataset(DATASET_NAME)
|
49 |
+
new_data = []
|
50 |
+
|
51 |
+
for metadata in metadata_list:
|
52 |
+
pdf_text = process_pdf(metadata["pdf_url"])
|
53 |
+
chunks = text_splitter.split_text(pdf_text)
|
54 |
+
|
55 |
+
# Add to Qdrant
|
56 |
+
qdrant.add_texts(chunks, metadatas=[metadata] * len(chunks))
|
57 |
+
|
58 |
+
# Prepare data for Hugging Face dataset
|
59 |
+
for chunk in chunks:
|
60 |
+
new_data.append({
|
61 |
+
"text": chunk,
|
62 |
+
"metadata": metadata,
|
63 |
+
"embedding": embeddings.embed_query(chunk)
|
64 |
+
})
|
65 |
+
|
66 |
+
# Update Hugging Face dataset
|
67 |
+
new_dataset = Dataset.from_dict({k: [d[k] for d in new_data] for k in new_data[0]})
|
68 |
+
dataset = dataset.add_item(new_dataset)
|
69 |
+
dataset.push_to_hub(DATASET_NAME)
|
ingest_papers.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File: ingest_papers.py
|
2 |
+
from data_ingestion import fetch_arxiv_metadata, ingest_documents
|
3 |
+
|
4 |
+
if __name__ == "__main__":
|
5 |
+
query = input("Enter a query to fetch arXiv papers: ")
|
6 |
+
max_results = int(input("Enter the maximum number of papers to fetch: "))
|
7 |
+
metadata_list = fetch_arxiv_metadata(query, max_results)
|
8 |
+
ingest_documents(metadata_list)
|
9 |
+
print(f"Ingested {len(metadata_list)} papers.")
|
ingestion-flow-diagram.mermaid
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sequenceDiagram
|
2 |
+
participant PDF as arXiv PDF Document
|
3 |
+
participant DL as Document Loader (PyMuPDF)
|
4 |
+
participant TS as Text Splitter (RecursiveCharacter)
|
5 |
+
participant EM as Embedding Model (OpenAI)
|
6 |
+
participant VDB as Vector Database (Qdrant)
|
7 |
+
participant DS as Dataset (Hugging Face)
|
8 |
+
|
9 |
+
PDF->>DL: Load document
|
10 |
+
Note over DL: extract_images=True
|
11 |
+
DL->>TS: Pass extracted text
|
12 |
+
TS->>EM: Send text chunks
|
13 |
+
EM->>VDB: Store embeddings
|
14 |
+
DL->>DS: Store metadata
|
15 |
+
DL->>DS: Store extracted text
|
requirements.txt
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#
|
2 |
+
# This file is autogenerated by pip-compile with Python 3.10
|
3 |
+
# by the following command:
|
4 |
+
#
|
5 |
+
# pip-compile --no-annotate --strip-extras
|
6 |
+
#
|
7 |
+
--extra-index-url https://pypi.ngc.nvidia.com
|
8 |
+
--trusted-host pypi.ngc.nvidia.com
|
9 |
+
|
10 |
+
attrs==23.2.0
|
11 |
+
build==1.2.1
|
12 |
+
click==8.1.7
|
13 |
+
colorama==0.4.6
|
14 |
+
fastjsonschema==2.20.0
|
15 |
+
jsonschema==4.22.0
|
16 |
+
jsonschema-specifications==2023.12.1
|
17 |
+
jupyter-core==5.7.2
|
18 |
+
jupytext==1.16.2
|
19 |
+
markdown-it-py==3.0.0
|
20 |
+
mdit-py-plugins==0.4.1
|
21 |
+
mdurl==0.1.2
|
22 |
+
nbformat==5.10.4
|
23 |
+
packaging==24.1
|
24 |
+
platformdirs==4.2.2
|
25 |
+
pyproject-hooks==1.1.0
|
26 |
+
pyyaml==6.0.1
|
27 |
+
referencing==0.35.1
|
28 |
+
rpds-py==0.18.1
|
29 |
+
tomli==2.0.1
|
30 |
+
traitlets==5.14.3
|
31 |
+
wheel==0.43.0
|
32 |
+
|
33 |
+
# The following packages are considered to be unsafe in a requirements file:
|
34 |
+
# pip
|
35 |
+
# setuptools
|
36 |
+
|
37 |
+
langchain==0.2.5
|
38 |
+
langchain-core==0.2.9
|
39 |
+
langchain-groq==0.1.5
|
40 |
+
langchain-openai==0.1.8
|
41 |
+
langchain-qdrant==0.1.1
|
42 |
+
langchain-text-splitters==0.2.1
|
43 |
+
langchainhub==0.1.20
|
44 |
+
langsmith==0.1.81
|
45 |
+
openai==1.34.0
|
46 |
+
qdrant-client==1.9.1
|
47 |
+
arxiv
|
48 |
+
pymupdf
|
49 |
+
datasets
|
50 |
+
gradio
|
51 |
+
langchain-community
|
retrieval-flow-diagram.mermaid
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sequenceDiagram
|
2 |
+
Actor MOA as Agentic Reasoning System
|
3 |
+
Actor User
|
4 |
+
participant App as Application (Gradio)
|
5 |
+
participant Orch as Orchestration (LangChain/LCEL)
|
6 |
+
participant EM as Embedding Model (OpenAI)
|
7 |
+
participant VDB as Vector Database (Qdrant)
|
8 |
+
participant LLM as LLM (Llama3 70B via Groq)
|
9 |
+
participant HF as Hugging Face Dataset
|
10 |
+
participant Log as Logging (LangSmith)
|
11 |
+
|
12 |
+
MOA->>Orch: Submit query
|
13 |
+
User->>App: Submit query
|
14 |
+
App->>Orch: Process query
|
15 |
+
Orch->>EM: Generate query embedding
|
16 |
+
EM->>VDB: Retrieve similar vectors
|
17 |
+
Note over VDB: similarity threshold: 0.5, k=5
|
18 |
+
VDB->>Orch: Return relevant contexts
|
19 |
+
Orch->>HF: Retrieve additional metadata
|
20 |
+
Orch->>LLM: Send query + contexts + metadata
|
21 |
+
LLM->>Orch: Generate response
|
22 |
+
Orch->>MOA: Return response for analysis
|
23 |
+
MOA->>HF: Store analysis and evaluation
|
24 |
+
Orch->>App: Return response
|
25 |
+
App->>User: Display response
|
26 |
+
Orch->>Log: Log interaction
|
retrieval.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File: retrieval.py
|
2 |
+
from langchain_qdrant import Qdrant
|
3 |
+
from langchain_groq import ChatGroq
|
4 |
+
from langchain_openai import OpenAIEmbeddings
|
5 |
+
from langchain.chains import RetrievalQA
|
6 |
+
from config import *
|
7 |
+
|
8 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
9 |
+
llm = ChatGroq(model="llama3-70b-4096", temperature=0.3)
|
10 |
+
|
11 |
+
def rag_query(query: str) -> str:
|
12 |
+
qdrant = Qdrant.from_existing_collection(
|
13 |
+
embedding=embeddings,
|
14 |
+
collection_name=COLLECTION_NAME,
|
15 |
+
url=QDRANT_API_URL,
|
16 |
+
api_key=QDRANT_API_KEY,
|
17 |
+
prefer_grpc=True,
|
18 |
+
)
|
19 |
+
|
20 |
+
retriever = qdrant.as_retriever(search_kwargs={"k": 5})
|
21 |
+
|
22 |
+
qa_chain = RetrievalQA.from_chain_type(
|
23 |
+
llm=llm,
|
24 |
+
chain_type="stuff",
|
25 |
+
retriever=retriever,
|
26 |
+
return_source_documents=True
|
27 |
+
)
|
28 |
+
|
29 |
+
result = qa_chain({"query": query})
|
30 |
+
return result["result"]
|