Spaces:
Paused
Paused
update ingestion flow
Browse files- .gitignore +2 -1
- README.md +6 -4
- __pycache__/config.cpython-310.pyc +0 -0
- arxiv-library-tests.py +43 -0
- data_ingestion.py +6 -0
- ingestion-flow-service-diagram.mermaid +38 -0
.gitignore
CHANGED
@@ -2,6 +2,7 @@
|
|
2 |
.env
|
3 |
__pycache__/
|
4 |
#ignore all files in the __pycache__ directory
|
5 |
-
__pycache__
|
6 |
__pycache__/retrieval.cpython-310.pyc
|
7 |
__pycache__/retrieval.cpython-310.pyc
|
|
|
|
2 |
.env
|
3 |
__pycache__/
|
4 |
#ignore all files in the __pycache__ directory
|
5 |
+
__pycache__/
|
6 |
__pycache__/retrieval.cpython-310.pyc
|
7 |
__pycache__/retrieval.cpython-310.pyc
|
8 |
+
__pycache__/config.cpython-310.pyc
|
README.md
CHANGED
@@ -44,10 +44,12 @@ The primary stakeholder for this system is an agentic "System of Agents". This d
|
|
44 |
|
45 |
## Sequence Diagrams
|
46 |
|
47 |
-
### 1. Ingestion Flow
|
48 |
|
49 |
-
[
|
50 |
|
51 |
-
|
52 |
|
53 |
-
|
|
|
|
|
|
44 |
|
45 |
## Sequence Diagrams
|
46 |
|
47 |
+
### 1. Ingestion Flow - Mermaid Diagrams
|
48 |
|
49 |
+
[Ingestion Flow](ingestion-flow-diagram.mermaid)
|
50 |
|
51 |
+
[Ingestion Flow - Service Diagram](ingestion-flow-service-diagram.mermaid)
|
52 |
|
53 |
+
### 2. Retrieval Flow - Mermaid Diagrams
|
54 |
+
|
55 |
+
[Retrieval Flow](retrieval-flow-diagram.mermaid)
|
__pycache__/config.cpython-310.pyc
CHANGED
Binary files a/__pycache__/config.cpython-310.pyc and b/__pycache__/config.cpython-310.pyc differ
|
|
arxiv-library-tests.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import unittest
|
2 |
+
import arxiv
|
3 |
+
|
4 |
+
class TestArxivLibrary(unittest.TestCase):
|
5 |
+
def setUp(self):
|
6 |
+
self.client = arxiv.Client()
|
7 |
+
|
8 |
+
def test_simple_search(self):
|
9 |
+
search = arxiv.Search(query="quantum computing", max_results=5)
|
10 |
+
results = list(self.client.results(search))
|
11 |
+
self.assertEqual(len(results), 5)
|
12 |
+
self.assertTrue(all(isinstance(r, arxiv.Result) for r in results))
|
13 |
+
|
14 |
+
def test_complex_query(self):
|
15 |
+
query = 'au:"John Doe" AND cat:cs.AI AND year:2020'
|
16 |
+
search = arxiv.Search(query=query, max_results=10)
|
17 |
+
results = list(self.client.results(search))
|
18 |
+
# Add assertions to check the results match the query
|
19 |
+
|
20 |
+
def test_empty_query(self):
|
21 |
+
search = arxiv.Search(query="", max_results=5)
|
22 |
+
results = list(self.client.results(search))
|
23 |
+
self.assertEqual(len(results), 0, "Empty query should return no results")
|
24 |
+
|
25 |
+
def test_metadata_extraction(self):
|
26 |
+
search = arxiv.Search(query="physics", max_results=1)
|
27 |
+
result = next(self.client.results(search))
|
28 |
+
self.assertIsNotNone(result.title)
|
29 |
+
self.assertIsNotNone(result.authors)
|
30 |
+
self.assertIsNotNone(result.published)
|
31 |
+
# Add more assertions for other metadata fields
|
32 |
+
|
33 |
+
def test_whitespace_query(self):
|
34 |
+
search = arxiv.Search(query=" ", max_results=5)
|
35 |
+
results = list(self.client.results(search))
|
36 |
+
self.assertEqual(len(results), 0, "Whitespace-only query should return no results")
|
37 |
+
|
38 |
+
def test_network_error(self):
|
39 |
+
# Mock network error and ensure it's handled gracefully
|
40 |
+
pass
|
41 |
+
|
42 |
+
if __name__ == '__main__':
|
43 |
+
unittest.main()
|
data_ingestion.py
CHANGED
@@ -16,6 +16,10 @@ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20
|
|
16 |
|
17 |
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
18 |
logging.info(f"Fetching arXiv metadata for query: {query}")
|
|
|
|
|
|
|
|
|
19 |
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
20 |
search = arxiv.Search(query=query, max_results=max_results)
|
21 |
results = []
|
@@ -83,6 +87,8 @@ def ingest_documents(metadata_list: List[Dict[str, Any]]) -> str:
|
|
83 |
def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
|
84 |
try:
|
85 |
metadata_list = fetch_arxiv_metadata(query, max_results)
|
|
|
|
|
86 |
result = ingest_documents(metadata_list)
|
87 |
return result
|
88 |
except Exception as e:
|
|
|
16 |
|
17 |
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
18 |
logging.info(f"Fetching arXiv metadata for query: {query}")
|
19 |
+
if not query.strip():
|
20 |
+
logging.warning("Empty or whitespace-only query provided")
|
21 |
+
return []
|
22 |
+
|
23 |
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
24 |
search = arxiv.Search(query=query, max_results=max_results)
|
25 |
results = []
|
|
|
87 |
def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
|
88 |
try:
|
89 |
metadata_list = fetch_arxiv_metadata(query, max_results)
|
90 |
+
if not metadata_list:
|
91 |
+
return "No results found for the given query"
|
92 |
result = ingest_documents(metadata_list)
|
93 |
return result
|
94 |
except Exception as e:
|
ingestion-flow-service-diagram.mermaid
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sequenceDiagram
|
2 |
+
participant API as API (FastAPI)
|
3 |
+
participant DI as Data Ingestion Service
|
4 |
+
participant AM as ArXiv Metadata Fetcher
|
5 |
+
participant PL as PDF Loader (PyMuPDF)
|
6 |
+
participant TS as Text Splitter
|
7 |
+
participant EM as Embedding Model (OpenAI)
|
8 |
+
participant VDB as Vector Database (Qdrant)
|
9 |
+
participant HF as Hugging Face Dataset
|
10 |
+
|
11 |
+
API->>DI: POST /ingest (query, max_results)
|
12 |
+
DI->>AM: fetch_arxiv_metadata(query, max_results)
|
13 |
+
AM-->>DI: Return metadata list
|
14 |
+
alt Successful metadata fetch
|
15 |
+
loop For each metadata item
|
16 |
+
DI->>PL: process_pdf(pdf_url)
|
17 |
+
alt Successful PDF processing
|
18 |
+
PL-->>DI: Return PDF text
|
19 |
+
DI->>TS: split_text(pdf_text)
|
20 |
+
TS-->>DI: Return text chunks
|
21 |
+
loop For each chunk
|
22 |
+
DI->>EM: embed_query(chunk)
|
23 |
+
EM-->>DI: Return embedding
|
24 |
+
DI->>VDB: add_texts(chunk, embedding)
|
25 |
+
DI->>HF: Add chunk and metadata
|
26 |
+
end
|
27 |
+
else PDF processing error
|
28 |
+
PL-->>DI: Raise exception
|
29 |
+
DI->>DI: Log error and continue
|
30 |
+
end
|
31 |
+
end
|
32 |
+
DI-->>API: Return ingestion result
|
33 |
+
else Metadata fetch error
|
34 |
+
AM-->>DI: Raise exception
|
35 |
+
DI-->>API: Return error message
|
36 |
+
end
|
37 |
+
|
38 |
+
Note over API,HF: Logging at each step
|