donb-hf commited on
Commit
e808c52
·
1 Parent(s): b4c442a

update ingestion flow

Browse files
.gitignore CHANGED
@@ -2,6 +2,7 @@
2
  .env
3
  __pycache__/
4
  #ignore all files in the __pycache__ directory
5
- __pycache__/*
6
  __pycache__/retrieval.cpython-310.pyc
7
  __pycache__/retrieval.cpython-310.pyc
 
 
2
  .env
3
  __pycache__/
4
  #ignore all files in the __pycache__ directory
5
+ __pycache__/
6
  __pycache__/retrieval.cpython-310.pyc
7
  __pycache__/retrieval.cpython-310.pyc
8
+ __pycache__/config.cpython-310.pyc
README.md CHANGED
@@ -44,10 +44,12 @@ The primary stakeholder for this system is an agentic "System of Agents". This d
44
 
45
  ## Sequence Diagrams
46
 
47
- ### 1. Ingestion Flow
48
 
49
- [Mermaid Diagram - Ingestion Flow](ingestion-flow-diagram.mermaid)
50
 
51
- ### 2. Retrieval Flow
52
 
53
- [Mermaid Diagram - Retrieval Flow](retrieval-flow-diagram.mermaid)
 
 
 
44
 
45
  ## Sequence Diagrams
46
 
47
+ ### 1. Ingestion Flow - Mermaid Diagrams
48
 
49
+ [Ingestion Flow](ingestion-flow-diagram.mermaid)
50
 
51
+ [Ingestion Flow - Service Diagram](ingestion-flow-service-diagram.mermaid)
52
 
53
+ ### 2. Retrieval Flow - Mermaid Diagrams
54
+
55
+ [Retrieval Flow](retrieval-flow-diagram.mermaid)
__pycache__/config.cpython-310.pyc CHANGED
Binary files a/__pycache__/config.cpython-310.pyc and b/__pycache__/config.cpython-310.pyc differ
 
arxiv-library-tests.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ import arxiv
3
+
4
+ class TestArxivLibrary(unittest.TestCase):
5
+ def setUp(self):
6
+ self.client = arxiv.Client()
7
+
8
+ def test_simple_search(self):
9
+ search = arxiv.Search(query="quantum computing", max_results=5)
10
+ results = list(self.client.results(search))
11
+ self.assertEqual(len(results), 5)
12
+ self.assertTrue(all(isinstance(r, arxiv.Result) for r in results))
13
+
14
+ def test_complex_query(self):
15
+ query = 'au:"John Doe" AND cat:cs.AI AND year:2020'
16
+ search = arxiv.Search(query=query, max_results=10)
17
+ results = list(self.client.results(search))
18
+ # Add assertions to check the results match the query
19
+
20
+ def test_empty_query(self):
21
+ search = arxiv.Search(query="", max_results=5)
22
+ results = list(self.client.results(search))
23
+ self.assertEqual(len(results), 0, "Empty query should return no results")
24
+
25
+ def test_metadata_extraction(self):
26
+ search = arxiv.Search(query="physics", max_results=1)
27
+ result = next(self.client.results(search))
28
+ self.assertIsNotNone(result.title)
29
+ self.assertIsNotNone(result.authors)
30
+ self.assertIsNotNone(result.published)
31
+ # Add more assertions for other metadata fields
32
+
33
+ def test_whitespace_query(self):
34
+ search = arxiv.Search(query=" ", max_results=5)
35
+ results = list(self.client.results(search))
36
+ self.assertEqual(len(results), 0, "Whitespace-only query should return no results")
37
+
38
+ def test_network_error(self):
39
+ # Mock network error and ensure it's handled gracefully
40
+ pass
41
+
42
+ if __name__ == '__main__':
43
+ unittest.main()
data_ingestion.py CHANGED
@@ -16,6 +16,10 @@ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20
16
 
17
  def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
18
  logging.info(f"Fetching arXiv metadata for query: {query}")
 
 
 
 
19
  client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
20
  search = arxiv.Search(query=query, max_results=max_results)
21
  results = []
@@ -83,6 +87,8 @@ def ingest_documents(metadata_list: List[Dict[str, Any]]) -> str:
83
  def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
84
  try:
85
  metadata_list = fetch_arxiv_metadata(query, max_results)
 
 
86
  result = ingest_documents(metadata_list)
87
  return result
88
  except Exception as e:
 
16
 
17
  def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
18
  logging.info(f"Fetching arXiv metadata for query: {query}")
19
+ if not query.strip():
20
+ logging.warning("Empty or whitespace-only query provided")
21
+ return []
22
+
23
  client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
24
  search = arxiv.Search(query=query, max_results=max_results)
25
  results = []
 
87
  def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
88
  try:
89
  metadata_list = fetch_arxiv_metadata(query, max_results)
90
+ if not metadata_list:
91
+ return "No results found for the given query"
92
  result = ingest_documents(metadata_list)
93
  return result
94
  except Exception as e:
ingestion-flow-service-diagram.mermaid ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sequenceDiagram
2
+ participant API as API (FastAPI)
3
+ participant DI as Data Ingestion Service
4
+ participant AM as ArXiv Metadata Fetcher
5
+ participant PL as PDF Loader (PyMuPDF)
6
+ participant TS as Text Splitter
7
+ participant EM as Embedding Model (OpenAI)
8
+ participant VDB as Vector Database (Qdrant)
9
+ participant HF as Hugging Face Dataset
10
+
11
+ API->>DI: POST /ingest (query, max_results)
12
+ DI->>AM: fetch_arxiv_metadata(query, max_results)
13
+ AM-->>DI: Return metadata list
14
+ alt Successful metadata fetch
15
+ loop For each metadata item
16
+ DI->>PL: process_pdf(pdf_url)
17
+ alt Successful PDF processing
18
+ PL-->>DI: Return PDF text
19
+ DI->>TS: split_text(pdf_text)
20
+ TS-->>DI: Return text chunks
21
+ loop For each chunk
22
+ DI->>EM: embed_query(chunk)
23
+ EM-->>DI: Return embedding
24
+ DI->>VDB: add_texts(chunk, embedding)
25
+ DI->>HF: Add chunk and metadata
26
+ end
27
+ else PDF processing error
28
+ PL-->>DI: Raise exception
29
+ DI->>DI: Log error and continue
30
+ end
31
+ end
32
+ DI-->>API: Return ingestion result
33
+ else Metadata fetch error
34
+ AM-->>DI: Raise exception
35
+ DI-->>API: Return error message
36
+ end
37
+
38
+ Note over API,HF: Logging at each step