Spaces:

dwb2023
/

arxiv-rag-mvp

Paused

App Files Files Community

donb-hf commited on Jul 6, 2024

Commit

e808c52

1 Parent(s): b4c442a

update ingestion flow

Browse files

Files changed (6) hide show

.gitignore +2 -1
README.md +6 -4
__pycache__/config.cpython-310.pyc +0 -0
arxiv-library-tests.py +43 -0
data_ingestion.py +6 -0
ingestion-flow-service-diagram.mermaid +38 -0

.gitignore CHANGED Viewed

@@ -2,6 +2,7 @@
 .env
 __pycache__/
 #ignore all files in the __pycache__ directory
-__pycache__/*
 __pycache__/retrieval.cpython-310.pyc
 __pycache__/retrieval.cpython-310.pyc

 .env
 __pycache__/
 #ignore all files in the __pycache__ directory
+__pycache__/
 __pycache__/retrieval.cpython-310.pyc
 __pycache__/retrieval.cpython-310.pyc
+__pycache__/config.cpython-310.pyc

README.md CHANGED Viewed

@@ -44,10 +44,12 @@ The primary stakeholder for this system is an agentic "System of Agents". This d
 ## Sequence Diagrams
-### 1. Ingestion Flow
-[Mermaid Diagram - Ingestion Flow](ingestion-flow-diagram.mermaid)
-### 2. Retrieval Flow
-[Mermaid Diagram - Retrieval Flow](retrieval-flow-diagram.mermaid)

 ## Sequence Diagrams
+### 1. Ingestion Flow - Mermaid Diagrams
+[Ingestion Flow](ingestion-flow-diagram.mermaid)
+[Ingestion Flow - Service Diagram](ingestion-flow-service-diagram.mermaid)
+### 2. Retrieval Flow - Mermaid Diagrams
+[Retrieval Flow](retrieval-flow-diagram.mermaid)

__pycache__/config.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/config.cpython-310.pyc and b/__pycache__/config.cpython-310.pyc differ

arxiv-library-tests.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import unittest
+import arxiv
+class TestArxivLibrary(unittest.TestCase):
+    def setUp(self):
+        self.client = arxiv.Client()
+    def test_simple_search(self):
+        search = arxiv.Search(query="quantum computing", max_results=5)
+        results = list(self.client.results(search))
+        self.assertEqual(len(results), 5)
+        self.assertTrue(all(isinstance(r, arxiv.Result) for r in results))
+    def test_complex_query(self):
+        query = 'au:"John Doe" AND cat:cs.AI AND year:2020'
+        search = arxiv.Search(query=query, max_results=10)
+        results = list(self.client.results(search))
+        # Add assertions to check the results match the query
+    def test_empty_query(self):
+        search = arxiv.Search(query="", max_results=5)
+        results = list(self.client.results(search))
+        self.assertEqual(len(results), 0, "Empty query should return no results")
+    def test_metadata_extraction(self):
+        search = arxiv.Search(query="physics", max_results=1)
+        result = next(self.client.results(search))
+        self.assertIsNotNone(result.title)
+        self.assertIsNotNone(result.authors)
+        self.assertIsNotNone(result.published)
+        # Add more assertions for other metadata fields
+    def test_whitespace_query(self):
+        search = arxiv.Search(query="   ", max_results=5)
+        results = list(self.client.results(search))
+        self.assertEqual(len(results), 0, "Whitespace-only query should return no results")
+    def test_network_error(self):
+        # Mock network error and ensure it's handled gracefully
+        pass
+if __name__ == '__main__':
+    unittest.main()

data_ingestion.py CHANGED Viewed

@@ -16,6 +16,10 @@ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20
 def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
     logging.info(f"Fetching arXiv metadata for query: {query}")
     client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
     search = arxiv.Search(query=query, max_results=max_results)
     results = []
@@ -83,6 +87,8 @@ def ingest_documents(metadata_list: List[Dict[str, Any]]) -> str:
 def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
     try:
         metadata_list = fetch_arxiv_metadata(query, max_results)
         result = ingest_documents(metadata_list)
         return result
     except Exception as e:

 def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
     logging.info(f"Fetching arXiv metadata for query: {query}")
+    if not query.strip():
+        logging.warning("Empty or whitespace-only query provided")
+        return []
     client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
     search = arxiv.Search(query=query, max_results=max_results)
     results = []
 def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
     try:
         metadata_list = fetch_arxiv_metadata(query, max_results)
+        if not metadata_list:
+            return "No results found for the given query"
         result = ingest_documents(metadata_list)
         return result
     except Exception as e:

ingestion-flow-service-diagram.mermaid ADDED Viewed

	@@ -0,0 +1,38 @@

+sequenceDiagram
+    participant API as API (FastAPI)
+    participant DI as Data Ingestion Service
+    participant AM as ArXiv Metadata Fetcher
+    participant PL as PDF Loader (PyMuPDF)
+    participant TS as Text Splitter
+    participant EM as Embedding Model (OpenAI)
+    participant VDB as Vector Database (Qdrant)
+    participant HF as Hugging Face Dataset
+    API->>DI: POST /ingest (query, max_results)
+    DI->>AM: fetch_arxiv_metadata(query, max_results)
+    AM-->>DI: Return metadata list
+    alt Successful metadata fetch
+        loop For each metadata item
+            DI->>PL: process_pdf(pdf_url)
+            alt Successful PDF processing
+                PL-->>DI: Return PDF text
+                DI->>TS: split_text(pdf_text)
+                TS-->>DI: Return text chunks
+                loop For each chunk
+                    DI->>EM: embed_query(chunk)
+                    EM-->>DI: Return embedding
+                    DI->>VDB: add_texts(chunk, embedding)
+                    DI->>HF: Add chunk and metadata
+                end
+            else PDF processing error
+                PL-->>DI: Raise exception
+                DI->>DI: Log error and continue
+            end
+        end
+        DI-->>API: Return ingestion result
+    else Metadata fetch error
+        AM-->>DI: Raise exception
+        DI-->>API: Return error message
+    end
+    Note over API,HF: Logging at each step