Spaces:
Paused
Paused
update functions
Browse files- .gitignore +4 -1
- __pycache__/retrieval.cpython-310.pyc +0 -0
- data_ingestion.py +41 -19
- retrieval.py +29 -19
.gitignore
CHANGED
@@ -1,4 +1,7 @@
|
|
1 |
.venv/
|
2 |
.env
|
3 |
__pycache__/
|
4 |
-
|
|
|
|
|
|
|
|
1 |
.venv/
|
2 |
.env
|
3 |
__pycache__/
|
4 |
+
#ignore all files in the __pycache__ directory
|
5 |
+
__pycache__/*
|
6 |
+
__pycache__/retrieval.cpython-310.pyc
|
7 |
+
__pycache__/retrieval.cpython-310.pyc
|
__pycache__/retrieval.cpython-310.pyc
CHANGED
Binary files a/__pycache__/retrieval.cpython-310.pyc and b/__pycache__/retrieval.cpython-310.pyc differ
|
|
data_ingestion.py
CHANGED
@@ -1,8 +1,5 @@
|
|
1 |
-
|
2 |
# File: data_ingestion.py
|
3 |
import arxiv
|
4 |
-
import io
|
5 |
-
import requests
|
6 |
from typing import List, Dict, Any
|
7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
8 |
from langchain_openai import OpenAIEmbeddings
|
@@ -10,11 +7,15 @@ from langchain_qdrant import Qdrant
|
|
10 |
from datasets import load_dataset, Dataset
|
11 |
from langchain_community.document_loaders import PyMuPDFLoader
|
12 |
from config import *
|
|
|
|
|
|
|
13 |
|
14 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
15 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
16 |
|
17 |
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
|
|
18 |
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
19 |
search = arxiv.Search(query=query, max_results=max_results)
|
20 |
results = []
|
@@ -29,14 +30,17 @@ def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, An
|
|
29 |
"summary": result.summary
|
30 |
}
|
31 |
results.append(metadata)
|
|
|
32 |
return results
|
33 |
|
34 |
def process_pdf(pdf_url: str) -> str:
|
|
|
35 |
loader = PyMuPDFLoader(pdf_url)
|
36 |
data = loader.load()
|
37 |
return "\n".join([page.page_content for page in data])
|
38 |
|
39 |
-
def ingest_documents(metadata_list: List[Dict[str, Any]]):
|
|
|
40 |
qdrant = Qdrant.from_documents(
|
41 |
[], # We'll add documents one by one
|
42 |
embeddings,
|
@@ -48,22 +52,40 @@ def ingest_documents(metadata_list: List[Dict[str, Any]]):
|
|
48 |
dataset = load_dataset(DATASET_NAME)
|
49 |
new_data = []
|
50 |
|
51 |
-
for metadata in metadata_list:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
65 |
|
66 |
# Update Hugging Face dataset
|
67 |
new_dataset = Dataset.from_dict({k: [d[k] for d in new_data] for k in new_data[0]})
|
68 |
dataset = dataset.add_item(new_dataset)
|
69 |
-
dataset.push_to_hub(DATASET_NAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# File: data_ingestion.py
|
2 |
import arxiv
|
|
|
|
|
3 |
from typing import List, Dict, Any
|
4 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
5 |
from langchain_openai import OpenAIEmbeddings
|
|
|
7 |
from datasets import load_dataset, Dataset
|
8 |
from langchain_community.document_loaders import PyMuPDFLoader
|
9 |
from config import *
|
10 |
+
import logging
|
11 |
+
|
12 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
13 |
|
14 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
15 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
16 |
|
17 |
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
18 |
+
logging.info(f"Fetching arXiv metadata for query: {query}")
|
19 |
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
20 |
search = arxiv.Search(query=query, max_results=max_results)
|
21 |
results = []
|
|
|
30 |
"summary": result.summary
|
31 |
}
|
32 |
results.append(metadata)
|
33 |
+
logging.info(f"Fetched metadata for {len(results)} papers")
|
34 |
return results
|
35 |
|
36 |
def process_pdf(pdf_url: str) -> str:
|
37 |
+
logging.info(f"Processing PDF from URL: {pdf_url}")
|
38 |
loader = PyMuPDFLoader(pdf_url)
|
39 |
data = loader.load()
|
40 |
return "\n".join([page.page_content for page in data])
|
41 |
|
42 |
+
def ingest_documents(metadata_list: List[Dict[str, Any]]) -> str:
|
43 |
+
logging.info(f"Starting ingestion of {len(metadata_list)} documents")
|
44 |
qdrant = Qdrant.from_documents(
|
45 |
[], # We'll add documents one by one
|
46 |
embeddings,
|
|
|
52 |
dataset = load_dataset(DATASET_NAME)
|
53 |
new_data = []
|
54 |
|
55 |
+
for i, metadata in enumerate(metadata_list):
|
56 |
+
try:
|
57 |
+
pdf_text = process_pdf(metadata["pdf_url"])
|
58 |
+
chunks = text_splitter.split_text(pdf_text)
|
59 |
+
|
60 |
+
# Add to Qdrant
|
61 |
+
qdrant.add_texts(chunks, metadatas=[metadata] * len(chunks))
|
62 |
+
|
63 |
+
# Prepare data for Hugging Face dataset
|
64 |
+
for chunk in chunks:
|
65 |
+
new_data.append({
|
66 |
+
"text": chunk,
|
67 |
+
"metadata": metadata,
|
68 |
+
"embedding": embeddings.embed_query(chunk)
|
69 |
+
})
|
70 |
+
logging.info(f"Processed document {i+1}/{len(metadata_list)}")
|
71 |
+
except Exception as e:
|
72 |
+
logging.error(f"Error processing document {i+1}: {str(e)}")
|
73 |
|
74 |
# Update Hugging Face dataset
|
75 |
new_dataset = Dataset.from_dict({k: [d[k] for d in new_data] for k in new_data[0]})
|
76 |
dataset = dataset.add_item(new_dataset)
|
77 |
+
dataset.push_to_hub(DATASET_NAME)
|
78 |
+
|
79 |
+
result_message = f"Ingested {len(metadata_list)} documents, adding {len(new_data)} chunks to the dataset."
|
80 |
+
logging.info(result_message)
|
81 |
+
return result_message
|
82 |
+
|
83 |
+
def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
|
84 |
+
try:
|
85 |
+
metadata_list = fetch_arxiv_metadata(query, max_results)
|
86 |
+
result = ingest_documents(metadata_list)
|
87 |
+
return result
|
88 |
+
except Exception as e:
|
89 |
+
error_message = f"Error in ingestion pipeline: {str(e)}"
|
90 |
+
logging.error(error_message)
|
91 |
+
return error_message
|
retrieval.py
CHANGED
@@ -4,27 +4,37 @@ from langchain_groq import ChatGroq
|
|
4 |
from langchain_openai import OpenAIEmbeddings
|
5 |
from langchain.chains import RetrievalQA
|
6 |
from config import *
|
|
|
|
|
|
|
7 |
|
8 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
9 |
llm = ChatGroq(model="llama3-70b-4096", temperature=0.3)
|
10 |
|
11 |
def rag_query(query: str) -> str:
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from langchain_openai import OpenAIEmbeddings
|
5 |
from langchain.chains import RetrievalQA
|
6 |
from config import *
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
10 |
|
11 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
12 |
llm = ChatGroq(model="llama3-70b-4096", temperature=0.3)
|
13 |
|
14 |
def rag_query(query: str) -> str:
|
15 |
+
logging.info(f"Processing query: {query}")
|
16 |
+
try:
|
17 |
+
qdrant = Qdrant.from_existing_collection(
|
18 |
+
embedding=embeddings,
|
19 |
+
collection_name=COLLECTION_NAME,
|
20 |
+
url=QDRANT_API_URL,
|
21 |
+
api_key=QDRANT_API_KEY,
|
22 |
+
prefer_grpc=True,
|
23 |
+
)
|
24 |
+
|
25 |
+
retriever = qdrant.as_retriever(search_kwargs={"k": 5})
|
26 |
+
|
27 |
+
qa_chain = RetrievalQA.from_chain_type(
|
28 |
+
llm=llm,
|
29 |
+
chain_type="stuff",
|
30 |
+
retriever=retriever,
|
31 |
+
return_source_documents=True
|
32 |
+
)
|
33 |
+
|
34 |
+
result = qa_chain({"query": query})
|
35 |
+
logging.info("Query processed successfully")
|
36 |
+
return result["result"]
|
37 |
+
except Exception as e:
|
38 |
+
error_message = f"Error processing query: {str(e)}"
|
39 |
+
logging.error(error_message)
|
40 |
+
return error_message
|