donb-hf commited on
Commit
b4c442a
·
1 Parent(s): e70a6fe

update functions

Browse files
.gitignore CHANGED
@@ -1,4 +1,7 @@
1
  .venv/
2
  .env
3
  __pycache__/
4
-
 
 
 
 
1
  .venv/
2
  .env
3
  __pycache__/
4
+ #ignore all files in the __pycache__ directory
5
+ __pycache__/*
6
+ __pycache__/retrieval.cpython-310.pyc
7
+ __pycache__/retrieval.cpython-310.pyc
__pycache__/retrieval.cpython-310.pyc CHANGED
Binary files a/__pycache__/retrieval.cpython-310.pyc and b/__pycache__/retrieval.cpython-310.pyc differ
 
data_ingestion.py CHANGED
@@ -1,8 +1,5 @@
1
-
2
  # File: data_ingestion.py
3
  import arxiv
4
- import io
5
- import requests
6
  from typing import List, Dict, Any
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
8
  from langchain_openai import OpenAIEmbeddings
@@ -10,11 +7,15 @@ from langchain_qdrant import Qdrant
10
  from datasets import load_dataset, Dataset
11
  from langchain_community.document_loaders import PyMuPDFLoader
12
  from config import *
 
 
 
13
 
14
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
15
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
16
 
17
  def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
 
18
  client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
19
  search = arxiv.Search(query=query, max_results=max_results)
20
  results = []
@@ -29,14 +30,17 @@ def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, An
29
  "summary": result.summary
30
  }
31
  results.append(metadata)
 
32
  return results
33
 
34
  def process_pdf(pdf_url: str) -> str:
 
35
  loader = PyMuPDFLoader(pdf_url)
36
  data = loader.load()
37
  return "\n".join([page.page_content for page in data])
38
 
39
- def ingest_documents(metadata_list: List[Dict[str, Any]]):
 
40
  qdrant = Qdrant.from_documents(
41
  [], # We'll add documents one by one
42
  embeddings,
@@ -48,22 +52,40 @@ def ingest_documents(metadata_list: List[Dict[str, Any]]):
48
  dataset = load_dataset(DATASET_NAME)
49
  new_data = []
50
 
51
- for metadata in metadata_list:
52
- pdf_text = process_pdf(metadata["pdf_url"])
53
- chunks = text_splitter.split_text(pdf_text)
54
-
55
- # Add to Qdrant
56
- qdrant.add_texts(chunks, metadatas=[metadata] * len(chunks))
57
-
58
- # Prepare data for Hugging Face dataset
59
- for chunk in chunks:
60
- new_data.append({
61
- "text": chunk,
62
- "metadata": metadata,
63
- "embedding": embeddings.embed_query(chunk)
64
- })
 
 
 
 
65
 
66
  # Update Hugging Face dataset
67
  new_dataset = Dataset.from_dict({k: [d[k] for d in new_data] for k in new_data[0]})
68
  dataset = dataset.add_item(new_dataset)
69
- dataset.push_to_hub(DATASET_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # File: data_ingestion.py
2
  import arxiv
 
 
3
  from typing import List, Dict, Any
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_openai import OpenAIEmbeddings
 
7
  from datasets import load_dataset, Dataset
8
  from langchain_community.document_loaders import PyMuPDFLoader
9
  from config import *
10
+ import logging
11
+
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
 
14
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
15
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
16
 
17
  def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
18
+ logging.info(f"Fetching arXiv metadata for query: {query}")
19
  client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
20
  search = arxiv.Search(query=query, max_results=max_results)
21
  results = []
 
30
  "summary": result.summary
31
  }
32
  results.append(metadata)
33
+ logging.info(f"Fetched metadata for {len(results)} papers")
34
  return results
35
 
36
  def process_pdf(pdf_url: str) -> str:
37
+ logging.info(f"Processing PDF from URL: {pdf_url}")
38
  loader = PyMuPDFLoader(pdf_url)
39
  data = loader.load()
40
  return "\n".join([page.page_content for page in data])
41
 
42
+ def ingest_documents(metadata_list: List[Dict[str, Any]]) -> str:
43
+ logging.info(f"Starting ingestion of {len(metadata_list)} documents")
44
  qdrant = Qdrant.from_documents(
45
  [], # We'll add documents one by one
46
  embeddings,
 
52
  dataset = load_dataset(DATASET_NAME)
53
  new_data = []
54
 
55
+ for i, metadata in enumerate(metadata_list):
56
+ try:
57
+ pdf_text = process_pdf(metadata["pdf_url"])
58
+ chunks = text_splitter.split_text(pdf_text)
59
+
60
+ # Add to Qdrant
61
+ qdrant.add_texts(chunks, metadatas=[metadata] * len(chunks))
62
+
63
+ # Prepare data for Hugging Face dataset
64
+ for chunk in chunks:
65
+ new_data.append({
66
+ "text": chunk,
67
+ "metadata": metadata,
68
+ "embedding": embeddings.embed_query(chunk)
69
+ })
70
+ logging.info(f"Processed document {i+1}/{len(metadata_list)}")
71
+ except Exception as e:
72
+ logging.error(f"Error processing document {i+1}: {str(e)}")
73
 
74
  # Update Hugging Face dataset
75
  new_dataset = Dataset.from_dict({k: [d[k] for d in new_data] for k in new_data[0]})
76
  dataset = dataset.add_item(new_dataset)
77
+ dataset.push_to_hub(DATASET_NAME)
78
+
79
+ result_message = f"Ingested {len(metadata_list)} documents, adding {len(new_data)} chunks to the dataset."
80
+ logging.info(result_message)
81
+ return result_message
82
+
83
+ def run_ingestion_pipeline(query: str, max_results: int = 10) -> str:
84
+ try:
85
+ metadata_list = fetch_arxiv_metadata(query, max_results)
86
+ result = ingest_documents(metadata_list)
87
+ return result
88
+ except Exception as e:
89
+ error_message = f"Error in ingestion pipeline: {str(e)}"
90
+ logging.error(error_message)
91
+ return error_message
retrieval.py CHANGED
@@ -4,27 +4,37 @@ from langchain_groq import ChatGroq
4
  from langchain_openai import OpenAIEmbeddings
5
  from langchain.chains import RetrievalQA
6
  from config import *
 
 
 
7
 
8
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
9
  llm = ChatGroq(model="llama3-70b-4096", temperature=0.3)
10
 
11
  def rag_query(query: str) -> str:
12
- qdrant = Qdrant.from_existing_collection(
13
- embedding=embeddings,
14
- collection_name=COLLECTION_NAME,
15
- url=QDRANT_API_URL,
16
- api_key=QDRANT_API_KEY,
17
- prefer_grpc=True,
18
- )
19
-
20
- retriever = qdrant.as_retriever(search_kwargs={"k": 5})
21
-
22
- qa_chain = RetrievalQA.from_chain_type(
23
- llm=llm,
24
- chain_type="stuff",
25
- retriever=retriever,
26
- return_source_documents=True
27
- )
28
-
29
- result = qa_chain({"query": query})
30
- return result["result"]
 
 
 
 
 
 
 
 
4
  from langchain_openai import OpenAIEmbeddings
5
  from langchain.chains import RetrievalQA
6
  from config import *
7
+ import logging
8
+
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
 
11
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
12
  llm = ChatGroq(model="llama3-70b-4096", temperature=0.3)
13
 
14
  def rag_query(query: str) -> str:
15
+ logging.info(f"Processing query: {query}")
16
+ try:
17
+ qdrant = Qdrant.from_existing_collection(
18
+ embedding=embeddings,
19
+ collection_name=COLLECTION_NAME,
20
+ url=QDRANT_API_URL,
21
+ api_key=QDRANT_API_KEY,
22
+ prefer_grpc=True,
23
+ )
24
+
25
+ retriever = qdrant.as_retriever(search_kwargs={"k": 5})
26
+
27
+ qa_chain = RetrievalQA.from_chain_type(
28
+ llm=llm,
29
+ chain_type="stuff",
30
+ retriever=retriever,
31
+ return_source_documents=True
32
+ )
33
+
34
+ result = qa_chain({"query": query})
35
+ logging.info("Query processed successfully")
36
+ return result["result"]
37
+ except Exception as e:
38
+ error_message = f"Error processing query: {str(e)}"
39
+ logging.error(error_message)
40
+ return error_message