Spaces:
Paused
Paused
File size: 5,509 Bytes
dfe28dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Arxiv Metadata Dataset - Loader and Retriever\n",
"\n",
"- Load Arxiv Metadata from Hugging Face DataSet and Load in to Qdrant\n",
"- Use LangGraph to store trace info"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%pip install -qU pymupdf \n",
"%pip install -qU langchain langchain-core langchain-community langchain-text-splitters \n",
"%pip install -qU langchain-openai\n",
"%pip install -qU langchain-groq\n",
"%pip install -qU langchain-qdrant"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Parameterize some stuff\n",
"\n",
"QUESTION = \"What are the emerging patterns for building Systems of Agents that could provide the system the ability to evolve and improve its own processes through learning?\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from langchain import hub\n",
"from langchain_groq import ChatGroq\n",
"from config import COLLECTION_NAME, DATASET_NAME, OPENAI_API_KEY, QDRANT_API_KEY, QDRANT_API_URL, LANGCHAIN_HUB_PROMPT\n",
"from langchain_community.document_loaders import PyMuPDFLoader\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_qdrant import Qdrant\n",
"# idenify data loader for html documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain_openai import OpenAIEmbeddings\n",
"\n",
"embedding = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
"prompt = hub.pull(LANGCHAIN_HUB_PROMPT)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# URL Path is retrieved from the dataset\n",
"# need to use another loader for HTML documents\n",
"\n",
"# iterate over retrieved records from the huggingface dataset\n",
"URL_PATH = # need to retrieve the URL path from the dataset\n",
"loader = PyMuPDFLoader(URL_PATH, extract_images=True)\n",
"docs = loader.load()\n",
"\n",
"text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n",
"splits = text_splitter.split_documents(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Store the chunks in Qdrant\n",
"from_splits = Qdrant.from_documents(\n",
" embedding=embedding,\n",
" collection_name=COLLECTION_NAME,\n",
" url=QDRANT_API_URL,\n",
" api_key=QDRANT_API_KEY,\n",
" prefer_grpc=True, \n",
" documents=splits,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Retrieve Information using Metadata in Vector Store"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"qdrant = Qdrant.from_existing_collection(\n",
" embedding=embedding,\n",
" collection_name=COLLECTION_NAME,\n",
" url=QDRANT_API_URL,\n",
" api_key=QDRANT_API_KEY,\n",
" prefer_grpc=True, \n",
")\n",
"\n",
"retriever = qdrant.as_retriever(\n",
" search_type=\"similarity_score_threshold\",\n",
" search_kwargs={\"score_threshold\": 0.5, \"k\": 5}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain_groq import ChatGroq\n",
"from operator import itemgetter\n",
"from langchain.schema.runnable import RunnablePassthrough\n",
"\n",
"llm = ChatGroq(model=\"llama3-70b-8192\", temperature=0.3)\n",
"\n",
"rag_chain = (\n",
" {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n",
" | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n",
" | {\"response\": prompt | llm, \"context\": itemgetter(\"context\")}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(rag_chain.get_graph().draw_ascii())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response = rag_chain.invoke({\"question\" : QUESTION})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# return the response. filter on the response key AIMessage content element\n",
"print(response[\"response\"].content)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"response[\"context\"]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|