AkashDataScience commited on
Commit
dec661d
·
1 Parent(s): 8fa6469

First commit

Browse files
Files changed (4) hide show
  1. app.py +74 -0
  2. data/amazon-10-k-2024.pdf +0 -0
  3. data/goog-10-k-2023.pdf +0 -0
  4. requirements.txt +154 -0
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ from pinecone import Pinecone
4
+ from langchain_chroma import Chroma
5
+ from langchain_core.prompts import PromptTemplate
6
+ from langchain_pinecone import PineconeVectorStore
7
+ from langchain_community.vectorstores import FAISS
8
+ from langchain_community.vectorstores import LanceDB
9
+ from langchain_text_splitters import CharacterTextSplitter
10
+ from langchain_community.document_loaders import PyPDFLoader
11
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI
12
+
13
+ embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
14
+ gemini = GoogleGenerativeAI(model="models/gemini-2.0-flash")
15
+
16
+ prompt_template = """
17
+
18
+ Context:\n {context}?\n
19
+ Question: \n{question}\n
20
+
21
+ Answer:
22
+ """
23
+
24
+ prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
25
+
26
+ chain = prompt | gemini
27
+
28
+ def inference(pdf_path, chunk_size, chunk_overlap):
29
+ raw_documents = PyPDFLoader(pdf_path).load()
30
+ text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
31
+ documents = text_splitter.split_documents(raw_documents)
32
+
33
+ pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
34
+
35
+ index_name = "langchain-test-index"
36
+
37
+ index = pc.Index(host="https://langchain-test-index-la2n80y.svc.aped-4627-b74a.pinecone.io")
38
+
39
+ index.delete(delete_all=True)
40
+
41
+ chroma_db = Chroma.from_documents(documents, embeddings, persist_directory="./chroma_db")
42
+ faiss_db = FAISS.from_documents(documents, embeddings)
43
+ faiss_db.save_local("./faiss_db")
44
+ lance_db = LanceDB.from_documents(documents, embeddings, uri="./lance_db")
45
+ pinecone_db = PineconeVectorStore.from_documents(documents, index_name=index_name,
46
+ embedding=embeddings)
47
+
48
+ return "All embeddings are stored in vector database"
49
+
50
+ title = "PDF Chat"
51
+ description = "A simple Gradio interface to query PDFs and compare vector database"
52
+ examples = [["data/amazon-10-k-2024.pdf", 1000, 100],
53
+ ["data/goog-10-k-2023.pdf", 1000, 100]]
54
+
55
+ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
56
+ gr.Markdown(f"# {title}\n{description}")
57
+ with gr.Row():
58
+ with gr.Column():
59
+ pdf = gr.UploadButton(file_types=[".pdf"])
60
+ chunk_size = gr.Slider(0, 2000, 1000, 100, label="Size of Chunk")
61
+ chunk_overlap = gr.Slider(0, 1000, 100, 100, label="Size of Chunk Overlap")
62
+ with gr.Row():
63
+ clear_btn = gr.ClearButton(components=[pdf, chunk_size, chunk_overlap])
64
+ submit_btn = gr.Button("Store Embeddings", variant='primary')
65
+ with gr.Column():
66
+ message = gr.Textbox(label="Status", type="text")
67
+
68
+ submit_btn.click(inference, inputs=[pdf, chunk_size, chunk_overlap], outputs=message)
69
+
70
+ demo.launch()
71
+
72
+
73
+
74
+
data/amazon-10-k-2024.pdf ADDED
The diff for this file is too large to render. See raw diff
 
data/goog-10-k-2023.pdf ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.10.11
4
+ aiosignal==1.3.2
5
+ annotated-types==0.7.0
6
+ anyio==4.8.0
7
+ asgiref==3.8.1
8
+ attrs==25.3.0
9
+ audioop-lts==0.2.1
10
+ backoff==2.2.1
11
+ bcrypt==4.3.0
12
+ build==1.2.2.post1
13
+ cachetools==5.5.2
14
+ certifi==2025.1.31
15
+ charset-normalizer==3.4.1
16
+ chroma-hnswlib==0.7.6
17
+ chromadb==0.6.3
18
+ click==8.1.8
19
+ colorama==0.4.6
20
+ coloredlogs==15.0.1
21
+ dataclasses-json==0.6.7
22
+ Deprecated==1.2.18
23
+ deprecation==2.1.0
24
+ distro==1.9.0
25
+ durationpy==0.9
26
+ fastapi==0.115.11
27
+ ffmpy==0.5.0
28
+ filelock==3.18.0
29
+ filetype==1.2.0
30
+ flatbuffers==25.2.10
31
+ frozenlist==1.5.0
32
+ fsspec==2025.3.0
33
+ google-ai-generativelanguage==0.6.16
34
+ google-api-core==2.24.2
35
+ google-auth==2.38.0
36
+ googleapis-common-protos==1.69.1
37
+ gradio==5.21.0
38
+ gradio_client==1.7.2
39
+ greenlet==3.1.1
40
+ groovy==0.1.2
41
+ grpcio==1.71.0
42
+ grpcio-status==1.71.0
43
+ h11==0.14.0
44
+ httpcore==1.0.7
45
+ httptools==0.6.4
46
+ httpx==0.28.1
47
+ httpx-sse==0.4.0
48
+ huggingface-hub==0.29.3
49
+ humanfriendly==10.0
50
+ idna==3.10
51
+ importlib_metadata==8.6.1
52
+ importlib_resources==6.5.2
53
+ iniconfig==2.0.0
54
+ Jinja2==3.1.6
55
+ jsonpatch==1.33
56
+ jsonpointer==3.0.0
57
+ kubernetes==32.0.1
58
+ lancedb==0.21.1
59
+ langchain==0.3.20
60
+ langchain-chroma==0.2.2
61
+ langchain-community==0.3.19
62
+ langchain-core==0.3.45
63
+ langchain-google-genai==2.1.0
64
+ langchain-pinecone==0.2.3
65
+ langchain-tests==0.3.14
66
+ langchain-text-splitters==0.3.6
67
+ langsmith==0.3.15
68
+ markdown-it-py==3.0.0
69
+ MarkupSafe==2.1.5
70
+ marshmallow==3.26.1
71
+ mdurl==0.1.2
72
+ mmh3==5.1.0
73
+ monotonic==1.6
74
+ mpmath==1.3.0
75
+ multidict==6.1.0
76
+ mypy-extensions==1.0.0
77
+ numpy==1.26.4
78
+ oauthlib==3.2.2
79
+ onnxruntime==1.21.0
80
+ opentelemetry-api==1.31.0
81
+ opentelemetry-exporter-otlp-proto-common==1.31.0
82
+ opentelemetry-exporter-otlp-proto-grpc==1.31.0
83
+ opentelemetry-instrumentation==0.52b0
84
+ opentelemetry-instrumentation-asgi==0.52b0
85
+ opentelemetry-instrumentation-fastapi==0.52b0
86
+ opentelemetry-proto==1.31.0
87
+ opentelemetry-sdk==1.31.0
88
+ opentelemetry-semantic-conventions==0.52b0
89
+ opentelemetry-util-http==0.52b0
90
+ orjson==3.10.15
91
+ overrides==7.7.0
92
+ packaging==24.2
93
+ pandas==2.2.3
94
+ pillow==11.1.0
95
+ pinecone==5.4.2
96
+ pinecone-plugin-inference==3.1.0
97
+ pinecone-plugin-interface==0.0.7
98
+ pluggy==1.5.0
99
+ posthog==3.20.0
100
+ propcache==0.3.0
101
+ proto-plus==1.26.1
102
+ protobuf==5.29.3
103
+ pyarrow==19.0.1
104
+ pyasn1==0.6.1
105
+ pyasn1_modules==0.4.1
106
+ pydantic==2.10.6
107
+ pydantic-settings==2.8.1
108
+ pydantic_core==2.27.2
109
+ pydub==0.25.1
110
+ Pygments==2.19.1
111
+ pylance==0.24.1
112
+ PyPika==0.48.9
113
+ pyproject_hooks==1.2.0
114
+ pyreadline3==3.5.4
115
+ pytest==8.3.5
116
+ pytest-asyncio==0.25.3
117
+ pytest-socket==0.7.0
118
+ python-dateutil==2.9.0.post0
119
+ python-dotenv==1.0.1
120
+ python-multipart==0.0.20
121
+ pytz==2025.1
122
+ PyYAML==6.0.2
123
+ requests==2.32.3
124
+ requests-oauthlib==2.0.0
125
+ requests-toolbelt==1.0.0
126
+ rich==13.9.4
127
+ rsa==4.9
128
+ ruff==0.11.0
129
+ safehttpx==0.1.6
130
+ semantic-version==2.10.0
131
+ shellingham==1.5.4
132
+ six==1.17.0
133
+ sniffio==1.3.1
134
+ SQLAlchemy==2.0.39
135
+ starlette==0.46.1
136
+ sympy==1.13.3
137
+ syrupy==4.9.0
138
+ tenacity==9.0.0
139
+ tokenizers==0.21.1
140
+ tomlkit==0.13.2
141
+ tqdm==4.67.1
142
+ typer==0.15.2
143
+ typing-inspect==0.9.0
144
+ typing_extensions==4.12.2
145
+ tzdata==2025.1
146
+ urllib3==2.3.0
147
+ uvicorn==0.34.0
148
+ watchfiles==1.0.4
149
+ websocket-client==1.8.0
150
+ websockets==15.0.1
151
+ wrapt==1.17.2
152
+ yarl==1.18.3
153
+ zipp==3.21.0
154
+ zstandard==0.23.0