Spaces:

Adarsh-aot
/

pdf_qa

Paused

App Files Files Community

Adarsh-aot commited on Jul 3, 2024

Commit

6aabcd0

verified ·

1 Parent(s): 123356b

Upload 7 files

Browse files

Files changed (7) hide show

csv_app.py +96 -0
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/data_level0.bin +3 -0
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/header.bin +3 -0
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/length.bin +3 -0
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/link_lists.bin +3 -0
data_db/chroma.sqlite3 +0 -0
requirments.txt +155 -0

csv_app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# import csv
+# # Load sample data (a restaurant menu of items)
+# with open('./data.csv') as file:
+#     lines = csv.reader(file)
+#     # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
+#     documents = []
+#     # Store the corresponding menu item IDs in this array.
+#     metadatas = []
+#     # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
+#     ids = []
+#     id = 1
+#     # Loop thru each line and populate the 3 arrays.
+#     for i, line in enumerate(lines):
+#         if i==0:
+#             # Skip the first row (the column headers)
+#             continue
+#         documents.append(line[0])
+#         metadatas.append({"item_id": line[1]})
+#         ids.append(str(id))
+#         id+=1
+import chromadb
+from chromadb.utils import embedding_functions
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+from transformers import pipeline
+from langchain.llms import HuggingFacePipeline
+import torch
+# Instantiate chromadb instance. Data is stored in memory only.
+# chroma_client = chromadb.Client()
+# Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
+chroma_client = chromadb.PersistentClient(path="vector_db")
+# Select the embedding model to use.
+# List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
+sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
+# Use this to delete the database
+# chroma_client.delete_collection(name="my_collection")
+# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
+collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)
+# collection.add(
+#     documents=documents,
+#     metadatas=metadatas,
+#     ids=ids
+# )
+results = collection.query(
+    query_texts=["director"],
+    n_results=1,
+    include=['documents', 'distances', 'metadatas']
+)
+print(results['metadatas'])
+tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
+model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")
+pipe = pipeline(
+    "text2text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    max_length=512
+)
+local_llm = HuggingFacePipeline(pipeline=pipe)
+context = results['documents'][0][0]
+question = "director job"
+l = f"""
+Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer.
+{context}
+Question: {question}
+Helpful Answer:
+"""
+print(local_llm(l))

data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/data_level0.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
+size 1676000

data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/header.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
+size 100

data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/length.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b85016cc0826ff8a0c94eca60de20011730b672a4b305184a840e95e787a85be
+size 4000

data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/link_lists.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
+size 0

data_db/chroma.sqlite3 ADDED Viewed

Binary file (303 kB). View file

requirments.txt ADDED Viewed

	@@ -0,0 +1,155 @@

+aiohttp==3.9.5
+aiosignal==1.3.1
+altair==5.3.0
+annotated-types==0.7.0
+anyio==4.4.0
+asgiref==3.8.1
+async-timeout==4.0.3
+attrs==23.2.0
+backoff==2.2.1
+bcrypt==4.1.3
+blinker==1.8.2
+build==1.2.1
+cachetools==5.3.3
+certifi==2024.6.2
+charset-normalizer==3.3.2
+chroma-hnswlib==0.7.3
+chromadb==0.5.3
+click==8.1.7
+colorama==0.4.6
+coloredlogs==15.0.1
+dataclasses-json==0.6.7
+Deprecated==1.2.14
+dnspython==2.6.1
+email_validator==2.2.0
+exceptiongroup==1.2.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+filelock==3.15.4
+flatbuffers==24.3.25
+frozenlist==1.4.1
+fsspec==2024.6.0
+gitdb==4.0.11
+GitPython==3.1.43
+google-auth==2.30.0
+googleapis-common-protos==1.63.2
+greenlet==3.0.3
+grpcio==1.64.1
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+huggingface-hub==0.23.4
+humanfriendly==10.0
+idna==3.7
+importlib_metadata==7.1.0
+importlib_resources==6.4.0
+InstructorEmbedding==1.0.1
+intel-openmp==2021.4.0
+Jinja2==3.1.4
+joblib==1.4.2
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema==4.22.0
+jsonschema-specifications==2023.12.1
+kubernetes==30.1.0
+langchain==0.2.5
+langchain-community==0.2.5
+langchain-core==0.2.9
+langchain-text-splitters==0.2.1
+langsmith==0.1.82
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.3
+mdurl==0.1.2
+mkl==2021.4.0
+mmh3==4.1.0
+monotonic==1.6
+mpmath==1.3.0
+multidict==6.0.5
+mypy-extensions==1.0.0
+networkx==3.3
+nltk==3.8.1
+numpy==1.26.4
+oauthlib==3.2.2
+onnxruntime==1.18.0
+opentelemetry-api==1.25.0
+opentelemetry-exporter-otlp-proto-common==1.25.0
+opentelemetry-exporter-otlp-proto-grpc==1.25.0
+opentelemetry-instrumentation==0.46b0
+opentelemetry-instrumentation-asgi==0.46b0
+opentelemetry-instrumentation-fastapi==0.46b0
+opentelemetry-proto==1.25.0
+opentelemetry-sdk==1.25.0
+opentelemetry-semantic-conventions==0.46b0
+opentelemetry-util-http==0.46b0
+orjson==3.10.5
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.2
+pillow==10.3.0
+posthog==3.5.0
+protobuf==4.25.3
+pyarrow==16.1.0
+pyasn1==0.6.0
+pyasn1_modules==0.4.0
+pydantic==2.7.4
+pydantic_core==2.18.4
+pydeck==0.9.1
+Pygments==2.18.0
+pypdf==4.2.0
+PyPika==0.48.9
+pyproject_hooks==1.1.0
+pyreadline3==3.4.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+regex==2024.5.15
+requests==2.32.3
+requests-oauthlib==2.0.0
+rich==13.7.1
+rpds-py==0.18.1
+rsa==4.9
+safetensors==0.4.3
+scikit-learn==1.5.0
+scipy==1.14.0
+sentence-transformers==2.2.2
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+SQLAlchemy==2.0.31
+starlette==0.37.2
+streamlit==1.36.0
+sympy==1.12.1
+tbb==2021.13.0
+tenacity==8.4.2
+threadpoolctl==3.5.0
+tiktoken==0.7.0
+tokenizers==0.19.1
+toml==0.10.2
+tomli==2.0.1
+toolz==0.12.1
+torch==2.3.1
+torchvision==0.18.1
+tornado==6.4.1
+tqdm==4.66.4
+transformers==4.41.2
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.1
+ujson==5.10.0
+urllib3==2.2.2
+uvicorn==0.30.1
+watchdog==4.0.1
+watchfiles==0.22.0
+websocket-client==1.8.0
+websockets==12.0
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.19.2