Adarsh-aot commited on
Commit
6aabcd0
·
verified ·
1 Parent(s): 123356b

Upload 7 files

Browse files
csv_app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import csv
2
+
3
+ # # Load sample data (a restaurant menu of items)
4
+ # with open('./data.csv') as file:
5
+ # lines = csv.reader(file)
6
+
7
+ # # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
8
+ # documents = []
9
+
10
+ # # Store the corresponding menu item IDs in this array.
11
+ # metadatas = []
12
+
13
+ # # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
14
+ # ids = []
15
+ # id = 1
16
+
17
+ # # Loop thru each line and populate the 3 arrays.
18
+ # for i, line in enumerate(lines):
19
+ # if i==0:
20
+ # # Skip the first row (the column headers)
21
+ # continue
22
+
23
+ # documents.append(line[0])
24
+ # metadatas.append({"item_id": line[1]})
25
+ # ids.append(str(id))
26
+ # id+=1
27
+
28
+
29
+ import chromadb
30
+ from chromadb.utils import embedding_functions
31
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
32
+ from transformers import pipeline
33
+ from langchain.llms import HuggingFacePipeline
34
+ import torch
35
+ # Instantiate chromadb instance. Data is stored in memory only.
36
+ # chroma_client = chromadb.Client()
37
+
38
+ # Instantiate chromadb instance. Data is stored on disk (a folder named 'my_vectordb' will be created in the same folder as this file).
39
+ chroma_client = chromadb.PersistentClient(path="vector_db")
40
+
41
+ # Select the embedding model to use.
42
+ # List of model names can be found here https://www.sbert.net/docs/pretrained_models.html
43
+ sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-mpnet-base-v2")
44
+
45
+ # Use this to delete the database
46
+ # chroma_client.delete_collection(name="my_collection")
47
+
48
+ # Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
49
+ collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef)
50
+
51
+
52
+ # collection.add(
53
+ # documents=documents,
54
+ # metadatas=metadatas,
55
+ # ids=ids
56
+ # )
57
+
58
+
59
+
60
+ results = collection.query(
61
+ query_texts=["director"],
62
+ n_results=1,
63
+ include=['documents', 'distances', 'metadatas']
64
+ )
65
+ print(results['metadatas'])
66
+
67
+
68
+
69
+ tokenizer = AutoTokenizer.from_pretrained("MBZUAI/LaMini-T5-738M")
70
+ model = AutoModelForSeq2SeqLM.from_pretrained("MBZUAI/LaMini-T5-738M")
71
+
72
+ pipe = pipeline(
73
+ "text2text-generation",
74
+ model=model,
75
+ tokenizer=tokenizer,
76
+ max_length=512
77
+ )
78
+
79
+ local_llm = HuggingFacePipeline(pipeline=pipe)
80
+
81
+
82
+ context = results['documents'][0][0]
83
+ question = "director job"
84
+
85
+
86
+ l = f"""
87
+ Use the following pieces of context to answer the question at the end . If you don't know the answer, just say that you don't know, don't try to make up an answer.
88
+
89
+ {context}
90
+
91
+ Question: {question}
92
+ Helpful Answer:
93
+ """
94
+
95
+
96
+ print(local_llm(l))
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3c9fd302f000d7790aa403c2d0d8fec363fe46f30b07d53020b6e33b22435a9
3
+ size 1676000
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e87a1dc8bcae6f2c4bea6d5dd5005454d4dace8637dae29bff3c037ea771411e
3
+ size 100
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85016cc0826ff8a0c94eca60de20011730b672a4b305184a840e95e787a85be
3
+ size 4000
data_db/979e4073-abbb-4f24-a4c5-f131b2a82e2d/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
3
+ size 0
data_db/chroma.sqlite3 ADDED
Binary file (303 kB). View file
 
requirments.txt ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.9.5
2
+ aiosignal==1.3.1
3
+ altair==5.3.0
4
+ annotated-types==0.7.0
5
+ anyio==4.4.0
6
+ asgiref==3.8.1
7
+ async-timeout==4.0.3
8
+ attrs==23.2.0
9
+ backoff==2.2.1
10
+ bcrypt==4.1.3
11
+ blinker==1.8.2
12
+ build==1.2.1
13
+ cachetools==5.3.3
14
+ certifi==2024.6.2
15
+ charset-normalizer==3.3.2
16
+ chroma-hnswlib==0.7.3
17
+ chromadb==0.5.3
18
+ click==8.1.7
19
+ colorama==0.4.6
20
+ coloredlogs==15.0.1
21
+ dataclasses-json==0.6.7
22
+ Deprecated==1.2.14
23
+ dnspython==2.6.1
24
+ email_validator==2.2.0
25
+ exceptiongroup==1.2.1
26
+ fastapi==0.111.0
27
+ fastapi-cli==0.0.4
28
+ filelock==3.15.4
29
+ flatbuffers==24.3.25
30
+ frozenlist==1.4.1
31
+ fsspec==2024.6.0
32
+ gitdb==4.0.11
33
+ GitPython==3.1.43
34
+ google-auth==2.30.0
35
+ googleapis-common-protos==1.63.2
36
+ greenlet==3.0.3
37
+ grpcio==1.64.1
38
+ h11==0.14.0
39
+ httpcore==1.0.5
40
+ httptools==0.6.1
41
+ httpx==0.27.0
42
+ huggingface-hub==0.23.4
43
+ humanfriendly==10.0
44
+ idna==3.7
45
+ importlib_metadata==7.1.0
46
+ importlib_resources==6.4.0
47
+ InstructorEmbedding==1.0.1
48
+ intel-openmp==2021.4.0
49
+ Jinja2==3.1.4
50
+ joblib==1.4.2
51
+ jsonpatch==1.33
52
+ jsonpointer==3.0.0
53
+ jsonschema==4.22.0
54
+ jsonschema-specifications==2023.12.1
55
+ kubernetes==30.1.0
56
+ langchain==0.2.5
57
+ langchain-community==0.2.5
58
+ langchain-core==0.2.9
59
+ langchain-text-splitters==0.2.1
60
+ langsmith==0.1.82
61
+ markdown-it-py==3.0.0
62
+ MarkupSafe==2.1.5
63
+ marshmallow==3.21.3
64
+ mdurl==0.1.2
65
+ mkl==2021.4.0
66
+ mmh3==4.1.0
67
+ monotonic==1.6
68
+ mpmath==1.3.0
69
+ multidict==6.0.5
70
+ mypy-extensions==1.0.0
71
+ networkx==3.3
72
+ nltk==3.8.1
73
+ numpy==1.26.4
74
+ oauthlib==3.2.2
75
+ onnxruntime==1.18.0
76
+ opentelemetry-api==1.25.0
77
+ opentelemetry-exporter-otlp-proto-common==1.25.0
78
+ opentelemetry-exporter-otlp-proto-grpc==1.25.0
79
+ opentelemetry-instrumentation==0.46b0
80
+ opentelemetry-instrumentation-asgi==0.46b0
81
+ opentelemetry-instrumentation-fastapi==0.46b0
82
+ opentelemetry-proto==1.25.0
83
+ opentelemetry-sdk==1.25.0
84
+ opentelemetry-semantic-conventions==0.46b0
85
+ opentelemetry-util-http==0.46b0
86
+ orjson==3.10.5
87
+ overrides==7.7.0
88
+ packaging==24.1
89
+ pandas==2.2.2
90
+ pillow==10.3.0
91
+ posthog==3.5.0
92
+ protobuf==4.25.3
93
+ pyarrow==16.1.0
94
+ pyasn1==0.6.0
95
+ pyasn1_modules==0.4.0
96
+ pydantic==2.7.4
97
+ pydantic_core==2.18.4
98
+ pydeck==0.9.1
99
+ Pygments==2.18.0
100
+ pypdf==4.2.0
101
+ PyPika==0.48.9
102
+ pyproject_hooks==1.1.0
103
+ pyreadline3==3.4.1
104
+ python-dateutil==2.9.0.post0
105
+ python-dotenv==1.0.1
106
+ python-multipart==0.0.9
107
+ pytz==2024.1
108
+ PyYAML==6.0.1
109
+ referencing==0.35.1
110
+ regex==2024.5.15
111
+ requests==2.32.3
112
+ requests-oauthlib==2.0.0
113
+ rich==13.7.1
114
+ rpds-py==0.18.1
115
+ rsa==4.9
116
+ safetensors==0.4.3
117
+ scikit-learn==1.5.0
118
+ scipy==1.14.0
119
+ sentence-transformers==2.2.2
120
+ sentencepiece==0.2.0
121
+ shellingham==1.5.4
122
+ six==1.16.0
123
+ smmap==5.0.1
124
+ sniffio==1.3.1
125
+ SQLAlchemy==2.0.31
126
+ starlette==0.37.2
127
+ streamlit==1.36.0
128
+ sympy==1.12.1
129
+ tbb==2021.13.0
130
+ tenacity==8.4.2
131
+ threadpoolctl==3.5.0
132
+ tiktoken==0.7.0
133
+ tokenizers==0.19.1
134
+ toml==0.10.2
135
+ tomli==2.0.1
136
+ toolz==0.12.1
137
+ torch==2.3.1
138
+ torchvision==0.18.1
139
+ tornado==6.4.1
140
+ tqdm==4.66.4
141
+ transformers==4.41.2
142
+ typer==0.12.3
143
+ typing-inspect==0.9.0
144
+ typing_extensions==4.12.2
145
+ tzdata==2024.1
146
+ ujson==5.10.0
147
+ urllib3==2.2.2
148
+ uvicorn==0.30.1
149
+ watchdog==4.0.1
150
+ watchfiles==0.22.0
151
+ websocket-client==1.8.0
152
+ websockets==12.0
153
+ wrapt==1.16.0
154
+ yarl==1.9.4
155
+ zipp==3.19.2