That1BrainCell commited on
Commit
26d63da
·
verified ·
1 Parent(s): 6d1c6f2

Delete infridgement_chroma.py

Browse files
Files changed (1) hide show
  1. infridgement_chroma.py +0 -338
infridgement_chroma.py DELETED
@@ -1,338 +0,0 @@
1
- import streamlit as st
2
- import concurrent.futures
3
- from concurrent.futures import ThreadPoolExecutor,as_completed
4
- from functools import partial
5
- import numpy as np
6
- from io import StringIO
7
- import sys
8
- import time
9
- import pandas as pd
10
- from pymongo import MongoClient
11
- import plotly.express as px
12
- from pinecone import Pinecone, ServerlessSpec
13
- import chromadb
14
- import requests
15
- from io import BytesIO
16
- from PyPDF2 import PdfReader
17
- import hashlib
18
- import os
19
-
20
- # File Imports
21
- from embedding import get_embeddings,get_image_embeddings,get_embed_chroma,imporve_text # Ensure this file/module is available
22
- from preprocess import filtering # Ensure this file/module is available
23
- from search import *
24
-
25
-
26
- # Chroma Connections
27
- client = chromadb.PersistentClient(path = "embeddings")
28
- collection = client.get_or_create_collection(name="data",metadata={"hnsw:space": "l2"})
29
-
30
-
31
- def generate_hash(content):
32
- return hashlib.sha256(content.encode('utf-8')).hexdigest()
33
-
34
- def get_key(link):
35
- text = ''
36
- try:
37
- # Fetch the PDF file from the URL
38
- response = requests.get(link)
39
- response.raise_for_status() # Raise an error for bad status codes
40
-
41
- # Use BytesIO to handle the PDF content in memory
42
- pdf_file = BytesIO(response.content)
43
-
44
- # Load the PDF file
45
- reader = PdfReader(pdf_file)
46
- num_pages = len(reader.pages)
47
-
48
- first_page_text = reader.pages[0].extract_text()
49
- if first_page_text:
50
- text += first_page_text
51
-
52
-
53
- last_page_text = reader.pages[-1].extract_text()
54
- if last_page_text:
55
- text += last_page_text
56
-
57
- except requests.exceptions.HTTPError as e:
58
- print(f'HTTP error occurred: {e}')
59
- except Exception as e:
60
- print(f'An error occurred: {e}')
61
-
62
- unique_key = generate_hash(text)
63
-
64
- return unique_key
65
-
66
- # Cosine Similarity Function
67
- def cosine_similarity(vec1, vec2):
68
- vec1 = np.array(vec1)
69
- vec2 = np.array(vec2)
70
-
71
- dot_product = np.dot(vec1, vec2.T)
72
- magnitude_vec1 = np.linalg.norm(vec1)
73
- magnitude_vec2 = np.linalg.norm(vec2)
74
-
75
- if magnitude_vec1 == 0 or magnitude_vec2 == 0:
76
- return 0.0
77
-
78
- cosine_sim = dot_product / (magnitude_vec1 * magnitude_vec2)
79
- return cosine_sim
80
-
81
- def update_chroma(product_name,url,key,text,vector,log_area):
82
-
83
- id_list = [key+str(i) for i in range(len(text))]
84
-
85
- metadata_list = [
86
- { 'key':key,
87
- 'product_name': product_name,
88
- 'url': url,
89
- 'text':item
90
- }
91
- for item in text
92
- ]
93
-
94
- collection.upsert(
95
- ids = id_list,
96
- embeddings = vector,
97
- metadatas = metadata_list
98
- )
99
-
100
- logger.write(f"\n\u2713 Updated DB - {url}\n\n")
101
- log_area.text(logger.getvalue())
102
-
103
-
104
- # Logger class to capture output
105
- class StreamCapture:
106
- def __init__(self):
107
- self.output = StringIO()
108
- self._stdout = sys.stdout
109
-
110
- def __enter__(self):
111
- sys.stdout = self.output
112
- return self.output
113
-
114
- def __exit__(self, exc_type, exc_val, exc_tb):
115
- sys.stdout = self._stdout
116
-
117
- # Main Function
118
- def score(main_product, main_url, product_count, link_count, search, logger, log_area):
119
-
120
-
121
- data = {}
122
- similar_products = extract_similar_products(main_product)[:product_count]
123
-
124
- print("--> Fetching Manual Links")
125
- # Normal Filtering + Embedding -----------------------------------------------
126
- if search == 'All':
127
-
128
- def process_product(product, search_function, main_product):
129
- search_result = search_function(product)
130
- return filtering(search_result, main_product, product, link_count)
131
-
132
-
133
- search_functions = {
134
- 'google': search_google,
135
- 'duckduckgo': search_duckduckgo,
136
- # 'archive': search_archive,
137
- 'github': search_github,
138
- 'wikipedia': search_wikipedia
139
- }
140
-
141
- with ThreadPoolExecutor() as executor:
142
- future_to_product_search = {
143
- executor.submit(process_product, product, search_function, main_product): (product, search_name)
144
- for product in similar_products
145
- for search_name, search_function in search_functions.items()
146
- }
147
-
148
- for future in as_completed(future_to_product_search):
149
- product, search_name = future_to_product_search[future]
150
- try:
151
- if product not in data:
152
- data[product] = {}
153
- data[product] = future.result()
154
- except Exception as e:
155
- print(f"Error processing product {product} with {search_name}: {e}")
156
-
157
- else:
158
-
159
- for product in similar_products:
160
-
161
- if search == 'google':
162
- data[product] = filtering(search_google(product), main_product, product, link_count)
163
- elif search == 'duckduckgo':
164
- data[product] = filtering(search_duckduckgo(product), main_product, product, link_count)
165
- elif search == 'archive':
166
- data[product] = filtering(search_archive(product), main_product, product, link_count)
167
- elif search == 'github':
168
- data[product] = filtering(search_github(product), main_product, product, link_count)
169
- elif search == 'wikipedia':
170
- data[product] = filtering(search_wikipedia(product), main_product, product, link_count)
171
-
172
-
173
- # Filtered Link -----------------------------------------
174
- logger.write("\n\n\u2713 Filtered Links\n")
175
- log_area.text(logger.getvalue())
176
-
177
-
178
- # Main product Embeddings ---------------------------------
179
- logger.write("\n\n--> Creating Main product Embeddings\n")
180
-
181
- main_key = get_key(main_url)
182
- main_text,main_vector = get_embed_chroma(main_url)
183
-
184
- update_chroma(main_product,main_url,main_key,main_text,main_vector,log_area)
185
-
186
- # log_area.text(logger.getvalue())
187
- print("\n\n\u2713 Main Product embeddings Created")
188
-
189
-
190
- logger.write("\n\n--> Creating Similar product Embeddings\n")
191
- log_area.text(logger.getvalue())
192
- test_embedding = [0]*768
193
-
194
- for product in data:
195
- for link in data[product]:
196
-
197
- url, _ = link
198
- similar_key = get_key(url)
199
-
200
- res = collection.query(
201
- query_embeddings = [test_embedding],
202
- n_results=1,
203
- where={"key": similar_key},
204
- )
205
-
206
- if not res['distances'][0]:
207
- similar_text,similar_vector = get_embed_chroma(url)
208
- update_chroma(product,url,similar_key,similar_text,similar_vector,log_area)
209
-
210
-
211
- logger.write("\n\n\u2713 Similar Product embeddings Created\n")
212
- log_area.text(logger.getvalue())
213
-
214
- top_similar = []
215
-
216
- for idx,chunk in enumerate(main_vector):
217
- res = collection.query(
218
- query_embeddings = [chunk],
219
- n_results=1,
220
- where={"key": {'$ne':main_key}},
221
- include=['metadatas','embeddings','distances']
222
- )
223
-
224
- top_similar.append((main_text[idx],chunk,res,res['distances'][0]))
225
-
226
- most_similar_items = sorted(top_similar,key = lambda x:x[3])[:top_similar_count]
227
-
228
-
229
- logger.write("--------------- DONE -----------------\n")
230
- log_area.text(logger.getvalue())
231
-
232
- return most_similar_items
233
-
234
-
235
-
236
-
237
-
238
- # Streamlit Interface
239
- st.title("Check Infringement")
240
-
241
-
242
- # Inputs
243
- main_product = st.text_input('Enter Main Product Name', 'Philips led 7w bulb')
244
- main_url = st.text_input('Enter Main Product Manual URL', 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf')
245
- search_method = st.selectbox('Choose Search Engine', ['All','duckduckgo', 'google', 'archive', 'github', 'wikipedia'])
246
-
247
- col1, col2, col3= st.columns(3)
248
- with col1:
249
- product_count = st.number_input("Number of Simliar Products",min_value=1, step=1, format="%i")
250
- with col2:
251
- link_count = st.number_input("Number of Links per product",min_value=1, step=1, format="%i")
252
- with col3:
253
- need_image = st.selectbox("Process Images", ['True','False'])
254
-
255
- top_similar_count = st.number_input("Top Similarities to be displayed",value=3,min_value=1, step=1, format="%i")
256
- tag_option = "Complete Document Similarity"
257
-
258
-
259
- if st.button('Check for Infringement'):
260
- global log_output # Placeholder for log output
261
-
262
- tab1, tab2 = st.tabs(["Output", "Console"])
263
-
264
- with tab2:
265
- log_output = st.empty()
266
-
267
- with tab1:
268
- with st.spinner('Processing...'):
269
- with StreamCapture() as logger:
270
- top_similar_values = score(main_product, main_url, product_count, link_count, search_method, logger, log_output)
271
-
272
- st.success('Processing complete!')
273
-
274
- st.subheader("Cosine Similarity Scores")
275
-
276
- for main_text, main_vector, response, _ in top_similar_values:
277
- product_name = response['metadatas'][0][0]['product_name']
278
- link = response['metadatas'][0][0]['url']
279
- similar_text = response['metadatas'][0][0]['text']
280
-
281
- cosine_score = cosine_similarity([main_vector], response['embeddings'][0])[0][0]
282
-
283
- # Display the product information
284
- with st.container():
285
- st.markdown(f"### [Product: {product_name}]({link})")
286
- st.markdown(f"#### Cosine Score: {cosine_score:.4f}")
287
- col1, col2 = st.columns(2)
288
- with col1:
289
- st.markdown(f"**Main Text:** \n{imporve_text(main_text)}")
290
- with col2:
291
- st.markdown(f"**Similar Text:** \n{imporve_text(similar_text)}")
292
-
293
- st.markdown("---")
294
-
295
- if need_image == 'True':
296
- with st.spinner('Processing Images...'):
297
- emb_main = get_image_embeddings(main_product)
298
- similar_prod = extract_similar_products(main_product)[0]
299
- emb_similar = get_image_embeddings(similar_prod)
300
-
301
- similarity_matrix = np.zeros((5, 5))
302
- for i in range(5):
303
- for j in range(5):
304
- similarity_matrix[i][j] = cosine_similarity([emb_main[i]], [emb_similar[j]])[0][0]
305
-
306
- st.subheader("Image Similarity")
307
- # Create an interactive heatmap
308
- fig = px.imshow(similarity_matrix,
309
- labels=dict(x=f"{similar_prod} Images", y=f"{main_product} Images", color="Similarity"),
310
- x=[f"Image {i+1}" for i in range(5)],
311
- y=[f"Image {i+1}" for i in range(5)],
312
- color_continuous_scale="Viridis")
313
-
314
- # Add title to the heatmap
315
- fig.update_layout(title="Image Similarity Heatmap")
316
-
317
- # Display the interactive heatmap
318
- st.plotly_chart(fig)
319
-
320
-
321
-
322
-
323
- # main_product = 'Philips led 7w bulb'
324
- # main_url = 'https://www.assets.signify.com/is/content/PhilipsConsumer/PDFDownloads/Colombia/technical-sheets/ODLI20180227_001-UPD-es_CO-Ficha_Tecnica_LED_MR16_Master_7W_Dim_12V_CRI90.pdf'
325
- # search_method = 'duckduckgo'
326
-
327
- # product_count = 1
328
- # link_count = 1
329
- # need_image = False
330
-
331
-
332
- # tag_option = "Field Wise Document Similarity"
333
-
334
- # logger = StreamCapture()
335
- # score(main_product, main_url,product_count, link_count, search_method, logger, st.empty())
336
-
337
-
338
-