Spaces:

notabaka
/

ASRtest

Runtime error

App Files Files Community

notabaka commited on Feb 23, 2024

Commit

79ecc72

1 Parent(s): 0c7ffdb

tststml

Browse files

Files changed (2) hide show

app.py +37 -27
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -3,6 +3,9 @@ import torch
 import torch.nn.functional as F
 from torch import Tensor
 from transformers import AutoTokenizer, AutoModel
 def last_token_pool(last_hidden_states: Tensor,
                  attention_mask: Tensor) -> Tensor:
@@ -21,37 +24,44 @@ st.title("Text Similarity Model")
 task = 'Given a web search query, retrieve relevant passages that answer the query'
-query1 = st.text_input("Enter first query")
-query2 = st.text_input("Enter second query")
-if query1 and query2:
-    queries = [
-        get_detailed_instruct(task, query1),
-        get_detailed_instruct(task, query2)
-    ]
-    passages = [
-    "To bake a delicious chocolate cake, you'll need the following ingredients: all-purpose flour, sugar, cocoa powder, baking powder, baking soda, salt, eggs, milk, vegetable oil, and vanilla extract. Start by preheating your oven to 350°F (175°C). In a mixing bowl, combine the dry ingredients (flour, sugar, cocoa powder, baking powder, baking soda, and salt). In a separate bowl, whisk together the wet ingredients (eggs, milk, vegetable oil, and vanilla extract). Gradually add the wet mixture to the dry ingredients, stirring until well combined. Pour the batter into a greased cake pan and bake for 30-35 minutes. Let it cool before frosting with your favorite chocolate frosting. Enjoy your homemade chocolate cake!",
-    "The flu, or influenza, is an illness caused by influenza viruses. Common symptoms of the flu include a high fever, chills, cough, sore throat, runny or stuffy nose, body aches, headache, fatigue, and sometimes nausea and vomiting. These symptoms can come on suddenly and are usually more severe than the common cold. It's important to get plenty of rest, stay hydrated, and consult a healthcare professional if you suspect you have the flu. In some cases, antiviral medications can help alleviate symptoms and reduce the duration of the illness."]
-    tokenizer = AutoTokenizer.from_pretrained('Salesforce/SFR-Embedding-Mistral')
-    model = AutoModel.from_pretrained('Salesforce/SFR-Embedding-Mistral')
-    # Get embeddings
-    max_length = 4096
-    input_texts = queries + passages
-    batch_dict = tokenizer(input_texts, max_length=max_length, padding=True, truncation=True, return_tensors="pt")
-    outputs = model(**batch_dict)
-    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
-    # Normalize embeddings
-    embeddings = F.normalize(embeddings, p=2, dim=1)
-    scores = (embeddings[:2] @ embeddings[2:].T) * 100
-    st.write("Similarity scores:", scores.tolist())

 import torch.nn.functional as F
 from torch import Tensor
 from transformers import AutoTokenizer, AutoModel
+import textract
+import docx2txt
+import pdfplumber
 def last_token_pool(last_hidden_states: Tensor,
                  attention_mask: Tensor) -> Tensor:
 task = 'Given a web search query, retrieve relevant passages that answer the query'
+docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True, type=['txt','pdf','xlsx','docx'])
+query = st.text_input("Enter search query")
+click = st.button("Search")
+if click and query:
+    doc_contents = []
+    for doc in docs:
+        # Extract text from each document
+        doc_text = extract_text(doc)
+        doc_contents.append(doc_text)
+    doc_embeddings = get_embeddings(doc_contents)
+    query_embedding = get_embedding(query)
+    scores = compute_similarity(query_embedding, doc_embeddings)
+    ranked_docs = get_ranked_docs(scores)
+    st.write("Most Relevant Documents")
+    for doc, score in ranked_docs:
+        st.write(f"{doc.name} (score: {score:.2f})")
+def extract_text(doc):
+    if doc.type == 'text/plain':
+        return doc.getvalue().decode("utf-8")
+    if doc.type == "application/pdf":
+        with pdfplumber.open(doc) as pdf:
+            pages = [page.extract_text() for page in pdf.pages]
+            return "\n".join(pages)
+    if doc.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return docx2txt.process(doc)
+    if doc.name.endswith(".xlsx"):
+        text = textract.process(doc)
+        return text.decode("utf-8")
+    return None

requirements.txt CHANGED Viewed

@@ -1,2 +1,5 @@
 torch
-transformers

 torch
+transformers
+textract
+docx2txt
+pdfplumber