File size: 2,228 Bytes
011c6b2
 
0c7ffdb
 
 
2524123
79ecc72
 
 
c208ca1
0c7ffdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79ecc72
 
 
0c7ffdb
0ee4a85
 
 
60eae40
0ee4a85
 
60eae40
9bbbf26
958bbd7
 
 
 
 
 
 
 
 
 
0ee4a85
 
 
 
 
60eae40
79ecc72
 
0c7ffdb
79ecc72
 
 
 
0c7ffdb
79ecc72
 
011c6b2
79ecc72
 
011c6b2
79ecc72
 
 
 
 
0c7ffdb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import streamlit as st
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import tempfile
import textract
import docx2txt
import pdfplumber
import io

def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

st.title("Text Similarity Model")

task = 'Given a web search query, retrieve relevant passages that answer the query'  

docs = st.sidebar.file_uploader("Upload documents", accept_multiple_files=True, type=['txt','pdf','xlsx','docx']) 
query = st.text_input("Enter search query")
click = st.button("Search")

import pdfplumber
import docx2txt

def extract_text(doc):
    if doc.type == 'text/plain':
        return doc.read().decode('utf-8')

    if doc.name.endswith(".pdf"):
        raw = doc.read()  

        # Remove null bytes without decoding 
        raw = raw.replace(b'\x00', b'')

        pdf = pdfplumber.open(BytesIO(raw))
        pages = [page.extract_text() for page in pdf.pages]      
        return "\n".join(pages)
    
    
    if doc.name.endswith('.docx'):
        raw_text = doc.read()
        return docx2txt.process(raw_text)

    return None

if click and query:
    doc_contents = []
    
    for doc in docs:
        # Extract text from each document
        doc_text = extract_text(doc)  
        doc_contents.append(doc_text)

    doc_embeddings = get_embeddings(doc_contents)  
    query_embedding = get_embedding(query)
    
    scores = compute_similarity(query_embedding, doc_embeddings)
    ranked_docs = get_ranked_docs(scores)
    
    st.write("Most Relevant Documents")
    for doc, score in ranked_docs:
        st.write(f"{doc.name} (score: {score:.2f})")