File size: 2,207 Bytes
78db47d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bb6871
 
a8030b6
8bb6871
 
 
 
 
78db47d
 
 
 
 
 
8bb6871
78db47d
 
 
8bb6871
78db47d
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import requests
import os
import re

API_TOKEN = os.getenv('API_TOKEN')
API_URL = "https://api-inference.huggingface.co/models/nasa-impact/nasa-smd-ibm-st-v2"
headers = {"Authorization": f"Bearer {API_TOKEN}"}

def query_similarity(source_sentence, sentences):
    payload = {
        "inputs": {
            "source_sentence": source_sentence,
            "sentences": sentences
        }
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def format_output(response):
    results = sorted(response, key=lambda x: x['score'], reverse=True)
    formatted_results = []
    for item in results:
        formatted_results.append(f"Sentence: {item['sentence']}, Score: {item['score']:.4f}")
    return "\n".join(formatted_results)

def split_into_chunks(text, chunk_size=100):
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split text into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > chunk_size:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def semantic_search(query, file):
    if file is not None:
        document = file.read().decode('utf-8')  # Correct way to read the content
        chunks = split_into_chunks(document)
        response = query_similarity(query, chunks)
        return format_output(response)
    else:
        return "Please upload a .txt file."

# Define Gradio interface
iface = gr.Interface(
    fn=semantic_search,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter your query here..."),
        gr.File(file_types=['txt'], label="Upload a .txt file")
    ],
    outputs="text",
    title="Document Semantic Search",
    description="Input a query and upload a document (.txt) to find the most semantically similar paragraphs or sentences."
)

iface.launch()