Spaces:
Sleeping
Sleeping
| import re | |
| import openai | |
| import pandas as pd | |
| import streamlit_scrollable_textbox as stx | |
| import torch | |
| from InstructorEmbedding import INSTRUCTOR | |
| from gradio_client import Client | |
| from transformers import ( | |
| AutoModelForMaskedLM, | |
| AutoTokenizer, | |
| ) | |
| from rank_bm25 import BM25Okapi, BM25L, BM25Plus | |
| import numpy as np | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem.porter import PorterStemmer | |
| import re | |
| import streamlit as st | |
| def get_data(): | |
| data = pd.read_csv("earnings_calls_cleaned_metadata_keywords_indices.csv") | |
| return data | |
| # Preprocessing for BM25 | |
| def tokenizer( | |
| string, reg="[a-zA-Z'-]+|[0-9]{1,}%|[0-9]{1,}\.[0-9]{1,}%|\d+\.\d+%}" | |
| ): | |
| regex = reg | |
| string = string.replace("-", " ") | |
| return " ".join(re.findall(regex, string)) | |
| def preprocess_text(text): | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Tokenize the text | |
| tokens = word_tokenize(text) | |
| # Remove stop words | |
| stop_words = set(stopwords.words("english")) | |
| tokens = [token for token in tokens if token not in stop_words] | |
| # Stem the tokens | |
| porter_stemmer = PorterStemmer() | |
| tokens = [porter_stemmer.stem(token) for token in tokens] | |
| # Join the tokens back into a single string | |
| preprocessed_text = " ".join(tokens) | |
| preprocessed_text = tokenizer(preprocessed_text) | |
| return preprocessed_text | |
| # Initialize models from HuggingFace | |
| def get_splade_sparse_embedding_model(): | |
| model_sparse = "naver/splade-cocondenser-ensembledistil" | |
| # check device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(model_sparse) | |
| model_sparse = AutoModelForMaskedLM.from_pretrained(model_sparse) | |
| # move to gpu if available | |
| model_sparse.to(device) | |
| return model_sparse, tokenizer | |
| def get_instructor_embedding_model(): | |
| model = INSTRUCTOR("hkunlp/instructor-xl") | |
| return model | |
| def get_instructor_embedding_model_api(): | |
| client = Client("https://awinml-api-instructor-xl-2.hf.space/") | |
| return client | |
| def get_alpaca_model(): | |
| client = Client("https://awinml-alpaca-cpp.hf.space") | |
| return client | |
| def get_vicuna_ner_1_model(): | |
| client = Client("https://awinml-api-vicuna-openblas-ner-1.hf.space/") | |
| return client | |
| def get_vicuna_ner_2_model(): | |
| client = Client("https://awinml-api-vicuna-openblas-ner-2.hf.space/") | |
| return client | |
| def get_vicuna_text_gen_model(): | |
| client = Client("https://awinml-api-vicuna-openblas-4.hf.space/") | |
| return client | |
| def get_bm25_model(data): | |
| corpus = data.Text.tolist() | |
| corpus_clean = [preprocess_text(x) for x in corpus] | |
| tokenized_corpus = [doc.split(" ") for doc in corpus_clean] | |
| bm25 = BM25Plus(tokenized_corpus) | |
| return corpus, bm25 | |
| def save_key(api_key): | |
| return api_key | |
| # Text Generation | |
| def vicuna_text_generate(prompt, model): | |
| generated_text = model.predict(prompt, api_name="/predict") | |
| return generated_text | |
| def gpt_turbo_model(prompt): | |
| response = openai.ChatCompletion.create( | |
| model="gpt-3.5-turbo", | |
| messages=[ | |
| {"role": "user", "content": prompt}, | |
| ], | |
| temperature=0.01, | |
| max_tokens=1024, | |
| ) | |
| return response["choices"][0]["message"]["content"] | |