JoeArmani
summarization, reranker, environment setup, and response quality checker
f7b283c
from setuptools import setup, find_packages
import subprocess
import sys
import platform
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
with open("requirements.txt", "r", encoding="utf-8") as fh:
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
def setup_spacy_models(models=['en_core_web_sm', 'en_core_web_md']):
"""
Download the specified spaCy model.
Args:
models(List): List[str] of the names of the spaCy model to download.
"""
try:
for model in models:
print(f"Downloading spaCy model: {model}")
subprocess.check_call([sys.executable, "-m", "spacy", "download", model])
print(f"Successfully downloaded spaCy model: {model}")
except subprocess.CalledProcessError as e:
print(f"Error downloading spaCy model: {model}")
print(e)
def setup_gpu_dependencies():
"""Setup GPU-specific dependencies."""
cuda_available = False
# Check CUDA availability
try:
import torch
cuda_available = torch.cuda.is_available()
except ImportError:
pass
if cuda_available:
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-gpu>=1.7.0"])
print("Successfully installed faiss-gpu")
except subprocess.CalledProcessError:
print("Failed to install faiss-gpu. Falling back to faiss-cpu")
subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu>=1.7.0"])
else:
subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu>=1.7.0"])
def setup_models():
"""
Download other required models.
"""
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import (
AutoTokenizer,
AutoModel,
GPT2TokenizerFast,
MarianTokenizer,
DistilBertTokenizer,
DistilBertModel
)
# Cache the models
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
# Download Universal Sentence Encoder
_ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
# Download paraphraser model
_ = AutoTokenizer.from_pretrained('humarin/chatgpt_paraphraser_on_T5_base')
# Download translation models
source_lang, pivot_lang, target_lang = 'en', 'de', 'es'
model_names = [
f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}',
f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}',
f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
]
for model_name in model_names:
_ = MarianTokenizer.from_pretrained(model_name)
# Download GPT-2
_ = GPT2TokenizerFast.from_pretrained('gpt2')
def setup_nltk():
"""
Download required NLTK data.
"""
import nltk
required_packages = [
'wordnet',
'averaged_perceptron_tagger_eng'
]
for package in required_packages:
try:
print(f"Downloading {package}...")
nltk.download(package)
print(f"Successfully downloaded {package}")
except Exception as e:
print(f"Warning: Could not download {package}: {str(e)}")
def setup_faiss():
"""
Download required faiss library.
"""
current_os = platform.system()
cuda_available = False
# Function to check CUDA availability
def check_cuda():
try:
import torch
return torch.cuda.is_available()
except:
return False
if current_os == "Linux" and check_cuda():
# Attempt to install faiss-gpu
try:
print("Attempting to install faiss-gpu...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-gpu>=1.7.0"])
print("Successfully installed faiss-gpu")
return
except subprocess.CalledProcessError:
print("Failed to install faiss-gpu. Falling back to faiss-cpu.")
# Install faiss-cpu as the default
try:
print("Installing faiss-cpu...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "faiss-cpu>=1.7.0"])
print("Successfully installed faiss-cpu")
except subprocess.CalledProcessError as e:
print("Error installing faiss-cpu")
print(e)
setup(
name="retrieval-chatbot",
version="0.2.0",
author="Joe Armani",
author_email="[email protected]",
description="A retrieval-based chatbot with enhanced validation",
long_description=long_description,
long_description_content_type="text/markdown",
packages=find_packages(),
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Text Processing :: Linguistic",
],
python_requires=">=3.8",
install_requires=requirements,
extras_require={
'dev': [
'pytest>=7.0.0',
'black>=22.0.0',
'isort>=5.10.0',
'mypy>=1.0.0',
],
'gpu': [
'faiss-gpu>=1.7.0',
],
},
entry_points={
"console_scripts": [
"dialogue-augment=dialogue_augmenter.main:main",
"run-chatbot=chatbot.main:main",
],
},
include_package_data=True,
package_data={
"chatbot": ["config/*.yaml"],
"dialogue_augmenter": ["data/*.json", "config/*.yaml"],
},
)
if __name__ == '__main__':
setup_spacy_models()
setup_gpu_dependencies()
setup_models()
setup_nltk()
setup_faiss()