JoeArmani
Initial commit
3190e1e
raw
history blame
3.05 kB
from setuptools import setup, find_packages
import subprocess
import sys
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
with open("requirements.txt", "r", encoding="utf-8") as fh:
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]
def setup_spacy_model():
"""
Download spaCy model.
"""
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
def setup_models():
"""
Download other required models.
"""
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import (
AutoTokenizer,
GPT2TokenizerFast,
MarianTokenizer
)
# Download Universal Sentence Encoder
_ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
# Download paraphraser model
_ = AutoTokenizer.from_pretrained('humarin/chatgpt_paraphraser_on_T5_base')
# Download translation models
source_lang, pivot_lang, target_lang = 'en', 'de', 'es'
model_names = [
f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}',
f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}',
f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
]
for model_name in model_names:
_ = MarianTokenizer.from_pretrained(model_name)
# Download GPT-2
_ = GPT2TokenizerFast.from_pretrained('gpt2')
def setup_nltk():
"""
Download required NLTK data.
"""
import nltk
required_packages = [
'wordnet',
'averaged_perceptron_tagger_eng'
]
for package in required_packages:
try:
print(f"Downloading {package}...")
nltk.download(package)
print(f"Successfully downloaded {package}")
except Exception as e:
print(f"Warning: Could not download {package}: {str(e)}")
setup(
name="text-data-augmenter",
version="0.1.0",
author="Joe Armani",
author_email="[email protected]",
description="A tool for generating high-quality dialogue variations",
packages=find_packages(),
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Text Processing :: Linguistic",
],
python_requires=">=3.8",
install_requires=requirements,
entry_points={
"console_scripts": [
"dialogue-augment=dialogue_augmenter.main:main",
],
},
include_package_data=True,
package_data={
"dialogue_augmenter": ["data/*.json", "config/*.yaml"],
},
)
if __name__ == '__main__':
setup_spacy_model()
setup_models()
setup_nltk()