|
from setuptools import setup, find_packages |
|
import subprocess |
|
import sys |
|
|
|
with open("README.md", "r", encoding="utf-8") as fh: |
|
long_description = fh.read() |
|
|
|
with open("requirements.txt", "r", encoding="utf-8") as fh: |
|
requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")] |
|
|
|
def setup_spacy_model(): |
|
""" |
|
Download spaCy model. |
|
""" |
|
subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"]) |
|
|
|
def setup_models(): |
|
""" |
|
Download other required models. |
|
""" |
|
import tensorflow_hub as hub |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from transformers import ( |
|
AutoTokenizer, |
|
GPT2TokenizerFast, |
|
MarianTokenizer |
|
) |
|
|
|
|
|
_ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4') |
|
|
|
|
|
_ = AutoTokenizer.from_pretrained('humarin/chatgpt_paraphraser_on_T5_base') |
|
|
|
|
|
source_lang, pivot_lang, target_lang = 'en', 'de', 'es' |
|
model_names = [ |
|
f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}', |
|
f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}', |
|
f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}' |
|
] |
|
for model_name in model_names: |
|
_ = MarianTokenizer.from_pretrained(model_name) |
|
|
|
|
|
_ = GPT2TokenizerFast.from_pretrained('gpt2') |
|
|
|
def setup_nltk(): |
|
""" |
|
Download required NLTK data. |
|
""" |
|
import nltk |
|
required_packages = [ |
|
'wordnet', |
|
'averaged_perceptron_tagger_eng' |
|
] |
|
|
|
for package in required_packages: |
|
try: |
|
print(f"Downloading {package}...") |
|
nltk.download(package) |
|
print(f"Successfully downloaded {package}") |
|
except Exception as e: |
|
print(f"Warning: Could not download {package}: {str(e)}") |
|
|
|
setup( |
|
name="text-data-augmenter", |
|
version="0.1.0", |
|
author="Joe Armani", |
|
author_email="[email protected]", |
|
description="A tool for generating high-quality dialogue variations", |
|
packages=find_packages(), |
|
classifiers=[ |
|
"Development Status :: 3 - Alpha", |
|
"Intended Audience :: Science/Research", |
|
"License :: OSI Approved :: MIT License", |
|
"Operating System :: OS Independent", |
|
"Programming Language :: Python :: 3", |
|
"Programming Language :: Python :: 3.8", |
|
"Programming Language :: Python :: 3.9", |
|
"Topic :: Scientific/Engineering :: Artificial Intelligence", |
|
"Topic :: Text Processing :: Linguistic", |
|
], |
|
python_requires=">=3.8", |
|
install_requires=requirements, |
|
entry_points={ |
|
"console_scripts": [ |
|
"dialogue-augment=dialogue_augmenter.main:main", |
|
], |
|
}, |
|
include_package_data=True, |
|
package_data={ |
|
"dialogue_augmenter": ["data/*.json", "config/*.yaml"], |
|
}, |
|
) |
|
|
|
if __name__ == '__main__': |
|
setup_spacy_model() |
|
setup_models() |
|
setup_nltk() |