File size: 3,054 Bytes
3190e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from setuptools import setup, find_packages
import subprocess
import sys

with open("README.md", "r", encoding="utf-8") as fh:
    long_description = fh.read()

with open("requirements.txt", "r", encoding="utf-8") as fh:
    requirements = [line.strip() for line in fh if line.strip() and not line.startswith("#")]

def setup_spacy_model():
    """
    Download spaCy model.
    """
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])

def setup_models():
    """
    Download other required models.
    """
    import tensorflow_hub as hub
    from sklearn.feature_extraction.text import TfidfVectorizer
    from transformers import (
        AutoTokenizer, 
        GPT2TokenizerFast, 
        MarianTokenizer
    )
    
    # Download Universal Sentence Encoder
    _ = hub.load('https://tfhub.dev/google/universal-sentence-encoder/4')
    
    # Download paraphraser model
    _ = AutoTokenizer.from_pretrained('humarin/chatgpt_paraphraser_on_T5_base')
    
    # Download translation models
    source_lang, pivot_lang, target_lang = 'en', 'de', 'es'
    model_names = [
        f'Helsinki-NLP/opus-mt-{source_lang}-{pivot_lang}',
        f'Helsinki-NLP/opus-mt-{pivot_lang}-{target_lang}',
        f'Helsinki-NLP/opus-mt-{target_lang}-{source_lang}'
    ]
    for model_name in model_names:
        _ = MarianTokenizer.from_pretrained(model_name)
    
    # Download GPT-2
    _ = GPT2TokenizerFast.from_pretrained('gpt2')

def setup_nltk():
    """
    Download required NLTK data.
    """
    import nltk
    required_packages = [
        'wordnet',
        'averaged_perceptron_tagger_eng'
    ]
    
    for package in required_packages:
        try:
            print(f"Downloading {package}...")
            nltk.download(package)
            print(f"Successfully downloaded {package}")
        except Exception as e:
            print(f"Warning: Could not download {package}: {str(e)}")

setup(
    name="text-data-augmenter",
    version="0.1.0",
    author="Joe Armani",
    author_email="[email protected]",
    description="A tool for generating high-quality dialogue variations",
    packages=find_packages(),
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Science/Research",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "Topic :: Text Processing :: Linguistic",
    ],
    python_requires=">=3.8",
    install_requires=requirements,
    entry_points={
        "console_scripts": [
            "dialogue-augment=dialogue_augmenter.main:main",
        ],
    },
    include_package_data=True,
    package_data={
        "dialogue_augmenter": ["data/*.json", "config/*.yaml"],
    },
)

if __name__ == '__main__':
    setup_spacy_model()
    setup_models()
    setup_nltk()