Spaces:
Sleeping
Sleeping
| """ | |
| Author: Khanh Phan | |
| Date: 2024-12-04 | |
| """ | |
| import os | |
| import nltk | |
| import openai | |
| import torch | |
| from dotenv import load_dotenv | |
| from sentence_transformers import SentenceTransformer | |
| # Load environment variables | |
| load_dotenv() | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| SEARCH_ENGINE_ID = os.getenv("SEARCH_ENGINE_ID") | |
| AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") | |
| AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") | |
| AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") | |
| # GPT Model | |
| GPT_ENTITY_MODEL = "o1-mini" # "gpt-4o-mini" or "o1-mini" | |
| GPT_PARAPHRASE_MODELS = ["gpt-4o", "gpt-4o-mini"] | |
| GPT_IMAGE_MODEL = "dall-e-3" | |
| AZUREOPENAI_CLIENT = openai.AzureOpenAI( | |
| api_version=AZURE_OPENAI_API_VERSION, # AZURE_OPENAI_API_VERSION, | |
| api_key=AZURE_OPENAI_API_KEY, | |
| azure_endpoint=AZURE_OPENAI_ENDPOINT, | |
| ) | |
| # Download the resources | |
| nltk.download("punkt", quiet=True) # Sentence tokenization | |
| nltk.download("punkt_tab", quiet=True) # Tokenization with tab-separated data | |
| nltk.download("stopwords", quiet=True) # A list of stop words | |
| STOPWORDS_LANG = "english" | |
| # Load PARAPHASE_MODEL | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| PARAPHRASE_MODEL = SentenceTransformer("paraphrase-MiniLM-L6-v2") | |
| PARAPHRASE_MODEL.to(DEVICE) | |
| # Model to detect AI-generated text | |
| AI_TEXT_DECTECTION_MODEL = "TrustSafeAI/RADAR-Vicuna-7B" | |
| # Model to classify AI-generated text | |
| AI_TEXT_CLASSIFICATION_MODEL = "ductuan024/gpts-detector" | |
| # Thresholds | |
| PARAPHRASE_THRESHOLD_HUMAN = 0.963 | |
| PARAPHRASE_THRESHOLD_MACHINE = 0.8 | |
| PARAPHRASE_THRESHOLD = 0.8 | |
| MIN_SAME_SENTENCE_LEN = 6 | |
| MIN_PHRASE_SENTENCE_LEN = 10 | |
| MIN_RATIO_PARAPHRASE_NUM = 0.5 | |
| MAX_CHAR_SIZE = 30000 | |
| # Number of top URLs per search | |
| TOP_URLS_PER_SEARCH = 3 | |
| MAX_URL_SIZE = 2 * 1024 * 1024 # ~2 MB | |
| # Search parameters | |
| GOOGLE_ENDPOINT_URL = "https://www.googleapis.com/customsearch/v1" | |
| TOP_SEARCH_RESUTLS = 10 | |
| CHUNK_SIZE = 32 # words | |
| NUM_CHUNKS = 3 # number of chunks to search | |
| NUM_FREQUENT_WORDS = 32 # number of top words to return | |
| NUM_KEYWORDS = 5 # number of keywords to return | |
| # Labels | |
| MODEL_HUMAN_LABEL = {AI_TEXT_DECTECTION_MODEL: "Human"} | |
| HUMAN = "HUMAN" | |
| MACHINE = "MACHINE" | |
| UNKNOWN = "UNKNOWN" | |
| PARAPHRASE = "PARAPHRASE" | |
| NON_PARAPHRASE = "NON_PARAPHRASE" | |
| # Entity color | |
| """ | |
| factor > 1: Lightens the color. | |
| factor = 1: Leaves the color unchanged. | |
| factor < 1: Darkens the color. | |
| factor = 0: Black. | |
| """ | |
| ENTITY_LIGHTEN_COLOR = 2.2 | |
| ENTITY_DARKEN_COLOR = 0.7 | |
| ENTITY_SATURATION = 0.65 # Saturation: color's intensity (vividness). | |
| ENTITY_BRIGHTNESS = 0.75 # color's brightness. | |
| # HTML formatting | |
| WORD_BREAK = "word-break: break-all;" | |
| # Prefix for output MACHINE label of text | |
| PREFIX = "Partially generated by " | |