File size: 4,195 Bytes
0218c20
03991d8
 
 
0218c20
 
29331bd
6a547e4
29331bd
2ba12d8
6f6ae2a
 
 
 
 
2ba12d8
6f6ae2a
2ba12d8
29331bd
2ba12d8
26383b6
2ba12d8
 
 
 
 
 
 
 
 
 
 
0218c20
2ba12d8
6a547e4
0218c20
03991d8
2ba12d8
03991d8
 
2ba12d8
03991d8
 
0218c20
03991d8
 
 
 
 
2ba12d8
03991d8
 
 
2ba12d8
f04e876
0091aa3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f04e876
0218c20
03991d8
2ba12d8
 
 
 
 
 
 
 
 
 
 
0218c20
2ba12d8
 
0218c20
 
 
2ba12d8
0218c20
 
 
 
 
2ba12d8
 
 
 
 
 
 
 
 
6a547e4
2ba12d8
03991d8
 
 
2ba12d8
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import asyncio

# Set cache directories
cache_dir = "/tmp/hf_home"
os.environ["HF_HOME"] = cache_dir
os.environ["TRANSFORMERS_CACHE"] = cache_dir
os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir

# Create cache directory with proper permissions
os.makedirs(cache_dir, exist_ok=True)
os.chmod(cache_dir, 0o777)

# Load model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=cache_dir
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    cache_dir=cache_dir,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize FastAPI
app = FastAPI()

# Enable CORS
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Input model
class Question(BaseModel):
    question: str

# System prompt
SYSTEM_PROMPT = '''You are a helpful assistant of Infinity AI, professional, and highly persuasive sales assistant for a premium web development and AI service website.

Your tone is friendly, respectful, and high-end, making users feel valued. Always treat every visitor as a premium client.

The website offers:

Custom-built 2D and 3D websites based on client needs (pricing: $200 to $600, depending on features and demand).

A one-time-payment, free and unlimited AI chatbot for $119, fully customizable for the user's website.

Your primary goals are:

Drive sales of the website services and chatbots.

Clearly explain the benefits and pricing.

Show extra respect and premium care to users.

Encourage users to take action (book, buy, or ask more).

Behavior Guidelines:

Greet users warmly and thank them for visiting.

Highlight how custom and premium your service is.

Offer to help based on their ideas and needs.

Gently upsell where appropriate, especially emphasizing the one-time AI chatbot offer.

Always respond in a concise, friendly, and confident tone.

Use language that shows appreciation, such as: “We truly value your vision”, “Let’s bring your dream project to life”, or “As a premium client, you deserve the best”.

Important Details to Mention When Needed:

Custom 2D/3D websites starting from $200 to $600 depending on requirements.

Lifetime AI chatbot for $119 – no monthly fees, unlimited use.

Fast development, full support, and high-end quality.

Never say: “I don’t know,” “That’s not possible,” or “Sorry.”
Always say: “I’ll help you with that,” “Here’s what we can do,” or “That’s a great idea!”'''

async def generate_response_chunks(prompt: str):
    # Create the chat template
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt}
    ]
    
    # Apply chat template
    qwen_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize and generate
    inputs = tokenizer(qwen_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode and clean the output
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=False)
    
    # Extract only the assistant's response
    response = full_output[len(qwen_prompt):].split(tokenizer.eos_token)[0].strip()
    
    # Stream the response
    for word in response.split():
        yield word + " "
        await asyncio.sleep(0.05)

@app.post("/ask")
async def ask(question: Question):
    return StreamingResponse(
        generate_response_chunks(question.question),
        media_type="text/plain"
    )