Upload 4 files
Browse files- README.md +95 -0
- app.py +89 -0
- packages.txt +1 -0
- requirements.txt +11 -0
README.md
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
---
|
3 |
+
title: Scripttt
|
4 |
+
sdk: gradio
|
5 |
+
appfile: app.py
|
6 |
+
colorFrom: blue
|
7 |
+
colorTo: green
|
8 |
+
license: mit
|
9 |
+
tags:
|
10 |
+
- transcription
|
11 |
+
- diarization
|
12 |
+
- whisper
|
13 |
+
- pyannote
|
14 |
+
- video
|
15 |
+
- short-form
|
16 |
+
- gradio
|
17 |
+
- content-creation
|
18 |
+
pythonversion: "3.10"
|
19 |
+
---
|
20 |
+
|
21 |
+
# Scripttt
|
22 |
+
|
23 |
+
Scripttt is a Python web application that enables content creators to repurpose long-form video content into concise, engaging scripts for short-form platforms such as Instagram Reels and YouTube Shorts. Built with Gradio, Scripttt combines state-of-the-art transcription, speaker diarization, and script generation to deliver production-ready outputs that reflect the tone and style of the original conversation.
|
24 |
+
|
25 |
+
## Features
|
26 |
+
|
27 |
+
- **Video File Uploads Only**
|
28 |
+
Accepts direct uploads of video files (`.mp4`, `.mkv`, and other common formats). Audio-only files and external links are not supported.
|
29 |
+
|
30 |
+
- **Accurate Transcription**
|
31 |
+
Utilizes OpenAI Whisper for high-quality speech-to-text conversion.
|
32 |
+
|
33 |
+
- **Speaker Diarization**
|
34 |
+
Employs pyannote.audio to automatically identify and label speakers within the transcript.
|
35 |
+
|
36 |
+
- **Speaker-Tagged Transcript**
|
37 |
+
Generates a clean, speaker-attributed transcript of the input video.
|
38 |
+
|
39 |
+
- **Short-Form Script Generation**
|
40 |
+
Produces a concise, human-like script optimized for viral, short-form video content.
|
41 |
+
|
42 |
+
- **Privacy by Design**
|
43 |
+
All processing occurs locally; no external URLs or remote media are accepted.
|
44 |
+
|
45 |
+
## Installation
|
46 |
+
|
47 |
+
1. **Clone the Repository**
|
48 |
+
```
|
49 |
+
git clone https://github.com/your-username/scripttt.git
|
50 |
+
cd scripttt
|
51 |
+
```
|
52 |
+
|
53 |
+
2. **Set Up a Virtual Environment (Recommended)**
|
54 |
+
```
|
55 |
+
python -m venv venv
|
56 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
57 |
+
```
|
58 |
+
|
59 |
+
3. **Install Dependencies**
|
60 |
+
```
|
61 |
+
pip install -r requirements.txt
|
62 |
+
```
|
63 |
+
|
64 |
+
4. **Configure Environment Variables**
|
65 |
+
- Create a `.env` file in the project root.
|
66 |
+
- Add your Hugging Face and Google API credentials as environment variables.
|
67 |
+
Example:
|
68 |
+
```
|
69 |
+
HUGGINGFACE_TOKEN=your_huggingface_token
|
70 |
+
GOOGLE_API_KEY=your_google_api_key
|
71 |
+
```
|
72 |
+
|
73 |
+
## Usage
|
74 |
+
|
75 |
+
1. **Run the Application**
|
76 |
+
```
|
77 |
+
python app.py
|
78 |
+
```
|
79 |
+
|
80 |
+
2. **Access the Interface**
|
81 |
+
- Open the local URL provided by Gradio in your browser.
|
82 |
+
- Upload a supported video file and follow the on-screen instructions.
|
83 |
+
|
84 |
+
## Output
|
85 |
+
|
86 |
+
- **Speaker-Tagged Transcript:**
|
87 |
+
A clean, readable transcript with speaker labels.
|
88 |
+
|
89 |
+
- **Short-Form Script:**
|
90 |
+
A new, concise script based on the original video, ready for use in short-form content production.
|
91 |
+
|
92 |
+
## Limitations
|
93 |
+
|
94 |
+
- YouTube links, remote URLs, and audio-only files are **not supported**. Only direct video file uploads are accepted.
|
95 |
+
```
|
app.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# app.py
|
2 |
+
import os, tempfile, subprocess, gradio as gr
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import whisper
|
5 |
+
import pvfalcon
|
6 |
+
|
7 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
8 |
+
# 1. ENVIRONMENT
|
9 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
10 |
+
load_dotenv()
|
11 |
+
FALCON_ACCESS_KEY = os.getenv("FALCON_ACCESS_KEY")
|
12 |
+
if not FALCON_ACCESS_KEY:
|
13 |
+
raise RuntimeError(
|
14 |
+
"Set FALCON_ACCESS_KEY in your environment or .env file "
|
15 |
+
"(get one free at https://console.picovoice.ai)."
|
16 |
+
)
|
17 |
+
|
18 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
19 |
+
# 2. MODELS
|
20 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
21 |
+
whisper_model = whisper.load_model("base") # CPU-friendly
|
22 |
+
falcon = pvfalcon.create(access_key=FALCON_ACCESS_KEY)
|
23 |
+
|
24 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
25 |
+
# 3. CORE LOGIC
|
26 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
27 |
+
def process_video(file, language="Auto"):
|
28 |
+
# 3.1 Choose language for Whisper
|
29 |
+
lang_code = None if language == "Auto" else language.lower()
|
30 |
+
|
31 |
+
# 3.2 Extract mono 16-kHz WAV with ffmpeg
|
32 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav:
|
33 |
+
wav_path = wav.name
|
34 |
+
subprocess.run(
|
35 |
+
["ffmpeg", "-y", "-i", file.name,
|
36 |
+
"-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", wav_path],
|
37 |
+
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
38 |
+
)
|
39 |
+
if not os.path.getsize(wav_path):
|
40 |
+
return "Audio extraction failed.", ""
|
41 |
+
|
42 |
+
# 3.3 Speaker diarization
|
43 |
+
segments = falcon.process_file(wav_path) # list[pvfalcon.Segment]
|
44 |
+
diarized_map, label_map, counter = [], {}, 1
|
45 |
+
for seg in segments:
|
46 |
+
tag = seg.speaker_tag
|
47 |
+
if tag not in label_map:
|
48 |
+
label_map[tag] = f"Speaker {counter}"
|
49 |
+
counter += 1
|
50 |
+
diarized_map.append(
|
51 |
+
dict(start=seg.start_sec, end=seg.end_sec, speaker=label_map[tag])
|
52 |
+
)
|
53 |
+
|
54 |
+
# 3.4 Transcription (Whisper)
|
55 |
+
res = whisper_model.transcribe(wav_path, language=lang_code)
|
56 |
+
paragraph_transcript = res["text"] # plain paragraph
|
57 |
+
|
58 |
+
# 3.5 Merge speakers with transcription
|
59 |
+
speaker_lines = []
|
60 |
+
for s in res.get("segments", []):
|
61 |
+
speaker = next(
|
62 |
+
(m["speaker"] for m in diarized_map if m["start"] <= s["start"] <= m["end"]),
|
63 |
+
"Unknown"
|
64 |
+
)
|
65 |
+
speaker_lines.append(f"{speaker}: {s['text']}")
|
66 |
+
speaker_transcript = "\n".join(speaker_lines)
|
67 |
+
|
68 |
+
# 3.6 Return in desired order
|
69 |
+
return speaker_transcript, paragraph_transcript
|
70 |
+
|
71 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
72 |
+
# 4. GRADIO UI
|
73 |
+
# βββββββββββββββββββββββββββββββββββββββββββ
|
74 |
+
demo = gr.Interface(
|
75 |
+
fn=process_video,
|
76 |
+
inputs=[
|
77 |
+
gr.File(label="Upload Video", type="filepath"),
|
78 |
+
gr.Dropdown(["Auto", "English", "Hindi", "Urdu"], label="Language")
|
79 |
+
],
|
80 |
+
outputs=[
|
81 |
+
gr.Textbox(label="Speaker-wise Transcript", show_copy_button=True),
|
82 |
+
gr.Textbox(label=" Transcription", show_copy_button=True)
|
83 |
+
],
|
84 |
+
title="Transcription + Speaker Segmentation",
|
85 |
+
description="Whisper + Picovoice Falcon running fully on CPU."
|
86 |
+
)
|
87 |
+
|
88 |
+
if __name__ == "__main__":
|
89 |
+
demo.launch()
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-dotenv
|
2 |
+
requests
|
3 |
+
openai
|
4 |
+
pandas
|
5 |
+
git+https://github.com/openai/whisper.git
|
6 |
+
ffmpeg-python
|
7 |
+
yt-dlp
|
8 |
+
torch
|
9 |
+
torchaudio
|
10 |
+
gradio
|
11 |
+
pvfalcon # Picovoice Falcon
|