1
0

Refactored code, added Dockerfile, replaced bash scripts with python alternatives, added README with instructions on running a pipeline

This commit is contained in:
2026-04-01 16:56:06 +02:00
parent ca116562fe
commit 686a458905
19 changed files with 1103 additions and 65 deletions

View File

@@ -42,6 +42,7 @@ Thumbs.db
# Local env and logs # Local env and logs
.env .env
.env.* .env.*
!.env.example
*.log *.log
*.pid *.pid
@@ -50,3 +51,13 @@ Thumbs.db
*.mov *.mov
*.avi *.avi
*.mkv *.mkv
# Project generated data and checkpoints
images/
audios/
videos/
merged/
results/
outputs/
ckpts/
HunyuanVideo-1.5/ckpts/

1
.env
View File

@@ -1 +0,0 @@
ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383

19
.env.example Normal file
View File

@@ -0,0 +1,19 @@
# ElevenLabs
ELEVENLABS_API_KEY=
# Hugging Face (required for gated model downloads, e.g. FLUX.1-schnell)
HUGGINGFACE_HUB_TOKEN=
# Hunyuan prompt rewrite endpoints (optional; rewrite is disabled in current generate_videos.py)
T2V_REWRITE_BASE_URL=
T2V_REWRITE_MODEL_NAME=
I2V_REWRITE_BASE_URL=
I2V_REWRITE_MODEL_NAME=
# AWS / S3 (used when initializing S3VideoStorage)
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_SESSION_TOKEN=
AWS_REGION=
AWS_S3_BUCKET=
AWS_S3_ENDPOINT_URL=

1
.gitignore vendored
View File

@@ -58,6 +58,7 @@ Thumbs.db
# Local environment variables # Local environment variables
.env .env
.env.* .env.*
!.env.example
# Project-specific artifacts # Project-specific artifacts
*.mp4 *.mp4

66
Dockerfile Normal file
View File

@@ -0,0 +1,66 @@
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
ENV DEBIAN_FRONTEND=noninteractive \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
# Base OS tools + media stack + Python toolchain.
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.10 \
python3-pip \
python3.10-dev \
python3.10-venv \
ffmpeg \
git \
git-lfs \
ca-certificates \
curl \
build-essential \
pkg-config \
ninja-build \
libglib2.0-0 \
libgl1 \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3.10 /usr/bin/python \
&& ln -sf /usr/bin/pip3 /usr/bin/pip \
&& git lfs install
WORKDIR /app
# Install project Python dependencies first for better layer caching.
COPY requirements.txt /app/requirements.txt
RUN python -m pip install --upgrade pip setuptools wheel \
&& pip install --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio \
&& pip install -r /app/requirements.txt \
&& pip install -U accelerate safetensors
# Copy project code.
COPY . /app
# Ensure HunyuanVideo source exists in the image.
ARG HUNYUAN_REPO=https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5.git
RUN if [ ! -f /app/HunyuanVideo-1.5/requirements.txt ]; then \
rm -rf /app/HunyuanVideo-1.5 && \
git clone --depth 1 "$HUNYUAN_REPO" /app/HunyuanVideo-1.5; \
fi
# Install HunyuanVideo dependencies from upstream README guidance.
RUN pip install -r /app/HunyuanVideo-1.5/requirements.txt \
&& pip install --upgrade tencentcloud-sdk-python \
&& pip install sgl-kernel==0.3.18
# Optional attention backends from Hunyuan docs.
# Build with: --build-arg INSTALL_OPTIONAL_ATTENTION=1
ARG INSTALL_OPTIONAL_ATTENTION=0
RUN if [ "$INSTALL_OPTIONAL_ATTENTION" = "1" ]; then \
pip install flash-attn --no-build-isolation && \
git clone --depth 1 https://github.com/Tencent-Hunyuan/flex-block-attn.git /tmp/flex-block-attn && \
cd /tmp/flex-block-attn && git submodule update --init --recursive && python setup.py install && \
git clone --depth 1 https://github.com/cooper1637/SageAttention.git /tmp/SageAttention && \
cd /tmp/SageAttention && python setup.py install; \
fi
# Default pipeline entrypoint.
CMD ["python", "run_video_pipeline.py"]

202
README.md Normal file
View File

@@ -0,0 +1,202 @@
# ContentGeneration Pipeline
This project runs a 3-step video pipeline:
1. Generate shot videos from images + prompts.
2. Merge each generated video with its audio.
3. Concatenate merged clips into one final output.
The pipeline entrypoint is `run_video_pipeline.py`.
## Quick Start
Local Python:
```bash
cp .env.example .env
python3 -m venv .venv && source .venv/bin/activate
pip install -r requirements.txt
python run_video_pipeline.py
```
Docker (GPU):
```bash
cp .env.example .env
docker build -t content-generation:latest .
docker run --rm --gpus all --env-file .env -v "$(pwd)":/app -w /app content-generation:latest
```
First run (skip S3 upload):
```bash
python run_video_pipeline.py --skip-s3-upload
```
Docker first run (skip S3 upload):
```bash
docker run --rm --gpus all --env-file .env -v "$(pwd)":/app -w /app content-generation:latest \
python run_video_pipeline.py --skip-s3-upload
```
## Project Layout
- `run_video_pipeline.py`: main entrypoint.
- `src/scripts/`: helper scripts used by the pipeline.
- `HunyuanVideo-1.5/`: Hunyuan inference code and model dependencies.
- `reel_script.json`: required script input with `shots`.
- `images/`, `audios/`, `videos/`, `merged/`, `results/`: working/output folders.
- `.env.example`: environment variable template.
## Prerequisites
1. Linux with NVIDIA GPU and CUDA runtime.
2. `ffmpeg` and `ffprobe` available on PATH.
3. Python 3.10+.
4. Hunyuan model checkpoints under `HunyuanVideo-1.5/ckpts`.
5. If using FLUX local download, access approved for `black-forest-labs/FLUX.1-schnell`.
## Environment Variables
1. Create local env file:
```bash
cp .env.example .env
```
2. Fill required variables in `.env`:
- `ELEVENLABS_API_KEY` for audio generation.
- `HUGGINGFACE_HUB_TOKEN` if gated Hugging Face model access is needed.
- `AWS_S3_BUCKET` (+ optional AWS vars) if you want final output uploaded to S3.
## Run Locally (Python)
1. Create and activate a virtual environment:
```bash
python3 -m venv .venv
source .venv/bin/activate
```
2. Install Python dependencies:
```bash
python -m pip install --upgrade pip
pip install -r requirements.txt
```
3. Install Hunyuan dependencies:
```bash
pip install -r HunyuanVideo-1.5/requirements.txt
pip install --upgrade tencentcloud-sdk-python
pip install sgl-kernel==0.3.18
```
4. Run full pipeline:
```bash
python run_video_pipeline.py
```
5. Common options:
```bash
# Skip generation and only merge + concat
python run_video_pipeline.py --skip-generate
# Skip S3 upload
python run_video_pipeline.py --skip-s3-upload
# Override base directory
python run_video_pipeline.py --base-dir /absolute/path/to/workdir
# Change logging verbosity
python run_video_pipeline.py --log-level DEBUG
```
## Run with Docker
1. Build image:
```bash
docker build -t content-generation:latest .
```
2. Optional build with extra attention backends:
```bash
docker build -t content-generation:latest --build-arg INSTALL_OPTIONAL_ATTENTION=1 .
```
3. Run pipeline in container (GPU required):
```bash
docker run --rm --gpus all \
--env-file .env \
-v "$(pwd)":/app \
-w /app \
content-generation:latest
```
4. Pass extra pipeline args:
```bash
docker run --rm --gpus all \
--env-file .env \
-v "$(pwd)":/app \
-w /app \
content-generation:latest \
python run_video_pipeline.py --skip-s3-upload --log-level DEBUG
```
## Input Expectations
1. `reel_script.json` must exist and contain a `shots` array.
2. `images/shot_<n>.png` and `audios/output_<n>.mp3` should align by shot number.
3. Final output is written by default to `results/final_output.mp4`.
## S3 Upload Behavior
1. If `AWS_S3_BUCKET` is set, the pipeline uploads final output to S3 using `S3VideoStorage`.
2. If `AWS_S3_BUCKET` is missing, upload is skipped with a warning.
3. Disable upload explicitly with `--skip-s3-upload`.
## Troubleshooting
1. `torch.cuda.is_available()` is false in Docker.
- Run with GPU flags: `docker run --gpus all ...`
- Verify NVIDIA Container Toolkit is installed on host.
- Check host GPU visibility: `nvidia-smi`.
2. `ffmpeg` or `ffprobe` not found.
- Local: install ffmpeg with your package manager.
- Docker: ffmpeg is installed in the provided Dockerfile.
3. Hunyuan generate step fails due to missing checkpoints.
- Ensure checkpoints are available under `HunyuanVideo-1.5/ckpts`.
- Confirm mounted project path in Docker includes checkpoints.
4. Hugging Face model download fails (401/403).
- Accept model access terms for gated models (for example FLUX.1-schnell).
- Set `HUGGINGFACE_HUB_TOKEN` in `.env`.
5. S3 upload fails.
- Confirm `AWS_S3_BUCKET` is set.
- If needed, set `AWS_REGION` and credentials (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, optional `AWS_SESSION_TOKEN`).
- For S3-compatible providers, set `AWS_S3_ENDPOINT_URL`.
6. Permission issues when running Docker with mounted volumes.
- Use your host user mapping if needed:
`docker run --rm --gpus all -u "$(id -u):$(id -g)" ...`
7. Out-of-memory during video generation.
- Keep `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128`.
- Reduce workload by skipping optional enhancements or lowering resolution/steps in generation scripts.
8. Verify syntax quickly before running.
```bash
python3 -m py_compile run_video_pipeline.py src/scripts/*.py
```

View File

@@ -1,35 +0,0 @@
from elevenlabs.client import ElevenLabs
from elevenlabs.play import play
import os
import json
from dotenv import load_dotenv
load_dotenv()
ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
if __name__ == '__main__':
script_path = "reel_script.json"
with open(script_path, "r") as f:
reel_data = json.load(f)
client = ElevenLabs(
api_key=ELEVENLABS_API_KEY
)
for shot in reel_data["shots"]:
print(shot["shot_number"], shot["voiceover"])
prompt = shot["voiceover"]
audio = client.text_to_speech.convert(
text=prompt,
voice_id="JBFqnCBsd6RMkjVDRZzb",
model_id="eleven_multilingual_v2",
output_format="mp3_44100_128",
)
audio_bytes = b"".join(audio)
if not os.path.exists("audios"):
os.makedirs("audios")
with open(f"audios/output_{shot["shot_number"]}.mp3", "wb") as f:
f.write(audio_bytes)

View File

@@ -1,28 +0,0 @@
import torch
from diffusers import FluxPipeline
import json
import os
if __name__ == '__main__':
script_path = "reel_script.json"
with open(script_path, "r") as f:
reel_data = json.load(f)
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
pipe.enable_model_cpu_offload()
for shot in reel_data["shots"]:
print(shot["shot_number"], shot["image_description"])
prompt = shot["image_description"]
image = pipe(
prompt,
guidance_scale=0.0,
num_inference_steps=4,
max_sequence_length=256,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
if not os.path.exists("images"):
os.makedirs("images")
image.save(f"images/shot_{shot["shot_number"]}.png")

17
requirements.txt Normal file
View File

@@ -0,0 +1,17 @@
# Core project dependencies inferred from imports in this workspace
boto3
python-dotenv
elevenlabs
torch
transformers
diffusers
accelerate
safetensors
huggingface-hub
# Optional but commonly required for 4-bit quantization with BitsAndBytesConfig
bitsandbytes
# Notes:
# - ffmpeg/ffprobe are required by video scripts but installed at OS level, not via pip.
# - torchrun is provided by the torch package.

163
run_video_pipeline.py Normal file
View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""Run the full video pipeline: generate, merge, and concatenate."""
from __future__ import annotations
import argparse
import logging
import os
import subprocess
import sys
from pathlib import Path
from src.scripts.logging_config import configure_logging
from src.scripts.s3_video_storage import S3VideoStorage
PROJECT_ROOT = Path(__file__).resolve().parent
SCRIPT_DIR = PROJECT_ROOT / "src" / "scripts"
DEFAULT_BASE_DIR = PROJECT_ROOT
DEFAULT_HUNYUAN_DIR = DEFAULT_BASE_DIR / "HunyuanVideo-1.5"
DEFAULT_REEL_SCRIPT = DEFAULT_BASE_DIR / "reel_script.json"
DEFAULT_IMAGES_DIR = DEFAULT_BASE_DIR / "images"
DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos"
DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios"
DEFAULT_MERGED_DIR = DEFAULT_BASE_DIR / "merged"
DEFAULT_OUTPUT = DEFAULT_BASE_DIR / "results" / "final_output.mp4"
LOGGER = logging.getLogger(__name__)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--base-dir", type=Path, default=DEFAULT_BASE_DIR)
parser.add_argument("--hunyuan-dir", type=Path, default=DEFAULT_HUNYUAN_DIR)
parser.add_argument("--reel-script", type=Path, default=DEFAULT_REEL_SCRIPT)
parser.add_argument("--images-dir", type=Path, default=DEFAULT_IMAGES_DIR)
parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR)
parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR)
parser.add_argument("--merged-dir", type=Path, default=DEFAULT_MERGED_DIR)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--skip-generate", action="store_true")
parser.add_argument("--skip-merge", action="store_true")
parser.add_argument("--skip-concat", action="store_true")
parser.add_argument("--skip-s3-upload", action="store_true")
parser.add_argument("--log-level", default="INFO")
return parser.parse_args()
def run_step(name: str, cmd: list[str]) -> None:
LOGGER.info("=== %s ===", name)
LOGGER.info("$ %s", " ".join(str(part) for part in cmd))
subprocess.run(cmd, check=True)
def maybe_upload_to_s3(output_path: Path) -> None:
bucket = os.getenv("AWS_S3_BUCKET")
if not bucket:
LOGGER.warning("Skipping S3 upload: AWS_S3_BUCKET is not set")
return
storage = S3VideoStorage(
{
"bucket_name": bucket,
"region_name": os.getenv("AWS_REGION"),
"endpoint_url": os.getenv("AWS_S3_ENDPOINT_URL"),
"aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
"aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
"aws_session_token": os.getenv("AWS_SESSION_TOKEN"),
}
)
s3_uri = storage.store_file(output_path)
LOGGER.info("Uploaded output to %s", s3_uri)
def main() -> int:
args = parse_args()
configure_logging(args.log_level)
# If only base-dir is overridden, derive the common subpaths from it.
if args.base_dir != DEFAULT_BASE_DIR:
if args.hunyuan_dir == DEFAULT_HUNYUAN_DIR:
args.hunyuan_dir = args.base_dir / "HunyuanVideo-1.5"
if args.reel_script == DEFAULT_REEL_SCRIPT:
args.reel_script = args.base_dir / "reel_script.json"
if args.images_dir == DEFAULT_IMAGES_DIR:
args.images_dir = args.base_dir / "images"
if args.videos_dir == DEFAULT_VIDEOS_DIR:
args.videos_dir = args.base_dir / "videos"
if args.audios_dir == DEFAULT_AUDIOS_DIR:
args.audios_dir = args.base_dir / "audios"
if args.merged_dir == DEFAULT_MERGED_DIR:
args.merged_dir = args.base_dir / "merged"
if args.output == DEFAULT_OUTPUT:
args.output = args.base_dir / "results" / "final_output.mp4"
try:
if not args.skip_generate:
run_step(
"Generate Videos",
[
sys.executable,
str(SCRIPT_DIR / "generate_videos.py"),
"--hunyuan-dir",
str(args.hunyuan_dir),
"--reel-script",
str(args.reel_script),
"--images-dir",
str(args.images_dir),
"--videos-dir",
str(args.videos_dir),
"--audios-dir",
str(args.audios_dir),
"--seed",
str(args.seed),
],
)
if not args.skip_merge:
run_step(
"Merge Audio + Video",
[
sys.executable,
str(SCRIPT_DIR / "merge_audio_video.py"),
"--videos-dir",
str(args.videos_dir),
"--audios-dir",
str(args.audios_dir),
"--output-dir",
str(args.merged_dir),
],
)
if not args.skip_concat:
run_step(
"Concatenate Merged Videos",
[
sys.executable,
str(SCRIPT_DIR / "concat_merged.py"),
"--merged-dir",
str(args.merged_dir),
"--output",
str(args.output),
],
)
except subprocess.CalledProcessError as exc:
LOGGER.exception("Pipeline failed at command: %s", exc.cmd)
return exc.returncode
if not args.skip_s3_upload:
try:
maybe_upload_to_s3(args.output)
except Exception:
LOGGER.exception("Failed uploading output to S3")
return 1
LOGGER.info("Pipeline complete")
LOGGER.info("Final output: %s", args.output)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""Concatenate merged_*.mp4 files into a single output using ffmpeg concat demuxer."""
from __future__ import annotations
import argparse
import logging
import re
import subprocess
import tempfile
from pathlib import Path
from logging_config import configure_logging
SCRIPT_DIR = Path(__file__).resolve().parent
DEFAULT_BASE_DIR = SCRIPT_DIR.parents[1]
DEFAULT_MERGED_DIR = DEFAULT_BASE_DIR / "merged"
DEFAULT_OUTPUT = DEFAULT_BASE_DIR / "results" / "run_3" / "final_output.mp4"
LOGGER = logging.getLogger(__name__)
def shot_number(path: Path) -> int:
match = re.search(r"merged_(\d+)\.mp4$", path.name)
return int(match.group(1)) if match else -1
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--merged-dir", type=Path, default=DEFAULT_MERGED_DIR)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--log-level", default="INFO")
return parser.parse_args()
def main() -> int:
args = parse_args()
configure_logging(args.log_level)
videos = sorted(args.merged_dir.glob("merged_*.mp4"), key=shot_number)
if not videos:
LOGGER.warning("No merged videos found in %s", args.merged_dir)
return 1
args.output.parent.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as tmp:
filelist = Path(tmp.name)
for video in videos:
tmp.write(f"file '{video}'\\n")
try:
LOGGER.info("Concatenating the following files:\n%s", filelist.read_text().rstrip())
subprocess.run(
[
"ffmpeg",
"-f",
"concat",
"-safe",
"0",
"-i",
str(filelist),
"-c",
"copy",
"-y",
str(args.output),
],
check=True,
)
finally:
filelist.unlink(missing_ok=True)
LOGGER.info("Done")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,54 @@
from __future__ import annotations
import json
import logging
import os
from pathlib import Path
from dotenv import load_dotenv
from elevenlabs.client import ElevenLabs
from logging_config import configure_logging
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
load_dotenv(PROJECT_ROOT / ".env")
LOGGER = logging.getLogger(__name__)
def main() -> int:
configure_logging("INFO")
api_key = os.getenv("ELEVENLABS_API_KEY")
if not api_key:
raise RuntimeError("ELEVENLABS_API_KEY is not set")
reel_script = PROJECT_ROOT / "reel_script.json"
audios_dir = PROJECT_ROOT / "audios"
audios_dir.mkdir(parents=True, exist_ok=True)
reel_data = json.loads(reel_script.read_text())
client = ElevenLabs(api_key=api_key)
for shot in reel_data["shots"]:
shot_num = shot["shot_number"]
prompt = shot["voiceover"]
LOGGER.info("Generating audio for shot %s: %s", shot_num, prompt)
audio = client.text_to_speech.convert(
text=prompt,
voice_id="JBFqnCBsd6RMkjVDRZzb",
model_id="eleven_multilingual_v2",
output_format="mp3_44100_128",
)
audio_bytes = b"".join(audio)
out_path = audios_dir / f"output_{shot_num}.mp3"
out_path.write_bytes(audio_bytes)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,50 @@
from __future__ import annotations
import json
import logging
from pathlib import Path
import torch
from diffusers import FluxPipeline
from logging_config import configure_logging
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
LOGGER = logging.getLogger(__name__)
def main() -> int:
configure_logging("INFO")
reel_script = PROJECT_ROOT / "reel_script.json"
images_dir = PROJECT_ROOT / "images"
images_dir.mkdir(parents=True, exist_ok=True)
reel_data = json.loads(reel_script.read_text())
pipe = FluxPipeline.from_pretrained(
"black-forest-labs/FLUX.1-schnell",
torch_dtype=torch.bfloat16,
)
pipe.enable_model_cpu_offload()
for shot in reel_data["shots"]:
shot_num = shot["shot_number"]
prompt = shot["image_description"]
LOGGER.info("Generating image for shot %s: %s", shot_num, prompt)
image = pipe(
prompt,
guidance_scale=0.0,
num_inference_steps=4,
max_sequence_length=256,
generator=torch.Generator("cpu").manual_seed(0),
).images[0]
image.save(images_dir / f"shot_{shot_num}.png")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -1,9 +1,15 @@
import torch import torch
import json import json
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import re import re
from typing import Optional from typing import Optional
from logging_config import configure_logging
LOGGER = logging.getLogger(__name__)
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_ID = "Qwen/Qwen3-14B" MODEL_ID = "Qwen/Qwen3-14B"
WORDS_PER_SECOND = 2.5 WORDS_PER_SECOND = 2.5
@@ -174,7 +180,7 @@ def generate_reel_scenario(
inputs = tokenizer(text, return_tensors="pt").to(model.device) inputs = tokenizer(text, return_tensors="pt").to(model.device)
print("Generating reel scenario..") LOGGER.info("Generating reel scenario")
with torch.no_grad(): with torch.no_grad():
output_ids = model.generate( output_ids = model.generate(
**inputs, **inputs,
@@ -330,6 +336,7 @@ def parse_reel_scenario(raw_scenario: str) -> dict:
if __name__ == '__main__': if __name__ == '__main__':
configure_logging("INFO")
with open("topic_description.txt", "r") as f: with open("topic_description.txt", "r") as f:
topic = f.read() topic = f.read()

View File

@@ -0,0 +1,171 @@
#!/usr/bin/env python3
"""Generate shot videos with HunyuanVideo based on reel script and audio durations."""
from __future__ import annotations
import argparse
import json
import logging
import os
import subprocess
from pathlib import Path
from logging_config import configure_logging
SCRIPT_DIR = Path(__file__).resolve().parent
DEFAULT_BASE_DIR = SCRIPT_DIR.parents[1]
DEFAULT_HUNYUAN_DIR = DEFAULT_BASE_DIR / "HunyuanVideo-1.5"
DEFAULT_REEL_SCRIPT = DEFAULT_BASE_DIR / "reel_script.json"
DEFAULT_IMAGES_DIR = DEFAULT_BASE_DIR / "images"
DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos"
DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios"
LOGGER = logging.getLogger(__name__)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--hunyuan-dir", type=Path, default=DEFAULT_HUNYUAN_DIR)
parser.add_argument("--reel-script", type=Path, default=DEFAULT_REEL_SCRIPT)
parser.add_argument("--images-dir", type=Path, default=DEFAULT_IMAGES_DIR)
parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR)
parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR)
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--log-level", default="INFO")
return parser.parse_args()
def get_audio_duration(audio_path: Path) -> float:
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
str(audio_path),
],
check=True,
text=True,
capture_output=True,
)
return float(result.stdout.strip())
def duration_to_video_length(duration: float) -> int:
frames = int(duration * 24) + 1
if frames % 2 == 0:
frames += 1
return max(49, min(frames, 169))
def main() -> int:
args = parse_args()
configure_logging(args.log_level)
model_path = args.hunyuan_dir / "ckpts"
args.videos_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
data = json.loads(args.reel_script.read_text())
shots = data.get("shots", [])
LOGGER.info("Found %s shots to generate", len(shots))
for shot in shots:
shot_number = shot["shot_number"]
prompt = str(shot["image_description"]).replace("\t", " ").replace("\n", " ")
image_path = args.images_dir / f"shot_{shot_number}.png"
output_path = args.videos_dir / f"output_{shot_number}.mp4"
audio_path = args.audios_dir / f"output_{shot_number}.mp3"
if not audio_path.exists():
LOGGER.warning("No audio found at %s, falling back to 5s default", audio_path)
duration = 5.0
else:
duration = get_audio_duration(audio_path)
LOGGER.info("Audio duration for shot %s: %ss", shot_number, duration)
video_length = duration_to_video_length(duration)
LOGGER.info("Shot %s | %ss -> %s frames", shot_number, duration, video_length)
LOGGER.info("Prompt: %s", prompt)
LOGGER.info("Image: %s", image_path)
LOGGER.info("Audio: %s", audio_path)
LOGGER.info("Output: %s", output_path)
if output_path.exists():
LOGGER.info("Output path already exists, skipping")
continue
if not image_path.exists():
LOGGER.warning("Image not found at %s, skipped", image_path)
continue
subprocess.run(
[
"python3",
"-c",
"import torch; torch.cuda.empty_cache()",
],
check=True,
env=env,
)
LOGGER.info("GPU cache cleared")
subprocess.run(
[
"torchrun",
"--nproc_per_node=1",
"generate.py",
"--prompt",
prompt,
"--image_path",
str(image_path),
"--resolution",
"480p",
"--aspect_ratio",
"16:9",
"--seed",
str(args.seed),
"--video_length",
str(video_length),
"--rewrite",
"false",
"--cfg_distilled",
"true",
"--enable_step_distill",
"true",
"--sparse_attn",
"false",
"--use_sageattn",
"true",
"--enable_cache",
"false",
"--overlap_group_offloading",
"true",
"--sr",
"false",
"--output_path",
str(output_path),
"--model_path",
str(model_path),
],
check=True,
cwd=args.hunyuan_dir,
env=env,
)
LOGGER.info("Shot %s done", shot_number)
LOGGER.info("Done")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
import logging
DEFAULT_LOG_FORMAT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
def configure_logging(level: str = "INFO") -> None:
logging.basicConfig(
level=getattr(logging, level.upper(), logging.INFO),
format=DEFAULT_LOG_FORMAT,
)

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""Merge videos/output_n.mp4 with audios/output_n.mp3 into merged/merged_n.mp4."""
from __future__ import annotations
import argparse
import logging
import re
import subprocess
from pathlib import Path
from logging_config import configure_logging
SCRIPT_DIR = Path(__file__).resolve().parent
DEFAULT_BASE_DIR = SCRIPT_DIR.parents[1]
DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos"
DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios"
DEFAULT_OUTPUT_DIR = DEFAULT_BASE_DIR / "merged"
LOGGER = logging.getLogger(__name__)
def shot_number(path: Path) -> int:
match = re.search(r"output_(\d+)\.mp4$", path.name)
return int(match.group(1)) if match else -1
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR)
parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR)
parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
parser.add_argument("--log-level", default="INFO")
return parser.parse_args()
def main() -> int:
args = parse_args()
configure_logging(args.log_level)
args.output_dir.mkdir(parents=True, exist_ok=True)
videos = sorted(args.videos_dir.glob("output_*.mp4"), key=shot_number)
if not videos:
LOGGER.warning("No videos found in %s", args.videos_dir)
return 1
for video in videos:
num = shot_number(video)
audio = args.audios_dir / f"output_{num}.mp3"
output = args.output_dir / f"merged_{num}.mp4"
if not audio.exists():
LOGGER.warning("No audio found for shot %s (%s); skipped", num, audio)
continue
if output.exists():
LOGGER.info("Already exists; skipped shot %s", num)
continue
LOGGER.info("Merging shot %s: %s + %s -> %s", num, video, audio, output)
subprocess.run(
[
"ffmpeg",
"-i",
str(video),
"-i",
str(audio),
"-c:v",
"copy",
"-c:a",
"aac",
"-shortest",
"-y",
str(output),
],
check=True,
)
LOGGER.info("Done: %s", output)
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,163 @@
#!/usr/bin/env python3
"""Run the full video pipeline: generate, merge, and concatenate."""
from __future__ import annotations
import argparse
import logging
import os
import subprocess
import sys
from pathlib import Path
from logging_config import configure_logging
from s3_video_storage import S3VideoStorage
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
DEFAULT_BASE_DIR = PROJECT_ROOT
DEFAULT_HUNYUAN_DIR = DEFAULT_BASE_DIR / "HunyuanVideo-1.5"
DEFAULT_REEL_SCRIPT = DEFAULT_BASE_DIR / "reel_script.json"
DEFAULT_IMAGES_DIR = DEFAULT_BASE_DIR / "images"
DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos"
DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios"
DEFAULT_MERGED_DIR = DEFAULT_BASE_DIR / "merged"
DEFAULT_OUTPUT = DEFAULT_BASE_DIR / "results" / "final_output.mp4"
LOGGER = logging.getLogger(__name__)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--base-dir", type=Path, default=DEFAULT_BASE_DIR)
parser.add_argument("--hunyuan-dir", type=Path, default=DEFAULT_HUNYUAN_DIR)
parser.add_argument("--reel-script", type=Path, default=DEFAULT_REEL_SCRIPT)
parser.add_argument("--images-dir", type=Path, default=DEFAULT_IMAGES_DIR)
parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR)
parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR)
parser.add_argument("--merged-dir", type=Path, default=DEFAULT_MERGED_DIR)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--seed", type=int, default=1)
parser.add_argument("--skip-generate", action="store_true")
parser.add_argument("--skip-merge", action="store_true")
parser.add_argument("--skip-concat", action="store_true")
parser.add_argument("--skip-s3-upload", action="store_true")
parser.add_argument("--log-level", default="INFO")
return parser.parse_args()
def run_step(name: str, cmd: list[str]) -> None:
LOGGER.info("=== %s ===", name)
LOGGER.info("$ %s", " ".join(str(part) for part in cmd))
subprocess.run(cmd, check=True)
def maybe_upload_to_s3(output_path: Path) -> None:
bucket = os.getenv("AWS_S3_BUCKET")
if not bucket:
LOGGER.warning("Skipping S3 upload: AWS_S3_BUCKET is not set")
return
storage = S3VideoStorage(
{
"bucket_name": bucket,
"region_name": os.getenv("AWS_REGION"),
"endpoint_url": os.getenv("AWS_S3_ENDPOINT_URL"),
"aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"),
"aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"),
"aws_session_token": os.getenv("AWS_SESSION_TOKEN"),
}
)
s3_uri = storage.store_file(output_path)
LOGGER.info("Uploaded output to %s", s3_uri)
def main() -> int:
args = parse_args()
configure_logging(args.log_level)
# If only base-dir is overridden, derive the common subpaths from it.
if args.base_dir != DEFAULT_BASE_DIR:
if args.hunyuan_dir == DEFAULT_HUNYUAN_DIR:
args.hunyuan_dir = args.base_dir / "HunyuanVideo-1.5"
if args.reel_script == DEFAULT_REEL_SCRIPT:
args.reel_script = args.base_dir / "reel_script.json"
if args.images_dir == DEFAULT_IMAGES_DIR:
args.images_dir = args.base_dir / "images"
if args.videos_dir == DEFAULT_VIDEOS_DIR:
args.videos_dir = args.base_dir / "videos"
if args.audios_dir == DEFAULT_AUDIOS_DIR:
args.audios_dir = args.base_dir / "audios"
if args.merged_dir == DEFAULT_MERGED_DIR:
args.merged_dir = args.base_dir / "merged"
if args.output == DEFAULT_OUTPUT:
args.output = args.base_dir / "results" / "final_output.mp4"
try:
if not args.skip_generate:
run_step(
"Generate Videos",
[
sys.executable,
str(SCRIPT_DIR / "generate_videos.py"),
"--hunyuan-dir",
str(args.hunyuan_dir),
"--reel-script",
str(args.reel_script),
"--images-dir",
str(args.images_dir),
"--videos-dir",
str(args.videos_dir),
"--audios-dir",
str(args.audios_dir),
"--seed",
str(args.seed),
],
)
if not args.skip_merge:
run_step(
"Merge Audio + Video",
[
sys.executable,
str(SCRIPT_DIR / "merge_audio_video.py"),
"--videos-dir",
str(args.videos_dir),
"--audios-dir",
str(args.audios_dir),
"--output-dir",
str(args.merged_dir),
],
)
if not args.skip_concat:
run_step(
"Concatenate Merged Videos",
[
sys.executable,
str(SCRIPT_DIR / "concat_merged.py"),
"--merged-dir",
str(args.merged_dir),
"--output",
str(args.output),
],
)
except subprocess.CalledProcessError as exc:
LOGGER.exception("Pipeline failed at command: %s", exc.cmd)
return exc.returncode
if not args.skip_s3_upload:
try:
maybe_upload_to_s3(args.output)
except Exception:
LOGGER.exception("Failed uploading output to S3")
return 1
LOGGER.info("Pipeline complete")
LOGGER.info("Final output: %s", args.output)
return 0
if __name__ == "__main__":
raise SystemExit(main())