diff --git a/.dockerignore b/.dockerignore index 87188f9..48fda6f 100644 --- a/.dockerignore +++ b/.dockerignore @@ -42,6 +42,7 @@ Thumbs.db # Local env and logs .env .env.* +!.env.example *.log *.pid @@ -50,3 +51,13 @@ Thumbs.db *.mov *.avi *.mkv + +# Project generated data and checkpoints +images/ +audios/ +videos/ +merged/ +results/ +outputs/ +ckpts/ +HunyuanVideo-1.5/ckpts/ diff --git a/.env b/.env deleted file mode 100644 index f22ab82..0000000 --- a/.env +++ /dev/null @@ -1 +0,0 @@ -ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383 diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..4d22d25 --- /dev/null +++ b/.env.example @@ -0,0 +1,19 @@ +# ElevenLabs +ELEVENLABS_API_KEY= + +# Hugging Face (required for gated model downloads, e.g. FLUX.1-schnell) +HUGGINGFACE_HUB_TOKEN= + +# Hunyuan prompt rewrite endpoints (optional; rewrite is disabled in current generate_videos.py) +T2V_REWRITE_BASE_URL= +T2V_REWRITE_MODEL_NAME= +I2V_REWRITE_BASE_URL= +I2V_REWRITE_MODEL_NAME= + +# AWS / S3 (used when initializing S3VideoStorage) +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +AWS_SESSION_TOKEN= +AWS_REGION= +AWS_S3_BUCKET= +AWS_S3_ENDPOINT_URL= diff --git a/.gitignore b/.gitignore index 297b60a..89090fb 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,7 @@ Thumbs.db # Local environment variables .env .env.* +!.env.example # Project-specific artifacts *.mp4 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..09a67ce --- /dev/null +++ b/Dockerfile @@ -0,0 +1,66 @@ +FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128 + +# Base OS tools + media stack + Python toolchain. +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3.10 \ + python3-pip \ + python3.10-dev \ + python3.10-venv \ + ffmpeg \ + git \ + git-lfs \ + ca-certificates \ + curl \ + build-essential \ + pkg-config \ + ninja-build \ + libglib2.0-0 \ + libgl1 \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/python3.10 /usr/bin/python \ + && ln -sf /usr/bin/pip3 /usr/bin/pip \ + && git lfs install + +WORKDIR /app + +# Install project Python dependencies first for better layer caching. +COPY requirements.txt /app/requirements.txt + +RUN python -m pip install --upgrade pip setuptools wheel \ + && pip install --index-url https://download.pytorch.org/whl/cu121 torch torchvision torchaudio \ + && pip install -r /app/requirements.txt \ + && pip install -U accelerate safetensors + +# Copy project code. +COPY . /app + +# Ensure HunyuanVideo source exists in the image. +ARG HUNYUAN_REPO=https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5.git +RUN if [ ! -f /app/HunyuanVideo-1.5/requirements.txt ]; then \ + rm -rf /app/HunyuanVideo-1.5 && \ + git clone --depth 1 "$HUNYUAN_REPO" /app/HunyuanVideo-1.5; \ + fi + +# Install HunyuanVideo dependencies from upstream README guidance. +RUN pip install -r /app/HunyuanVideo-1.5/requirements.txt \ + && pip install --upgrade tencentcloud-sdk-python \ + && pip install sgl-kernel==0.3.18 + +# Optional attention backends from Hunyuan docs. +# Build with: --build-arg INSTALL_OPTIONAL_ATTENTION=1 +ARG INSTALL_OPTIONAL_ATTENTION=0 +RUN if [ "$INSTALL_OPTIONAL_ATTENTION" = "1" ]; then \ + pip install flash-attn --no-build-isolation && \ + git clone --depth 1 https://github.com/Tencent-Hunyuan/flex-block-attn.git /tmp/flex-block-attn && \ + cd /tmp/flex-block-attn && git submodule update --init --recursive && python setup.py install && \ + git clone --depth 1 https://github.com/cooper1637/SageAttention.git /tmp/SageAttention && \ + cd /tmp/SageAttention && python setup.py install; \ + fi + +# Default pipeline entrypoint. +CMD ["python", "run_video_pipeline.py"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..523e134 --- /dev/null +++ b/README.md @@ -0,0 +1,202 @@ +# ContentGeneration Pipeline + +This project runs a 3-step video pipeline: + +1. Generate shot videos from images + prompts. +2. Merge each generated video with its audio. +3. Concatenate merged clips into one final output. + +The pipeline entrypoint is `run_video_pipeline.py`. + +## Quick Start + +Local Python: + +```bash +cp .env.example .env +python3 -m venv .venv && source .venv/bin/activate +pip install -r requirements.txt +python run_video_pipeline.py +``` + +Docker (GPU): + +```bash +cp .env.example .env +docker build -t content-generation:latest . +docker run --rm --gpus all --env-file .env -v "$(pwd)":/app -w /app content-generation:latest +``` + +First run (skip S3 upload): + +```bash +python run_video_pipeline.py --skip-s3-upload +``` + +Docker first run (skip S3 upload): + +```bash +docker run --rm --gpus all --env-file .env -v "$(pwd)":/app -w /app content-generation:latest \ + python run_video_pipeline.py --skip-s3-upload +``` + +## Project Layout + +- `run_video_pipeline.py`: main entrypoint. +- `src/scripts/`: helper scripts used by the pipeline. +- `HunyuanVideo-1.5/`: Hunyuan inference code and model dependencies. +- `reel_script.json`: required script input with `shots`. +- `images/`, `audios/`, `videos/`, `merged/`, `results/`: working/output folders. +- `.env.example`: environment variable template. + +## Prerequisites + +1. Linux with NVIDIA GPU and CUDA runtime. +2. `ffmpeg` and `ffprobe` available on PATH. +3. Python 3.10+. +4. Hunyuan model checkpoints under `HunyuanVideo-1.5/ckpts`. +5. If using FLUX local download, access approved for `black-forest-labs/FLUX.1-schnell`. + +## Environment Variables + +1. Create local env file: + +```bash +cp .env.example .env +``` + +2. Fill required variables in `.env`: +- `ELEVENLABS_API_KEY` for audio generation. +- `HUGGINGFACE_HUB_TOKEN` if gated Hugging Face model access is needed. +- `AWS_S3_BUCKET` (+ optional AWS vars) if you want final output uploaded to S3. + +## Run Locally (Python) + +1. Create and activate a virtual environment: + +```bash +python3 -m venv .venv +source .venv/bin/activate +``` + +2. Install Python dependencies: + +```bash +python -m pip install --upgrade pip +pip install -r requirements.txt +``` + +3. Install Hunyuan dependencies: + +```bash +pip install -r HunyuanVideo-1.5/requirements.txt +pip install --upgrade tencentcloud-sdk-python +pip install sgl-kernel==0.3.18 +``` + +4. Run full pipeline: + +```bash +python run_video_pipeline.py +``` + +5. Common options: + +```bash +# Skip generation and only merge + concat +python run_video_pipeline.py --skip-generate + +# Skip S3 upload +python run_video_pipeline.py --skip-s3-upload + +# Override base directory +python run_video_pipeline.py --base-dir /absolute/path/to/workdir + +# Change logging verbosity +python run_video_pipeline.py --log-level DEBUG +``` + +## Run with Docker + +1. Build image: + +```bash +docker build -t content-generation:latest . +``` + +2. Optional build with extra attention backends: + +```bash +docker build -t content-generation:latest --build-arg INSTALL_OPTIONAL_ATTENTION=1 . +``` + +3. Run pipeline in container (GPU required): + +```bash +docker run --rm --gpus all \ + --env-file .env \ + -v "$(pwd)":/app \ + -w /app \ + content-generation:latest +``` + +4. Pass extra pipeline args: + +```bash +docker run --rm --gpus all \ + --env-file .env \ + -v "$(pwd)":/app \ + -w /app \ + content-generation:latest \ + python run_video_pipeline.py --skip-s3-upload --log-level DEBUG +``` + +## Input Expectations + +1. `reel_script.json` must exist and contain a `shots` array. +2. `images/shot_.png` and `audios/output_.mp3` should align by shot number. +3. Final output is written by default to `results/final_output.mp4`. + +## S3 Upload Behavior + +1. If `AWS_S3_BUCKET` is set, the pipeline uploads final output to S3 using `S3VideoStorage`. +2. If `AWS_S3_BUCKET` is missing, upload is skipped with a warning. +3. Disable upload explicitly with `--skip-s3-upload`. + +## Troubleshooting + +1. `torch.cuda.is_available()` is false in Docker. +- Run with GPU flags: `docker run --gpus all ...` +- Verify NVIDIA Container Toolkit is installed on host. +- Check host GPU visibility: `nvidia-smi`. + +2. `ffmpeg` or `ffprobe` not found. +- Local: install ffmpeg with your package manager. +- Docker: ffmpeg is installed in the provided Dockerfile. + +3. Hunyuan generate step fails due to missing checkpoints. +- Ensure checkpoints are available under `HunyuanVideo-1.5/ckpts`. +- Confirm mounted project path in Docker includes checkpoints. + +4. Hugging Face model download fails (401/403). +- Accept model access terms for gated models (for example FLUX.1-schnell). +- Set `HUGGINGFACE_HUB_TOKEN` in `.env`. + +5. S3 upload fails. +- Confirm `AWS_S3_BUCKET` is set. +- If needed, set `AWS_REGION` and credentials (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, optional `AWS_SESSION_TOKEN`). +- For S3-compatible providers, set `AWS_S3_ENDPOINT_URL`. + +6. Permission issues when running Docker with mounted volumes. +- Use your host user mapping if needed: + `docker run --rm --gpus all -u "$(id -u):$(id -g)" ...` + +7. Out-of-memory during video generation. +- Keep `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128`. +- Reduce workload by skipping optional enhancements or lowering resolution/steps in generation scripts. + +8. Verify syntax quickly before running. + +```bash +python3 -m py_compile run_video_pipeline.py src/scripts/*.py +``` diff --git a/generate_audios.py b/generate_audios.py deleted file mode 100644 index dc88555..0000000 --- a/generate_audios.py +++ /dev/null @@ -1,35 +0,0 @@ -from elevenlabs.client import ElevenLabs -from elevenlabs.play import play -import os -import json -from dotenv import load_dotenv - -load_dotenv() -ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY') - - -if __name__ == '__main__': - - script_path = "reel_script.json" - with open(script_path, "r") as f: - reel_data = json.load(f) - - client = ElevenLabs( - api_key=ELEVENLABS_API_KEY - ) - for shot in reel_data["shots"]: - print(shot["shot_number"], shot["voiceover"]) - prompt = shot["voiceover"] - audio = client.text_to_speech.convert( - text=prompt, - voice_id="JBFqnCBsd6RMkjVDRZzb", - model_id="eleven_multilingual_v2", - output_format="mp3_44100_128", - ) - - audio_bytes = b"".join(audio) - - if not os.path.exists("audios"): - os.makedirs("audios") - with open(f"audios/output_{shot["shot_number"]}.mp3", "wb") as f: - f.write(audio_bytes) \ No newline at end of file diff --git a/generate_images.py b/generate_images.py deleted file mode 100644 index 446145b..0000000 --- a/generate_images.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch -from diffusers import FluxPipeline -import json -import os - -if __name__ == '__main__': - - script_path = "reel_script.json" - with open(script_path, "r") as f: - reel_data = json.load(f) - - pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) - pipe.enable_model_cpu_offload() - - for shot in reel_data["shots"]: - print(shot["shot_number"], shot["image_description"]) - prompt = shot["image_description"] - image = pipe( - prompt, - guidance_scale=0.0, - num_inference_steps=4, - max_sequence_length=256, - generator=torch.Generator("cpu").manual_seed(0) - ).images[0] - - if not os.path.exists("images"): - os.makedirs("images") - image.save(f"images/shot_{shot["shot_number"]}.png") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e4418b0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,17 @@ +# Core project dependencies inferred from imports in this workspace +boto3 +python-dotenv +elevenlabs +torch +transformers +diffusers +accelerate +safetensors +huggingface-hub + +# Optional but commonly required for 4-bit quantization with BitsAndBytesConfig +bitsandbytes + +# Notes: +# - ffmpeg/ffprobe are required by video scripts but installed at OS level, not via pip. +# - torchrun is provided by the torch package. diff --git a/run_video_pipeline.py b/run_video_pipeline.py new file mode 100644 index 0000000..1a14bcc --- /dev/null +++ b/run_video_pipeline.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Run the full video pipeline: generate, merge, and concatenate.""" + +from __future__ import annotations + +import argparse +import logging +import os +import subprocess +import sys +from pathlib import Path + +from src.scripts.logging_config import configure_logging +from src.scripts.s3_video_storage import S3VideoStorage + + +PROJECT_ROOT = Path(__file__).resolve().parent +SCRIPT_DIR = PROJECT_ROOT / "src" / "scripts" +DEFAULT_BASE_DIR = PROJECT_ROOT +DEFAULT_HUNYUAN_DIR = DEFAULT_BASE_DIR / "HunyuanVideo-1.5" +DEFAULT_REEL_SCRIPT = DEFAULT_BASE_DIR / "reel_script.json" +DEFAULT_IMAGES_DIR = DEFAULT_BASE_DIR / "images" +DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos" +DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios" +DEFAULT_MERGED_DIR = DEFAULT_BASE_DIR / "merged" +DEFAULT_OUTPUT = DEFAULT_BASE_DIR / "results" / "final_output.mp4" + +LOGGER = logging.getLogger(__name__) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--base-dir", type=Path, default=DEFAULT_BASE_DIR) + parser.add_argument("--hunyuan-dir", type=Path, default=DEFAULT_HUNYUAN_DIR) + parser.add_argument("--reel-script", type=Path, default=DEFAULT_REEL_SCRIPT) + parser.add_argument("--images-dir", type=Path, default=DEFAULT_IMAGES_DIR) + parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR) + parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR) + parser.add_argument("--merged-dir", type=Path, default=DEFAULT_MERGED_DIR) + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + parser.add_argument("--seed", type=int, default=1) + parser.add_argument("--skip-generate", action="store_true") + parser.add_argument("--skip-merge", action="store_true") + parser.add_argument("--skip-concat", action="store_true") + parser.add_argument("--skip-s3-upload", action="store_true") + parser.add_argument("--log-level", default="INFO") + return parser.parse_args() + + +def run_step(name: str, cmd: list[str]) -> None: + LOGGER.info("=== %s ===", name) + LOGGER.info("$ %s", " ".join(str(part) for part in cmd)) + subprocess.run(cmd, check=True) + + +def maybe_upload_to_s3(output_path: Path) -> None: + bucket = os.getenv("AWS_S3_BUCKET") + if not bucket: + LOGGER.warning("Skipping S3 upload: AWS_S3_BUCKET is not set") + return + + storage = S3VideoStorage( + { + "bucket_name": bucket, + "region_name": os.getenv("AWS_REGION"), + "endpoint_url": os.getenv("AWS_S3_ENDPOINT_URL"), + "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), + "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "aws_session_token": os.getenv("AWS_SESSION_TOKEN"), + } + ) + s3_uri = storage.store_file(output_path) + LOGGER.info("Uploaded output to %s", s3_uri) + + +def main() -> int: + args = parse_args() + configure_logging(args.log_level) + + # If only base-dir is overridden, derive the common subpaths from it. + if args.base_dir != DEFAULT_BASE_DIR: + if args.hunyuan_dir == DEFAULT_HUNYUAN_DIR: + args.hunyuan_dir = args.base_dir / "HunyuanVideo-1.5" + if args.reel_script == DEFAULT_REEL_SCRIPT: + args.reel_script = args.base_dir / "reel_script.json" + if args.images_dir == DEFAULT_IMAGES_DIR: + args.images_dir = args.base_dir / "images" + if args.videos_dir == DEFAULT_VIDEOS_DIR: + args.videos_dir = args.base_dir / "videos" + if args.audios_dir == DEFAULT_AUDIOS_DIR: + args.audios_dir = args.base_dir / "audios" + if args.merged_dir == DEFAULT_MERGED_DIR: + args.merged_dir = args.base_dir / "merged" + if args.output == DEFAULT_OUTPUT: + args.output = args.base_dir / "results" / "final_output.mp4" + + try: + if not args.skip_generate: + run_step( + "Generate Videos", + [ + sys.executable, + str(SCRIPT_DIR / "generate_videos.py"), + "--hunyuan-dir", + str(args.hunyuan_dir), + "--reel-script", + str(args.reel_script), + "--images-dir", + str(args.images_dir), + "--videos-dir", + str(args.videos_dir), + "--audios-dir", + str(args.audios_dir), + "--seed", + str(args.seed), + ], + ) + + if not args.skip_merge: + run_step( + "Merge Audio + Video", + [ + sys.executable, + str(SCRIPT_DIR / "merge_audio_video.py"), + "--videos-dir", + str(args.videos_dir), + "--audios-dir", + str(args.audios_dir), + "--output-dir", + str(args.merged_dir), + ], + ) + + if not args.skip_concat: + run_step( + "Concatenate Merged Videos", + [ + sys.executable, + str(SCRIPT_DIR / "concat_merged.py"), + "--merged-dir", + str(args.merged_dir), + "--output", + str(args.output), + ], + ) + except subprocess.CalledProcessError as exc: + LOGGER.exception("Pipeline failed at command: %s", exc.cmd) + return exc.returncode + + if not args.skip_s3_upload: + try: + maybe_upload_to_s3(args.output) + except Exception: + LOGGER.exception("Failed uploading output to S3") + return 1 + + LOGGER.info("Pipeline complete") + LOGGER.info("Final output: %s", args.output) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/scripts/concat_merged.py b/src/scripts/concat_merged.py new file mode 100644 index 0000000..ef9c6fa --- /dev/null +++ b/src/scripts/concat_merged.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +"""Concatenate merged_*.mp4 files into a single output using ffmpeg concat demuxer.""" + +from __future__ import annotations + +import argparse +import logging +import re +import subprocess +import tempfile +from pathlib import Path + +from logging_config import configure_logging + + +SCRIPT_DIR = Path(__file__).resolve().parent +DEFAULT_BASE_DIR = SCRIPT_DIR.parents[1] +DEFAULT_MERGED_DIR = DEFAULT_BASE_DIR / "merged" +DEFAULT_OUTPUT = DEFAULT_BASE_DIR / "results" / "run_3" / "final_output.mp4" + +LOGGER = logging.getLogger(__name__) + + +def shot_number(path: Path) -> int: + match = re.search(r"merged_(\d+)\.mp4$", path.name) + return int(match.group(1)) if match else -1 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--merged-dir", type=Path, default=DEFAULT_MERGED_DIR) + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + parser.add_argument("--log-level", default="INFO") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + configure_logging(args.log_level) + + videos = sorted(args.merged_dir.glob("merged_*.mp4"), key=shot_number) + if not videos: + LOGGER.warning("No merged videos found in %s", args.merged_dir) + return 1 + + args.output.parent.mkdir(parents=True, exist_ok=True) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as tmp: + filelist = Path(tmp.name) + for video in videos: + tmp.write(f"file '{video}'\\n") + + try: + LOGGER.info("Concatenating the following files:\n%s", filelist.read_text().rstrip()) + + subprocess.run( + [ + "ffmpeg", + "-f", + "concat", + "-safe", + "0", + "-i", + str(filelist), + "-c", + "copy", + "-y", + str(args.output), + ], + check=True, + ) + finally: + filelist.unlink(missing_ok=True) + + LOGGER.info("Done") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/scripts/generate_audios.py b/src/scripts/generate_audios.py new file mode 100644 index 0000000..091ab68 --- /dev/null +++ b/src/scripts/generate_audios.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import json +import logging +import os +from pathlib import Path + +from dotenv import load_dotenv +from elevenlabs.client import ElevenLabs +from logging_config import configure_logging + + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] + +load_dotenv(PROJECT_ROOT / ".env") + +LOGGER = logging.getLogger(__name__) + + +def main() -> int: + configure_logging("INFO") + api_key = os.getenv("ELEVENLABS_API_KEY") + if not api_key: + raise RuntimeError("ELEVENLABS_API_KEY is not set") + + reel_script = PROJECT_ROOT / "reel_script.json" + audios_dir = PROJECT_ROOT / "audios" + audios_dir.mkdir(parents=True, exist_ok=True) + + reel_data = json.loads(reel_script.read_text()) + client = ElevenLabs(api_key=api_key) + + for shot in reel_data["shots"]: + shot_num = shot["shot_number"] + prompt = shot["voiceover"] + LOGGER.info("Generating audio for shot %s: %s", shot_num, prompt) + + audio = client.text_to_speech.convert( + text=prompt, + voice_id="JBFqnCBsd6RMkjVDRZzb", + model_id="eleven_multilingual_v2", + output_format="mp3_44100_128", + ) + audio_bytes = b"".join(audio) + + out_path = audios_dir / f"output_{shot_num}.mp3" + out_path.write_bytes(audio_bytes) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/src/scripts/generate_images.py b/src/scripts/generate_images.py new file mode 100644 index 0000000..40e6e3b --- /dev/null +++ b/src/scripts/generate_images.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path + +import torch +from diffusers import FluxPipeline +from logging_config import configure_logging + + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] + +LOGGER = logging.getLogger(__name__) + + +def main() -> int: + configure_logging("INFO") + reel_script = PROJECT_ROOT / "reel_script.json" + images_dir = PROJECT_ROOT / "images" + images_dir.mkdir(parents=True, exist_ok=True) + + reel_data = json.loads(reel_script.read_text()) + + pipe = FluxPipeline.from_pretrained( + "black-forest-labs/FLUX.1-schnell", + torch_dtype=torch.bfloat16, + ) + pipe.enable_model_cpu_offload() + + for shot in reel_data["shots"]: + shot_num = shot["shot_number"] + prompt = shot["image_description"] + LOGGER.info("Generating image for shot %s: %s", shot_num, prompt) + + image = pipe( + prompt, + guidance_scale=0.0, + num_inference_steps=4, + max_sequence_length=256, + generator=torch.Generator("cpu").manual_seed(0), + ).images[0] + image.save(images_dir / f"shot_{shot_num}.png") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/generate_script.py b/src/scripts/generate_script.py similarity index 98% rename from generate_script.py rename to src/scripts/generate_script.py index 410b8be..902ab0c 100644 --- a/generate_script.py +++ b/src/scripts/generate_script.py @@ -1,9 +1,15 @@ import torch import json +import logging from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig import re from typing import Optional +from logging_config import configure_logging + + +LOGGER = logging.getLogger(__name__) + device = 'cuda' if torch.cuda.is_available() else 'cpu' MODEL_ID = "Qwen/Qwen3-14B" WORDS_PER_SECOND = 2.5 @@ -174,7 +180,7 @@ def generate_reel_scenario( inputs = tokenizer(text, return_tensors="pt").to(model.device) - print("Generating reel scenario..") + LOGGER.info("Generating reel scenario") with torch.no_grad(): output_ids = model.generate( **inputs, @@ -330,6 +336,7 @@ def parse_reel_scenario(raw_scenario: str) -> dict: if __name__ == '__main__': + configure_logging("INFO") with open("topic_description.txt", "r") as f: topic = f.read() diff --git a/src/scripts/generate_videos.py b/src/scripts/generate_videos.py new file mode 100644 index 0000000..1cf89f3 --- /dev/null +++ b/src/scripts/generate_videos.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +"""Generate shot videos with HunyuanVideo based on reel script and audio durations.""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import subprocess +from pathlib import Path + +from logging_config import configure_logging + + +SCRIPT_DIR = Path(__file__).resolve().parent +DEFAULT_BASE_DIR = SCRIPT_DIR.parents[1] +DEFAULT_HUNYUAN_DIR = DEFAULT_BASE_DIR / "HunyuanVideo-1.5" +DEFAULT_REEL_SCRIPT = DEFAULT_BASE_DIR / "reel_script.json" +DEFAULT_IMAGES_DIR = DEFAULT_BASE_DIR / "images" +DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos" +DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios" + +LOGGER = logging.getLogger(__name__) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--hunyuan-dir", type=Path, default=DEFAULT_HUNYUAN_DIR) + parser.add_argument("--reel-script", type=Path, default=DEFAULT_REEL_SCRIPT) + parser.add_argument("--images-dir", type=Path, default=DEFAULT_IMAGES_DIR) + parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR) + parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR) + parser.add_argument("--seed", type=int, default=1) + parser.add_argument("--log-level", default="INFO") + return parser.parse_args() + + +def get_audio_duration(audio_path: Path) -> float: + result = subprocess.run( + [ + "ffprobe", + "-v", + "error", + "-show_entries", + "format=duration", + "-of", + "default=noprint_wrappers=1:nokey=1", + str(audio_path), + ], + check=True, + text=True, + capture_output=True, + ) + return float(result.stdout.strip()) + + +def duration_to_video_length(duration: float) -> int: + frames = int(duration * 24) + 1 + if frames % 2 == 0: + frames += 1 + return max(49, min(frames, 169)) + + +def main() -> int: + args = parse_args() + configure_logging(args.log_level) + model_path = args.hunyuan_dir / "ckpts" + + args.videos_dir.mkdir(parents=True, exist_ok=True) + + env = os.environ.copy() + env["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128" + + data = json.loads(args.reel_script.read_text()) + shots = data.get("shots", []) + LOGGER.info("Found %s shots to generate", len(shots)) + + for shot in shots: + shot_number = shot["shot_number"] + prompt = str(shot["image_description"]).replace("\t", " ").replace("\n", " ") + + image_path = args.images_dir / f"shot_{shot_number}.png" + output_path = args.videos_dir / f"output_{shot_number}.mp4" + audio_path = args.audios_dir / f"output_{shot_number}.mp3" + + if not audio_path.exists(): + LOGGER.warning("No audio found at %s, falling back to 5s default", audio_path) + duration = 5.0 + else: + duration = get_audio_duration(audio_path) + LOGGER.info("Audio duration for shot %s: %ss", shot_number, duration) + + video_length = duration_to_video_length(duration) + + LOGGER.info("Shot %s | %ss -> %s frames", shot_number, duration, video_length) + LOGGER.info("Prompt: %s", prompt) + LOGGER.info("Image: %s", image_path) + LOGGER.info("Audio: %s", audio_path) + LOGGER.info("Output: %s", output_path) + + if output_path.exists(): + LOGGER.info("Output path already exists, skipping") + continue + + if not image_path.exists(): + LOGGER.warning("Image not found at %s, skipped", image_path) + continue + + subprocess.run( + [ + "python3", + "-c", + "import torch; torch.cuda.empty_cache()", + ], + check=True, + env=env, + ) + LOGGER.info("GPU cache cleared") + + subprocess.run( + [ + "torchrun", + "--nproc_per_node=1", + "generate.py", + "--prompt", + prompt, + "--image_path", + str(image_path), + "--resolution", + "480p", + "--aspect_ratio", + "16:9", + "--seed", + str(args.seed), + "--video_length", + str(video_length), + "--rewrite", + "false", + "--cfg_distilled", + "true", + "--enable_step_distill", + "true", + "--sparse_attn", + "false", + "--use_sageattn", + "true", + "--enable_cache", + "false", + "--overlap_group_offloading", + "true", + "--sr", + "false", + "--output_path", + str(output_path), + "--model_path", + str(model_path), + ], + check=True, + cwd=args.hunyuan_dir, + env=env, + ) + + LOGGER.info("Shot %s done", shot_number) + + LOGGER.info("Done") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/scripts/logging_config.py b/src/scripts/logging_config.py new file mode 100644 index 0000000..bb4404d --- /dev/null +++ b/src/scripts/logging_config.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +import logging + + +DEFAULT_LOG_FORMAT = "%(asctime)s | %(levelname)s | %(name)s | %(message)s" + + +def configure_logging(level: str = "INFO") -> None: + logging.basicConfig( + level=getattr(logging, level.upper(), logging.INFO), + format=DEFAULT_LOG_FORMAT, + ) diff --git a/src/scripts/merge_audio_video.py b/src/scripts/merge_audio_video.py new file mode 100644 index 0000000..6ce0963 --- /dev/null +++ b/src/scripts/merge_audio_video.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +"""Merge videos/output_n.mp4 with audios/output_n.mp3 into merged/merged_n.mp4.""" + +from __future__ import annotations + +import argparse +import logging +import re +import subprocess +from pathlib import Path + +from logging_config import configure_logging + + +SCRIPT_DIR = Path(__file__).resolve().parent +DEFAULT_BASE_DIR = SCRIPT_DIR.parents[1] +DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos" +DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios" +DEFAULT_OUTPUT_DIR = DEFAULT_BASE_DIR / "merged" + +LOGGER = logging.getLogger(__name__) + + +def shot_number(path: Path) -> int: + match = re.search(r"output_(\d+)\.mp4$", path.name) + return int(match.group(1)) if match else -1 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR) + parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR) + parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) + parser.add_argument("--log-level", default="INFO") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + configure_logging(args.log_level) + args.output_dir.mkdir(parents=True, exist_ok=True) + + videos = sorted(args.videos_dir.glob("output_*.mp4"), key=shot_number) + if not videos: + LOGGER.warning("No videos found in %s", args.videos_dir) + return 1 + + for video in videos: + num = shot_number(video) + audio = args.audios_dir / f"output_{num}.mp3" + output = args.output_dir / f"merged_{num}.mp4" + + if not audio.exists(): + LOGGER.warning("No audio found for shot %s (%s); skipped", num, audio) + continue + + if output.exists(): + LOGGER.info("Already exists; skipped shot %s", num) + continue + + LOGGER.info("Merging shot %s: %s + %s -> %s", num, video, audio, output) + subprocess.run( + [ + "ffmpeg", + "-i", + str(video), + "-i", + str(audio), + "-c:v", + "copy", + "-c:a", + "aac", + "-shortest", + "-y", + str(output), + ], + check=True, + ) + LOGGER.info("Done: %s", output) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/scripts/run_video_pipeline.py b/src/scripts/run_video_pipeline.py new file mode 100644 index 0000000..702bdd9 --- /dev/null +++ b/src/scripts/run_video_pipeline.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +"""Run the full video pipeline: generate, merge, and concatenate.""" + +from __future__ import annotations + +import argparse +import logging +import os +import subprocess +import sys +from pathlib import Path + +from logging_config import configure_logging +from s3_video_storage import S3VideoStorage + + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] +DEFAULT_BASE_DIR = PROJECT_ROOT +DEFAULT_HUNYUAN_DIR = DEFAULT_BASE_DIR / "HunyuanVideo-1.5" +DEFAULT_REEL_SCRIPT = DEFAULT_BASE_DIR / "reel_script.json" +DEFAULT_IMAGES_DIR = DEFAULT_BASE_DIR / "images" +DEFAULT_VIDEOS_DIR = DEFAULT_BASE_DIR / "videos" +DEFAULT_AUDIOS_DIR = DEFAULT_BASE_DIR / "audios" +DEFAULT_MERGED_DIR = DEFAULT_BASE_DIR / "merged" +DEFAULT_OUTPUT = DEFAULT_BASE_DIR / "results" / "final_output.mp4" + +LOGGER = logging.getLogger(__name__) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--base-dir", type=Path, default=DEFAULT_BASE_DIR) + parser.add_argument("--hunyuan-dir", type=Path, default=DEFAULT_HUNYUAN_DIR) + parser.add_argument("--reel-script", type=Path, default=DEFAULT_REEL_SCRIPT) + parser.add_argument("--images-dir", type=Path, default=DEFAULT_IMAGES_DIR) + parser.add_argument("--videos-dir", type=Path, default=DEFAULT_VIDEOS_DIR) + parser.add_argument("--audios-dir", type=Path, default=DEFAULT_AUDIOS_DIR) + parser.add_argument("--merged-dir", type=Path, default=DEFAULT_MERGED_DIR) + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT) + parser.add_argument("--seed", type=int, default=1) + parser.add_argument("--skip-generate", action="store_true") + parser.add_argument("--skip-merge", action="store_true") + parser.add_argument("--skip-concat", action="store_true") + parser.add_argument("--skip-s3-upload", action="store_true") + parser.add_argument("--log-level", default="INFO") + return parser.parse_args() + + +def run_step(name: str, cmd: list[str]) -> None: + LOGGER.info("=== %s ===", name) + LOGGER.info("$ %s", " ".join(str(part) for part in cmd)) + subprocess.run(cmd, check=True) + + +def maybe_upload_to_s3(output_path: Path) -> None: + bucket = os.getenv("AWS_S3_BUCKET") + if not bucket: + LOGGER.warning("Skipping S3 upload: AWS_S3_BUCKET is not set") + return + + storage = S3VideoStorage( + { + "bucket_name": bucket, + "region_name": os.getenv("AWS_REGION"), + "endpoint_url": os.getenv("AWS_S3_ENDPOINT_URL"), + "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), + "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "aws_session_token": os.getenv("AWS_SESSION_TOKEN"), + } + ) + s3_uri = storage.store_file(output_path) + LOGGER.info("Uploaded output to %s", s3_uri) + + +def main() -> int: + args = parse_args() + configure_logging(args.log_level) + + # If only base-dir is overridden, derive the common subpaths from it. + if args.base_dir != DEFAULT_BASE_DIR: + if args.hunyuan_dir == DEFAULT_HUNYUAN_DIR: + args.hunyuan_dir = args.base_dir / "HunyuanVideo-1.5" + if args.reel_script == DEFAULT_REEL_SCRIPT: + args.reel_script = args.base_dir / "reel_script.json" + if args.images_dir == DEFAULT_IMAGES_DIR: + args.images_dir = args.base_dir / "images" + if args.videos_dir == DEFAULT_VIDEOS_DIR: + args.videos_dir = args.base_dir / "videos" + if args.audios_dir == DEFAULT_AUDIOS_DIR: + args.audios_dir = args.base_dir / "audios" + if args.merged_dir == DEFAULT_MERGED_DIR: + args.merged_dir = args.base_dir / "merged" + if args.output == DEFAULT_OUTPUT: + args.output = args.base_dir / "results" / "final_output.mp4" + + try: + if not args.skip_generate: + run_step( + "Generate Videos", + [ + sys.executable, + str(SCRIPT_DIR / "generate_videos.py"), + "--hunyuan-dir", + str(args.hunyuan_dir), + "--reel-script", + str(args.reel_script), + "--images-dir", + str(args.images_dir), + "--videos-dir", + str(args.videos_dir), + "--audios-dir", + str(args.audios_dir), + "--seed", + str(args.seed), + ], + ) + + if not args.skip_merge: + run_step( + "Merge Audio + Video", + [ + sys.executable, + str(SCRIPT_DIR / "merge_audio_video.py"), + "--videos-dir", + str(args.videos_dir), + "--audios-dir", + str(args.audios_dir), + "--output-dir", + str(args.merged_dir), + ], + ) + + if not args.skip_concat: + run_step( + "Concatenate Merged Videos", + [ + sys.executable, + str(SCRIPT_DIR / "concat_merged.py"), + "--merged-dir", + str(args.merged_dir), + "--output", + str(args.output), + ], + ) + except subprocess.CalledProcessError as exc: + LOGGER.exception("Pipeline failed at command: %s", exc.cmd) + return exc.returncode + + if not args.skip_s3_upload: + try: + maybe_upload_to_s3(args.output) + except Exception: + LOGGER.exception("Failed uploading output to S3") + return 1 + + LOGGER.info("Pipeline complete") + LOGGER.info("Final output: %s", args.output) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/s3_video_storage.py b/src/scripts/s3_video_storage.py similarity index 100% rename from s3_video_storage.py rename to src/scripts/s3_video_storage.py