Video generation pipelines files added

2026-04-01 04:36:27 -07:00
commit de1bb5c23f
9 changed files with 569 additions and 0 deletions
--- a/.env
+++ b/.env
@@ -0,0 +1 @@
 ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383
--- a/HunyuanVideo-1.5
+++ b/HunyuanVideo-1.5
--- a/concat_merged.sh
+++ b/concat_merged.sh
@@ -0,0 +1,22 @@
 #!/bin/bash
 # Concatenates all merged/merged_n.mp4 into final_output.mp4
 BASE_DIR="/home/madina/projects/short_videos"
 MERGED_DIR="$BASE_DIR/merged"
 OUTPUT="$BASE_DIR/results/run_3/final_output.mp4"
 FILELIST=$(mktemp /tmp/filelist_XXXXXX.txt)
 # Build file list sorted by shot number
 for video in $(ls "$MERGED_DIR"/merged_*.mp4 | sort -t_ -k2 -n); do
    echo "file '$video'" >> "$FILELIST"
 done
 echo "Concatenating the following files:"
 cat "$FILELIST"
 ffmpeg -f concat -safe 0 -i "$FILELIST" -c copy -y "$OUTPUT"
 rm -f "$FILELIST"
 echo ""
 echo "Done"
--- a/generate_audios.py
+++ b/generate_audios.py
@@ -0,0 +1,35 @@
 from elevenlabs.client import ElevenLabs
 from elevenlabs.play import play
 import os
 import json
 from dotenv import load_dotenv
 load_dotenv()
 ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
 if __name__ == '__main__':
    script_path = "reel_script.json"
    with open(script_path, "r") as f:
        reel_data = json.load(f)
    client = ElevenLabs(
        api_key=ELEVENLABS_API_KEY
    )
    for shot in reel_data["shots"]:
        print(shot["shot_number"], shot["voiceover"])
        prompt = shot["voiceover"]
        audio = client.text_to_speech.convert(
            text=prompt,
            voice_id="JBFqnCBsd6RMkjVDRZzb",
            model_id="eleven_multilingual_v2",
            output_format="mp3_44100_128",
        )
        audio_bytes = b"".join(audio)
        if not os.path.exists("audios"):
            os.makedirs("audios")
        with open(f"audios/output_{shot["shot_number"]}.mp3", "wb") as f:
            f.write(audio_bytes)
--- a/generate_images.py
+++ b/generate_images.py
@@ -0,0 +1,28 @@
 import torch
 from diffusers import FluxPipeline
 import json
 import os
 if __name__ == '__main__':
    script_path = "reel_script.json"
    with open(script_path, "r") as f:
        reel_data = json.load(f)
    pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
    pipe.enable_model_cpu_offload() 
    for shot in reel_data["shots"]:
        print(shot["shot_number"], shot["image_description"])
        prompt = shot["image_description"]
        image = pipe(
            prompt,
            guidance_scale=0.0,
            num_inference_steps=4,
            max_sequence_length=256,
            generator=torch.Generator("cpu").manual_seed(0)
        ).images[0]
        if not os.path.exists("images"):
            os.makedirs("images")
        image.save(f"images/shot_{shot["shot_number"]}.png")
--- a/generate_script.py
+++ b/generate_script.py
@@ -0,0 +1,344 @@
 import torch
 import json
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import re
 from typing import Optional
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 MODEL_ID = "Qwen/Qwen3-14B"
 WORDS_PER_SECOND = 2.5  
 MAX_DEAD_AIR_SECONDS = 1 
 MAX_VOICEOVER_SECONDS = 5.0 
 MAX_VOICEOVER_WORDS = int(MAX_VOICEOVER_SECONDS * WORDS_PER_SECOND)
 MIN_VOICEOVER_WORDS = 5 
 def load_model(model_id: str = MODEL_ID):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    ).eval()
    return model, tokenizer
 def generate_reel_scenario(
    model,
    tokenizer,
    content_summary: str,
    temperature: float = 0.75,
    top_p: float = 0.9,
    repetition_penalty: float = 1.1,
 ) -> str:
    """
    Generate a shot-by-shot Instagram Reel scenario where every script beat
    has its own image description for AI image/video generation.
    Each shot in the output contains:
      - Timestamp
      - Beat label (HOOK, PROBLEM, etc.)
      - Voiceover line
      - Text on screen
      - Image description (AI-generation-ready prompt)
    The hook question from the topic summary is preserved as the opening beat.
    """
    system_prompt = (
        "You are a professional Instagram Reel director and scriptwriter specializing in EdTech content. "
        "You think like a filmmaker: every line of voiceover has a visual that amplifies it. "
        "You write punchy, cinematic, scroll-stopping reels that feel native to social media. "
        "You write image descriptions like a cinematographer briefing an AI image generator — "
        "specific, vivid, atmospheric, with clear subject, composition, lighting, mood, and style."
    )
    user_prompt = f"""You are given a topic summary for an Instagram Reel.
    Your job is to direct a complete shot-by-shot reel scenario — like a real filmmaker laying out a storyboard.
    ## TOPIC SUMMARY
    {content_summary}
    ---
    ## PACING RULES — read these before writing a single shot
    These rules exist because every shot in this reel will be rendered as a real video clip. Timestamps must be tight and honest.
    **Speech rate:** spoken voiceover moves at roughly 2.5 words per second in a reel (energetic, not rushed).
    **VOICEOVER LENGTH: vary naturally between 2–5 seconds per shot. Diversity is encouraged.**
    Different beats call for different rhythms:
    - A hook or payoff line can be short and punchy: 2–3 seconds (5–8 words). Let it land.
    - A problem or tension beat needs more breath: 4–5 seconds (10–12 words). Build the feeling.
    - A CTA can be medium: 3–4 seconds (8–10 words). Direct and warm.
    The one hard constraint is the 5-second ceiling — the audio renderer cannot handle more than 5 seconds per shot (12 words). Never exceed this. If an idea needs more than 12 words, split it into two shots.
    VARIETY IN ACTION — a good reel sounds like this:
      SHOT 1 (HOOK, 2s):      "Will your major still matter in five years?" — short, punchy, stops the scroll
      SHOT 2 (PROBLEM, 5s):   "AI is already replacing writers, designers, and even doctors — fields we thought were safe." — builds tension
      SHOT 3 (TENSION, 4s):   "Most students have no idea this is already happening to them." — personal, lands hard
      SHOT 4 (PAYOFF, 3s):    "Will your major still matter? It depends on you." — echo + flip
      SHOT 5 (SOLUTION, 5s):  "The students who'll thrive are learning skills AI simply cannot replicate yet." — concrete, hopeful
      SHOT 6 (CTA, 3s):       "Follow us for tips on future-proofing your career." — warm, direct
    Notice how the lengths vary. That variation is intentional — it creates rhythm and keeps the viewer engaged.
    **Shot duration = voiceover duration + 0–1 second of breathing room.** Match the shot length to what you actually wrote, not to a preset slot.
    **Continuity.** The end timestamp of one shot is the start timestamp of the next. No gaps, no overlaps. The reel runs like a stopwatch.
    **Total reel length:** 45–60 seconds. Count it up before you finalize. If you're over, trim voiceover. If you're under, add a shot.
    **Shot count:** aim for 7–10 shots. More shots = faster pace = more energy. Fewer shots = slower, more contemplative. Match the tone of the topic.
    ---
    ## BRAND CONTEXT — read before writing any shot
    These reels are published on the **LiveCarta** Instagram account. LiveCarta is an AI-powered EdTech platform for higher education that lets educators build custom coursepacks by mixing and matching chapters from top publishers, adding their own materials, video, and notes — all in one place. Students get flexible, affordable access: buy whole books or individual chapters, read online or via the app. Key features: AI Curriculum Builder, Remix-on-Demand, pay-by-chapter pricing, LMS integration (Canvas, Blackboard), and real-time content updates.
    **Brand mention rules:**
    1. **CTA beat — always name LiveCarta.** The final CTA shot(s) must mention LiveCarta by name. Reference it naturally in context — e.g. "Follow LiveCarta for more", "Check out LiveCarta", "LiveCarta helps students stay ahead."
    2. **Mid-reel mention — only when there's a genuine, obvious overlap.** If the topic directly connects to what LiveCarta does, drop a natural product mention in the SOLUTION beat. Do NOT force it. The overlap is genuine when the topic is about: customizing course content, affordable textbooks, AI in education, flexible learning, keeping up with fast-changing fields, building skills alongside a degree. If the overlap isn't obvious, skip the mid-reel mention entirely — a forced plug feels fake and loses the audience.
    3. **Never open with the brand.** The HOOK and PROBLEM beats must earn attention first. LiveCarta enters only once the viewer is already hooked.
    Write shots that follow these beats in order. Each beat can be one or more shots — distribute them based on how long the idea takes to say out loud, not on a fixed slot.
      HOOK             — The hook question from the topic, spoken word-for-word. One punchy sentence. Creates immediate tension or curiosity.
      PROBLEM          — Build the relatable pain. Show the viewer their own situation. Make it vivid and specific, not abstract.
      TENSION / RELATE — Go deeper into the problem. A concrete, personal detail that makes the viewer think "that's exactly me."
      HOOK PAYOFF      — Echo the opening hook question word-for-word, then immediately flip it with the insight. This is the emotional turning point.
      SOLUTION         — The concrete, actionable answer. Specific enough to be immediately useful.
      CTA              — Direct call to action. **Must name LiveCarta by name.** End with a follow or save prompt. Warm, not salesy.
    ## OUTPUT FORMAT
    Output exactly two sections. Start immediately with the first separator — no preamble.
    ------------------------------------------------------------
    SHOT LIST
    For each shot use EXACTLY this format:
    SHOT [N]
    Timestamp: [start]–[end]
    Beat: [BEAT LABEL]
    Voiceover: [2–5 seconds of speech (5–12 words). Vary length by beat — short for hooks/payoffs, longer for problem/tension/solution. Hard ceiling: 12 words.]
    Text on screen: [3–6 word punchy overlay that punches up the voiceover, not just repeats it]
    Image description: [Standalone AI image generation prompt. Describe: subject, composition, camera angle, lighting, color palette, mood, visual style. Make it cinematic and specific. 2–4 sentences. No references to "the reel", "the previous shot", or any other shot.]
    ------------------------------------------------------------
    CAPTION
    Write a 4–5 line Instagram caption:
      Line 1: hook statement (echoes the reel's opening question)
      Lines 2–3: expand the insight in 1–2 casual, direct sentences
      Line 4: engagement question to the audience
      Line 5: 8–10 relevant hashtags
    ---
    ## FINAL CHECKS before outputting
    - Does the voiceover length vary across shots? Hooks and payoffs should be short (2–3s), problems and solutions longer (4–5s).
    - Does any voiceover exceed 12 words? If so, split that shot.
    - Does every shot's timestamp connect cleanly to the next?
    - Does the total add up to 45–60 seconds?
    - Does the CTA beat name LiveCarta explicitly?
    - If the topic overlaps with LiveCarta's features, is there a natural mid-reel mention in the SOLUTION beat?
    - Does the HOOK PAYOFF echo it verbatim before the flip?
    - Is every Image description usable as a standalone AI generation prompt?
    """
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    print("Generating reel scenario..")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=2000,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )
    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    result = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return result
 def _parse_timestamp_seconds(ts_str: str) -> tuple[Optional[int], Optional[int]]:
    ts_str = re.sub(r'\s*[–—]+\s*', '-', ts_str).strip()
    parts = ts_str.split('-')
    if len(parts) != 2:
        return None, None
    def to_seconds(s: str) -> Optional[int]:
        s = s.strip()
        if ':' in s:
            m, sec = s.split(':', 1)
            try:
                return int(m) * 60 + int(sec)
            except ValueError:
                return None
        try:
            return int(s)
        except ValueError:
            return None
    return to_seconds(parts[0]), to_seconds(parts[1])
 def extract_field(label: str, next_label: Optional[str], text: str) -> str:
    if next_label:
        pattern = rf'{label}:\s*(.*?)(?=\n{next_label}:|$)'
    else:
        pattern = rf'{label}:\s*(.*?)$'
    m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip().strip('"')
    return ""
 def parse_reel_scenario(raw_scenario: str) -> dict:
    """
    Parse the shot-by-shot reel scenario into a structured dict.
    Returns:
    {
        "shots": [
            {
                "shot_number":        1,
                "timestamp":          "0–4",
                "start_sec":          0,
                "end_sec":            4,
                "duration_sec":       4,
                "beat":               "HOOK",
                "voiceover":          "Will your major still matter in 5 years?",
                "word_count":         9,
                "speech_duration_sec": 3.6,   # word_count / WORDS_PER_SECOND
                "dead_air_sec":       0.4,     # duration_sec - speech_duration_sec
                "text_on_screen":     "Your degree. Obsolete?",
                "image_description":  "Close-up of a university diploma ...",
            },
            ...
        ],
        "caption": {
            "body":     "Will your major still matter in 5 years? ...",
            "hashtags": ["#EdTech", "#AIEducation", ...],
        },
        "total_duration_sec": 55,
    }
    """
    result = {
        "shots": [],
        "caption": {"body": "", "hashtags": []},
        "total_duration_sec": 0,
    }
    raw_scenario = re.sub(r'<think>.*?</think>', '', raw_scenario, flags=re.DOTALL).strip()
    cleaned = re.sub(r'\*+', '', raw_scenario)
    shot_section = re.search(
        r'SHOT LIST\s*\n(.*?)(?=-{4,}\s*CAPTION|$)',
        cleaned, re.DOTALL | re.IGNORECASE
    )
    if shot_section:
        shot_text = shot_section.group(1)
        shot_blocks = re.split(r'\n(?=SHOT\s+\d+)', shot_text.strip())
        for block in shot_blocks:
            block = block.strip()
            if not block:
                continue
            shot_num_match = re.match(r'SHOT\s+(\d+)', block)
            if not shot_num_match:
                continue
            shot_number = int(shot_num_match.group(1))
            timestamp         = extract_field("Timestamp",        "Beat",              block)
            beat              = extract_field("Beat",             "Voiceover",         block)
            voiceover         = extract_field("Voiceover",        "Text on screen",    block)
            text_on_screen    = extract_field("Text on screen",   "Image description", block)
            image_description = extract_field("Image description", None,               block)
            timestamp_display = re.sub(r'\s*[–—-]+\s*', '–', timestamp)
            start_sec, end_sec = _parse_timestamp_seconds(timestamp)
            duration_sec       = (end_sec - start_sec) if (start_sec is not None and end_sec is not None) else None
            word_count         = len(voiceover.split()) if voiceover else 0
            speech_duration    = round(word_count / WORDS_PER_SECOND, 1)
            dead_air           = round(duration_sec - speech_duration, 1) if duration_sec is not None else None
            result["shots"].append({
                "shot_number":         shot_number,
                "timestamp":           timestamp_display,
                "start_sec":           start_sec,
                "end_sec":             end_sec,
                "duration_sec":        duration_sec,
                "beat":                beat.upper(),
                "voiceover":           voiceover,
                "word_count":          word_count,
                "speech_duration_sec": speech_duration,
                "dead_air_sec":        dead_air,
                "text_on_screen":      text_on_screen,
                "image_description":   image_description,
            })
    caption_section = re.search(
        r'CAPTION\s*\n(.*?)$',
        cleaned, re.DOTALL | re.IGNORECASE
    )
    if caption_section:
        caption_text = caption_section.group(1).strip()
        lines = [l.strip() for l in caption_text.splitlines() if l.strip()]
        hashtag_line = next((l for l in lines if l.startswith("#")), "")
        body_lines   = [l for l in lines if not l.startswith("#")]
        result["caption"] = {
            "body":     "\n".join(body_lines).strip().strip('"'),
            "hashtags": re.findall(r'#\w+', hashtag_line),
        }
    return result
 if __name__ == '__main__':
    with open("topic_description.txt", "r") as f:
        topic = f.read()
    model, tokenizer = load_model()
    scenario_raw = generate_reel_scenario(model, tokenizer, topic)
    parsed = parse_reel_scenario(scenario_raw)
    with open("reel_script.json", "w") as f:
        json.dump(parsed, f)
--- a/generate_videos.sh
+++ b/generate_videos.sh
@@ -0,0 +1,98 @@
 #!/bin/bash
 # video length is derived from actual audio file duration
 HUNYUAN_DIR="/home/madina/projects/short_videos/HunyuanVideo-1.5"
 REEL_SCRIPT="/home/madina/projects/short_videos/reel_script.json"
 IMAGES_DIR="/home/madina/projects/short_videos/images"
 VIDEOS_DIR="/home/madina/projects/short_videos/videos"
 AUDIOS_DIR="/home/madina/projects/short_videos/audios"
 MODEL_PATH="$HUNYUAN_DIR/ckpts"
 mkdir -p "$VIDEOS_DIR"
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
 # write shots to a temp TSV file (tab-separated: shot_number, prompt)
 TMPFILE=$(mktemp /tmp/shots_XXXXXX.tsv)
 python3 - <<EOF > "$TMPFILE"
 import json
 d = json.load(open('$REEL_SCRIPT'))
 for shot in d['shots']:
    num = shot['shot_number']
    desc = shot['image_description'].replace('\t', ' ').replace('\n', ' ')
    print(f'{num}\t{desc}')
 EOF
 NUM_SHOTS=$(wc -l < "$TMPFILE")
 echo "Found $NUM_SHOTS shots to generate"
 while IFS=$'\t' read -r shot_number prompt; do
    IMAGE_PATH="$IMAGES_DIR/shot_${shot_number}.png"
    OUTPUT_PATH="$VIDEOS_DIR/output_${shot_number}.mp4"
    AUDIO_PATH="$AUDIOS_DIR/output_${shot_number}.mp3"
    # get audio duration and convert to frame count
    if [ ! -f "$AUDIO_PATH" ]; then
        echo "WARNING: No audio found at $AUDIO_PATH, falling back to 5s default."
        DURATION=5.0
    else
        DURATION=$(ffprobe -v error -show_entries format=duration \
            -of default=noprint_wrappers=1:nokey=1 "$AUDIO_PATH")
        echo "Audio duration for shot $shot_number: ${DURATION}s"
    fi
    VIDEO_LENGTH=$(python3 -c "
 duration = float('$DURATION')
 frames = int(duration * 24) + 1
 if frames % 2 == 0:
    frames += 1
 frames = max(49, min(frames, 169))
 print(frames)
 ")
    echo ""
    echo "Shot $shot_number | ${DURATION}s -> ${VIDEO_LENGTH} frames"
    echo "Prompt: $prompt"
    echo "Image:  $IMAGE_PATH"
    echo "Audio:  $AUDIO_PATH"
    echo "Output: $OUTPUT_PATH"
    if [ -f "$OUTPUT_PATH" ]; then
        echo "OUTPUT_PATH already exists, can skip"
        continue
    fi
    if [ ! -f "$IMAGE_PATH" ]; then
        echo "WARNING: image not found at $IMAGE_PATH, skipped"
        continue
    fi
    python3 -c "import torch; torch.cuda.empty_cache(); print('GPU cache cleared')"
    cd "$HUNYUAN_DIR"
    torchrun --nproc_per_node=1 generate.py \
        --prompt "$prompt" \
        --image_path "$IMAGE_PATH" \
        --resolution 480p \
        --aspect_ratio 16:9 \
        --seed 1 \
        --video_length $VIDEO_LENGTH \
        --rewrite false \
        --cfg_distilled true \
        --enable_step_distill true \
        --sparse_attn false \
        --use_sageattn true \
        --enable_cache false \
        --overlap_group_offloading true \
        --sr false \
        --output_path "$OUTPUT_PATH" \
        --model_path "$MODEL_PATH"
    echo "shot $shot_number done"
 done < "$TMPFILE"
 rm -f "$TMPFILE"
 echo "Done"
--- a/merge_audio_video.sh
+++ b/merge_audio_video.sh
@@ -0,0 +1,34 @@
 #!/bin/bash
 #merges videos/output_n.mp4 with audios/audio_n.mp3 -> merged/merged_n.mp4
 BASE_DIR="/home/madina/projects/short_videos"
 VIDEOS_DIR="$BASE_DIR/videos"
 AUDIOS_DIR="$BASE_DIR/audios"
 OUTPUT_DIR="$BASE_DIR/merged"
 mkdir -p "$OUTPUT_DIR"
 for video in "$VIDEOS_DIR"/output_*.mp4; do
    num=$(basename "$video" | sed 's/output_\([0-9]*\)\.mp4/\1/')
    audio="$AUDIOS_DIR/output_${num}.mp3"
    output="$OUTPUT_DIR/merged_${num}.mp4"
    if [ ! -f "$audio" ]; then
        echo "WARNING: No audio found for shot $num ($audio); skipped"
        continue
    fi
    if [ -f "$output" ]; then
        echo "Already exists; skipped the shot $num."
        continue
    fi
    echo "Merging shot $num: $video + $audio -> $output"
    ffmpeg -i "$video" -i "$audio" \
        -c:v copy \
        -c:a aac \
        -shortest \
        -y "$output"
    echo "Done: $output"
 done
--- a/topic_description.txt
+++ b/topic_description.txt
@@ -0,0 +1,6 @@
 **TITLE:** “I Only Use 3 Chapters Out Of This Book” – And That’s Okay  
 **CATEGORY:** Hot take / Controversial opinion  
 **TONE:** Empowering  
 **HOOK QUESTION:** Why pay for an entire book if you only read 10 pages?  
 **CONTENT SUMMARY:** Argue for customizable learning experiences through platforms like LiveCarta.  
 **TARGET AUDIENCE:** Students
		`@@ -0,0 +1 @@`
							`ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383`