1
0

Video generation pipelines files added

This commit is contained in:
Madina
2026-04-01 04:36:27 -07:00
commit de1bb5c23f
9 changed files with 569 additions and 0 deletions

1
.env Normal file
View File

@@ -0,0 +1 @@
ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383

1
HunyuanVideo-1.5 Submodule

Submodule HunyuanVideo-1.5 added at 2641c0de73

22
concat_merged.sh Normal file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
# Concatenates all merged/merged_n.mp4 into final_output.mp4
BASE_DIR="/home/madina/projects/short_videos"
MERGED_DIR="$BASE_DIR/merged"
OUTPUT="$BASE_DIR/results/run_3/final_output.mp4"
FILELIST=$(mktemp /tmp/filelist_XXXXXX.txt)
# Build file list sorted by shot number
for video in $(ls "$MERGED_DIR"/merged_*.mp4 | sort -t_ -k2 -n); do
echo "file '$video'" >> "$FILELIST"
done
echo "Concatenating the following files:"
cat "$FILELIST"
ffmpeg -f concat -safe 0 -i "$FILELIST" -c copy -y "$OUTPUT"
rm -f "$FILELIST"
echo ""
echo "Done"

35
generate_audios.py Normal file
View File

@@ -0,0 +1,35 @@
from elevenlabs.client import ElevenLabs
from elevenlabs.play import play
import os
import json
from dotenv import load_dotenv
load_dotenv()
ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
if __name__ == '__main__':
script_path = "reel_script.json"
with open(script_path, "r") as f:
reel_data = json.load(f)
client = ElevenLabs(
api_key=ELEVENLABS_API_KEY
)
for shot in reel_data["shots"]:
print(shot["shot_number"], shot["voiceover"])
prompt = shot["voiceover"]
audio = client.text_to_speech.convert(
text=prompt,
voice_id="JBFqnCBsd6RMkjVDRZzb",
model_id="eleven_multilingual_v2",
output_format="mp3_44100_128",
)
audio_bytes = b"".join(audio)
if not os.path.exists("audios"):
os.makedirs("audios")
with open(f"audios/output_{shot["shot_number"]}.mp3", "wb") as f:
f.write(audio_bytes)

28
generate_images.py Normal file
View File

@@ -0,0 +1,28 @@
import torch
from diffusers import FluxPipeline
import json
import os
if __name__ == '__main__':
script_path = "reel_script.json"
with open(script_path, "r") as f:
reel_data = json.load(f)
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
pipe.enable_model_cpu_offload()
for shot in reel_data["shots"]:
print(shot["shot_number"], shot["image_description"])
prompt = shot["image_description"]
image = pipe(
prompt,
guidance_scale=0.0,
num_inference_steps=4,
max_sequence_length=256,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
if not os.path.exists("images"):
os.makedirs("images")
image.save(f"images/shot_{shot["shot_number"]}.png")

344
generate_script.py Normal file
View File

@@ -0,0 +1,344 @@
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import re
from typing import Optional
device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_ID = "Qwen/Qwen3-14B"
WORDS_PER_SECOND = 2.5
MAX_DEAD_AIR_SECONDS = 1
MAX_VOICEOVER_SECONDS = 5.0
MAX_VOICEOVER_WORDS = int(MAX_VOICEOVER_SECONDS * WORDS_PER_SECOND)
MIN_VOICEOVER_WORDS = 5
def load_model(model_id: str = MODEL_ID):
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
model = AutoModelForCausalLM.from_pretrained(
model_id,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True,
).eval()
return model, tokenizer
def generate_reel_scenario(
model,
tokenizer,
content_summary: str,
temperature: float = 0.75,
top_p: float = 0.9,
repetition_penalty: float = 1.1,
) -> str:
"""
Generate a shot-by-shot Instagram Reel scenario where every script beat
has its own image description for AI image/video generation.
Each shot in the output contains:
- Timestamp
- Beat label (HOOK, PROBLEM, etc.)
- Voiceover line
- Text on screen
- Image description (AI-generation-ready prompt)
The hook question from the topic summary is preserved as the opening beat.
"""
system_prompt = (
"You are a professional Instagram Reel director and scriptwriter specializing in EdTech content. "
"You think like a filmmaker: every line of voiceover has a visual that amplifies it. "
"You write punchy, cinematic, scroll-stopping reels that feel native to social media. "
"You write image descriptions like a cinematographer briefing an AI image generator — "
"specific, vivid, atmospheric, with clear subject, composition, lighting, mood, and style."
)
user_prompt = f"""You are given a topic summary for an Instagram Reel.
Your job is to direct a complete shot-by-shot reel scenario — like a real filmmaker laying out a storyboard.
## TOPIC SUMMARY
{content_summary}
---
## PACING RULES — read these before writing a single shot
These rules exist because every shot in this reel will be rendered as a real video clip. Timestamps must be tight and honest.
**Speech rate:** spoken voiceover moves at roughly 2.5 words per second in a reel (energetic, not rushed).
**VOICEOVER LENGTH: vary naturally between 25 seconds per shot. Diversity is encouraged.**
Different beats call for different rhythms:
- A hook or payoff line can be short and punchy: 23 seconds (58 words). Let it land.
- A problem or tension beat needs more breath: 45 seconds (1012 words). Build the feeling.
- A CTA can be medium: 34 seconds (810 words). Direct and warm.
The one hard constraint is the 5-second ceiling — the audio renderer cannot handle more than 5 seconds per shot (12 words). Never exceed this. If an idea needs more than 12 words, split it into two shots.
VARIETY IN ACTION — a good reel sounds like this:
SHOT 1 (HOOK, 2s): "Will your major still matter in five years?" — short, punchy, stops the scroll
SHOT 2 (PROBLEM, 5s): "AI is already replacing writers, designers, and even doctors — fields we thought were safe." — builds tension
SHOT 3 (TENSION, 4s): "Most students have no idea this is already happening to them." — personal, lands hard
SHOT 4 (PAYOFF, 3s): "Will your major still matter? It depends on you." — echo + flip
SHOT 5 (SOLUTION, 5s): "The students who'll thrive are learning skills AI simply cannot replicate yet." — concrete, hopeful
SHOT 6 (CTA, 3s): "Follow us for tips on future-proofing your career." — warm, direct
Notice how the lengths vary. That variation is intentional — it creates rhythm and keeps the viewer engaged.
**Shot duration = voiceover duration + 01 second of breathing room.** Match the shot length to what you actually wrote, not to a preset slot.
**Continuity.** The end timestamp of one shot is the start timestamp of the next. No gaps, no overlaps. The reel runs like a stopwatch.
**Total reel length:** 4560 seconds. Count it up before you finalize. If you're over, trim voiceover. If you're under, add a shot.
**Shot count:** aim for 710 shots. More shots = faster pace = more energy. Fewer shots = slower, more contemplative. Match the tone of the topic.
---
## BRAND CONTEXT — read before writing any shot
These reels are published on the **LiveCarta** Instagram account. LiveCarta is an AI-powered EdTech platform for higher education that lets educators build custom coursepacks by mixing and matching chapters from top publishers, adding their own materials, video, and notes — all in one place. Students get flexible, affordable access: buy whole books or individual chapters, read online or via the app. Key features: AI Curriculum Builder, Remix-on-Demand, pay-by-chapter pricing, LMS integration (Canvas, Blackboard), and real-time content updates.
**Brand mention rules:**
1. **CTA beat — always name LiveCarta.** The final CTA shot(s) must mention LiveCarta by name. Reference it naturally in context — e.g. "Follow LiveCarta for more", "Check out LiveCarta", "LiveCarta helps students stay ahead."
2. **Mid-reel mention — only when there's a genuine, obvious overlap.** If the topic directly connects to what LiveCarta does, drop a natural product mention in the SOLUTION beat. Do NOT force it. The overlap is genuine when the topic is about: customizing course content, affordable textbooks, AI in education, flexible learning, keeping up with fast-changing fields, building skills alongside a degree. If the overlap isn't obvious, skip the mid-reel mention entirely — a forced plug feels fake and loses the audience.
3. **Never open with the brand.** The HOOK and PROBLEM beats must earn attention first. LiveCarta enters only once the viewer is already hooked.
Write shots that follow these beats in order. Each beat can be one or more shots — distribute them based on how long the idea takes to say out loud, not on a fixed slot.
HOOK — The hook question from the topic, spoken word-for-word. One punchy sentence. Creates immediate tension or curiosity.
PROBLEM — Build the relatable pain. Show the viewer their own situation. Make it vivid and specific, not abstract.
TENSION / RELATE — Go deeper into the problem. A concrete, personal detail that makes the viewer think "that's exactly me."
HOOK PAYOFF — Echo the opening hook question word-for-word, then immediately flip it with the insight. This is the emotional turning point.
SOLUTION — The concrete, actionable answer. Specific enough to be immediately useful.
CTA — Direct call to action. **Must name LiveCarta by name.** End with a follow or save prompt. Warm, not salesy.
## OUTPUT FORMAT
Output exactly two sections. Start immediately with the first separator — no preamble.
------------------------------------------------------------
SHOT LIST
For each shot use EXACTLY this format:
SHOT [N]
Timestamp: [start][end]
Beat: [BEAT LABEL]
Voiceover: [25 seconds of speech (512 words). Vary length by beat — short for hooks/payoffs, longer for problem/tension/solution. Hard ceiling: 12 words.]
Text on screen: [36 word punchy overlay that punches up the voiceover, not just repeats it]
Image description: [Standalone AI image generation prompt. Describe: subject, composition, camera angle, lighting, color palette, mood, visual style. Make it cinematic and specific. 24 sentences. No references to "the reel", "the previous shot", or any other shot.]
------------------------------------------------------------
CAPTION
Write a 45 line Instagram caption:
Line 1: hook statement (echoes the reel's opening question)
Lines 23: expand the insight in 12 casual, direct sentences
Line 4: engagement question to the audience
Line 5: 810 relevant hashtags
---
## FINAL CHECKS before outputting
- Does the voiceover length vary across shots? Hooks and payoffs should be short (23s), problems and solutions longer (45s).
- Does any voiceover exceed 12 words? If so, split that shot.
- Does every shot's timestamp connect cleanly to the next?
- Does the total add up to 4560 seconds?
- Does the CTA beat name LiveCarta explicitly?
- If the topic overlaps with LiveCarta's features, is there a natural mid-reel mention in the SOLUTION beat?
- Does the HOOK PAYOFF echo it verbatim before the flip?
- Is every Image description usable as a standalone AI generation prompt?
"""
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
print("Generating reel scenario..")
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=2000,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
result = tokenizer.decode(generated_ids, skip_special_tokens=True)
return result
def _parse_timestamp_seconds(ts_str: str) -> tuple[Optional[int], Optional[int]]:
ts_str = re.sub(r'\s*[–—]+\s*', '-', ts_str).strip()
parts = ts_str.split('-')
if len(parts) != 2:
return None, None
def to_seconds(s: str) -> Optional[int]:
s = s.strip()
if ':' in s:
m, sec = s.split(':', 1)
try:
return int(m) * 60 + int(sec)
except ValueError:
return None
try:
return int(s)
except ValueError:
return None
return to_seconds(parts[0]), to_seconds(parts[1])
def extract_field(label: str, next_label: Optional[str], text: str) -> str:
if next_label:
pattern = rf'{label}:\s*(.*?)(?=\n{next_label}:|$)'
else:
pattern = rf'{label}:\s*(.*?)$'
m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
if m:
return m.group(1).strip().strip('"')
return ""
def parse_reel_scenario(raw_scenario: str) -> dict:
"""
Parse the shot-by-shot reel scenario into a structured dict.
Returns:
{
"shots": [
{
"shot_number": 1,
"timestamp": "04",
"start_sec": 0,
"end_sec": 4,
"duration_sec": 4,
"beat": "HOOK",
"voiceover": "Will your major still matter in 5 years?",
"word_count": 9,
"speech_duration_sec": 3.6, # word_count / WORDS_PER_SECOND
"dead_air_sec": 0.4, # duration_sec - speech_duration_sec
"text_on_screen": "Your degree. Obsolete?",
"image_description": "Close-up of a university diploma ...",
},
...
],
"caption": {
"body": "Will your major still matter in 5 years? ...",
"hashtags": ["#EdTech", "#AIEducation", ...],
},
"total_duration_sec": 55,
}
"""
result = {
"shots": [],
"caption": {"body": "", "hashtags": []},
"total_duration_sec": 0,
}
raw_scenario = re.sub(r'<think>.*?</think>', '', raw_scenario, flags=re.DOTALL).strip()
cleaned = re.sub(r'\*+', '', raw_scenario)
shot_section = re.search(
r'SHOT LIST\s*\n(.*?)(?=-{4,}\s*CAPTION|$)',
cleaned, re.DOTALL | re.IGNORECASE
)
if shot_section:
shot_text = shot_section.group(1)
shot_blocks = re.split(r'\n(?=SHOT\s+\d+)', shot_text.strip())
for block in shot_blocks:
block = block.strip()
if not block:
continue
shot_num_match = re.match(r'SHOT\s+(\d+)', block)
if not shot_num_match:
continue
shot_number = int(shot_num_match.group(1))
timestamp = extract_field("Timestamp", "Beat", block)
beat = extract_field("Beat", "Voiceover", block)
voiceover = extract_field("Voiceover", "Text on screen", block)
text_on_screen = extract_field("Text on screen", "Image description", block)
image_description = extract_field("Image description", None, block)
timestamp_display = re.sub(r'\s*[–—-]+\s*', '', timestamp)
start_sec, end_sec = _parse_timestamp_seconds(timestamp)
duration_sec = (end_sec - start_sec) if (start_sec is not None and end_sec is not None) else None
word_count = len(voiceover.split()) if voiceover else 0
speech_duration = round(word_count / WORDS_PER_SECOND, 1)
dead_air = round(duration_sec - speech_duration, 1) if duration_sec is not None else None
result["shots"].append({
"shot_number": shot_number,
"timestamp": timestamp_display,
"start_sec": start_sec,
"end_sec": end_sec,
"duration_sec": duration_sec,
"beat": beat.upper(),
"voiceover": voiceover,
"word_count": word_count,
"speech_duration_sec": speech_duration,
"dead_air_sec": dead_air,
"text_on_screen": text_on_screen,
"image_description": image_description,
})
caption_section = re.search(
r'CAPTION\s*\n(.*?)$',
cleaned, re.DOTALL | re.IGNORECASE
)
if caption_section:
caption_text = caption_section.group(1).strip()
lines = [l.strip() for l in caption_text.splitlines() if l.strip()]
hashtag_line = next((l for l in lines if l.startswith("#")), "")
body_lines = [l for l in lines if not l.startswith("#")]
result["caption"] = {
"body": "\n".join(body_lines).strip().strip('"'),
"hashtags": re.findall(r'#\w+', hashtag_line),
}
return result
if __name__ == '__main__':
with open("topic_description.txt", "r") as f:
topic = f.read()
model, tokenizer = load_model()
scenario_raw = generate_reel_scenario(model, tokenizer, topic)
parsed = parse_reel_scenario(scenario_raw)
with open("reel_script.json", "w") as f:
json.dump(parsed, f)

98
generate_videos.sh Normal file
View File

@@ -0,0 +1,98 @@
#!/bin/bash
# video length is derived from actual audio file duration
HUNYUAN_DIR="/home/madina/projects/short_videos/HunyuanVideo-1.5"
REEL_SCRIPT="/home/madina/projects/short_videos/reel_script.json"
IMAGES_DIR="/home/madina/projects/short_videos/images"
VIDEOS_DIR="/home/madina/projects/short_videos/videos"
AUDIOS_DIR="/home/madina/projects/short_videos/audios"
MODEL_PATH="$HUNYUAN_DIR/ckpts"
mkdir -p "$VIDEOS_DIR"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
# write shots to a temp TSV file (tab-separated: shot_number, prompt)
TMPFILE=$(mktemp /tmp/shots_XXXXXX.tsv)
python3 - <<EOF > "$TMPFILE"
import json
d = json.load(open('$REEL_SCRIPT'))
for shot in d['shots']:
num = shot['shot_number']
desc = shot['image_description'].replace('\t', ' ').replace('\n', ' ')
print(f'{num}\t{desc}')
EOF
NUM_SHOTS=$(wc -l < "$TMPFILE")
echo "Found $NUM_SHOTS shots to generate"
while IFS=$'\t' read -r shot_number prompt; do
IMAGE_PATH="$IMAGES_DIR/shot_${shot_number}.png"
OUTPUT_PATH="$VIDEOS_DIR/output_${shot_number}.mp4"
AUDIO_PATH="$AUDIOS_DIR/output_${shot_number}.mp3"
# get audio duration and convert to frame count
if [ ! -f "$AUDIO_PATH" ]; then
echo "WARNING: No audio found at $AUDIO_PATH, falling back to 5s default."
DURATION=5.0
else
DURATION=$(ffprobe -v error -show_entries format=duration \
-of default=noprint_wrappers=1:nokey=1 "$AUDIO_PATH")
echo "Audio duration for shot $shot_number: ${DURATION}s"
fi
VIDEO_LENGTH=$(python3 -c "
duration = float('$DURATION')
frames = int(duration * 24) + 1
if frames % 2 == 0:
frames += 1
frames = max(49, min(frames, 169))
print(frames)
")
echo ""
echo "Shot $shot_number | ${DURATION}s -> ${VIDEO_LENGTH} frames"
echo "Prompt: $prompt"
echo "Image: $IMAGE_PATH"
echo "Audio: $AUDIO_PATH"
echo "Output: $OUTPUT_PATH"
if [ -f "$OUTPUT_PATH" ]; then
echo "OUTPUT_PATH already exists, can skip"
continue
fi
if [ ! -f "$IMAGE_PATH" ]; then
echo "WARNING: image not found at $IMAGE_PATH, skipped"
continue
fi
python3 -c "import torch; torch.cuda.empty_cache(); print('GPU cache cleared')"
cd "$HUNYUAN_DIR"
torchrun --nproc_per_node=1 generate.py \
--prompt "$prompt" \
--image_path "$IMAGE_PATH" \
--resolution 480p \
--aspect_ratio 16:9 \
--seed 1 \
--video_length $VIDEO_LENGTH \
--rewrite false \
--cfg_distilled true \
--enable_step_distill true \
--sparse_attn false \
--use_sageattn true \
--enable_cache false \
--overlap_group_offloading true \
--sr false \
--output_path "$OUTPUT_PATH" \
--model_path "$MODEL_PATH"
echo "shot $shot_number done"
done < "$TMPFILE"
rm -f "$TMPFILE"
echo "Done"

34
merge_audio_video.sh Normal file
View File

@@ -0,0 +1,34 @@
#!/bin/bash
#merges videos/output_n.mp4 with audios/audio_n.mp3 -> merged/merged_n.mp4
BASE_DIR="/home/madina/projects/short_videos"
VIDEOS_DIR="$BASE_DIR/videos"
AUDIOS_DIR="$BASE_DIR/audios"
OUTPUT_DIR="$BASE_DIR/merged"
mkdir -p "$OUTPUT_DIR"
for video in "$VIDEOS_DIR"/output_*.mp4; do
num=$(basename "$video" | sed 's/output_\([0-9]*\)\.mp4/\1/')
audio="$AUDIOS_DIR/output_${num}.mp3"
output="$OUTPUT_DIR/merged_${num}.mp4"
if [ ! -f "$audio" ]; then
echo "WARNING: No audio found for shot $num ($audio); skipped"
continue
fi
if [ -f "$output" ]; then
echo "Already exists; skipped the shot $num."
continue
fi
echo "Merging shot $num: $video + $audio -> $output"
ffmpeg -i "$video" -i "$audio" \
-c:v copy \
-c:a aac \
-shortest \
-y "$output"
echo "Done: $output"
done

6
topic_description.txt Normal file
View File

@@ -0,0 +1,6 @@
**TITLE:** “I Only Use 3 Chapters Out Of This Book” And Thats Okay
**CATEGORY:** Hot take / Controversial opinion
**TONE:** Empowering
**HOOK QUESTION:** Why pay for an entire book if you only read 10 pages?
**CONTENT SUMMARY:** Argue for customizable learning experiences through platforms like LiveCarta.
**TARGET AUDIENCE:** Students