commit de1bb5c23f8789a906ff82bb80237c08b7b6d7bb Author: Madina Date: Wed Apr 1 04:36:27 2026 -0700 Video generation pipelines files added diff --git a/.env b/.env new file mode 100644 index 0000000..f22ab82 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383 diff --git a/HunyuanVideo-1.5 b/HunyuanVideo-1.5 new file mode 160000 index 0000000..2641c0d --- /dev/null +++ b/HunyuanVideo-1.5 @@ -0,0 +1 @@ +Subproject commit 2641c0de73da0a2d9682fed24af2c0a516527cc1 diff --git a/concat_merged.sh b/concat_merged.sh new file mode 100644 index 0000000..39163c1 --- /dev/null +++ b/concat_merged.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Concatenates all merged/merged_n.mp4 into final_output.mp4 + +BASE_DIR="/home/madina/projects/short_videos" +MERGED_DIR="$BASE_DIR/merged" +OUTPUT="$BASE_DIR/results/run_3/final_output.mp4" +FILELIST=$(mktemp /tmp/filelist_XXXXXX.txt) + +# Build file list sorted by shot number +for video in $(ls "$MERGED_DIR"/merged_*.mp4 | sort -t_ -k2 -n); do + echo "file '$video'" >> "$FILELIST" +done + +echo "Concatenating the following files:" +cat "$FILELIST" + +ffmpeg -f concat -safe 0 -i "$FILELIST" -c copy -y "$OUTPUT" + +rm -f "$FILELIST" +echo "" +echo "Done" diff --git a/generate_audios.py b/generate_audios.py new file mode 100644 index 0000000..dc88555 --- /dev/null +++ b/generate_audios.py @@ -0,0 +1,35 @@ +from elevenlabs.client import ElevenLabs +from elevenlabs.play import play +import os +import json +from dotenv import load_dotenv + +load_dotenv() +ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY') + + +if __name__ == '__main__': + + script_path = "reel_script.json" + with open(script_path, "r") as f: + reel_data = json.load(f) + + client = ElevenLabs( + api_key=ELEVENLABS_API_KEY + ) + for shot in reel_data["shots"]: + print(shot["shot_number"], shot["voiceover"]) + prompt = shot["voiceover"] + audio = client.text_to_speech.convert( + text=prompt, + voice_id="JBFqnCBsd6RMkjVDRZzb", + model_id="eleven_multilingual_v2", + output_format="mp3_44100_128", + ) + + audio_bytes = b"".join(audio) + + if not os.path.exists("audios"): + os.makedirs("audios") + with open(f"audios/output_{shot["shot_number"]}.mp3", "wb") as f: + f.write(audio_bytes) \ No newline at end of file diff --git a/generate_images.py b/generate_images.py new file mode 100644 index 0000000..446145b --- /dev/null +++ b/generate_images.py @@ -0,0 +1,28 @@ +import torch +from diffusers import FluxPipeline +import json +import os + +if __name__ == '__main__': + + script_path = "reel_script.json" + with open(script_path, "r") as f: + reel_data = json.load(f) + + pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16) + pipe.enable_model_cpu_offload() + + for shot in reel_data["shots"]: + print(shot["shot_number"], shot["image_description"]) + prompt = shot["image_description"] + image = pipe( + prompt, + guidance_scale=0.0, + num_inference_steps=4, + max_sequence_length=256, + generator=torch.Generator("cpu").manual_seed(0) + ).images[0] + + if not os.path.exists("images"): + os.makedirs("images") + image.save(f"images/shot_{shot["shot_number"]}.png") \ No newline at end of file diff --git a/generate_script.py b/generate_script.py new file mode 100644 index 0000000..410b8be --- /dev/null +++ b/generate_script.py @@ -0,0 +1,344 @@ +import torch +import json +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +import re +from typing import Optional + +device = 'cuda' if torch.cuda.is_available() else 'cpu' +MODEL_ID = "Qwen/Qwen3-14B" +WORDS_PER_SECOND = 2.5 +MAX_DEAD_AIR_SECONDS = 1 +MAX_VOICEOVER_SECONDS = 5.0 +MAX_VOICEOVER_WORDS = int(MAX_VOICEOVER_SECONDS * WORDS_PER_SECOND) +MIN_VOICEOVER_WORDS = 5 + + +def load_model(model_id: str = MODEL_ID): + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + model = AutoModelForCausalLM.from_pretrained( + model_id, + quantization_config=bnb_config, + device_map="auto", + trust_remote_code=True, + ).eval() + + return model, tokenizer + + +def generate_reel_scenario( + model, + tokenizer, + content_summary: str, + temperature: float = 0.75, + top_p: float = 0.9, + repetition_penalty: float = 1.1, +) -> str: + """ + Generate a shot-by-shot Instagram Reel scenario where every script beat + has its own image description for AI image/video generation. + + Each shot in the output contains: + - Timestamp + - Beat label (HOOK, PROBLEM, etc.) + - Voiceover line + - Text on screen + - Image description (AI-generation-ready prompt) + + The hook question from the topic summary is preserved as the opening beat. + """ + + system_prompt = ( + "You are a professional Instagram Reel director and scriptwriter specializing in EdTech content. " + "You think like a filmmaker: every line of voiceover has a visual that amplifies it. " + "You write punchy, cinematic, scroll-stopping reels that feel native to social media. " + "You write image descriptions like a cinematographer briefing an AI image generator — " + "specific, vivid, atmospheric, with clear subject, composition, lighting, mood, and style." + ) + + user_prompt = f"""You are given a topic summary for an Instagram Reel. + Your job is to direct a complete shot-by-shot reel scenario — like a real filmmaker laying out a storyboard. + + ## TOPIC SUMMARY + {content_summary} + + --- + + ## PACING RULES — read these before writing a single shot + + These rules exist because every shot in this reel will be rendered as a real video clip. Timestamps must be tight and honest. + + **Speech rate:** spoken voiceover moves at roughly 2.5 words per second in a reel (energetic, not rushed). + + **VOICEOVER LENGTH: vary naturally between 2–5 seconds per shot. Diversity is encouraged.** + Different beats call for different rhythms: + - A hook or payoff line can be short and punchy: 2–3 seconds (5–8 words). Let it land. + - A problem or tension beat needs more breath: 4–5 seconds (10–12 words). Build the feeling. + - A CTA can be medium: 3–4 seconds (8–10 words). Direct and warm. + + The one hard constraint is the 5-second ceiling — the audio renderer cannot handle more than 5 seconds per shot (12 words). Never exceed this. If an idea needs more than 12 words, split it into two shots. + + VARIETY IN ACTION — a good reel sounds like this: + SHOT 1 (HOOK, 2s): "Will your major still matter in five years?" — short, punchy, stops the scroll + SHOT 2 (PROBLEM, 5s): "AI is already replacing writers, designers, and even doctors — fields we thought were safe." — builds tension + SHOT 3 (TENSION, 4s): "Most students have no idea this is already happening to them." — personal, lands hard + SHOT 4 (PAYOFF, 3s): "Will your major still matter? It depends on you." — echo + flip + SHOT 5 (SOLUTION, 5s): "The students who'll thrive are learning skills AI simply cannot replicate yet." — concrete, hopeful + SHOT 6 (CTA, 3s): "Follow us for tips on future-proofing your career." — warm, direct + + Notice how the lengths vary. That variation is intentional — it creates rhythm and keeps the viewer engaged. + + **Shot duration = voiceover duration + 0–1 second of breathing room.** Match the shot length to what you actually wrote, not to a preset slot. + + **Continuity.** The end timestamp of one shot is the start timestamp of the next. No gaps, no overlaps. The reel runs like a stopwatch. + + **Total reel length:** 45–60 seconds. Count it up before you finalize. If you're over, trim voiceover. If you're under, add a shot. + + **Shot count:** aim for 7–10 shots. More shots = faster pace = more energy. Fewer shots = slower, more contemplative. Match the tone of the topic. + + --- + + ## BRAND CONTEXT — read before writing any shot + + These reels are published on the **LiveCarta** Instagram account. LiveCarta is an AI-powered EdTech platform for higher education that lets educators build custom coursepacks by mixing and matching chapters from top publishers, adding their own materials, video, and notes — all in one place. Students get flexible, affordable access: buy whole books or individual chapters, read online or via the app. Key features: AI Curriculum Builder, Remix-on-Demand, pay-by-chapter pricing, LMS integration (Canvas, Blackboard), and real-time content updates. + + **Brand mention rules:** + + 1. **CTA beat — always name LiveCarta.** The final CTA shot(s) must mention LiveCarta by name. Reference it naturally in context — e.g. "Follow LiveCarta for more", "Check out LiveCarta", "LiveCarta helps students stay ahead." + + 2. **Mid-reel mention — only when there's a genuine, obvious overlap.** If the topic directly connects to what LiveCarta does, drop a natural product mention in the SOLUTION beat. Do NOT force it. The overlap is genuine when the topic is about: customizing course content, affordable textbooks, AI in education, flexible learning, keeping up with fast-changing fields, building skills alongside a degree. If the overlap isn't obvious, skip the mid-reel mention entirely — a forced plug feels fake and loses the audience. + + 3. **Never open with the brand.** The HOOK and PROBLEM beats must earn attention first. LiveCarta enters only once the viewer is already hooked. + + Write shots that follow these beats in order. Each beat can be one or more shots — distribute them based on how long the idea takes to say out loud, not on a fixed slot. + + HOOK — The hook question from the topic, spoken word-for-word. One punchy sentence. Creates immediate tension or curiosity. + PROBLEM — Build the relatable pain. Show the viewer their own situation. Make it vivid and specific, not abstract. + TENSION / RELATE — Go deeper into the problem. A concrete, personal detail that makes the viewer think "that's exactly me." + HOOK PAYOFF — Echo the opening hook question word-for-word, then immediately flip it with the insight. This is the emotional turning point. + SOLUTION — The concrete, actionable answer. Specific enough to be immediately useful. + CTA — Direct call to action. **Must name LiveCarta by name.** End with a follow or save prompt. Warm, not salesy. + + ## OUTPUT FORMAT + + Output exactly two sections. Start immediately with the first separator — no preamble. + + ------------------------------------------------------------ + SHOT LIST + For each shot use EXACTLY this format: + + SHOT [N] + Timestamp: [start]–[end] + Beat: [BEAT LABEL] + Voiceover: [2–5 seconds of speech (5–12 words). Vary length by beat — short for hooks/payoffs, longer for problem/tension/solution. Hard ceiling: 12 words.] + Text on screen: [3–6 word punchy overlay that punches up the voiceover, not just repeats it] + Image description: [Standalone AI image generation prompt. Describe: subject, composition, camera angle, lighting, color palette, mood, visual style. Make it cinematic and specific. 2–4 sentences. No references to "the reel", "the previous shot", or any other shot.] + + ------------------------------------------------------------ + CAPTION + + Write a 4–5 line Instagram caption: + Line 1: hook statement (echoes the reel's opening question) + Lines 2–3: expand the insight in 1–2 casual, direct sentences + Line 4: engagement question to the audience + Line 5: 8–10 relevant hashtags + + --- + + ## FINAL CHECKS before outputting + - Does the voiceover length vary across shots? Hooks and payoffs should be short (2–3s), problems and solutions longer (4–5s). + - Does any voiceover exceed 12 words? If so, split that shot. + - Does every shot's timestamp connect cleanly to the next? + - Does the total add up to 45–60 seconds? + - Does the CTA beat name LiveCarta explicitly? + - If the topic overlaps with LiveCarta's features, is there a natural mid-reel mention in the SOLUTION beat? + - Does the HOOK PAYOFF echo it verbatim before the flip? + - Is every Image description usable as a standalone AI generation prompt? + """ + + messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ] + + text = tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + + inputs = tokenizer(text, return_tensors="pt").to(model.device) + + print("Generating reel scenario..") + with torch.no_grad(): + output_ids = model.generate( + **inputs, + max_new_tokens=2000, + temperature=temperature, + top_p=top_p, + repetition_penalty=repetition_penalty, + do_sample=True, + pad_token_id=tokenizer.eos_token_id, + ) + + generated_ids = output_ids[0][inputs["input_ids"].shape[1]:] + result = tokenizer.decode(generated_ids, skip_special_tokens=True) + return result + + +def _parse_timestamp_seconds(ts_str: str) -> tuple[Optional[int], Optional[int]]: + ts_str = re.sub(r'\s*[–—]+\s*', '-', ts_str).strip() + + parts = ts_str.split('-') + if len(parts) != 2: + return None, None + + def to_seconds(s: str) -> Optional[int]: + s = s.strip() + if ':' in s: + m, sec = s.split(':', 1) + try: + return int(m) * 60 + int(sec) + except ValueError: + return None + try: + return int(s) + except ValueError: + return None + + return to_seconds(parts[0]), to_seconds(parts[1]) + + +def extract_field(label: str, next_label: Optional[str], text: str) -> str: + if next_label: + pattern = rf'{label}:\s*(.*?)(?=\n{next_label}:|$)' + else: + pattern = rf'{label}:\s*(.*?)$' + m = re.search(pattern, text, re.DOTALL | re.IGNORECASE) + if m: + return m.group(1).strip().strip('"') + return "" + + +def parse_reel_scenario(raw_scenario: str) -> dict: + """ + Parse the shot-by-shot reel scenario into a structured dict. + + Returns: + { + "shots": [ + { + "shot_number": 1, + "timestamp": "0–4", + "start_sec": 0, + "end_sec": 4, + "duration_sec": 4, + "beat": "HOOK", + "voiceover": "Will your major still matter in 5 years?", + "word_count": 9, + "speech_duration_sec": 3.6, # word_count / WORDS_PER_SECOND + "dead_air_sec": 0.4, # duration_sec - speech_duration_sec + "text_on_screen": "Your degree. Obsolete?", + "image_description": "Close-up of a university diploma ...", + }, + ... + ], + "caption": { + "body": "Will your major still matter in 5 years? ...", + "hashtags": ["#EdTech", "#AIEducation", ...], + }, + "total_duration_sec": 55, + + } + """ + result = { + "shots": [], + "caption": {"body": "", "hashtags": []}, + "total_duration_sec": 0, + } + raw_scenario = re.sub(r'.*?', '', raw_scenario, flags=re.DOTALL).strip() + cleaned = re.sub(r'\*+', '', raw_scenario) + + shot_section = re.search( + r'SHOT LIST\s*\n(.*?)(?=-{4,}\s*CAPTION|$)', + cleaned, re.DOTALL | re.IGNORECASE + ) + if shot_section: + shot_text = shot_section.group(1) + shot_blocks = re.split(r'\n(?=SHOT\s+\d+)', shot_text.strip()) + + for block in shot_blocks: + block = block.strip() + if not block: + continue + + shot_num_match = re.match(r'SHOT\s+(\d+)', block) + if not shot_num_match: + continue + shot_number = int(shot_num_match.group(1)) + + timestamp = extract_field("Timestamp", "Beat", block) + beat = extract_field("Beat", "Voiceover", block) + voiceover = extract_field("Voiceover", "Text on screen", block) + text_on_screen = extract_field("Text on screen", "Image description", block) + image_description = extract_field("Image description", None, block) + + timestamp_display = re.sub(r'\s*[–—-]+\s*', '–', timestamp) + + start_sec, end_sec = _parse_timestamp_seconds(timestamp) + duration_sec = (end_sec - start_sec) if (start_sec is not None and end_sec is not None) else None + word_count = len(voiceover.split()) if voiceover else 0 + speech_duration = round(word_count / WORDS_PER_SECOND, 1) + dead_air = round(duration_sec - speech_duration, 1) if duration_sec is not None else None + + result["shots"].append({ + "shot_number": shot_number, + "timestamp": timestamp_display, + "start_sec": start_sec, + "end_sec": end_sec, + "duration_sec": duration_sec, + "beat": beat.upper(), + "voiceover": voiceover, + "word_count": word_count, + "speech_duration_sec": speech_duration, + "dead_air_sec": dead_air, + "text_on_screen": text_on_screen, + "image_description": image_description, + }) + + caption_section = re.search( + r'CAPTION\s*\n(.*?)$', + cleaned, re.DOTALL | re.IGNORECASE + ) + + if caption_section: + caption_text = caption_section.group(1).strip() + lines = [l.strip() for l in caption_text.splitlines() if l.strip()] + hashtag_line = next((l for l in lines if l.startswith("#")), "") + body_lines = [l for l in lines if not l.startswith("#")] + result["caption"] = { + "body": "\n".join(body_lines).strip().strip('"'), + "hashtags": re.findall(r'#\w+', hashtag_line), + } + + return result + + +if __name__ == '__main__': + + with open("topic_description.txt", "r") as f: + topic = f.read() + + model, tokenizer = load_model() + scenario_raw = generate_reel_scenario(model, tokenizer, topic) + + parsed = parse_reel_scenario(scenario_raw) + + with open("reel_script.json", "w") as f: + json.dump(parsed, f) + \ No newline at end of file diff --git a/generate_videos.sh b/generate_videos.sh new file mode 100644 index 0000000..f929ad3 --- /dev/null +++ b/generate_videos.sh @@ -0,0 +1,98 @@ +#!/bin/bash + +# video length is derived from actual audio file duration + +HUNYUAN_DIR="/home/madina/projects/short_videos/HunyuanVideo-1.5" +REEL_SCRIPT="/home/madina/projects/short_videos/reel_script.json" +IMAGES_DIR="/home/madina/projects/short_videos/images" +VIDEOS_DIR="/home/madina/projects/short_videos/videos" +AUDIOS_DIR="/home/madina/projects/short_videos/audios" +MODEL_PATH="$HUNYUAN_DIR/ckpts" + +mkdir -p "$VIDEOS_DIR" + +export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128 + +# write shots to a temp TSV file (tab-separated: shot_number, prompt) +TMPFILE=$(mktemp /tmp/shots_XXXXXX.tsv) +python3 - < "$TMPFILE" +import json +d = json.load(open('$REEL_SCRIPT')) +for shot in d['shots']: + num = shot['shot_number'] + desc = shot['image_description'].replace('\t', ' ').replace('\n', ' ') + print(f'{num}\t{desc}') +EOF + +NUM_SHOTS=$(wc -l < "$TMPFILE") +echo "Found $NUM_SHOTS shots to generate" + +while IFS=$'\t' read -r shot_number prompt; do + IMAGE_PATH="$IMAGES_DIR/shot_${shot_number}.png" + OUTPUT_PATH="$VIDEOS_DIR/output_${shot_number}.mp4" + AUDIO_PATH="$AUDIOS_DIR/output_${shot_number}.mp3" + + # get audio duration and convert to frame count + if [ ! -f "$AUDIO_PATH" ]; then + echo "WARNING: No audio found at $AUDIO_PATH, falling back to 5s default." + DURATION=5.0 + else + DURATION=$(ffprobe -v error -show_entries format=duration \ + -of default=noprint_wrappers=1:nokey=1 "$AUDIO_PATH") + echo "Audio duration for shot $shot_number: ${DURATION}s" + fi + + VIDEO_LENGTH=$(python3 -c " +duration = float('$DURATION') +frames = int(duration * 24) + 1 +if frames % 2 == 0: + frames += 1 +frames = max(49, min(frames, 169)) +print(frames) +") + + echo "" + echo "Shot $shot_number | ${DURATION}s -> ${VIDEO_LENGTH} frames" + echo "Prompt: $prompt" + echo "Image: $IMAGE_PATH" + echo "Audio: $AUDIO_PATH" + echo "Output: $OUTPUT_PATH" + + if [ -f "$OUTPUT_PATH" ]; then + echo "OUTPUT_PATH already exists, can skip" + continue + fi + + if [ ! -f "$IMAGE_PATH" ]; then + echo "WARNING: image not found at $IMAGE_PATH, skipped" + continue + fi + + python3 -c "import torch; torch.cuda.empty_cache(); print('GPU cache cleared')" + + cd "$HUNYUAN_DIR" + torchrun --nproc_per_node=1 generate.py \ + --prompt "$prompt" \ + --image_path "$IMAGE_PATH" \ + --resolution 480p \ + --aspect_ratio 16:9 \ + --seed 1 \ + --video_length $VIDEO_LENGTH \ + --rewrite false \ + --cfg_distilled true \ + --enable_step_distill true \ + --sparse_attn false \ + --use_sageattn true \ + --enable_cache false \ + --overlap_group_offloading true \ + --sr false \ + --output_path "$OUTPUT_PATH" \ + --model_path "$MODEL_PATH" + + echo "shot $shot_number done" + +done < "$TMPFILE" + +rm -f "$TMPFILE" + +echo "Done" diff --git a/merge_audio_video.sh b/merge_audio_video.sh new file mode 100644 index 0000000..ccb87e1 --- /dev/null +++ b/merge_audio_video.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#merges videos/output_n.mp4 with audios/audio_n.mp3 -> merged/merged_n.mp4 +BASE_DIR="/home/madina/projects/short_videos" +VIDEOS_DIR="$BASE_DIR/videos" +AUDIOS_DIR="$BASE_DIR/audios" +OUTPUT_DIR="$BASE_DIR/merged" + +mkdir -p "$OUTPUT_DIR" + +for video in "$VIDEOS_DIR"/output_*.mp4; do + num=$(basename "$video" | sed 's/output_\([0-9]*\)\.mp4/\1/') + audio="$AUDIOS_DIR/output_${num}.mp3" + output="$OUTPUT_DIR/merged_${num}.mp4" + + if [ ! -f "$audio" ]; then + echo "WARNING: No audio found for shot $num ($audio); skipped" + continue + fi + + if [ -f "$output" ]; then + echo "Already exists; skipped the shot $num." + continue + fi + + echo "Merging shot $num: $video + $audio -> $output" + ffmpeg -i "$video" -i "$audio" \ + -c:v copy \ + -c:a aac \ + -shortest \ + -y "$output" + + echo "Done: $output" +done diff --git a/topic_description.txt b/topic_description.txt new file mode 100644 index 0000000..d307c88 --- /dev/null +++ b/topic_description.txt @@ -0,0 +1,6 @@ +**TITLE:** “I Only Use 3 Chapters Out Of This Book” – And That’s Okay +**CATEGORY:** Hot take / Controversial opinion +**TONE:** Empowering +**HOOK QUESTION:** Why pay for an entire book if you only read 10 pages? +**CONTENT SUMMARY:** Argue for customizable learning experiences through platforms like LiveCarta. +**TARGET AUDIENCE:** Students \ No newline at end of file