forked from LiveCarta/ContentGeneration
Video generation pipelines files added
This commit is contained in:
1
.env
Normal file
1
.env
Normal file
@@ -0,0 +1 @@
|
|||||||
|
ELEVENLABS_API_KEY=sk_e343522cb3fd4da2d46844e81e1152e3de2a72cd1430a383
|
||||||
1
HunyuanVideo-1.5
Submodule
1
HunyuanVideo-1.5
Submodule
Submodule HunyuanVideo-1.5 added at 2641c0de73
22
concat_merged.sh
Normal file
22
concat_merged.sh
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Concatenates all merged/merged_n.mp4 into final_output.mp4
|
||||||
|
|
||||||
|
BASE_DIR="/home/madina/projects/short_videos"
|
||||||
|
MERGED_DIR="$BASE_DIR/merged"
|
||||||
|
OUTPUT="$BASE_DIR/results/run_3/final_output.mp4"
|
||||||
|
FILELIST=$(mktemp /tmp/filelist_XXXXXX.txt)
|
||||||
|
|
||||||
|
# Build file list sorted by shot number
|
||||||
|
for video in $(ls "$MERGED_DIR"/merged_*.mp4 | sort -t_ -k2 -n); do
|
||||||
|
echo "file '$video'" >> "$FILELIST"
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Concatenating the following files:"
|
||||||
|
cat "$FILELIST"
|
||||||
|
|
||||||
|
ffmpeg -f concat -safe 0 -i "$FILELIST" -c copy -y "$OUTPUT"
|
||||||
|
|
||||||
|
rm -f "$FILELIST"
|
||||||
|
echo ""
|
||||||
|
echo "Done"
|
||||||
35
generate_audios.py
Normal file
35
generate_audios.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from elevenlabs.client import ElevenLabs
|
||||||
|
from elevenlabs.play import play
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
ELEVENLABS_API_KEY = os.getenv('ELEVENLABS_API_KEY')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
script_path = "reel_script.json"
|
||||||
|
with open(script_path, "r") as f:
|
||||||
|
reel_data = json.load(f)
|
||||||
|
|
||||||
|
client = ElevenLabs(
|
||||||
|
api_key=ELEVENLABS_API_KEY
|
||||||
|
)
|
||||||
|
for shot in reel_data["shots"]:
|
||||||
|
print(shot["shot_number"], shot["voiceover"])
|
||||||
|
prompt = shot["voiceover"]
|
||||||
|
audio = client.text_to_speech.convert(
|
||||||
|
text=prompt,
|
||||||
|
voice_id="JBFqnCBsd6RMkjVDRZzb",
|
||||||
|
model_id="eleven_multilingual_v2",
|
||||||
|
output_format="mp3_44100_128",
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_bytes = b"".join(audio)
|
||||||
|
|
||||||
|
if not os.path.exists("audios"):
|
||||||
|
os.makedirs("audios")
|
||||||
|
with open(f"audios/output_{shot["shot_number"]}.mp3", "wb") as f:
|
||||||
|
f.write(audio_bytes)
|
||||||
28
generate_images.py
Normal file
28
generate_images.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
import torch
|
||||||
|
from diffusers import FluxPipeline
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
script_path = "reel_script.json"
|
||||||
|
with open(script_path, "r") as f:
|
||||||
|
reel_data = json.load(f)
|
||||||
|
|
||||||
|
pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
|
||||||
|
pipe.enable_model_cpu_offload()
|
||||||
|
|
||||||
|
for shot in reel_data["shots"]:
|
||||||
|
print(shot["shot_number"], shot["image_description"])
|
||||||
|
prompt = shot["image_description"]
|
||||||
|
image = pipe(
|
||||||
|
prompt,
|
||||||
|
guidance_scale=0.0,
|
||||||
|
num_inference_steps=4,
|
||||||
|
max_sequence_length=256,
|
||||||
|
generator=torch.Generator("cpu").manual_seed(0)
|
||||||
|
).images[0]
|
||||||
|
|
||||||
|
if not os.path.exists("images"):
|
||||||
|
os.makedirs("images")
|
||||||
|
image.save(f"images/shot_{shot["shot_number"]}.png")
|
||||||
344
generate_script.py
Normal file
344
generate_script.py
Normal file
@@ -0,0 +1,344 @@
|
|||||||
|
import torch
|
||||||
|
import json
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
||||||
|
import re
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
MODEL_ID = "Qwen/Qwen3-14B"
|
||||||
|
WORDS_PER_SECOND = 2.5
|
||||||
|
MAX_DEAD_AIR_SECONDS = 1
|
||||||
|
MAX_VOICEOVER_SECONDS = 5.0
|
||||||
|
MAX_VOICEOVER_WORDS = int(MAX_VOICEOVER_SECONDS * WORDS_PER_SECOND)
|
||||||
|
MIN_VOICEOVER_WORDS = 5
|
||||||
|
|
||||||
|
|
||||||
|
def load_model(model_id: str = MODEL_ID):
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
bnb_4bit_quant_type="nf4",
|
||||||
|
)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id,
|
||||||
|
quantization_config=bnb_config,
|
||||||
|
device_map="auto",
|
||||||
|
trust_remote_code=True,
|
||||||
|
).eval()
|
||||||
|
|
||||||
|
return model, tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def generate_reel_scenario(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
content_summary: str,
|
||||||
|
temperature: float = 0.75,
|
||||||
|
top_p: float = 0.9,
|
||||||
|
repetition_penalty: float = 1.1,
|
||||||
|
) -> str:
|
||||||
|
"""
|
||||||
|
Generate a shot-by-shot Instagram Reel scenario where every script beat
|
||||||
|
has its own image description for AI image/video generation.
|
||||||
|
|
||||||
|
Each shot in the output contains:
|
||||||
|
- Timestamp
|
||||||
|
- Beat label (HOOK, PROBLEM, etc.)
|
||||||
|
- Voiceover line
|
||||||
|
- Text on screen
|
||||||
|
- Image description (AI-generation-ready prompt)
|
||||||
|
|
||||||
|
The hook question from the topic summary is preserved as the opening beat.
|
||||||
|
"""
|
||||||
|
|
||||||
|
system_prompt = (
|
||||||
|
"You are a professional Instagram Reel director and scriptwriter specializing in EdTech content. "
|
||||||
|
"You think like a filmmaker: every line of voiceover has a visual that amplifies it. "
|
||||||
|
"You write punchy, cinematic, scroll-stopping reels that feel native to social media. "
|
||||||
|
"You write image descriptions like a cinematographer briefing an AI image generator — "
|
||||||
|
"specific, vivid, atmospheric, with clear subject, composition, lighting, mood, and style."
|
||||||
|
)
|
||||||
|
|
||||||
|
user_prompt = f"""You are given a topic summary for an Instagram Reel.
|
||||||
|
Your job is to direct a complete shot-by-shot reel scenario — like a real filmmaker laying out a storyboard.
|
||||||
|
|
||||||
|
## TOPIC SUMMARY
|
||||||
|
{content_summary}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## PACING RULES — read these before writing a single shot
|
||||||
|
|
||||||
|
These rules exist because every shot in this reel will be rendered as a real video clip. Timestamps must be tight and honest.
|
||||||
|
|
||||||
|
**Speech rate:** spoken voiceover moves at roughly 2.5 words per second in a reel (energetic, not rushed).
|
||||||
|
|
||||||
|
**VOICEOVER LENGTH: vary naturally between 2–5 seconds per shot. Diversity is encouraged.**
|
||||||
|
Different beats call for different rhythms:
|
||||||
|
- A hook or payoff line can be short and punchy: 2–3 seconds (5–8 words). Let it land.
|
||||||
|
- A problem or tension beat needs more breath: 4–5 seconds (10–12 words). Build the feeling.
|
||||||
|
- A CTA can be medium: 3–4 seconds (8–10 words). Direct and warm.
|
||||||
|
|
||||||
|
The one hard constraint is the 5-second ceiling — the audio renderer cannot handle more than 5 seconds per shot (12 words). Never exceed this. If an idea needs more than 12 words, split it into two shots.
|
||||||
|
|
||||||
|
VARIETY IN ACTION — a good reel sounds like this:
|
||||||
|
SHOT 1 (HOOK, 2s): "Will your major still matter in five years?" — short, punchy, stops the scroll
|
||||||
|
SHOT 2 (PROBLEM, 5s): "AI is already replacing writers, designers, and even doctors — fields we thought were safe." — builds tension
|
||||||
|
SHOT 3 (TENSION, 4s): "Most students have no idea this is already happening to them." — personal, lands hard
|
||||||
|
SHOT 4 (PAYOFF, 3s): "Will your major still matter? It depends on you." — echo + flip
|
||||||
|
SHOT 5 (SOLUTION, 5s): "The students who'll thrive are learning skills AI simply cannot replicate yet." — concrete, hopeful
|
||||||
|
SHOT 6 (CTA, 3s): "Follow us for tips on future-proofing your career." — warm, direct
|
||||||
|
|
||||||
|
Notice how the lengths vary. That variation is intentional — it creates rhythm and keeps the viewer engaged.
|
||||||
|
|
||||||
|
**Shot duration = voiceover duration + 0–1 second of breathing room.** Match the shot length to what you actually wrote, not to a preset slot.
|
||||||
|
|
||||||
|
**Continuity.** The end timestamp of one shot is the start timestamp of the next. No gaps, no overlaps. The reel runs like a stopwatch.
|
||||||
|
|
||||||
|
**Total reel length:** 45–60 seconds. Count it up before you finalize. If you're over, trim voiceover. If you're under, add a shot.
|
||||||
|
|
||||||
|
**Shot count:** aim for 7–10 shots. More shots = faster pace = more energy. Fewer shots = slower, more contemplative. Match the tone of the topic.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## BRAND CONTEXT — read before writing any shot
|
||||||
|
|
||||||
|
These reels are published on the **LiveCarta** Instagram account. LiveCarta is an AI-powered EdTech platform for higher education that lets educators build custom coursepacks by mixing and matching chapters from top publishers, adding their own materials, video, and notes — all in one place. Students get flexible, affordable access: buy whole books or individual chapters, read online or via the app. Key features: AI Curriculum Builder, Remix-on-Demand, pay-by-chapter pricing, LMS integration (Canvas, Blackboard), and real-time content updates.
|
||||||
|
|
||||||
|
**Brand mention rules:**
|
||||||
|
|
||||||
|
1. **CTA beat — always name LiveCarta.** The final CTA shot(s) must mention LiveCarta by name. Reference it naturally in context — e.g. "Follow LiveCarta for more", "Check out LiveCarta", "LiveCarta helps students stay ahead."
|
||||||
|
|
||||||
|
2. **Mid-reel mention — only when there's a genuine, obvious overlap.** If the topic directly connects to what LiveCarta does, drop a natural product mention in the SOLUTION beat. Do NOT force it. The overlap is genuine when the topic is about: customizing course content, affordable textbooks, AI in education, flexible learning, keeping up with fast-changing fields, building skills alongside a degree. If the overlap isn't obvious, skip the mid-reel mention entirely — a forced plug feels fake and loses the audience.
|
||||||
|
|
||||||
|
3. **Never open with the brand.** The HOOK and PROBLEM beats must earn attention first. LiveCarta enters only once the viewer is already hooked.
|
||||||
|
|
||||||
|
Write shots that follow these beats in order. Each beat can be one or more shots — distribute them based on how long the idea takes to say out loud, not on a fixed slot.
|
||||||
|
|
||||||
|
HOOK — The hook question from the topic, spoken word-for-word. One punchy sentence. Creates immediate tension or curiosity.
|
||||||
|
PROBLEM — Build the relatable pain. Show the viewer their own situation. Make it vivid and specific, not abstract.
|
||||||
|
TENSION / RELATE — Go deeper into the problem. A concrete, personal detail that makes the viewer think "that's exactly me."
|
||||||
|
HOOK PAYOFF — Echo the opening hook question word-for-word, then immediately flip it with the insight. This is the emotional turning point.
|
||||||
|
SOLUTION — The concrete, actionable answer. Specific enough to be immediately useful.
|
||||||
|
CTA — Direct call to action. **Must name LiveCarta by name.** End with a follow or save prompt. Warm, not salesy.
|
||||||
|
|
||||||
|
## OUTPUT FORMAT
|
||||||
|
|
||||||
|
Output exactly two sections. Start immediately with the first separator — no preamble.
|
||||||
|
|
||||||
|
------------------------------------------------------------
|
||||||
|
SHOT LIST
|
||||||
|
For each shot use EXACTLY this format:
|
||||||
|
|
||||||
|
SHOT [N]
|
||||||
|
Timestamp: [start]–[end]
|
||||||
|
Beat: [BEAT LABEL]
|
||||||
|
Voiceover: [2–5 seconds of speech (5–12 words). Vary length by beat — short for hooks/payoffs, longer for problem/tension/solution. Hard ceiling: 12 words.]
|
||||||
|
Text on screen: [3–6 word punchy overlay that punches up the voiceover, not just repeats it]
|
||||||
|
Image description: [Standalone AI image generation prompt. Describe: subject, composition, camera angle, lighting, color palette, mood, visual style. Make it cinematic and specific. 2–4 sentences. No references to "the reel", "the previous shot", or any other shot.]
|
||||||
|
|
||||||
|
------------------------------------------------------------
|
||||||
|
CAPTION
|
||||||
|
|
||||||
|
Write a 4–5 line Instagram caption:
|
||||||
|
Line 1: hook statement (echoes the reel's opening question)
|
||||||
|
Lines 2–3: expand the insight in 1–2 casual, direct sentences
|
||||||
|
Line 4: engagement question to the audience
|
||||||
|
Line 5: 8–10 relevant hashtags
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## FINAL CHECKS before outputting
|
||||||
|
- Does the voiceover length vary across shots? Hooks and payoffs should be short (2–3s), problems and solutions longer (4–5s).
|
||||||
|
- Does any voiceover exceed 12 words? If so, split that shot.
|
||||||
|
- Does every shot's timestamp connect cleanly to the next?
|
||||||
|
- Does the total add up to 45–60 seconds?
|
||||||
|
- Does the CTA beat name LiveCarta explicitly?
|
||||||
|
- If the topic overlaps with LiveCarta's features, is there a natural mid-reel mention in the SOLUTION beat?
|
||||||
|
- Does the HOOK PAYOFF echo it verbatim before the flip?
|
||||||
|
- Is every Image description usable as a standalone AI generation prompt?
|
||||||
|
"""
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": system_prompt},
|
||||||
|
{"role": "user", "content": user_prompt},
|
||||||
|
]
|
||||||
|
|
||||||
|
text = tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
||||||
|
|
||||||
|
print("Generating reel scenario..")
|
||||||
|
with torch.no_grad():
|
||||||
|
output_ids = model.generate(
|
||||||
|
**inputs,
|
||||||
|
max_new_tokens=2000,
|
||||||
|
temperature=temperature,
|
||||||
|
top_p=top_p,
|
||||||
|
repetition_penalty=repetition_penalty,
|
||||||
|
do_sample=True,
|
||||||
|
pad_token_id=tokenizer.eos_token_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
|
||||||
|
result = tokenizer.decode(generated_ids, skip_special_tokens=True)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_timestamp_seconds(ts_str: str) -> tuple[Optional[int], Optional[int]]:
|
||||||
|
ts_str = re.sub(r'\s*[–—]+\s*', '-', ts_str).strip()
|
||||||
|
|
||||||
|
parts = ts_str.split('-')
|
||||||
|
if len(parts) != 2:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
def to_seconds(s: str) -> Optional[int]:
|
||||||
|
s = s.strip()
|
||||||
|
if ':' in s:
|
||||||
|
m, sec = s.split(':', 1)
|
||||||
|
try:
|
||||||
|
return int(m) * 60 + int(sec)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(s)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return to_seconds(parts[0]), to_seconds(parts[1])
|
||||||
|
|
||||||
|
|
||||||
|
def extract_field(label: str, next_label: Optional[str], text: str) -> str:
|
||||||
|
if next_label:
|
||||||
|
pattern = rf'{label}:\s*(.*?)(?=\n{next_label}:|$)'
|
||||||
|
else:
|
||||||
|
pattern = rf'{label}:\s*(.*?)$'
|
||||||
|
m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
return m.group(1).strip().strip('"')
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def parse_reel_scenario(raw_scenario: str) -> dict:
|
||||||
|
"""
|
||||||
|
Parse the shot-by-shot reel scenario into a structured dict.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"shots": [
|
||||||
|
{
|
||||||
|
"shot_number": 1,
|
||||||
|
"timestamp": "0–4",
|
||||||
|
"start_sec": 0,
|
||||||
|
"end_sec": 4,
|
||||||
|
"duration_sec": 4,
|
||||||
|
"beat": "HOOK",
|
||||||
|
"voiceover": "Will your major still matter in 5 years?",
|
||||||
|
"word_count": 9,
|
||||||
|
"speech_duration_sec": 3.6, # word_count / WORDS_PER_SECOND
|
||||||
|
"dead_air_sec": 0.4, # duration_sec - speech_duration_sec
|
||||||
|
"text_on_screen": "Your degree. Obsolete?",
|
||||||
|
"image_description": "Close-up of a university diploma ...",
|
||||||
|
},
|
||||||
|
...
|
||||||
|
],
|
||||||
|
"caption": {
|
||||||
|
"body": "Will your major still matter in 5 years? ...",
|
||||||
|
"hashtags": ["#EdTech", "#AIEducation", ...],
|
||||||
|
},
|
||||||
|
"total_duration_sec": 55,
|
||||||
|
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
result = {
|
||||||
|
"shots": [],
|
||||||
|
"caption": {"body": "", "hashtags": []},
|
||||||
|
"total_duration_sec": 0,
|
||||||
|
}
|
||||||
|
raw_scenario = re.sub(r'<think>.*?</think>', '', raw_scenario, flags=re.DOTALL).strip()
|
||||||
|
cleaned = re.sub(r'\*+', '', raw_scenario)
|
||||||
|
|
||||||
|
shot_section = re.search(
|
||||||
|
r'SHOT LIST\s*\n(.*?)(?=-{4,}\s*CAPTION|$)',
|
||||||
|
cleaned, re.DOTALL | re.IGNORECASE
|
||||||
|
)
|
||||||
|
if shot_section:
|
||||||
|
shot_text = shot_section.group(1)
|
||||||
|
shot_blocks = re.split(r'\n(?=SHOT\s+\d+)', shot_text.strip())
|
||||||
|
|
||||||
|
for block in shot_blocks:
|
||||||
|
block = block.strip()
|
||||||
|
if not block:
|
||||||
|
continue
|
||||||
|
|
||||||
|
shot_num_match = re.match(r'SHOT\s+(\d+)', block)
|
||||||
|
if not shot_num_match:
|
||||||
|
continue
|
||||||
|
shot_number = int(shot_num_match.group(1))
|
||||||
|
|
||||||
|
timestamp = extract_field("Timestamp", "Beat", block)
|
||||||
|
beat = extract_field("Beat", "Voiceover", block)
|
||||||
|
voiceover = extract_field("Voiceover", "Text on screen", block)
|
||||||
|
text_on_screen = extract_field("Text on screen", "Image description", block)
|
||||||
|
image_description = extract_field("Image description", None, block)
|
||||||
|
|
||||||
|
timestamp_display = re.sub(r'\s*[–—-]+\s*', '–', timestamp)
|
||||||
|
|
||||||
|
start_sec, end_sec = _parse_timestamp_seconds(timestamp)
|
||||||
|
duration_sec = (end_sec - start_sec) if (start_sec is not None and end_sec is not None) else None
|
||||||
|
word_count = len(voiceover.split()) if voiceover else 0
|
||||||
|
speech_duration = round(word_count / WORDS_PER_SECOND, 1)
|
||||||
|
dead_air = round(duration_sec - speech_duration, 1) if duration_sec is not None else None
|
||||||
|
|
||||||
|
result["shots"].append({
|
||||||
|
"shot_number": shot_number,
|
||||||
|
"timestamp": timestamp_display,
|
||||||
|
"start_sec": start_sec,
|
||||||
|
"end_sec": end_sec,
|
||||||
|
"duration_sec": duration_sec,
|
||||||
|
"beat": beat.upper(),
|
||||||
|
"voiceover": voiceover,
|
||||||
|
"word_count": word_count,
|
||||||
|
"speech_duration_sec": speech_duration,
|
||||||
|
"dead_air_sec": dead_air,
|
||||||
|
"text_on_screen": text_on_screen,
|
||||||
|
"image_description": image_description,
|
||||||
|
})
|
||||||
|
|
||||||
|
caption_section = re.search(
|
||||||
|
r'CAPTION\s*\n(.*?)$',
|
||||||
|
cleaned, re.DOTALL | re.IGNORECASE
|
||||||
|
)
|
||||||
|
|
||||||
|
if caption_section:
|
||||||
|
caption_text = caption_section.group(1).strip()
|
||||||
|
lines = [l.strip() for l in caption_text.splitlines() if l.strip()]
|
||||||
|
hashtag_line = next((l for l in lines if l.startswith("#")), "")
|
||||||
|
body_lines = [l for l in lines if not l.startswith("#")]
|
||||||
|
result["caption"] = {
|
||||||
|
"body": "\n".join(body_lines).strip().strip('"'),
|
||||||
|
"hashtags": re.findall(r'#\w+', hashtag_line),
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
with open("topic_description.txt", "r") as f:
|
||||||
|
topic = f.read()
|
||||||
|
|
||||||
|
model, tokenizer = load_model()
|
||||||
|
scenario_raw = generate_reel_scenario(model, tokenizer, topic)
|
||||||
|
|
||||||
|
parsed = parse_reel_scenario(scenario_raw)
|
||||||
|
|
||||||
|
with open("reel_script.json", "w") as f:
|
||||||
|
json.dump(parsed, f)
|
||||||
|
|
||||||
98
generate_videos.sh
Normal file
98
generate_videos.sh
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# video length is derived from actual audio file duration
|
||||||
|
|
||||||
|
HUNYUAN_DIR="/home/madina/projects/short_videos/HunyuanVideo-1.5"
|
||||||
|
REEL_SCRIPT="/home/madina/projects/short_videos/reel_script.json"
|
||||||
|
IMAGES_DIR="/home/madina/projects/short_videos/images"
|
||||||
|
VIDEOS_DIR="/home/madina/projects/short_videos/videos"
|
||||||
|
AUDIOS_DIR="/home/madina/projects/short_videos/audios"
|
||||||
|
MODEL_PATH="$HUNYUAN_DIR/ckpts"
|
||||||
|
|
||||||
|
mkdir -p "$VIDEOS_DIR"
|
||||||
|
|
||||||
|
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True,max_split_size_mb:128
|
||||||
|
|
||||||
|
# write shots to a temp TSV file (tab-separated: shot_number, prompt)
|
||||||
|
TMPFILE=$(mktemp /tmp/shots_XXXXXX.tsv)
|
||||||
|
python3 - <<EOF > "$TMPFILE"
|
||||||
|
import json
|
||||||
|
d = json.load(open('$REEL_SCRIPT'))
|
||||||
|
for shot in d['shots']:
|
||||||
|
num = shot['shot_number']
|
||||||
|
desc = shot['image_description'].replace('\t', ' ').replace('\n', ' ')
|
||||||
|
print(f'{num}\t{desc}')
|
||||||
|
EOF
|
||||||
|
|
||||||
|
NUM_SHOTS=$(wc -l < "$TMPFILE")
|
||||||
|
echo "Found $NUM_SHOTS shots to generate"
|
||||||
|
|
||||||
|
while IFS=$'\t' read -r shot_number prompt; do
|
||||||
|
IMAGE_PATH="$IMAGES_DIR/shot_${shot_number}.png"
|
||||||
|
OUTPUT_PATH="$VIDEOS_DIR/output_${shot_number}.mp4"
|
||||||
|
AUDIO_PATH="$AUDIOS_DIR/output_${shot_number}.mp3"
|
||||||
|
|
||||||
|
# get audio duration and convert to frame count
|
||||||
|
if [ ! -f "$AUDIO_PATH" ]; then
|
||||||
|
echo "WARNING: No audio found at $AUDIO_PATH, falling back to 5s default."
|
||||||
|
DURATION=5.0
|
||||||
|
else
|
||||||
|
DURATION=$(ffprobe -v error -show_entries format=duration \
|
||||||
|
-of default=noprint_wrappers=1:nokey=1 "$AUDIO_PATH")
|
||||||
|
echo "Audio duration for shot $shot_number: ${DURATION}s"
|
||||||
|
fi
|
||||||
|
|
||||||
|
VIDEO_LENGTH=$(python3 -c "
|
||||||
|
duration = float('$DURATION')
|
||||||
|
frames = int(duration * 24) + 1
|
||||||
|
if frames % 2 == 0:
|
||||||
|
frames += 1
|
||||||
|
frames = max(49, min(frames, 169))
|
||||||
|
print(frames)
|
||||||
|
")
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Shot $shot_number | ${DURATION}s -> ${VIDEO_LENGTH} frames"
|
||||||
|
echo "Prompt: $prompt"
|
||||||
|
echo "Image: $IMAGE_PATH"
|
||||||
|
echo "Audio: $AUDIO_PATH"
|
||||||
|
echo "Output: $OUTPUT_PATH"
|
||||||
|
|
||||||
|
if [ -f "$OUTPUT_PATH" ]; then
|
||||||
|
echo "OUTPUT_PATH already exists, can skip"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f "$IMAGE_PATH" ]; then
|
||||||
|
echo "WARNING: image not found at $IMAGE_PATH, skipped"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
python3 -c "import torch; torch.cuda.empty_cache(); print('GPU cache cleared')"
|
||||||
|
|
||||||
|
cd "$HUNYUAN_DIR"
|
||||||
|
torchrun --nproc_per_node=1 generate.py \
|
||||||
|
--prompt "$prompt" \
|
||||||
|
--image_path "$IMAGE_PATH" \
|
||||||
|
--resolution 480p \
|
||||||
|
--aspect_ratio 16:9 \
|
||||||
|
--seed 1 \
|
||||||
|
--video_length $VIDEO_LENGTH \
|
||||||
|
--rewrite false \
|
||||||
|
--cfg_distilled true \
|
||||||
|
--enable_step_distill true \
|
||||||
|
--sparse_attn false \
|
||||||
|
--use_sageattn true \
|
||||||
|
--enable_cache false \
|
||||||
|
--overlap_group_offloading true \
|
||||||
|
--sr false \
|
||||||
|
--output_path "$OUTPUT_PATH" \
|
||||||
|
--model_path "$MODEL_PATH"
|
||||||
|
|
||||||
|
echo "shot $shot_number done"
|
||||||
|
|
||||||
|
done < "$TMPFILE"
|
||||||
|
|
||||||
|
rm -f "$TMPFILE"
|
||||||
|
|
||||||
|
echo "Done"
|
||||||
34
merge_audio_video.sh
Normal file
34
merge_audio_video.sh
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#merges videos/output_n.mp4 with audios/audio_n.mp3 -> merged/merged_n.mp4
|
||||||
|
BASE_DIR="/home/madina/projects/short_videos"
|
||||||
|
VIDEOS_DIR="$BASE_DIR/videos"
|
||||||
|
AUDIOS_DIR="$BASE_DIR/audios"
|
||||||
|
OUTPUT_DIR="$BASE_DIR/merged"
|
||||||
|
|
||||||
|
mkdir -p "$OUTPUT_DIR"
|
||||||
|
|
||||||
|
for video in "$VIDEOS_DIR"/output_*.mp4; do
|
||||||
|
num=$(basename "$video" | sed 's/output_\([0-9]*\)\.mp4/\1/')
|
||||||
|
audio="$AUDIOS_DIR/output_${num}.mp3"
|
||||||
|
output="$OUTPUT_DIR/merged_${num}.mp4"
|
||||||
|
|
||||||
|
if [ ! -f "$audio" ]; then
|
||||||
|
echo "WARNING: No audio found for shot $num ($audio); skipped"
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f "$output" ]; then
|
||||||
|
echo "Already exists; skipped the shot $num."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Merging shot $num: $video + $audio -> $output"
|
||||||
|
ffmpeg -i "$video" -i "$audio" \
|
||||||
|
-c:v copy \
|
||||||
|
-c:a aac \
|
||||||
|
-shortest \
|
||||||
|
-y "$output"
|
||||||
|
|
||||||
|
echo "Done: $output"
|
||||||
|
done
|
||||||
6
topic_description.txt
Normal file
6
topic_description.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
**TITLE:** “I Only Use 3 Chapters Out Of This Book” – And That’s Okay
|
||||||
|
**CATEGORY:** Hot take / Controversial opinion
|
||||||
|
**TONE:** Empowering
|
||||||
|
**HOOK QUESTION:** Why pay for an entire book if you only read 10 pages?
|
||||||
|
**CONTENT SUMMARY:** Argue for customizable learning experiences through platforms like LiveCarta.
|
||||||
|
**TARGET AUDIENCE:** Students
|
||||||
Reference in New Issue
Block a user