ContentGeneration/generate_script.py

import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import re
from typing import Optional

device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_ID = "Qwen/Qwen3-14B"
WORDS_PER_SECOND = 2.5
MAX_DEAD_AIR_SECONDS = 1
MAX_VOICEOVER_SECONDS = 5.0
MAX_VOICEOVER_WORDS = int(MAX_VOICEOVER_SECONDS * WORDS_PER_SECOND)
MIN_VOICEOVER_WORDS = 5


def load_model(model_id: str = MODEL_ID):
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
    ).eval()

    return model, tokenizer


def generate_reel_scenario(
    model,
    tokenizer,
    content_summary: str,
    temperature: float = 0.75,
    top_p: float = 0.9,
    repetition_penalty: float = 1.1,
) -> str:
    """
    Generate a shot-by-shot Instagram Reel scenario where every script beat
    has its own image description for AI image/video generation.

    Each shot in the output contains:
      - Timestamp
      - Beat label (HOOK, PROBLEM, etc.)
      - Voiceover line
      - Text on screen
      - Image description (AI-generation-ready prompt)

    The hook question from the topic summary is preserved as the opening beat.
    """

    system_prompt = (
        "You are a professional Instagram Reel director and scriptwriter specializing in EdTech content. "
        "You think like a filmmaker: every line of voiceover has a visual that amplifies it. "
        "You write punchy, cinematic, scroll-stopping reels that feel native to social media. "
        "You write image descriptions like a cinematographer briefing an AI image generator — "
        "specific, vivid, atmospheric, with clear subject, composition, lighting, mood, and style."
    )

    user_prompt = f"""You are given a topic summary for an Instagram Reel.
    Your job is to direct a complete shot-by-shot reel scenario — like a real filmmaker laying out a storyboard.

    ## TOPIC SUMMARY
    {content_summary}

    ---

    ## PACING RULES — read these before writing a single shot

    These rules exist because every shot in this reel will be rendered as a real video clip. Timestamps must be tight and honest.

    **Speech rate:** spoken voiceover moves at roughly 2.5 words per second in a reel (energetic, not rushed).

    **VOICEOVER LENGTH: vary naturally between 2–5 seconds per shot. Diversity is encouraged.**
    Different beats call for different rhythms:
    - A hook or payoff line can be short and punchy: 2–3 seconds (5–8 words). Let it land.
    - A problem or tension beat needs more breath: 4–5 seconds (10–12 words). Build the feeling.
    - A CTA can be medium: 3–4 seconds (8–10 words). Direct and warm.

    The one hard constraint is the 5-second ceiling — the audio renderer cannot handle more than 5 seconds per shot (12 words). Never exceed this. If an idea needs more than 12 words, split it into two shots.

    VARIETY IN ACTION — a good reel sounds like this:
      SHOT 1 (HOOK, 2s):      "Will your major still matter in five years?" — short, punchy, stops the scroll
      SHOT 2 (PROBLEM, 5s):   "AI is already replacing writers, designers, and even doctors — fields we thought were safe." — builds tension
      SHOT 3 (TENSION, 4s):   "Most students have no idea this is already happening to them." — personal, lands hard
      SHOT 4 (PAYOFF, 3s):    "Will your major still matter? It depends on you." — echo + flip
      SHOT 5 (SOLUTION, 5s):  "The students who'll thrive are learning skills AI simply cannot replicate yet." — concrete, hopeful
      SHOT 6 (CTA, 3s):       "Follow us for tips on future-proofing your career." — warm, direct

    Notice how the lengths vary. That variation is intentional — it creates rhythm and keeps the viewer engaged.

    **Shot duration = voiceover duration + 0–1 second of breathing room.** Match the shot length to what you actually wrote, not to a preset slot.

    **Continuity.** The end timestamp of one shot is the start timestamp of the next. No gaps, no overlaps. The reel runs like a stopwatch.

    **Total reel length:** 45–60 seconds. Count it up before you finalize. If you're over, trim voiceover. If you're under, add a shot.

    **Shot count:** aim for 7–10 shots. More shots = faster pace = more energy. Fewer shots = slower, more contemplative. Match the tone of the topic.

    ---

    ## BRAND CONTEXT — read before writing any shot

    These reels are published on the **LiveCarta** Instagram account. LiveCarta is an AI-powered EdTech platform for higher education that lets educators build custom coursepacks by mixing and matching chapters from top publishers, adding their own materials, video, and notes — all in one place. Students get flexible, affordable access: buy whole books or individual chapters, read online or via the app. Key features: AI Curriculum Builder, Remix-on-Demand, pay-by-chapter pricing, LMS integration (Canvas, Blackboard), and real-time content updates.

    **Brand mention rules:**

    1. **CTA beat — always name LiveCarta.** The final CTA shot(s) must mention LiveCarta by name. Reference it naturally in context — e.g. "Follow LiveCarta for more", "Check out LiveCarta", "LiveCarta helps students stay ahead."

    2. **Mid-reel mention — only when there's a genuine, obvious overlap.** If the topic directly connects to what LiveCarta does, drop a natural product mention in the SOLUTION beat. Do NOT force it. The overlap is genuine when the topic is about: customizing course content, affordable textbooks, AI in education, flexible learning, keeping up with fast-changing fields, building skills alongside a degree. If the overlap isn't obvious, skip the mid-reel mention entirely — a forced plug feels fake and loses the audience.

    3. **Never open with the brand.** The HOOK and PROBLEM beats must earn attention first. LiveCarta enters only once the viewer is already hooked.

    Write shots that follow these beats in order. Each beat can be one or more shots — distribute them based on how long the idea takes to say out loud, not on a fixed slot.

      HOOK             — The hook question from the topic, spoken word-for-word. One punchy sentence. Creates immediate tension or curiosity.
      PROBLEM          — Build the relatable pain. Show the viewer their own situation. Make it vivid and specific, not abstract.
      TENSION / RELATE — Go deeper into the problem. A concrete, personal detail that makes the viewer think "that's exactly me."
      HOOK PAYOFF      — Echo the opening hook question word-for-word, then immediately flip it with the insight. This is the emotional turning point.
      SOLUTION         — The concrete, actionable answer. Specific enough to be immediately useful.
      CTA              — Direct call to action. **Must name LiveCarta by name.** End with a follow or save prompt. Warm, not salesy.

    ## OUTPUT FORMAT

    Output exactly two sections. Start immediately with the first separator — no preamble.

    ------------------------------------------------------------
    SHOT LIST
    For each shot use EXACTLY this format:

    SHOT [N]
    Timestamp: [start]–[end]
    Beat: [BEAT LABEL]
    Voiceover: [2–5 seconds of speech (5–12 words). Vary length by beat — short for hooks/payoffs, longer for problem/tension/solution. Hard ceiling: 12 words.]
    Text on screen: [3–6 word punchy overlay that punches up the voiceover, not just repeats it]
    Image description: [Standalone AI image generation prompt. Describe: subject, composition, camera angle, lighting, color palette, mood, visual style. Make it cinematic and specific. 2–4 sentences. No references to "the reel", "the previous shot", or any other shot.]

    ------------------------------------------------------------
    CAPTION

    Write a 4–5 line Instagram caption:
      Line 1: hook statement (echoes the reel's opening question)
      Lines 2–3: expand the insight in 1–2 casual, direct sentences
      Line 4: engagement question to the audience
      Line 5: 8–10 relevant hashtags

    ---

    ## FINAL CHECKS before outputting
    - Does the voiceover length vary across shots? Hooks and payoffs should be short (2–3s), problems and solutions longer (4–5s).
    - Does any voiceover exceed 12 words? If so, split that shot.
    - Does every shot's timestamp connect cleanly to the next?
    - Does the total add up to 45–60 seconds?
    - Does the CTA beat name LiveCarta explicitly?
    - If the topic overlaps with LiveCarta's features, is there a natural mid-reel mention in the SOLUTION beat?
    - Does the HOOK PAYOFF echo it verbatim before the flip?
    - Is every Image description usable as a standalone AI generation prompt?
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    inputs = tokenizer(text, return_tensors="pt").to(model.device)

    print("Generating reel scenario..")
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=2000,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_ids = output_ids[0][inputs["input_ids"].shape[1]:]
    result = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return result


def _parse_timestamp_seconds(ts_str: str) -> tuple[Optional[int], Optional[int]]:
    ts_str = re.sub(r'\s*[–—]+\s*', '-', ts_str).strip()

    parts = ts_str.split('-')
    if len(parts) != 2:
        return None, None

    def to_seconds(s: str) -> Optional[int]:
        s = s.strip()
        if ':' in s:
            m, sec = s.split(':', 1)
            try:
                return int(m) * 60 + int(sec)
            except ValueError:
                return None
        try:
            return int(s)
        except ValueError:
            return None

    return to_seconds(parts[0]), to_seconds(parts[1])


def extract_field(label: str, next_label: Optional[str], text: str) -> str:
    if next_label:
        pattern = rf'{label}:\s*(.*?)(?=\n{next_label}:|$)'
    else:
        pattern = rf'{label}:\s*(.*?)$'
    m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
    if m:
        return m.group(1).strip().strip('"')
    return ""


def parse_reel_scenario(raw_scenario: str) -> dict:
    """
    Parse the shot-by-shot reel scenario into a structured dict.

    Returns:
    {
        "shots": [
            {
                "shot_number":        1,
                "timestamp":          "0–4",
                "start_sec":          0,
                "end_sec":            4,
                "duration_sec":       4,
                "beat":               "HOOK",
                "voiceover":          "Will your major still matter in 5 years?",
                "word_count":         9,
                "speech_duration_sec": 3.6,   # word_count / WORDS_PER_SECOND
                "dead_air_sec":       0.4,     # duration_sec - speech_duration_sec
                "text_on_screen":     "Your degree. Obsolete?",
                "image_description":  "Close-up of a university diploma ...",
            },
            ...
        ],
        "caption": {
            "body":     "Will your major still matter in 5 years? ...",
            "hashtags": ["#EdTech", "#AIEducation", ...],
        },
        "total_duration_sec": 55,

    }
    """
    result = {
        "shots": [],
        "caption": {"body": "", "hashtags": []},
        "total_duration_sec": 0,
    }
    raw_scenario = re.sub(r'<think>.*?</think>', '', raw_scenario, flags=re.DOTALL).strip()
    cleaned = re.sub(r'\*+', '', raw_scenario)

    shot_section = re.search(
        r'SHOT LIST\s*\n(.*?)(?=-{4,}\s*CAPTION|$)',
        cleaned, re.DOTALL | re.IGNORECASE
    )
    if shot_section:
        shot_text = shot_section.group(1)
        shot_blocks = re.split(r'\n(?=SHOT\s+\d+)', shot_text.strip())

        for block in shot_blocks:
            block = block.strip()
            if not block:
                continue

            shot_num_match = re.match(r'SHOT\s+(\d+)', block)
            if not shot_num_match:
                continue
            shot_number = int(shot_num_match.group(1))

            timestamp         = extract_field("Timestamp",        "Beat",              block)
            beat              = extract_field("Beat",             "Voiceover",         block)
            voiceover         = extract_field("Voiceover",        "Text on screen",    block)
            text_on_screen    = extract_field("Text on screen",   "Image description", block)
            image_description = extract_field("Image description", None,               block)

            timestamp_display = re.sub(r'\s*[–—-]+\s*', '–', timestamp)

            start_sec, end_sec = _parse_timestamp_seconds(timestamp)
            duration_sec       = (end_sec - start_sec) if (start_sec is not None and end_sec is not None) else None
            word_count         = len(voiceover.split()) if voiceover else 0
            speech_duration    = round(word_count / WORDS_PER_SECOND, 1)
            dead_air           = round(duration_sec - speech_duration, 1) if duration_sec is not None else None

            result["shots"].append({
                "shot_number":         shot_number,
                "timestamp":           timestamp_display,
                "start_sec":           start_sec,
                "end_sec":             end_sec,
                "duration_sec":        duration_sec,
                "beat":                beat.upper(),
                "voiceover":           voiceover,
                "word_count":          word_count,
                "speech_duration_sec": speech_duration,
                "dead_air_sec":        dead_air,
                "text_on_screen":      text_on_screen,
                "image_description":   image_description,
            })

    caption_section = re.search(
        r'CAPTION\s*\n(.*?)$',
        cleaned, re.DOTALL | re.IGNORECASE
    )

    if caption_section:
        caption_text = caption_section.group(1).strip()
        lines = [l.strip() for l in caption_text.splitlines() if l.strip()]
        hashtag_line = next((l for l in lines if l.startswith("#")), "")
        body_lines   = [l for l in lines if not l.startswith("#")]
        result["caption"] = {
            "body":     "\n".join(body_lines).strip().strip('"'),
            "hashtags": re.findall(r'#\w+', hashtag_line),
        }

    return result


if __name__ == '__main__':

    with open("topic_description.txt", "r") as f:
        topic = f.read()

    model, tokenizer = load_model()
    scenario_raw = generate_reel_scenario(model, tokenizer, topic)

    parsed = parse_reel_scenario(scenario_raw)

    with open("reel_script.json", "w") as f:
        json.dump(parsed, f)