#!/usr/bin/env python3 """ xlsx_to_cues.py — Ingest the master script .xlsx and emit a normalized cue list the booth can load directly. Reads exactly six columns from sheet "Script": col 1 Ep → episode (zero-padded string: "01") col 2 Line → line (int) col 5 Character → character (string; "/" is a legitimate name) col 6 Subtitles → phrase (string) col 18 START TC → start_ms (int, parsed from SRT "HH:MM:SS,mmm") col 19 END TC → end_ms (int) Everything else in the spreadsheet is ignored. Writes to: /script.cue.json Usage (from repo root): python3 vo-booth/scripts/xlsx_to_cues.py python3 vo-booth/scripts/xlsx_to_cues.py --input path/to/other.xlsx """ from __future__ import annotations import argparse import datetime as dt import json import sys from collections import Counter from pathlib import Path import openpyxl SCRIPT_DIR_DEFAULT = ( Path(__file__).resolve().parent.parent / "references" / "episodes" / "Script" ) OUTPUT_FILENAME = "script.cue.json" COL_EPISODE = 1 # "Ep " COL_LINE = 2 # "Line" COL_CHARACTER = 5 # "Character" COL_PHRASE = 6 # "Subtitles" COL_START_TC = 18 # "START TC" — SRT "HH:MM:SS,mmm" COL_END_TC = 19 # "END TC" def parse_tc(value) -> int | None: """Parse a timecode cell into milliseconds. Accepts SRT strings ("00:00:01,939") and openpyxl datetime.time values.""" if value is None: return None if isinstance(value, dt.time): return ( value.hour * 3_600_000 + value.minute * 60_000 + value.second * 1_000 + value.microsecond // 1_000 ) if isinstance(value, str): s = value.strip().replace(",", ".") if not s: return None try: h, m, rest = s.split(":") sec, _, ms = rest.partition(".") return ( int(h) * 3_600_000 + int(m) * 60_000 + int(sec) * 1_000 + int(ms.ljust(3, "0")[:3] or 0) ) except ValueError: return None return None def norm_episode(value) -> str | None: """Coerce episode cell to a zero-padded 2-digit string.""" if value is None: return None s = str(value).strip() if not s: return None if s.isdigit(): return s.zfill(2) return s # let non-numeric episode tags pass through untouched def find_xlsx(script_dir: Path) -> Path: xlsx_files = sorted(p for p in script_dir.glob("*.xlsx") if not p.name.startswith("~$")) if not xlsx_files: sys.exit(f"no .xlsx found in {script_dir}") if len(xlsx_files) > 1: sys.exit( f"multiple .xlsx files in {script_dir}; pass --input to pick one:\n " + "\n ".join(str(p) for p in xlsx_files) ) return xlsx_files[0] def main() -> int: ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--input", type=Path, help="path to the master .xlsx (defaults to the single .xlsx under references/episodes/Script/)") ap.add_argument("--output", type=Path, help=f"output path (defaults to /{OUTPUT_FILENAME})") args = ap.parse_args() xlsx_path = args.input or find_xlsx(SCRIPT_DIR_DEFAULT) out_path = args.output or xlsx_path.parent / OUTPUT_FILENAME print(f"reading {xlsx_path}", file=sys.stderr) wb = openpyxl.load_workbook(xlsx_path, data_only=True, read_only=True) if "Script" not in wb.sheetnames: sys.exit(f"sheet 'Script' not found in {xlsx_path}; sheets present: {wb.sheetnames}") ws = wb["Script"] cues: list[dict] = [] skipped = Counter() # Skip header row; iterate by 1-indexed column via row tuple [col-1] for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2): episode = norm_episode(row[COL_EPISODE - 1]) line = row[COL_LINE - 1] character = row[COL_CHARACTER - 1] phrase = row[COL_PHRASE - 1] start_ms = parse_tc(row[COL_START_TC - 1]) end_ms = parse_tc(row[COL_END_TC - 1]) if episode is None: skipped["no episode"] += 1 continue if not isinstance(line, int): skipped["no line number"] += 1 continue if character is None or str(character).strip() == "": skipped["no character"] += 1 continue if start_ms is None or end_ms is None: skipped["missing timecode"] += 1 continue cues.append({ "episode": episode, "line": int(line), "character": str(character).strip(), "phrase": "" if phrase is None else str(phrase), "start_ms": start_ms, "end_ms": end_ms, }) out_path.write_text( json.dumps(cues, ensure_ascii=False, indent=2) + "\n", encoding="utf-8", ) per_ep = Counter(c["episode"] for c in cues) print(f"wrote {out_path}", file=sys.stderr) print(f" {len(cues)} cues across {len(per_ep)} episodes", file=sys.stderr) if skipped: print(f"skipped {dict(skipped)}", file=sys.stderr) top = sorted(per_ep.items())[:10] print(f"first 10 episodes: {top}", file=sys.stderr) return 0 if __name__ == "__main__": sys.exit(main())