#!/usr/bin/env python3
"""
xlsx_to_cues.py — Ingest the master script .xlsx and emit a normalized cue
list the booth can load directly.

Reads exactly six columns from sheet "Script":
    col 1  Ep         → episode  (zero-padded string: "01")
    col 2  Line       → line     (int)
    col 5  Character  → character (string; "/" is a legitimate name)
    col 6  Subtitles  → phrase   (string)
    col 18 START TC   → start_ms (int, parsed from SRT "HH:MM:SS,mmm")
    col 19 END TC     → end_ms   (int)

Everything else in the spreadsheet is ignored.

Writes to:
    <script_dir>/script.cue.json

Usage (from repo root):
    python3 vo-booth/scripts/xlsx_to_cues.py
    python3 vo-booth/scripts/xlsx_to_cues.py --input path/to/other.xlsx
"""

from __future__ import annotations

import argparse
import datetime as dt
import json
import sys
from collections import Counter
from pathlib import Path

import openpyxl

SCRIPT_DIR_DEFAULT = (
    Path(__file__).resolve().parent.parent
    / "references"
    / "episodes"
    / "Script"
)
OUTPUT_FILENAME = "script.cue.json"

COL_EPISODE = 1   # "Ep "
COL_LINE = 2     # "Line"
COL_CHARACTER = 5  # "Character"
COL_PHRASE = 6    # "Subtitles"
COL_START_TC = 18  # "START TC"  — SRT "HH:MM:SS,mmm"
COL_END_TC = 19   # "END TC"


def parse_tc(value) -> int | None:
    """Parse a timecode cell into milliseconds. Accepts SRT strings
    ("00:00:01,939") and openpyxl datetime.time values."""
    if value is None:
        return None
    if isinstance(value, dt.time):
        return (
            value.hour * 3_600_000
            + value.minute * 60_000
            + value.second * 1_000
            + value.microsecond // 1_000
        )
    if isinstance(value, str):
        s = value.strip().replace(",", ".")
        if not s:
            return None
        try:
            h, m, rest = s.split(":")
            sec, _, ms = rest.partition(".")
            return (
                int(h) * 3_600_000
                + int(m) * 60_000
                + int(sec) * 1_000
                + int(ms.ljust(3, "0")[:3] or 0)
            )
        except ValueError:
            return None
    return None


def norm_episode(value) -> str | None:
    """Coerce episode cell to a zero-padded 2-digit string."""
    if value is None:
        return None
    s = str(value).strip()
    if not s:
        return None
    if s.isdigit():
        return s.zfill(2)
    return s  # let non-numeric episode tags pass through untouched


def find_xlsx(script_dir: Path) -> Path:
    xlsx_files = sorted(p for p in script_dir.glob("*.xlsx") if not p.name.startswith("~$"))
    if not xlsx_files:
        sys.exit(f"no .xlsx found in {script_dir}")
    if len(xlsx_files) > 1:
        sys.exit(
            f"multiple .xlsx files in {script_dir}; pass --input to pick one:\n  "
            + "\n  ".join(str(p) for p in xlsx_files)
        )
    return xlsx_files[0]


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--input", type=Path, help="path to the master .xlsx (defaults to the single .xlsx under references/episodes/Script/)")
    ap.add_argument("--output", type=Path, help=f"output path (defaults to <input dir>/{OUTPUT_FILENAME})")
    args = ap.parse_args()

    xlsx_path = args.input or find_xlsx(SCRIPT_DIR_DEFAULT)
    out_path = args.output or xlsx_path.parent / OUTPUT_FILENAME

    print(f"reading  {xlsx_path}", file=sys.stderr)
    wb = openpyxl.load_workbook(xlsx_path, data_only=True, read_only=True)
    if "Script" not in wb.sheetnames:
        sys.exit(f"sheet 'Script' not found in {xlsx_path}; sheets present: {wb.sheetnames}")
    ws = wb["Script"]

    cues: list[dict] = []
    skipped = Counter()

    # Skip header row; iterate by 1-indexed column via row tuple [col-1]
    for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2):
        episode = norm_episode(row[COL_EPISODE - 1])
        line = row[COL_LINE - 1]
        character = row[COL_CHARACTER - 1]
        phrase = row[COL_PHRASE - 1]
        start_ms = parse_tc(row[COL_START_TC - 1])
        end_ms = parse_tc(row[COL_END_TC - 1])

        if episode is None:
            skipped["no episode"] += 1
            continue
        if not isinstance(line, int):
            skipped["no line number"] += 1
            continue
        if character is None or str(character).strip() == "":
            skipped["no character"] += 1
            continue
        if start_ms is None or end_ms is None:
            skipped["missing timecode"] += 1
            continue

        cues.append({
            "episode": episode,
            "line": int(line),
            "character": str(character).strip(),
            "phrase": "" if phrase is None else str(phrase),
            "start_ms": start_ms,
            "end_ms": end_ms,
        })

    out_path.write_text(
        json.dumps(cues, ensure_ascii=False, indent=2) + "\n",
        encoding="utf-8",
    )

    per_ep = Counter(c["episode"] for c in cues)
    print(f"wrote    {out_path}", file=sys.stderr)
    print(f"         {len(cues)} cues across {len(per_ep)} episodes", file=sys.stderr)
    if skipped:
        print(f"skipped  {dict(skipped)}", file=sys.stderr)
    top = sorted(per_ep.items())[:10]
    print(f"first 10 episodes: {top}", file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())
