+
    ui3                        R t ^ RIt^ RIt^ RIt^ RIt^ RIt^ RIt^ RItR tRt	Rt
]
3R R lltR!R R lltR"R	 R
 lltR#R ltR R ltR R ltR$R R lltR^ R^PRRRRRRRRR^/R R lltR t]R8X  d
   ]! 4        R# R# )%u  Detect voice onset in a VO take and align it to the cue start.

Strategy (phase 1 — anchored inside the green focal area):
  1. Detect voice onset in the take file. Cascade silencedetect at a few
     thresholds, then fall back to an RMS envelope on decoded PCM when
     silencedetect doesn't find a plausible leading silence.
  2. Target position inside the file = preroll_ms + anchor_offset_ms, so
     voice lands slightly INSIDE cue.start on the master timeline (user
     asked for "somewhat inside" instead of exactly at the edge).
  3. Before shifting, sanity-check that the detected onset is inside the
     plausible green-zone window (preroll_ms ± shadow_lead on the early side,
     preroll_ms + cue_duration + shadow_trail on the late side). If it's
     outside that window, detection lied — skip the shift entirely and
     leave the take as recorded. Never clobber the take with a nonsense
     pad/trim that pushes voice outside the green zone.
  4. If detection is plausible, trim (actor came in late) or pad (actor came
     in early) so voice lands at target_in_file_ms.

Output is a re-encoded webm (libopus 128k) so edit points are sample-accurate.
Requires ffmpeg on PATH. Uses numpy for the RMS fallback.
Ng?gQ?c          
      T    V ^8  d   QhR\         R\        R\        R\        R\        /# )   
input_paththreshold_dbmin_silence_smin_voice_burst_sreturnstrfloatint)formats   "V/Users/a11/Documents/CLAUDE_ENVIRONMENTS/ATRI-VOX-SUITE/vo-booth/scripts/align_take.py__annotate__r   2   s2     % % %5 %QV %/4%NQ%    c           
        RRRRV RRV RV 2RR	R
.
p\         P                  ! VRRR7      pVP                  p\        P                  ! RV4       Uu. uF  p\        VP                  ^4      4      NK  	  pp\        P                  ! RV4       Uu. uF  p\        VP                  ^4      4      NK  	  p	pV'       d   V^ ,          R8  d	   V	'       g   R# \        R4      p
\        V	4       FW  w  rV^,           \        V4      8  d   W^,           ,          MT
pW,
          V8  g   K<  \        \        VR,          4      4      u # 	  R# u upi u upi )u  Run silencedetect once and return voice-onset ms if the file has
leading silence starting within 10ms of t=0 AND the first non-silent
burst sustains for at least min_voice_burst_s.

Sub-burst transients (record-start clicks, mouse clicks captured a few
ms above threshold) are skipped — the algorithm walks past to the first
silence_end whose subsequent voice burst is genuinely sustained. Without
this gate, a sub-millisecond click at e.g. 80 ms gets returned as voice
onset and the cascade locks in a false positive (B2 from 2026-04-30).

Returns -1 if no silence_end has a sustained voice burst — caller
relaxes the threshold (next cascade step) or falls back to the RMS
envelope detector.ffmpeg-hide_bannerz-nostats-i-afzsilencedetect=noise=zdB:duration=-fnull-T)capture_outputtextzsilence_start:\s*([\d.]+)zsilence_end:\s*([\d.]+)g{Gz?inf  )
subprocessrunstderrrefinditerr   group	enumeratelenr   round)r   r   r   r   cmdoutr    msilence_startssilence_endsINFiend
next_starts   &&&&          r   _silencedetect_onset_msr0   2   s.     	.*j%l^<Ofc	C ..T
=CZZF13=Y[a1bc1bAeAGGAJ'1bNc13=W[a1bc1bAeAGGAJ'1bLc~a047L	 ,CL)./!ec.6I.I^E*s
00uS4Z()) * I dcs   #E#Ec                H    V ^8  d   QhR\         R\        R\        R\        /# )r   r   max_duration_s	window_msr   r	   )r   s   "r   r   r   Z   s*     ++ ++s ++E ++TW ++ad ++r   c                :    ^ RI pRpRRRRRT RTR	 R
RR\        T4      RRR.p \        P                  ! TRRR7      pTP                  TP                  TP                  R7      P                  TP                  4      R,          pTP                  ^ 8X  d   R# \        ^\        YB,          R,          4      4      pTP                  T,          p	T	^8  d   R# TP                  TRY,           P                  Y4      ^,          P!                  ^R7      4      p
\        ^^T,          4      p\#        TP%                  T
RT 4      4      R,           p\        TR,          R4      pY8  pTP'                  4       '       g   R# \)        \+        T4      ^,
          4       F2  pY,          '       g   K  Y^,           ,          '       g   K*  Y,          u # 	  \        TP-                  4       4      T,          #   \         d    Ru # i ; i  \        P
                   d    Ru # i ; i)u   Fallback onset detector — decode the first `max_duration_s` of audio to
mono PCM and find the first sustained rise above the leading noise floor.

Returns onset in ms, or -1 if the envelope is flat / decode failed.Ni>  r   r   	-loglevelerrorr   z-tz.2fz-ac1z-arr   s16ler   T)r   check)dtypeg      @r   )axisgư>g      @g{Gz?r   )numpyImportErrorr
   r   r   CalledProcessError
frombufferstdoutint16astypefloat32sizemaxr   sqrtreshapemeanr   mediananyranger%   argmax)r   r2   r3   npsrr'   r(   pcmwinnrmshead_nfloorthreshabover-   s   &&&             r   _rms_envelope_onset_msrW   Z   s   
 
B.+wj$sE3r7gsCnnSTB --

"((-
3
:
:2::
F
PC
xx1}	
aR^d*+
,CCA1u	
''3x=((0A5;;;C
DCC9$%F"))CL)*T1E d#FLE99;;	 3u:>"88!e=  # u||~**I  	 (( 	s"   G- H  -G=<G= HH皙?c                H    V ^8  d   QhR\         R\        R\        R\        /# )r   r   r   r   r   r	   )r   s   "r   r   r      s*       5 QV ad r   c                ~    \          F  p\        W\        4      pV^ 8  g   K  Vu # 	  \        V 4      pV^ 8  d   V# ^ # )u   Return ms where voice starts within the file. Cascades silence thresholds,
then falls back to RMS envelope. Always returns ≥ 0 (0 = file starts hot).)_SILENCE_THRESHOLDS_DBr0   _SILENCE_MIN_DURATION_SrW   )r   r   r   thronset	rms_onsets   &&&   r   detect_onset_msr`      sC     &'
9PQ19L &
 'z2IA~r   c                      \         P                  ! V R R R R7       R#   \         P                   d7   p\        RT RTP                  P                  4       R,           24      ThRp?ii ; i)T)r9   r   r   zffmpeg failed (z): :Ni  NN)r   r   r>   RuntimeErrorr    strip)argsdescriptiones   && r   _ffmpegrg      s^    ^t44H(( ^_[MQXX^^=Md=S<TUV\]]^s    A)1A$$A)c                <    V ^8  d   QhR\         R\         R\        /# )r   r   output_pathtrim_msr
   r   )r   s   "r   r   r      s!       #  r   c                 N    \        R RRRRRVR,          R RV R	R
RRV.RR7       R# )r   r   r5   r6   -yz-ssr   z.3fr   -c:alibopus-b:a128ktrimre   Nrg   )r   ri   rj   s   &&&r   trim_from_startru      s>    .+w'$,s#j	66 r   c                <    V ^8  d   QhR\         R\         R\        /# )r   r   ri   pad_msrk   )r   s   "r   r   r      s!      S s C r   c                 H    \        R RRRRRV RRV RV 2R	R
RRV.RR7       R# )r   r   r5   r6   rm   r   r   zadelay=|rn   ro   rp   rq   padrs   Nrt   )r   ri   rw   s   &&&r   pad_silence_at_startr{      sB    .+wj&*	66 r   c          
      T    V ^8  d   QhR\         R\         R\        R\        R\        /# )r   r   ri   shift_mstolerance_msr   )r
   r   dict)r   s   "r   r   r      s0      3 S C s [_ r   c           
     *   \        V4      V8:  d   \        P                  ! W4       RpM%V^ 8  d   \        WV4       RpM\	        WV) 4       RpR\
        P                  P                  V 4      R\
        P                  P                  V4      RVRVRV/# )	u  Shift the audio timing by shift_ms (applied ABSOLUTELY to input_path).

shift_ms > 0 → trim shift_ms from the start (voice plays earlier on timeline)
shift_ms < 0 → pad |shift_ms| of silence at the start (voice plays later)
|shift_ms| ≤ tolerance_ms → copy unchanged
copyrr   rz   inputoutputr}   actionr~   )absshutilcopyfileru   r{   ospathabspath)r   ri   r}   r~   r   s   &&&& r   
shift_taker      s     8}$
0	A
:Zxi@ 	,"''//+.H& r   cue_duration_msanchor_offset_msshadow_lead_ms,  shadow_trail_ms  r   r   r~   c                    V ^8  d   QhR\         R\         R\        R\        R\        R\        R\        R\        R	\        R
\        R\        /# )r   r   ri   
preroll_msr   r   r   r   r   r   r~   r   )r
   r   r   r   )r   s   "r   r   r      sv     = === =
 = = = = = = = 
=r   c                  \        WV4      p
\        ^ W%,
          4      pY#^ 8  d   TMR,           V,           pW8  g   W8  dn   \        P                  ! W4       R\        P
                  P                  V 4      R\        P
                  P                  V4      R^ RRRV	RV
R	VR
VRVRW.RRRVRV/# W$,           pW,
          p\        WW4      pWR&   W/R	&   WOR
&   W?R&   WR&   WR&   V# )a;  Detect voice onset and shift so voice lands at preroll_ms + anchor_offset_ms.

cue_duration_ms: width of the green focal area. When > 0, the detected onset
    is sanity-checked to fall within the plausible green-zone window. If it
    falls outside, the shift is skipped entirely (take is left as recorded).
anchor_offset_ms: how far past cue.start to target voice onset. Small positive
    value puts voice "somewhat inside" the cue rather than exactly at the edge.
shadow_lead_ms / shadow_trail_ms: tolerance on either side of the cue window
    for the sanity check.
i  r   r   r}   r   skipr~   detected_onset_msr   r   r   plausible_range_msreasononset_out_of_green_zoner   r   )r`   rE   r   r   r   r   r   r   )r   ri   r   r   r   r   r   r   r   r~   onset_msmin_expectedmax_expectedtarget_in_file_msr}   infos   &&&$$$$$$$      r   align_take_to_cuer      s   . zGH q*56La4GTRUddL("9 	
0RWW__Z0bggook2fL* 0 <">/L]
 	
  #5+HjxFD (	#/	-	')Kr   c                     \         P                  ! \        P                  4       ^ ,          R7      p V P	                  RRR7       V P	                  RRR7       V P	                  R\
        RR	R
7       V P	                  R\
        ^ RR7       V P	                  R\
        ^PRR7       V P	                  R\
        RRR7       V P	                  R\
        RRR7       V P	                  R\        R RR7       V P	                  R\        RRR7       V P	                  R\
        ^RR7       V P                  4       p\        VP                  VP                  VP                  VP                  VP                  VP                  VP                  VP                   VP"                  VP$                  R7
      p\'        \(        P*                  ! V^R7      4       R# )!    rs   r   z Input audio file (webm/wav/etc.))helpr   zOutput webm filez	--prerollTz:Expected voice-onset position in ms (the cue's preroll_ms))typerequiredr   z--cue-durationzKCue duration ms (for green-zone sanity check); 0 = unknown, use wide window)r   defaultr   z--anchor-offsetz5Target voice this many ms past cue.start (default 80)z--shadow-leadr   zEarly-side shadow tolerance msz--shadow-trailr   zLate-side shadow tolerance msz--thresholdz"Silence threshold dB (default -40)z--min-silencerX   z$Min silence duration s (default 0.1)z--toleranceu-   No-op if |shift| ≤ this many ms (default 5))r   r   r   r   r   r   r~   )indentN      D)argparseArgumentParser__doc__
splitlinesadd_argumentr   r   
parse_argsr   r   r   prerollcue_durationanchor_offsetshadow_leadshadow_trail	thresholdmin_silence	toleranceprintjsondumps)aprd   results      r   _clir     sw   		 	 W-?-?-A!-D	EBOOG"DOEOOH#5O6OOKcDU  WOO$3f  hOO%CP  ROOO#sAaObOO$3BaObOOMuCgOhOOO%CiOjOOMQ=lOm==?D

DKK))++''))^^&&^^	F 
$**VA
&'r   __main__r   )r   g     @g      <)g      @   )r   rX   ) )   )r   r   r   r   r!   r   r   sysr[   r\   _MIN_VOICE_BURST_Sr0   rW   r`   rg   ru   r{   r   r   r   __name__ r   r   <module>r      s   *   	 	   

 /     8J%P++\ ^4=
 = = = =  = = =@(: zF r   