
    ui3                     T   d Z ddlZddlZddlZddlZddlZddlZddlZdZdZ	dZ
e
fdededed	ed
ef
dZd(dededed
efdZd)dededed
efdZd*dZdededefdZdededefdZd+dedededed
ef
dZdddddddddeded ed!ed"ed#ed$edededed
efd%Zd& Zed'k(  r e        yy),u  Detect voice onset in a VO take and align it to the cue start.

Strategy (phase 1 — anchored inside the green focal area):
  1. Detect voice onset in the take file. Cascade silencedetect at a few
     thresholds, then fall back to an RMS envelope on decoded PCM when
     silencedetect doesn't find a plausible leading silence.
  2. Target position inside the file = preroll_ms + anchor_offset_ms, so
     voice lands slightly INSIDE cue.start on the master timeline (user
     asked for "somewhat inside" instead of exactly at the edge).
  3. Before shifting, sanity-check that the detected onset is inside the
     plausible green-zone window (preroll_ms ± shadow_lead on the early side,
     preroll_ms + cue_duration + shadow_trail on the late side). If it's
     outside that window, detection lied — skip the shift entirely and
     leave the take as recorded. Never clobber the take with a nonsense
     pad/trim that pushes voice outside the green zone.
  4. If detection is plausible, trim (actor came in late) or pad (actor came
     in early) so voice lands at target_in_file_ms.

Output is a re-encoded webm (libopus 128k) so edit points are sample-accurate.
Requires ffmpeg on PATH. Uses numpy for the RMS fallback.
    N)      Dg     @g      <g?gQ?
input_paththreshold_dbmin_silence_smin_voice_burst_sreturnc           
      >   dddd| dd| d| dd	d
g
}t        j                  |dd      }|j                  }t        j                  d|      D cg c]  }t        |j                  d             }}t        j                  d|      D cg c]  }t        |j                  d             }	}|r
|d   dk  r|	syt        d      }
t        |	      D ]A  \  }}|dz   t        |      k  r||dz      n|
}||z
  |k\  s*t        t        |dz              c S  yc c}w c c}w )u  Run silencedetect once and return voice-onset ms if the file has
    leading silence starting within 10ms of t=0 AND the first non-silent
    burst sustains for at least min_voice_burst_s.

    Sub-burst transients (record-start clicks, mouse clicks captured a few
    ms above threshold) are skipped — the algorithm walks past to the first
    silence_end whose subsequent voice burst is genuinely sustained. Without
    this gate, a sub-millisecond click at e.g. 80 ms gets returned as voice
    onset and the cascade locks in a false positive (B2 from 2026-04-30).

    Returns -1 if no silence_end has a sustained voice burst — caller
    relaxes the threshold (next cascade step) or falls back to the RMS
    envelope detector.ffmpeg-hide_bannerz-nostats-i-afzsilencedetect=noise=zdB:duration=-fnull-T)capture_outputtextzsilence_start:\s*([\d.]+)   zsilence_end:\s*([\d.]+)r   g{Gz?inf  )
subprocessrunstderrrefinditerfloatgroup	enumeratelenintround)r   r   r   r   cmdoutr   msilence_startssilence_endsINFiend
next_starts                 9/home/booth/atri-vox-suite/vo-booth/scripts/align_take.py_silencedetect_onset_msr,   2   s/     	.*j%l^<Ofc	C ..T
=CZZF13=Y[a1bcAeAGGAJ'cNc13=W[a1bcAeAGGAJ'cLc~a047L ,CL) *3./!ec.6I.I^AE*s
00uS4Z())*  dcs   !D
!Dmax_duration_s	window_msc                    	 ddl }d}ddddd	| d
|ddddt        |      dddg}	 t        j                  |dd      }|j                  |j                  |j                        j                  |j                        dz  }|j                  dk(  ryt        dt        ||z  dz              }|j                  |z  }	|	dk  ry|j                  |d|	|z   j                  |	|      dz  j!                  d            }
t        dd|z        }t#        |j%                  |
d|             dz   }t        |dz  d      }|
|kD  }|j'                         syt)        t+        |      dz
        D ]  }||   s	||dz      s||z  c S  t        |j-                               |z  S # t        $ r Y yw xY w# t        j
                  $ r Y yw xY w)u   Fallback onset detector — decode the first `max_duration_s` of audio to
    mono PCM and find the first sustained rise above the leading noise floor.

    Returns onset in ms, or -1 if the envelope is flat / decode failed.r   Nr   i>  r
   r   	-loglevelerrorr   z-tz.2fz-ac1z-arr   s16ler   T)r   check)dtypeg      @r   r         )axis   gư>g      @g{Gz?)numpyImportErrorstrr   r   CalledProcessError
frombufferstdoutint16astypefloat32sizemaxr    sqrtreshapemeanr   mediananyranger   argmax)r   r-   r.   npsrr"   r#   pcmwinnrmshead_nfloorthreshabover(   s                   r+   _rms_envelope_onset_msrV   Z   s   
 
B.+wj$sE3r7gsCnnSTB --

"((-
3
:
:2::
F
PC
xx1}
aR)^d*+
,CCA1u
''3xC=((C0A5;;;C
DCC9$%F"))CL)*T1E d#F&LE99; 3u:>" !8a!ey= ! u||~**I   (( s"   F F* 	F'&F'*G ?G r   皙?c                 r    t         D ]  }t        | |t              }|dkD  s|c S  t        |       }|dk\  r|S y)u   Return ms where voice starts within the file. Cascades silence thresholds,
    then falls back to RMS envelope. Always returns ≥ 0 (0 = file starts hot).r   )_SILENCE_THRESHOLDS_DBr,   _SILENCE_MIN_DURATION_SrV   )r   r   r   thronset	rms_onsets         r+   detect_onset_msr^      sJ     & '
C9PQ19L
 'z2IA~    c           	          	 t        j                  | ddd       y # t         j                  $ r2}t        d| d|j                  j                         d d        |d }~ww xY w)NT)r4   r   r   zffmpeg failed (z): i  )r   r   r=   RuntimeErrorr   strip)argsdescriptiones      r+   _ffmpegrf      s`    ^t44H(( ^_[MQXX^^=Mds=S<TUV\]]^s    A!-AA!output_pathtrim_msc                 D    t        dddddd|dz  dd	| d
ddd|gd       y )Nr
   r   r0   r1   -yz-ssr   z.3fr   -c:alibopus-b:a128ktrimrd   rf   )r   rg   rh   s      r+   trim_from_startrr      s>    .+w'$,s#j	66 r_   pad_msc                 F    t        dddddd| dd| d	| d
ddd|gd       y )Nr
   r   r0   r1   rj   r   r   zadelay=|rk   rl   rm   rn   padrp   rq   )r   rg   rs   s      r+   pad_silence_at_startrw      sB    .+wj&*	66 r_      shift_mstolerance_msc                    t        |      |k  rt        j                  | |       d}n%|dkD  rt        | ||       d}nt	        | ||        d}t
        j                  j                  |       t
        j                  j                  |      |||dS )u#  Shift the audio timing by shift_ms (applied ABSOLUTELY to input_path).

    shift_ms > 0 → trim shift_ms from the start (voice plays earlier on timeline)
    shift_ms < 0 → pad |shift_ms| of silence at the start (voice plays later)
    |shift_ms| ≤ tolerance_ms → copy unchanged
    copyr   ro   rv   )inputoutputry   actionrz   )absshutilcopyfilerr   rw   ospathabspath)r   rg   ry   rz   r   s        r+   
shift_taker      s     8}$
K0	A
K:Zxi@ ,''//+.$ r_   P   ,    cue_duration_msanchor_offset_msshadow_lead_msshadow_trail_msr   r   rz   
preroll_msr   r   r   r   c                   t        | ||      }
t        d||z
        }||dkD  r|ndz   |z   }|
|k  s|
|kD  rbt        j                  | |       t        j
                  j                  |       t        j
                  j                  |      dd|	|
|||||gd||dS ||z   }|
|z
  }t        | |||	      }|
|d<   ||d<   ||d<   ||d	<   ||d
<   ||d<   |S )a[  Detect voice onset and shift so voice lands at preroll_ms + anchor_offset_ms.

    cue_duration_ms: width of the green focal area. When > 0, the detected onset
        is sanity-checked to fall within the plausible green-zone window. If it
        falls outside, the shift is skipped entirely (take is left as recorded).
    anchor_offset_ms: how far past cue.start to target voice onset. Small positive
        value puts voice "somewhat inside" the cue rather than exactly at the edge.
    shadow_lead_ms / shadow_trail_ms: tolerance on either side of the cue window
        for the sanity check.
    r   i  skiponset_out_of_green_zone)r}   r~   ry   r   rz   detected_onset_msr   r   r   plausible_range_msreasonr   r   r   r   r   r   r   r   )r^   rD   r   r   r   r   r   r   )r   rg   r   r   r   r   r   r   r   rz   onset_msmin_expectedmax_expectedtarget_in_file_msry   infos                   r+   align_take_to_cuer      s   . z<GH q*~56LOa4GTRUddL,(\"9 	
K0WW__Z0ggook2(!)$ 0.#/">/(*
 	
  #%55++Hj+xFD (D	#D/D	-D	'D)DKr_   c                     t        j                  t        j                         d         } | j	                  dd       | j	                  dd       | j	                  dt
        d	d
       | j	                  dt
        dd       | j	                  dt
        dd       | j	                  dt
        dd       | j	                  dt
        dd       | j	                  dt        dd       | j	                  dt        dd       | j	                  dt
        dd        | j                         }t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                   |j"                  |j$                  !
      }t'        t)        j*                  |d"#             y )$Nr   rp   r}   z Input audio file (webm/wav/etc.))helpr~   zOutput webm filez	--prerollTz:Expected voice-onset position in ms (the cue's preroll_ms))typerequiredr   z--cue-durationzKCue duration ms (for green-zone sanity check); 0 = unknown, use wide window)r   defaultr   z--anchor-offsetr   z5Target voice this many ms past cue.start (default 80)z--shadow-leadr   zEarly-side shadow tolerance msz--shadow-trailr   zLate-side shadow tolerance msz--thresholdr   z"Silence threshold dB (default -40)z--min-silencerW   z$Min silence duration s (default 0.1)z--tolerancerx   u-   No-op if |shift| ≤ this many ms (default 5)r   r7   )indent)argparseArgumentParser__doc__
splitlinesadd_argumentr    r   
parse_argsr   r}   r~   prerollcue_durationanchor_offsetshadow_leadshadow_trail	thresholdmin_silence	toleranceprintjsondumps)aprc   results      r+   _clir     sw   		 	 W-?-?-A!-D	EBOOG"DOEOOH#5O6OOKcDU  WOO$3f  hOO%CP  ROOO#sAaObOO$3BaObOOMuCgOhOOO%CiOjOOMQ=lOm==?D

DKK))++''))^^&&^^	F 
$**VA
&'r_   __main__)g      @   )r   rW   ) )rx   )r   r   r   r   r   r   r   sysrY   rZ   _MIN_VOICE_BURST_Sr<   r   r    r,   rV   r^   rf   rr   rw   dictr   r   r   __name__ r_   r+   <module>r      s  *   	 	   

 /     8J% %5 %QV %/4%NQ%P++s ++E ++TW ++ad ++\ 5 QV ad  ^ #  S s C 3 S C s [_ > === =
 = = = = = = = 
=@(: zF r_   