
    _LiZ              %          d Z ddlZddlZdZdZdZdZdZdZdZ	dZ
dZdZdZd	Zd
Zd Zd Zd ZdZd Zd ZdefdZd Zeeeeeee	eeeeddefdedededededededededededed ed!ed"ed#ed$ed%ef$d&Zd' Zed(k(  r e        yy))u)  Auto-correct cue start / end timecodes by scanning a reference waveform.

Primary algorithm — **hysteresis edge detection** (requires a clean voice-only
stem, e.g. HTDemucs `voice_waveform.json`):

  - Cue START = silence → voice rising edge closest to the claimed start.
  - Cue END   = voice → silence falling edge closest to the claimed end.

Thresholds are derived dynamically from the search window's peak distribution
so the detector auto-scales across quiet dialogue and shouted scenes.

Fallback — **nearest local minimum** (for intra-phrase cue splits where the
voice is continuous across the bound, so no rising/falling edge exists):

  - Smooth the peaks (box filter), enumerate local minima in the window,
    rank by depth + proximity to the claimed bound, return the best one.

The fallback is what v1 used exclusively when the waveform was mixed audio and
music made edge detection unreliable. Now that we read voice-only stems, edges
are usable and preferred.

Waveform structure: { "sampleRate": 100, "peaks": [0..1, ...] } at 10 ms steps.
    Ni  i  x   i  g333333?(   g      ?   c           	      b   t        dt        t        |dz  |z                    }|dz  }t        |       }dg|z  }t	        | dt        ||dz                }t        ||dz         }t        |      D ]<  }	||z  ||	<   |	|z   dz   }
|	|z
  }|
|k  r|| |
   z  }|dz  }|dk\  s0|| |   z  }|dz  }> |S )N                r   )maxintroundlensumminrange)peakssrsmoothing_msswhalfNout
window_sumcountiadd_irem_is               >/home/booth/atri-vox-suite/vo-booth/scripts/autocorrect_cue.py	_smoothedr   6   s    	QE,-234	5B7DE
A%!)CU1SD1H-./J4!8E1X 3e#AD1D19%,&J
A:%,&J
3 J    c           	         g }d }t        t        |dz   d      t        |t        |       dz
              D ]P  }| |   | |dz
     k  s| |   | |dz      k  s!|||z
  |k  r| |   | |   k  r||d<   |}>|j	                  |       |}R |S )Nr   )r   r   r   r   append)smoothi0i1min_spacingminimaprevks          r   _local_minimar+   H   s    FD3rAvq>3r3v;?#;< '!9va!e}$fQUm)CQX$<!9vd|+!"F2J1DMM!Qd' Mr    c           	      r   t        t        ||z              }t        t        ||z              }	t        d||	z
        }
t        t	        |       ||	z         }||
dz   k  ry t        | ||      }t        ||
|       }|dk  ry t        dt        t        |dz  |z                    }t        ||
||      }||z  }|D cg c]  }||   |k  s| }}|sy t        dt        t        |dz  |z                    }d }t        d      }|D ]*  }t        ||z
        }||z  |z  }||   |z   }||k  s'|}|}, |S c c}w )Nr      r   r   inf)	r   r   r   r   r   r   r+   floatabs)r   r   target_swindow_sr   min_spacing_msdepth_factordistance_bias_mstarget_iwinr%   r&   r$   
window_maxr'   
candidates	depth_capr*   bias_samplesbest
best_scoredistdist_penaltyscores                           r   _find_nearest_minrA   U   si    5B'(H
eHrM"
#C	Q3	B	SZC	(B	R!V|ub,/FVBr]#JQaU>D#82#=>?@Kvr2{;J\)I'B6!9	+A!BJBq#e$4t$;b$@ABCLDuJ )1x< |+z9q	L(:Jq) K Cs   ;D4	D4g?c                     t        |       }|dz  }||z
  }|}||k  r9d}	t        |||z         D ]  }
| |
   |k  s|	dz  }	 |	|k\  r||z
  S |dz  }||k  r9||z
  S )u_  Walk forward from onset_i; return number of samples until sustained silence
    (≥85% of `silence_min` samples below silence_thresh) appears.

    Used for duration-aware candidate filtering: a 200 ms burst at the snap point
    is almost certainly an onomatopoeia / breath / interjection when the cue's
    expected line duration is 2 seconds.
    333333?r   r   )r   r   )r   r   onset_isilence_threshsilence_minr   	threshold	end_limitr   silentjs              r   _voice_burst_duration_samplesrK      s     	E
Ad"IKIA
i-q!k/* 	AQx.(!	 Yw;	Q i- w;r    c           	          | || }|syt        |      }|t        t        |      dz
  t        t        |      dz                 }t	        d|t
        z        }t	        d|t        z        }||fS )N)g{Gz?gQ?r   g?g{Gz?g{Gz?)sortedr   r   r   r   DEFAULT_VOICE_FACTORDEFAULT_SILENCE_FACTOR)r   r%   r&   windowsorted_wp90voice_threshrE   s           r   _dynamic_thresholdsrT      ss    2b\Ff~H
3s8}q(#c(mc.A*BC
DCtS#778L|.DDEN''r    c	           
      \  " t        t        ||z              "t        d"t        t        ||z              z
        }	t        t	        |       "t        t        ||z              z         }
|
|	z
  dk  ryt        | |	|
      \  }}t        dt        t        |dz  |z                    }t        dt        t        |dz  |z                    }g }t        |	|z   |      }t        |
|z
  t	        |       |z
        }t        ||      D ]m  }d}t        |||z         D ]  }| |   |kD  s|dz  } ||dz  k  r0d}t        ||z
  |      D ]  }| |   |k  s|dz  } ||dz  k  r]|j                  |       o |syd}||dk  rt        |"fd	
      }nr||z  }d}t        d      }d}|D ]R  }t        | ||||      }t        |"z
        |z  }|dkD  r||z  nd}||k  r||z
  |z  dz  }nd}||z   } | |k  sM| }|}|}T ||k  rd}|}!|!dkD  r!| |!dz
     |kD  r|!dz  }!|!dkD  r| |!dz
     |kD  r|!|fS )u  Find silence → voice transition index closest to target_s within the
    asymmetric window [target - back_s, target + forward_s].

    Returns (onset_idx, low_confidence) tuple or (None, False) if no candidate.

    Two-pass detection:
      1. Anchor scan — collect ALL positions where sustained voice follows
         sustained silence (robust against mid-phrase peaks / HTDemucs bleed).
      2. Candidate selection:
         - Without expected_duration_s: pick anchor closest to target (legacy).
         - With expected_duration_s: score each anchor by (distance from target
           + onomatopoeia penalty if its voice-burst duration is much shorter
           than the cue's expected line length). Rejects grunts/breaths/ad-libs
           that would otherwise win on proximity alone. If no candidate clears
           the duration floor, the lowest-cost (closest plausible) is returned
           with low_confidence=True.
      3. Onset snap — from the chosen anchor, walk BACKWARD through every
         non-silent sample to find the true first audible sample of that voice
         burst. Removes the "lands late" bias on gradual ramps.
    r      )NFr   r   gffffff?FNc                      t        | z
        S )N)r0   )cr6   s    r   <lambda>z#_find_rising_edge.<locals>.<lambda>   s    SX-> r    )keyr.   r
   g      ?g      @T)r   r   r   r   r   rT   r   r#   r/   rK   r0   )#r   r   r1   back_s	forward_svoice_min_mssilence_min_msexpected_duration_sduration_floor_ratior%   r&   rS   rE   	voice_minrF   r9   lohir   voice_countrJ   silence_countlow_confidencer<   expected_samplesr=   
best_ratioanchorburst
distance_sratio	penalty_sr@   onsetr6   s#                                     @r   _find_rising_edgero      s   0 5B'(H	Q3uVb[122	3B	SZCi"n(=$>>	?B	Bw|#6ub"#E L.As5!4r!9:;<IaU>D#82#=>?@K J	R+{	+B	R)^SZ)3	4B2r] q!i-( 	!AQx,&q 	! S(q;* 	#AQx.("	# ;,,!"  N"&9Q&>:#>?.35\

  	#F1r6>;E Vh./"4J0@10DE,,#E ++1E9=PPSVV		*Ez!"
"
#	#* ,,!N
 E
!)eai(>9
 !)eai(>9.  r    c           
         t        t        ||z              }t        d|t        t        ||z              z
        }t        t	        |       |t        t        ||z              z         }	|	|z
  dk  ryt        | ||	      \  }
}t        dt        t        |dz  |z                    }t        dt        t        |dz  |z                    }d}t        d      }t        ||      }t        |	|z
  t	        |       |z
        }t        ||      D ]{  }d}t        |||z         D ]  }| |   |k  s|dz  } ||dz  k  r0d}t        t        d||z
        |      D ]  }| |   |kD  s| |   } ||
k  rdt        ||z
        }||k  sx|}|}} |yt	        |       }|}||k  r| |   |kD  r|dz  }||k  r	| |   |kD  rt        d||z
        }||kD  r!| |dz
     |k  r|dz  }||kD  r| |dz
     |k  r|S )	u  Find where voice gives way to sustained silence, closest to target_s.

    Mirror of `_find_rising_edge` for the trailing boundary:

      1. Anchor scan — find a position where sustained silence follows recent
         voice activity. Voice endings TAPER (loud → quiet → silent over ~500
         ms), so there's no sharp edge. Anchor detects: (a) sustained silence
         of `silence_min_ms` starting at index i, and (b) voice activity
         somewhere in the preceding `voice_lookback_ms` window. Max-based
         lookback tolerates taper — the phrase can be quiet at i-1 as long as
         it was loud earlier in the lookback.

      2. Offset snap — from the anchor, refine so the returned index is
         exactly "one past the last audible sample". The 85%-silent rule in
         the anchor tolerates a few stray tail peaks inside [i, i+silence_min],
         and the anchor may land a few samples after the true tail because
         the silence needed to sustain. Snap:
           (a) walk FORWARD while peaks[i] > silence_thresh — captures any
               straggler tail peaks the 85% rule allowed past the anchor;
           (b) walk BACKWARD while peaks[i-1] ≤ silence_thresh — pulls back
               past any silent samples inside the anchor region so the result
               lands immediately after the last audible peak.
         The back-walk is bounded to the lookback window so it can't cross a
         real silence gap into a previous voice burst.
    r   rV   Nr   r   r.   rC   r
   )	r   r   r   r   r   rT   r/   r   r0   )r   r   r1   r[   r\   r^   voice_lookback_msr6   r%   r&   rS   rE   rF   voice_lookbackr<   	best_distrb   rc   r   re   rJ   prev_maxr>   r   offset
back_limits                             r   _find_voice_endrw     sQ   6 5B'(H	Q3uVb[122	3B	SZCi"n(=$>>	?B	Bw|#6ub"#E L.aU>D#82#=>?@KC&7$&>&C DEFNDeI	R	 B	R+s5zK7	8B2r] 'q!k/* 	#AQx.("	# ;-- s1a.0115 	$AQx(" 8	$ l"1x< )IQd''* | 	E
AF
1*v7! 1*v7
 Q~-.J
:
%
"3~"E! :
%
"3~"EMr    r
   waveformoriginal_start_soriginal_end_sstart_back_msstart_forward_msend_back_msend_forward_msr]   r^   rq   r   r3   r4   r5   gate_thresholdr_   r`   returnc                    t        | j                  d      xs d      }| d   }|r|dkD  r|D cg c]  }||k\  r|nd }}|dz  }|dz  }|dz  }|dz  }||z   dz  }t        |||||||||	      \  }}|d	nd }|t        |||||
|||      }|d
nd}t	        |||||||	      }|dnd }|t        |||||
|||      }|d
nd}|||z  nd }|||z  nd }||n|} ||n|}!|!| k  rAt        | |z
        }"t        |!|z
        }#|"|#kD  r|} |r| dnd}n|}!|r| dnd}|!| k  r|} |}!||| |!t        t        | |z
  dz              t        t        |!|z
  dz              |d u|d ut        |      ||d|||||||	|
||||||ddS c c}w )N
sampleRated   r   r   r
   g     @@g       @)r_   r`   rising_edge	local_minnone	voice_end	_revertedr   )startendr{   r|   r}   r~   r]   r^   rq   r   r3   r4   r5   r   r_   r`   )original_startoriginal_endcorrected_startcorrected_enddelta_start_msdelta_end_msonset_foundoffset_foundrf   methodparams)r   getro   rA   rw   r0   r   bool)$rx   ry   rz   r{   r|   r}   r~   r]   r^   rq   r   r3   r4   r5   r   r_   r`   r   r   pstart_back_sstart_forward_s
end_back_send_forward_sfallback_window_s	start_idxstart_low_confstart_methodend_idx
end_methodr   r   final_start	final_endstart_delta	end_deltas$                                       r   autocorrect_cuer   [  s    
X\\,'.3	/BWE
 .1,<ABqa>)s2BB#/L&/O!/J$/M%73> !2r#on/1!I~ %.$9=tL%eR1ACT&2N&24DF	 '0&;{eR(-,.?AG !( 3J#E2~?P$0.$02BD %,$7[V
*3*?y2~TO&-&9Wr\tM%4%@/FVK%2%@-nI K+(889)67	"*K9El^956L&I5?J<y1VJ#*K&I +(&$e[3C%Ct$KLMeY>%At$KLM)5'5~.(<* 0&,(,!2(,( 0,#6$8
 w Cs   Fc                  6   t        j                  t        j                         d         } | j	                  d       | j	                  dt
        d       | j	                  dt
        d       | j	                  dt        t        	       | j	                  d
t        t        	       | j	                  dt        t        	       | j	                  dt        t        	       | j	                  dt        t        	       | j	                  dt        t        	       | j	                  dt        t        	       | j	                  dt        t        	       | j	                  dt        t        	       | j	                  dt
        t         	       | j	                  dt        t"        	       | j	                  dt
        dd       | j	                  dt
        d d       | j	                  dt
        t$        d       | j'                         }t)        |j*                        5 }t-        j.                  |      }d d d        t1        |j2                  |j4                  f|j6                  |j8                  |j:                  |j<                  |j>                  |j@                  |jB                  |jD                  |jF                  |jH                  |jJ                  |jL                  |jN                  |jP                  d}tS        t-        jT                  |d             y # 1 sw Y   xY w)Nr   )descriptionrx   z--startT)typerequiredz--endz--start-back-ms)r   defaultz--start-forward-msz--end-back-msz--end-forward-msz--voice-min-msz--silence-min-msz--voice-lookback-msz--smoothing-msz--min-spacing-msz--depth-factorz--distance-bias-msz--gate-thresholdr
   z4pre-zero peaks below this value (0..1); 0 = disabled)r   r   helpz--expected-duration-szucue's expected line duration in seconds; enables duration-aware candidate scoring that rejects onomatopoeia / breathsz--duration-floor-ratiozfreject candidate bursts whose duration is less than this fraction of expected_duration_s (default 0.4)r   r	   )indent)+argparseArgumentParser__doc__
splitlinesadd_argumentr/   r   DEFAULT_START_BACK_MSDEFAULT_START_FORWARD_MSDEFAULT_END_BACK_MSDEFAULT_END_FORWARD_MSDEFAULT_VOICE_MIN_MSDEFAULT_SILENCE_MIN_MSDEFAULT_VOICE_LOOKBACK_MSDEFAULT_SMOOTHING_MSDEFAULT_MIN_SPACING_MSDEFAULT_DEPTH_FACTORDEFAULT_DISTANCE_BIAS_MSDEFAULT_DURATION_FLOOR_RATIO
parse_argsopenrx   jsonloadr   r   r   r{   r|   r}   r~   r]   r^   rq   r   r3   r4   r5   r   r_   r`   printdumps)apargsfwfresults        r   _clir     sv   		 	 W-?-?-A!-D	EBOOJOOIEDO9OOGEDO9OO%s>SOTOO(s>VOWOOOs>QOROO&s>TOUOO$s>ROSOO&s>TOUOO)s>WOXOO$s>ROSOO&s>TOUOO$u>ROSOO(s>VOWOO&ucO  QOO+%Q  R OO,5B^@  A ==?D	dmm	 YYq\
DJJ((..$$**&&**00&&**&&..** 44!66F" 
$**VA
&'' s   LL__main__)r   r   r   r   r   r   r   r   r   r   rN   rO   r   r   r   r   r   r+   rA   r   rK   rT   ro   rw   dictr/   r   r   r   __name__ r    r   <module>r      s  .                   $
P  # .( +/+Gb!JMj *?,D':*@(<*@-F(<*@*>,D,/152Nld le lU l#&l&)l "%l %(	l
 #&l %(l (+l #&l %(l #(l '*l %*l */l +0l TXl^,(^ zF r    