
    i(                     z    d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZ e G d d             Zy)	zi
Voice Activity Detection (VAD) Module

Energy-based VAD using RMS with adaptive noise floor estimation.
    N)	dataclassfield)Optional   )
SAMPLE_RATE_INBYTES_PER_SAMPLEVAD_FRAME_MSEND_SILENCE_MSMIN_UTTERANCE_MSMIN_RMS_THRESHOLDNOISE_MULTIPLIERNOISE_EMA_ALPHAFRAME_BYTES_8KMIN_PCM_BYTES_8Kc                      e Zd ZU dZeZeed<   eZ	eed<   e
Zeed<   eZeed<   eZeed<   eZeed<    edd	
      Zeed<    edd	
      Zeed<    ed	d	
      Zeed<    edd	
      Zeed<    edd	
      Zeed<    ed	d	
      Zeed<   edefd       Zedefd       Zedefd       Zedefd       Z edefd       Z!d Z"d Z#dede$e%   fdZ&dede'e%   fdZ(de'e%   fdZ)de'e%   fd Z*defd!Z+defd"Z,y#)$VADDetectora6  
    Voice Activity Detector using energy-based RMS thresholding.

    Uses exponential moving average (EMA) for adaptive noise floor estimation.

    Attributes:
        frame_ms: Duration of each analysis frame in milliseconds
        end_silence_ms: Silence duration (ms) to mark end of utterance
        min_utterance_ms: Minimum utterance length (ms) to accept
        min_rms_threshold: Absolute minimum RMS threshold
        noise_multiplier: Factor to multiply noise floor for threshold
        noise_ema_alpha: EMA smoothing factor for noise estimation
    frame_msend_silence_msmin_utterance_msmin_rms_thresholdnoise_multipliernoise_ema_alpha    F)defaultrepr_vad_buffer_speech_buffer	_speakingr   _silence_ms        
_noise_rms_buffer_committedreturnc                 N    t        t        | j                  dz  z  t        z        S )zNumber of bytes per VAD frameg     @@)intr   r   r   selfs    +/home/sas/my/fyp/ringai/ringai/audio/vad.pyframe_byteszVADDetector.frame_bytes8   s#     >T]]V%;<?OOPPr   c                     t         S )z.Minimum PCM bytes for valid utterance (~400ms))r   r&   s    r(   min_pcm_byteszVADDetector.min_pcm_bytes=   s
      r   c                     | j                   S )z$Whether speech is currently detected)r   r&   s    r(   is_speakingzVADDetector.is_speakingB   s     ~~r   c                     | j                   S )z!Current accumulated speech buffer)r   r&   s    r(   speech_bufferzVADDetector.speech_bufferG   s     """r   c                     | j                   S )z(Current silence duration in milliseconds)r   r&   s    r(   
silence_mszVADDetector.silence_msL   s     r   c                 <    d| _         d| _        d| _        d| _        y)z+Reset VAD state for new utterance detectionr   Fr   N)r   r   r   r"   r&   s    r(   resetzVADDetector.resetQ   s!    !!&r   c                 @    | j                          d| _        d| _        y)z%Full reset including noise estimationr   r    N)r3   r   r!   r&   s    r(   	reset_allzVADDetector.reset_allX   s    

r   pcm_8kc                 t   | xj                   |z  c_         g }t        | j                         | j                  k\  r~| j                   d| j                   }| j                   | j                  d | _         | j                  |      }|r|j	                  |       t        | j                         | j                  k\  r~|S )a4  
        Add audio data and process for voice activity.

        Args:
            pcm_8k: 8kHz 16-bit PCM audio data

        Returns:
            List of detected utterances, each containing:
                - pcm_8k: The speech audio buffer
                - duration_ms: Duration in milliseconds
        N)r   lenr)   _process_frameappend)r'   r6   
utterancesframeresults        r(   	add_audiozVADDetector.add_audio^   s     	F"
$""#t'7'77$$%7t'7'78E#//0@0@0BCD((/F!!&) $""#t'7'77 r   r<   c                 z   t        j                  |t              }| j                  s1d| j                  z
  | j
                  z  | j                  |z  z   | _        t        | j                  t        | j
                  | j                  z              }||k\  }|r>| j                  sd| _        d| _
        d| _        | xj                  |z  c_        d| _
        y| j                  r]| xj                  | j                  z  c_
        | xj                  |z  c_        | j                  | j                  k\  r| j                         S y)z
        Process a single audio frame for VAD.

        Returns utterance dict if speech segment completed, None otherwise.
        g      ?Tr   FN)audiooprmsr   r   r   r!   maxr   r%   r   r   r"   r   r   r   _commit_utterance)r'   r<   rA   	threshold	is_speechs        r(   r9   zVADDetector._process_framew   s    kk%!12 ~~t+++t>&&,- O ..DOOdF[F[4[0\]	9$	>>!%#$ ).&5( D  ~~  DMM1 ##u,###t':'::1133r   c                 `   d| _         t        t        | j                        t        t
        z  z  dz        }|| j                  k  r| j                          yt        | j                        | j                  k  r| j                          y| j                  |d}d| _	        d| _        d| _
        |S )z3Commit current speech buffer as completed utteranceFi  N)r6   duration_msTr   r   )r   r%   r8   r   r   r   r   r3   r+   r"   r   )r'   rG   r=   s      r(   rC   zVADDetector._commit_utterance   s    $$%)9N)JKtS
 ...JJLt""#d&8&88JJL ))&

 "&!r   c                 T    | j                   r| j                  ry| j                         S )z
        Force commit current speech buffer (for wall-clock timeout).

        Returns utterance dict if buffer is valid, None otherwise.
        N)r   r"   rC   r&   s    r(   force_commitzVADDetector.force_commit   s'     ""d&<&<%%''r   c                 n    t        | j                  t        | j                  | j                  z              S )z"Get current adaptive RMS threshold)rB   r   r%   r!   r   r&   s    r(   get_thresholdzVADDetector.get_threshold   s)    4))3tAVAV/V+WXXr   c                     | j                   S )zGet current estimated noise RMS)r!   r&   s    r(   get_noise_floorzVADDetector.get_noise_floor   s    r   N)-__name__
__module____qualname____doc__r	   r   r%   __annotations__r
   r   r   r   r   r   r   r   floatr   r   r   r   bytesr   r   boolr   r!   r"   propertyr)   r+   r-   r/   r1   r3   r5   listdictr>   r   r9   rC   rI   rK   rM    r   r(   r   r      s    !Hc (NC(,c,.s..e.,OU, s7K7!#E:NE:E6It6QU3K3c6J6#E>t>QS Q Q  s     T   #u # #  C    ' $t* 2'E 'htn 'R8D> 6	(htn 	(Ys Y r   r   )rQ   r@   dataclassesr   r   typingr   configr   r   r	   r
   r   r   r   r   r   r   r   rY   r   r(   <module>r]      s@     (    r r rr   