
    i                         U d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZ  G d d      Zdaee   ed	<   d
efdZy)zf
Dataset Storage Module

Manages call data storage including audio chunks, transcripts, and metadata.
    N)Path)OptionalUnion   )
CALLS_ROOTWHISPER_MODEL_NAMEWHISPER_LANGUAGE	log_audioc                   
   e Zd ZdZefdeeef   fdZdedefdZ	dededefdZ
dedefd	Zdedefd
Z	 ddedededededefdZdedefdZdedefdZdedefdZdedefdZdedee   fdZdee   fdZdedefdZy)CallDataStorezv
    Manager for call dataset storage.

    Handles directory structure, audio chunks, transcripts, and metadata.
    	base_pathc                 ^    t        |      | _        | j                  j                  dd       y)zk
        Initialize data store.

        Args:
            base_path: Root directory for call data
        Tparentsexist_okN)r   r   mkdir)selfr   s     1/home/sas/my/fyp/ringai/ringai/storage/dataset.py__init__zCallDataStore.__init__   s&     iTD9    call_idreturnc                 J    | j                   |z  }|j                  dd       |S )zGet root directory for a callTr   )r   r   r   r   paths      r   get_call_dirzCallDataStore.get_call_dir"   s%    ~~'

4$
/r   
session_idc                 r    | j                  |      dz  dz  t        |      z  }|j                  dd       |S )z&Get audio session directory for a callaudiosessionsTr   )r   strr   )r   r   r   r   s       r   get_audio_session_dirz#CallDataStore.get_audio_session_dir(   s:      )G3j@3z?R

4$
/r   c                 T    | j                  |      dz  }|j                  dd       |S )z#Get transcript directory for a call
transcriptTr   r   r   r   s      r   get_transcript_dirz CallDataStore.get_transcript_dir.   s,      )L8

4$
/r   c                 T    | j                  |      dz  }|j                  dd       |S )z#Get LLM events directory for a callllmTr   r%   r   s      r   get_llm_dirzCallDataStore.get_llm_dir4   s,      )E1

4$
/r   chunk_indexpcm_16ksample_ratec                 ^   | j                  ||      }|d|ddz  }t        j                  t        |      d      5 }|j	                  d       |j                  d       |j                  |       |j                  |       ddd       t        j                  d|||       |S # 1 sw Y   #xY w)	aK  
        Save audio chunk to disk.

        Args:
            call_id: Call identifier
            session_id: Session identifier
            chunk_index: Chunk number
            pcm_16k: 16kHz 16-bit PCM audio data
            sample_rate: Sample rate (default 16kHz)

        Returns:
            Path to saved WAV file
        chunk_04d.wavwb   r   Nzcall=%s chunk=%d saved to %s)
r"   waveopenr!   setnchannelssetsampwidthsetframeratewriteframesr
   info)	r   r   r   r*   r+   r,   session_dir
chunk_pathwfs	            r   save_audio_chunkzCallDataStore.save_audio_chunk:   s    * 00*E VK+<D#AA
YYs:- 	$OOAOOAOOK(NN7#		$ 	5wZX	$ 	$s   AB##B,rowc                 $   | j                  |      dz  }|j                         }t        |ddd      5 }t        j                  ||j                               }|s|j                          |j                  |       ddd       y# 1 sw Y   yxY w)z
        Append metadata row to CSV.

        Args:
            call_id: Call identifier
            row: Metadata dictionary
        zmetadata.csva utf-8)newlineencoding)
fieldnamesN)r   existsr4   csv
DictWriterkeyswriteheaderwriterow)r   r   r>   	meta_pathrF   fwriters          r   save_metadata_rowzCallDataStore.save_metadata_row[   s}     %%g.?	!!#)S"w? 	!1^^A#((*=F""$OOC 		! 	! 	!s   A	BB	utterancec                     | j                  |      dz  }t        |dd      5 }t        j                  ||d       |j	                  d       ddd       y# 1 sw Y   yxY w)	z
        Append utterance to JSONL file.

        Args:
            call_id: Call identifier
            utterance: Utterance dictionary (ts, chunk, utter_ms, text, etc.)
        zutterances.jsonlr@   rB   rD   Fensure_ascii
N)r&   r4   jsondumpwrite)r   r   rP   segments_pathrM   s        r   save_utterancezCallDataStore.save_utterancel   sX     //8;MM-w7 	1IIi7GGDM	 	 	   *AAeventc                     | j                  |      dz  }t        |dd      5 }t        j                  ||d       |j	                  d       ddd       y# 1 sw Y   yxY w)	z
        Append LLM event to JSONL file.

        Args:
            call_id: Call identifier
            event: LLM event dictionary
        events.jsonlr@   rB   rR   FrS   rU   N)r)   r4   rV   rW   rX   )r   r   r\   events_pathrM   s        r   save_llm_eventzCallDataStore.save_llm_eventz   sW     &&w/.@+sW5 	IIeQU3GGDM	 	 	r[   textc                 @   |sy| j                  |      }|dz  }t        |dd      5 }|j                  |dz          ddd       |dz  }t        |dd      5 }t        j                  |t
        t        |d|d	d
       ddd       y# 1 sw Y   LxY w# 1 sw Y   yxY w)z
        Save final transcript (both .txt and .json formats).

        Args:
            call_id: Call identifier
            text: Final transcript text
        N	final.txtwrB   rR   rU   z
final.json)r   modellanguagefinal_transcriptr   F)indentrT   )r&   r4   rX   rV   rW   r   r	   )r   r   ra   transcript_dirtxt_pathrM   	json_paths          r   save_final_transcriptz#CallDataStore.save_final_transcript   s     009 "K/(C'2 	!aGGD4K 	! #\1	)S73 	qII&/ 0(,	 "
	 		! 	!
	 	s   B'BBBc                 T   | j                  |      }|dz  dz  }|dz  dz  }|j                         st        j                  d|       yg }t	        |j                               D ]R  }|j                         st	        |j                               D ]#  }|j                  dk(  s|j                  |       % T |st        j                  d|       yd}|D ]M  }		 t        j                  t        |	      d	      5 }
||
j                  |
j                               z  }ddd       O |sy|j                  j!                  dd       t        j                  t        |      d      5 }
|
j#                  d       |
j%                  d       |
j'                  d       |
j)                  |       ddd       t+        |      dz  }t        j,                  d||t+        |             |S # 1 sw Y   xY w# t        $ r#}t        j                  d
||	|       Y d}~@d}~ww xY w# 1 sw Y   uxY w)z
        Merge all audio sessions into single WAV file.

        Args:
            call_id: Call identifier

        Returns:
            Path to merged WAV file, or None if no audio found
        r   r    
merged.wavz#call=%s no sessions directory foundNr0   zcall=%s no audio chunks foundr   rbz#call=%s failed to read chunk %s: %rTr   r1   r2   r   >  i }  z5call=%s merged audio saved, duration=%.1fs, chunks=%d)r   is_dirr
   warningsortediterdirsuffixappendr3   r4   r!   
readframes
getnframes	Exceptionerrorparentr   r5   r6   r7   r8   lenr9   )r   r   call_dirsessions_dirout_path
all_chunkssession_folder
chunk_filepcmr;   r<   edurations                r   merge_call_audiozCallDataStore.merge_call_audio   s    $$W-')J6g%4""$CWM 
$\%9%9%;< 	2N!((*$^%;%;%=> 2
$$.%%j12	2 =wG $ 	_J_YYs:5 :2==99C:	_  	dT:YYs8}d+ 	 rOOAOOAOOE"NN3		  s8y)CXs:	

 -: : _ EwPZ\]^^_	  	 s=   G/9#G#G/%AH#G,	(G//	H8HHH'c                     | j                   j                         D cg c]  }|j                         s|j                  ! c}S c c}w )zList all call IDs in storage)r   rt   rq   name)r   ds     r   
list_callszCallDataStore.list_calls   s/     $ 6 6 8G1AHHJGGGs
   AAc                     | j                  |      }||dz  dz  j                         |dz  dz  j                         |dz  dz  j                         dS )zGet summary metadata for a callr   rn   r$   rc   r(   r^   )r   has_merged_audiohas_final_transcripthas_llm_events)r   rF   )r   r   r}   s      r   get_call_metadatazCallDataStore.get_call_metadata   sc    $$W- !)G!3l!B J J L%-%<{%J$R$R$T'%/.@HHJ	
 	
r   N)rp   )__name__
__module____qualname____doc__r   r   r!   r   r   r   r"   r&   r)   intbytesr=   dictrO   rZ   r`   rl   r   r   listr   r    r   r   r   r      sD    6@ :%T	"2 :C D S c d # $ 3 4  !  	
   
B! !4 !"c d c $ S  B9 9 9vHDI H	
 	
 	
r   r   _data_storer   c                  .    t         
t               a t         S )zGet global data store instance)r   r   r   r   r   get_data_storer      s     #or   )r   osrG   rV   r3   pathlibr   typingr   r   configr   r   r	   r
   r   r   __annotations__r   r   r   r   <module>r      sN    
 
    " P P`
 `
H (,Xm$ + r   