a
    h@p                     @   s  d Z ddlZddlmZ ddlmZ ddlZddlZddl	m
Z
 ddlm
  mZ ddlmZ ddlmZmZ dd	lmZ eeG d
d deZeeG dd deZeeG dd deZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZG dd de
jZ eG d d! d!eZ!ed"d#G d$d% d%e!Z"d%d!gZ#dS )&zTransformers DAC model.    N)	dataclass)Optional   )PreTrainedAudioTokenizerBase)ModelOutputauto_docstring   )	DacConfigc                   @   sl   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dS )	DacOutputa  
    loss (`torch.Tensor`):
        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
    audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
        Reconstructed audio data.
    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
        Quantized continuous representation of input.
    audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
        Codebook indices for each codebook (quantized discrete representation of input).
    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
        Projected latents (continuous representation of input before quantization).
    Nlossaudio_valuesquantized_representationaudio_codesprojected_latents)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   Z
LongTensorr    r   r   `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dac/modeling_dac.pyr
      s   
r
   c                   @   sZ   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dS )DacEncoderOutputa  
    loss (`torch.Tensor`):
        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
        Quantized continuous representation of input.
    audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
        Codebook indices for each codebook (quantized discrete representation of input).
    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
        Projected latents (continuous representation of input before quantization).
    Nr   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   6   s
   
r   c                   @   s$   e Zd ZU dZdZeej ed< dS )DacDecoderOutputz
    audio_values (`torch.FloatTensor`  of shape `(batch_size, input_length)`, *optional*):
        Decoded audio values, obtained using the decoder part of Dac.
    Nr   )	r   r   r   r   r   r   r   r   r   r   r   r   r   r   J   s   
r   c                       s(   e Zd ZdZ fddZdd Z  ZS )Snake1dz;
    A 1-dimensional Snake activation function module.
    c                    s$   t    ttd|d| _d S )Nr   )super__init__nn	Parameterr   onesalpha)self
hidden_dim	__class__r   r   r   [   s    
zSnake1d.__init__c                 C   sR   |j }||d |d d}|| jd  t| j| d  }||}|S )Nr   r   g&.>   )shapereshaper!   Z
reciprocalr   sinpow)r"   hidden_statesr(   r   r   r   forward_   s
    (
zSnake1d.forward)r   r   r   r   r   r-   __classcell__r   r   r$   r   r   V   s   r   c                       s6   e Zd ZdZed fddZdd Zdd Z  ZS )	DacVectorQuantizea  
    Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization)

    Additionally uses following tricks from improved VQGAN
    (https://huggingface.co/papers/2110.04627):
        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
            for improved codebook usage
        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
            improves training stability
    configc                    sL   t    tj|j|jdd| _tj|j|jdd| _t|j	|j| _
d S )Nr   kernel_size)r   r   r   Conv1dhidden_sizecodebook_dimin_projout_proj	Embeddingcodebook_sizecodebookr"   r1   r$   r   r   r   s   s    
zDacVectorQuantize.__init__c                 C   sh   |  |}| |\}}tj|| dd}tj|| dd}|||   }| |}|||||fS )aJ  
        Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors.

        Args:
            hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`):
                Input tensor.

        Returns:
            quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`):
                Quantized continuous representation of input.
            commitment_loss (`torch.FloatTensor`of shape `(1)`):
                Commitment loss to train encoder to predict vectors closer to codebook entries.
            codebook_loss (`torch.FloatTensor`of shape `(1)`):
                Codebook loss to update the codebook.
            audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`):
                Codebook indices for each codebook, quantized discrete representation of input.
            projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`):
                Projected latents (continuous representation of input before quantization).
        mean)Z	reduction)r7   decode_latentsFZmse_lossdetachr8   )r"   hidden_stater   r   r   commitment_losscodebook_lossr   r   r   r-   z   s    

zDacVectorQuantize.forwardc                 C   s   |j \}}}|ddd|| |}| jj}t|}t|}|djddd}|d| |	    |djddd	  }|
dd }	|	|dd}	| |	dd}
|
|	fS )Nr   r'   r   T)Zkeepdimr&   )r(   Zpermuter)   r;   weightr?   	normalizer+   sumtmaxsize	transpose)r"   r,   Z
batch_sizer#   Zsequence_length	encodingsr;   Zl2_normdistindicesr   r   r   r   r>      s    

.z DacVectorQuantize.decode_latents)	r   r   r   r   r	   r   r-   r>   r.   r   r   r$   r   r/   g   s    r/   c                       s2   e Zd ZdZd	eed fddZdd Z  ZS )
DacResidualUnitza
    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
       r   )	dimensiondilationc                    sV   t    d| d }t|| _tj||d||d| _t|| _tj||dd| _d S )N   r'      )r3   rQ   paddingr   r2   )	r   r   r   snake1r   r4   conv1snake2conv2)r"   rP   rQ   padr$   r   r   r      s    


zDacResidualUnit.__init__c                 C   sb   |}|  | |}| | |}|jd |jd  d }|dkrV|d|| f }|| }|S )ar  
        Forward pass through the residual unit.

        Args:
            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
                Input tensor .

        Returns:
            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
                Input tensor after passing through the residual unit.
        r&   r'   r   .)rV   rU   rX   rW   r(   )r"   rA   Zoutput_tensorrT   r   r   r   r-      s    zDacResidualUnit.forward)rO   r   )r   r   r   r   intr   r-   r.   r   r   r$   r   rN      s   	rN   c                       s4   e Zd ZdZdeeed fddZdd Z  ZS )	DacEncoderBlockz"Encoder block used in DAC encoder.r   r1   stridestride_indexc              	      s   t    |jd|  }t|d dd| _t|d dd| _t|d dd| _t|d | _t	j
|d |d| |t|d d| _d S )Nr'   r   rQ   r   	   r3   r]   rT   )r   r   encoder_hidden_sizerN   	res_unit1	res_unit2	res_unit3r   rU   r   r4   mathceilrV   )r"   r1   r]   r^   rP   r$   r   r   r      s    
zDacEncoderBlock.__init__c                 C   s2   |  |}| |}| | |}| |}|S N)rc   rd   rU   re   rV   r"   rA   r   r   r   r-      s
    


zDacEncoderBlock.forward)r   r   	r   r   r   r   r	   rZ   r   r-   r.   r   r   r$   r   r[      s   r[   c                       s4   e Zd ZdZdeeed fddZdd Z  ZS )	DacDecoderBlockz"Decoder block used in DAC decoder.r   r\   c              	      s   t    |jd|  }|jd|d   }t|| _tj||d| |t|d d| _	t
|dd| _t
|dd| _t
|dd| _d S )Nr'   r   ra   r_   r   r`   )r   r   decoder_hidden_sizer   rU   r   ConvTranspose1drf   rg   conv_t1rN   rc   rd   re   )r"   r1   r]   r^   Z	input_dim
output_dimr$   r   r   r      s    

zDacDecoderBlock.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rh   )rU   rn   rc   rd   re   ri   r   r   r   r-      s    




zDacDecoderBlock.forward)r   r   rj   r   r   r$   r   rk      s   rk   c                       sZ   e Zd ZdZed fddZdee dddZe	j
d	d
dZe	j
dddZ  ZS )DacResidualVectorQuantizez
    ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://huggingface.co/papers/2107.03312)
    r0   c                    sF   t     j} j}|| _t fddt jD | _|| _d S )Nc                    s   g | ]}t  qS r   )r/   ).0ir0   r   r   
<listcomp>      z6DacResidualVectorQuantize.__init__.<locals>.<listcomp>)r   r   n_codebooksquantizer_dropoutr   
ModuleListrange
quantizers)r"   r1   ru   rv   r$   r0   r   r     s    
 z"DacResidualVectorQuantize.__init__N)n_quantizersc                 C   sp  d}|}d}d}g }g }|dur$|n| j }| jrt|jd f| j  d }td| j d |jd f}	t|jd | j }
|	d|
 |d|
< ||j	}t
| jD ]\}}| jdu r||kr qF||\}}}}}tj|jd f||j	d|k }|||ddddf   }|| }||| 7 }||| 7 }|| || qtj|dd}tj|dd}|||||fS )aQ  
        Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
        Args:
            hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Input tensor to be quantized.
            n_quantizers (`int`, *optional*):
                Number of quantizers to use. If specified and `self.quantizer_dropout` is True,
                this argument is ignored during training, and a random number of quantizers is used.

        Returns:
            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Quantized continuous representation of input.
            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
                Codebook indices for each codebook (quantized discrete representation of input).
            projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
                Projected latents (continuous representation of input before quantization).
            commitment_loss (`torch.Tensor` of shape `(1)`):
                Commitment loss to train the encoder to predict vectors closer to codebook entries.
            codebook_loss (`torch.Tensor` of shape `(1)`):
                Codebook loss to update the codebook.
        r   Nr   F)Z
fill_valuedevicedim)ru   Ztrainingr   r    r(   randintrZ   rv   tor{   	enumeratery   fullappendstackcat)r"   rA   rz   r   ZresidualrB   rC   r   r   ZdropoutZ	n_dropoutrr   	quantizerquantized_representation_iZcommitment_loss_iZcodebook_loss_iZ	indices_iprojected_latents_imaskr   r   r   r-     s:    
z!DacResidualVectorQuantize.forward)r   c                 C   s|   d}g }|j d }t|D ]L}| j| |dd|ddf dd}|| || j| |7 }q|tj|dd|fS )a  
        Reconstructs the continuous representation from quantized codes.

        Args:
            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
                Quantized discrete representation of input.

        Returns:
            quantized_representation (`torch.Tensor`):
                Quantized continuous representation of input.
            projected_latents (`torch.Tensor`):
                List of projected latents (continuous representations of input before quantization)
                for each codebook.
            audio_codes (`torch.Tensor`):
                Codebook indices for each codebook.
                r   Nr'   r|   )	r(   rx   ry   r;   rJ   r   r8   r   r   )r"   r   r   r   ru   rr   r   r   r   r   
from_codesY  s    
*
z$DacResidualVectorQuantize.from_codes)latentsc                 C   s   d}g }g }t dgdd | jD  }t j|dd}t||jd kd jdddd }t|D ]p}|| ||d   }	}
| j| 	|dd|	|
ddf \}}|
| |
| | j| |}|| }qd|t j|ddfS )	a  Reconstructs the quantized representation from unquantized latents.

        Args:
            latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`):
                Continuous representation of input after projection.

        Returns:
            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Quantized representation of the full-projected space.
            quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
                Quantized representation of the latent space (continuous representation before quantization).
        r   c                 S   s   g | ]
}|j qS r   )r6   )rq   qr   r   r   rs     rt   z:DacResidualVectorQuantize.from_latents.<locals>.<listcomp>r|   r   T)ZaxisZkeepdimsN)r   Ztensorry   Zcumsumnpwherer(   rH   rx   r>   r   r8   r   )r"   r   r   Zquantized_latentscodesZcodebook_dims_tensordimsru   rr   Zhidden_dim_jZhidden_dim_kZquantized_latents_iZcodes_ir   r   r   r   from_latentss  s    &*


z&DacResidualVectorQuantize.from_latents)N)r   r   r   r   r	   r   r   rZ   r-   r   Tensorr   r   r.   r   r   r$   r   rp   	  s
   @rp   c                       s.   e Zd ZdZed fddZdd Z  ZS )
DacDecoderzDAC Decoderr0   c           	         s   t    |j}|j}|j}tj||ddd| _g }t|D ]\}}|t	|||g7 }q<t
|| _|jd|d   }t|| _tj|dddd| _t | _d S )NrS   r   r3   rT   r'   r   )r   r   r5   rl   Zupsampling_ratiosr   r4   rV   r   rk   rw   blockr   rU   rX   ZTanhtanh)	r"   r1   Zinput_channelZchannelsstridesr   r^   r]   ro   r$   r   r   r     s    

zDacDecoder.__init__c                 C   s@   |  |}| jD ]}||}q| |}| |}| |}|S rh   )rV   r   rU   rX   r   )r"   rA   layerr   r   r   r-     s    





zDacDecoder.forwardr   r   r   r   r	   r   r-   r.   r   r   r$   r   r     s   r   c                       s.   e Zd ZdZed fddZdd Z  ZS )
DacEncoderzDAC Encoderr0   c                    s   t    |j}tjd|jddd| _g | _t|D ]*\}}|d }|  jt	|||dg7  _q4t
| j| _|jd|  }t|| _tj||jddd| _d S )Nr   rS   r   r   )r]   r^   r'   )r   r   Zdownsampling_ratiosr   r4   rb   rV   r   r   r[   rw   r   rU   r5   rX   )r"   r1   r   r^   r]   Zd_modelr$   r   r   r     s    

zDacEncoder.__init__c                 C   s6   |  |}| jD ]}||}q| |}| |}|S rh   )rV   r   rU   rX   )r"   rA   moduler   r   r   r-     s    




zDacEncoder.forwardr   r   r   r$   r   r     s   r   c                   @   s6   e Zd ZU eed< dZdZdd Zdd Zdd	 Z	d
S )DacPreTrainedModelr1   Zdacinput_valuesc                 C   s   t |tjr0tjj|jdd tj|jd nNt |trJ|j	j
d n4t |tjr`|  nt |tjr~|jj
jddd d S )Ng{Gz?)stdr   g      ?r   )r=   r   )
isinstancer   r4   initZtrunc_normal_rD   Z	constant_Zbiasr   r!   dataZfill_rm   Zreset_parametersr9   Znormal_)r"   r   r   r   r   _init_weights  s    

z DacPreTrainedModel._init_weightsc                 C   s6  t jj}tt jjdr t jjj}| jjD ]}||j ||j q(|| j	j
 || j	j | j	jD ]V}||j
 ||jj
 ||jj ||jj
 ||jj ||jj
 ||jj qb|| jj
 || jj | jjD ]V}||j ||jj
 ||jj ||jj
 ||jj ||jj
 ||jj qd S )Nweight_norm)r   utilsr   hasattrZparametrizationsr   ry   r7   r8   encoderrV   rX   r   rc   rd   re   decoderrn   )r"   r   r   r   r   r   apply_weight_norm  s4    



z$DacPreTrainedModel.apply_weight_normc                 C   sf  | j jD ] }tj|j tj|j qtj| jj tj| jj	 | jj
D ]r}tj|j tj|jj tj|jj	 tj|jj tj|jj	 tj|jj tj|jj	 qRtj| jj tj| jj	 | jj
D ]r}tj|j tj|jj tj|jj	 tj|jj tj|jj	 tj|jj tj|jj	 qd S rh   )r   ry   r   r   remove_weight_normr7   r8   r   rV   rX   r   rc   rd   re   r   rn   )r"   r   r   r   r   r     s.    z%DacPreTrainedModel.remove_weight_normN)
r   r   r   r	   r   Zbase_model_prefixZmain_input_namer   r   r   r   r   r   r   r     s   
!r   z/
    The DAC (Descript Audio Codec) model.
    )Zcustom_introc                       s   e Zd Zed fddZedejee	 ee
 dddZedeej eej ee
 dd	d
Zedejee	 ee
 dddZ  ZS )DacModelr0   c                    sj   t  | || _t|| _t|| _t|| _t	t
| jj| _d| j | jjkr^td|   d S )Nr'   z'The codebook_size must be a power of 2.)r   r   r1   r   r   r   r   rp   r   rZ   rf   log2r:   Zbits_per_codebook
ValueErrorZ	post_initr<   r$   r   r   r   /  s    


zDacModel.__init__N)r   rz   return_dictc           
      C   sj   |dur|n| j j}| |}| ||\}}}}}| j j| | j j|  }	|s\|	|||fS t|	|||S )a  
        input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
            Input audio data to encode,
        n_quantizers (int, *optional*):
            Number of quantizers to use. If None, all quantizers are used. Default is None.
        N)r1   r   r   r   Zcommitment_loss_weightZcodebook_loss_weightr   )
r"   r   rz   r   r   r   r   rB   rC   r   r   r   r   encode?  s    
zDacModel.encode)r   r   r   c                 C   sf   |du r|du rt d|dur$|n| jj}|durD| j|d }| |d}|s^|fS t|S )a  
        quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
            Quantized continuous representation of input.
        audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
            The codebook indices for each codebook, representing the quantized discrete
            representation of the input. This parameter should be provided if you want
            to decode directly from the audio codes (it will overwrite quantized_representation).
        return_dict (`bool`, *optional*, defaults to `True`):
            Whether to return a [`DacDecoderOutput`] instead of a plain tuple.
        NzDEither `quantized_representation` or `audio_codes` must be provided.r   r   )r   r1   r   r   r   r   Zsqueezer   )r"   r   r   r   r   r   r   r   decodeZ  s    zDacModel.decodec           
      C   sv   |dur|n| j j}|jd }| j||dd\}}}}| j|ddd dd|f }	|sf||	|||fS t||	|||S )a  
        input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`):
            Audio data to encode.
        n_quantizers (`int`, *optional*):
            Number of quantizers to use. If `None`, all quantizers are used. Default is `None`.

        Examples:

        ```python
        >>> from datasets import load_dataset, Audio
        >>> from transformers import DacModel, AutoProcessor
        >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> model = DacModel.from_pretrained("descript/dac_16khz")
        >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
        >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
        >>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
        >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

        >>> encoder_outputs = model.encode(inputs["input_values"])
        >>> # Get the intermediate audio codes
        >>> audio_codes = encoder_outputs.audio_codes
        >>> # Reconstruct the audio from its quantized representation
        >>> audio_values = model.decode(encoder_outputs.quantized_representation)
        >>> # or the equivalent with a forward pass
        >>> audio_values = model(inputs["input_values"]).audio_values
        ```Nr&   F)r   r   .)r1   r   r(   r   r   r
   )
r"   r   rz   r   lengthr   r   r   r   r   r   r   r   r-   {  s    #
zDacModel.forward)NN)NNN)NN)r   r   r   r	   r   r   r   r   r   rZ   boolr   r   r-   r.   r   r   r$   r   r   )  s4           r   )$r   rf   dataclassesr   typingr   numpyr   r   Ztorch.nnr   Ztorch.nn.functionalZ
functionalr?   Zmodeling_utilsr   r   r   r   Zconfiguration_dacr	   r
   r   r   Moduler   r/   rN   r[   rk   rp   r   r   r   r   __all__r   r   r   r   <module>   sF   	F% %!O~