a
    hcB                    @   s  d Z ddlZddlmZmZ ddlZddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ e*0e1Z2dZ3ej4e5e5dddZ6dwej4e5eej4 dddZ7dxe8e5e5f e9e5eej: e5ej;dddZ<G dd deZ=G dd  d eZ>G d!d" d"eZ?G d#d$ d$e	j@ZAG d%d& d&e	j@ZBG d'd( d(e	j@ZCG d)d* d*ej	j@ZDG d+d, d,e	j@ZEG d-d. d.e	j@ZFG d/d0 d0e	j@ZGG d1d2 d2e	j@ZHG d3d4 d4e	j@ZIG d5d6 d6e	j@ZJG d7d8 d8e	j@ZKG d9d: d:e	j@e&ZLG d;d< d<e	j@e&ZMG d=d> d>e	j@e&ZNG d?d@ d@e	j@ZOG dAdB dBe	j@ZPG dCdD dDeZQG dEdF dFeZRe)G dGdH dHe'ZSG dIdJ dJeSZTG dKdL dLeSZUG dMdN dNeSZVG dOdP dPeSZWG dQdR dReSZXG dSdT dTeSZYG dUdV dVeSZZG dWdX dXeSZ[G dYdZ dZe	j@Z\G d[d\ d\e	j@Z]e)d]d^G d_d` d`eSZ^e)dad^G dbdc dceSeZ_dyeSej`eej` eej: e9e9e9ee	j@ eaeaeej`e8ej`ej`f f dhdidjZbe)dkd^G dldm dmeSZce)dnd^G dodp dpeSZdG dqdr dre	j@Zee)dsd^G dtdu due'Zfg dvZgdS )zzPyTorch SpeechT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging)deprecate_kwarg   )SpeechT5ConfigSpeechT5HifiGanConfig)	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du rTtd||dk| |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r    r!   r"   Zshifted_input_ids r)   j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_right7   s    (r+   )input_valuesreduction_factorattention_maskc                 C   s   |dkrD| dd|d d|f } |durD|dd|d d|f }|  | j}| ddddf  |ddddf< ||dkd ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr#         Y        )r$   r%   r&   r(   )r,   r-   r.   Zshifted_input_valuesr)   r)   r*   shift_spectrograms_rightG   s    (r1   )r%   	mask_probmask_lengthr.   	min_masksreturnc                    s  | \}dk rt dkr6t d d dtjd   fdd}|durt| d	 nfd
dt|D }tj	|ft
d}g }	|}
|
dkr|S |D ]v}||}tjjt|d  |dd}t|dkrd }n|d }t|tj|
| tjd| g}|	| qt|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d kr҈d |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr2 }| d  |k rTt| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr3   r2   r4   sequence_lengthr)   r*   compute_num_masked_span   s    
z6_compute_mask_indices.<locals>.compute_num_masked_spanNr#   c                    s   g | ]} qS r)   r)   .0_)r=   r)   r*   
<listcomp>       z)_compute_mask_indices.<locals>.<listcomp>dtyper   F)replace)r'   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenZconcatenateonesZint32appendarrayZbroadcast_toreshaper8   Zput_along_axis)r%   r2   r3   r.   r4   
batch_sizer>   input_lengthsZspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr9   r:   Zspec_aug_mask_idxZdummy_mask_idxoffsetsr)   r;   r*   _compute_mask_indices]   s\    

r[   c                       s&   e Zd Zd fdd	Zdd Z  ZS )SpeechT5NoLayerNormConvLayerr   c                    sj   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__r)   r*   rb      s    
z%SpeechT5NoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)rj   rl   rn   hidden_statesr)   r)   r*   forward   s    

z$SpeechT5NoLayerNormConvLayer.forward)r   __name__
__module____qualname__rb   rv   __classcell__r)   r)   rq   r*   r\      s   r\   c                       s&   e Zd Zd fdd	Zdd Z  ZS )SpeechT5LayerNormConvLayerr   c                    s|   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   r]   T)Zelementwise_affine)ra   rb   rc   rd   re   r   rf   rg   rh   ri   rj   	LayerNorm
layer_normr	   rk   rl   rm   rq   r)   r*   rb      s    
z#SpeechT5LayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )Nr#   )rj   	transposer~   rl   rt   r)   r)   r*   rv      s    


z"SpeechT5LayerNormConvLayer.forward)r   rw   r)   r)   rq   r*   r|      s   r|   c                       s&   e Zd Zd fdd	Zdd Z  ZS )SpeechT5GroupNormConvLayerr   c                    s   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   r]   T)Z
num_groupsZnum_channelsZaffine)ra   rb   rc   rd   re   r   rf   rg   rh   ri   rj   r	   rk   rl   	GroupNormr~   rm   rq   r)   r*   rb     s    
z#SpeechT5GroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S rs   )rj   r~   rl   rt   r)   r)   r*   rv     s    


z"SpeechT5GroupNormConvLayer.forward)r   rw   r)   r)   rq   r*   r     s   r   c                       s   e Zd ZdZdeeee d fddZdeeee dddZedeeee dd	d
Z	e
 de
jedddZde
jeee dddZ  ZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.N)num_positionsembedding_dimpadding_idxc                    s4   t    d| _|| _|| _| || j || d S N   )ra   rb   offsetr   r   make_weights)rn   r   r   r   rq   r)   r*   rb   #  s
    
z.SpeechT5SinusoidalPositionalEmbedding.__init__)num_embeddingsr   r   c                 C   sB   |  |||}t| dr.|j| jj| jjd}| jd|dd d S )NweightsrE   deviceF
persistent)get_embeddinghasattrtor   rE   r   register_buffer)rn   r   r   r   Zemb_weightsr)   r)   r*   r   *  s    
z2SpeechT5SinusoidalPositionalEmbedding.make_weightsc                 C   s   |d }t d|d  }ttj|tjd |  }tj| tjd d|d }tjt	|t
|gdd| d}|d dkrtj|t| dgdd}|durd||ddf< |t S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rD   r   dimr#   N)mathlogtorchexprR   int64float	unsqueezecatsincosviewrO   r   Zget_default_dtype)r   r   r   Zhalf_dimembr)   r)   r*   r   2  s     $&z3SpeechT5SinusoidalPositionalEmbedding.get_embeddingr   )r    past_key_values_lengthc                 C   s|   |  \}}| || j||j}| jd | }|| j dkrZ| || j | j| j | j	d|
d
||d S )Nr   r   r#   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   Zindex_selectr   rK   )rn   r    r   bszseq_lenZposition_idsZmax_posr)   r)   r*   rv   D  s    z-SpeechT5SinusoidalPositionalEmbedding.forward)r    r   r   c                 C   s6   | | }tj|dd|| | }| | S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner7   r   cumsumZtype_aslong)rn   r    r   r   maskZincremental_indicesr)   r)   r*   r   S  s    zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_ids)N)N)N)r   )r   )rx   ry   rz   __doc__r7   r   rb   r   staticmethodr   r   no_gradTensorrv   r   r{   r)   r)   rq   r*   r      s    r   c                       s$   e Zd Z fddZdd Z  ZS )SpeechT5PositionalConvEmbeddingc                    s$  t    tj|j|j|j|jd |jd| _tjj	}t
tjjdrNtjjj	}t rdd l}|jj| jjdd" || jddd| _W d    n1 s0    Y  t
| jdr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n|| jddd| _t|j| _t|j | _d S )	Nr   )r^   paddinggroupsweight_normr   )Zmodifier_rankweight)namer   parametrizations)ra   rb   r   rf   hidden_sizenum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsrj   utilsr   r   r   r   	deepspeedzeroZGatheredParametersr   Z	original0Z	original1weight_gweight_vZregister_external_parameterSpeechT5SamePadLayerr   r	   rk   rl   )rn   ro   r   r   r   r   rq   r)   r*   rb   f  s2    

0z(SpeechT5PositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S Nr   r   )r   rj   r   rl   rt   r)   r)   r*   rv     s    


z'SpeechT5PositionalConvEmbedding.forwardrw   r)   r)   rq   r*   r   e  s   !r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS ) SpeechT5ScaledPositionalEncodingu[   
    Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
      c                    s   t ||}t d|d}t t jd|dt jd td|   }t 	| | |d d dd df< t 
| | |d d dd df< |d}t   | jd|dd tj|d	| _|| _tt d
| _d S )Nr   r   r   rD   g     @peFr   p      ?)r   rO   rR   r   r   r   r   r   r   r   r   ra   rb   r   r   Dropoutdropoutr   	Parametertensoralpha)rn   r   r   max_lenr   positionZdiv_termrq   r)   r*   rb     s    .$$

z)SpeechT5ScaledPositionalEncoding.__init__c                 C   s4   || j | jd d d |df   }| |}|S )Nr   )r   r   r   r   )rn   r   r)   r)   r*   rv     s    &
z(SpeechT5ScaledPositionalEncoding.forward)r   )rx   ry   rz   r   rb   rv   r{   r)   r)   rq   r*   r     s   r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )"SpeechT5RelativePositionalEncoding  c                    s.   t    || _|| _tjd| || _d S r   )ra   rb   r   
max_lengthr   r   	Embeddingpe_k)rn   r   r   rq   r)   r*   rb     s    
z+SpeechT5RelativePositionalEncoding.__init__c                 C   s   |j d }td|j|jtjd}|d d d f |d d d f  }| j ||| j k < | jd ||| jk< || j }| |S )Nr   r   r   rE   )r%   r   rR   r   r   r   r   r   )rn   ru   r   Zpos_seqr)   r)   r*   rv     s    
 
z*SpeechT5RelativePositionalEncoding.forward)r   rw   r)   r)   rq   r*   r     s   r   c                       s$   e Zd Z fddZdd Z  ZS )r   c                    s$   t    |d dkrdnd| _d S )Nr   r   r   )ra   rb   num_pad_remove)rn   r   rq   r)   r*   rb     s    
zSpeechT5SamePadLayer.__init__c                 C   s,   | j dkr(|d d d d d | j  f }|S )Nr   )r   rt   r)   r)   r*   rv     s    
zSpeechT5SamePadLayer.forwardrw   r)   r)   rq   r*   r     s   r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr@t ddg fddt jd D  }n6 jdkrd fddt jD }ntd	 j d
t|| _	d| _
d| _d S )Ngroupr   rp   c                    s   g | ]}t  |d  dqS )r   r   )r\   r@   iro   r)   r*   rB     s   z3SpeechT5FeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )r   )r|   r   r   r)   r*   rB     s   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)ra   rb   Zfeat_extract_normr   rN   Znum_feat_extract_layersr'   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)rn   ro   r   rq   r   r*   rb     s    




zSpeechT5FeatureEncoder.__init__c                 C   s   |   D ]
}d|_qd| _d S )NF)
parametersrequires_gradr   )rn   paramr)   r)   r*   _freeze_parameters  s    z)SpeechT5FeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r"| jr"d|_| jD ]}||}q(|S NT)r   trainingr   r   )rn   r,   ru   Z
conv_layerr)   r)   r*   rv     s    

zSpeechT5FeatureEncoder.forward)rx   ry   rz   r   rb   r   rv   r{   r)   r)   rq   r*   r     s   r   c                       s$   e Zd Z fddZdd Z  ZS )SpeechT5FeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Nr#   eps)ra   rb   r   r}   rc   layer_norm_epsr~   Linearr   
projectionr   Zfeat_proj_dropoutr   rn   ro   rq   r)   r*   rb     s    
z"SpeechT5FeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS rs   )r~   r   r   )rn   ru   Znorm_hidden_statesr)   r)   r*   rv     s    


z!SpeechT5FeatureProjection.forwardrw   r)   r)   rq   r*   r     s   r   c                       s   e Zd Z fddZdd Zdejeej eej	 dddZ
eejd	d
dZeejef dddZdej	eej	 eej dddZ  ZS )SpeechT5SpeechEncoderPrenetc                    s|   t    || _t|| _t|| _|jdks8|jdkrPt	
t|j | _t|| _t|j|j d |j|j| _d S )Nr0   r   )ra   rb   ro   r   feature_encoderr   feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr!   pos_sinusoidal_embedr   rq   r)   r*   rb     s    



z$SpeechT5SpeechEncoderPrenet.__init__c                 C   s   | j   d S rs   )r   r   rn   r)   r)   r*   freeze_feature_encoder  s    z2SpeechT5SpeechEncoderPrenet.freeze_feature_encoderN)r,   r.   mask_time_indicesc           	      C   s   |  |}|dd}|d ur0| |jd |}| |\}}| j|||d}| |}|| }|d urx|d }nt	j
|jd d t	j|jd}| |}|| }||fS )Nr   r   )r   r.   r   )r   r   "_get_feature_vector_attention_maskr%   r   _mask_hidden_statesr   r   r   r   rO   r   r   )	rn   r,   r.   r   Zextract_featuresru   Zpositional_conv_embeddingpadding_maskZ positional_sinusoidal_embeddingsr)   r)   r*   rv     s&    


z#SpeechT5SpeechEncoderPrenet.forward)feature_vector_lengthr.   c                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nr#   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r%   rO   rE   r   rR   fliprP   )rn   r   r.   Znon_padded_lengthsoutput_lengthsrX   r)   r)   r*   r   :  s    
"z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask)rY   c                 C   s4   dd }t | jj| jjD ]\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )r   div)r9   r^   r_   r)   r)   r*   _conv_out_lengthO  s    zVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_length)zipro   rg   rh   )rn   rY   r  r^   r_   r)   r)   r*   r   J  s    z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths)ru   r   r.   c                 C   s  t | jdds|S | \}}}|dur<| j|j||< nZ| jjdkr| jrt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        Zapply_spec_augmentTNr   )r2   r3   r.   r4   r   )r2   r3   r4   r#   )getattrro   r   r   r   rE   r   r   r[   Zmask_time_lengthZmask_time_min_masksr   r   r   rP   r   Zmask_feature_lengthZmask_feature_min_masksexpand)rn   ru   r   r.   rX   r=   r   Zmask_feature_indicesr)   r)   r*   r   Z  s4    z/SpeechT5SpeechEncoderPrenet._mask_hidden_states)NN)NN)rx   ry   rz   rb   r   r   r   r   
LongTensorFloatTensorrv   r7   r   r   r   r   r{   r)   r)   rq   r*   r     s$     #  r   c                       s>   e Zd Z fddZdd Zd	ejeej dddZ  Z	S )
SpeechT5SpeechDecoderPrenetc                    sr   t     | _t fddt jD | _t j	 j
| _t j j
 j| _t j j
  j
| _d S )Nc                    s*   g | ]"}t |d kr jn j jqS )r   )r   r   num_mel_binsspeech_decoder_prenet_unitsr   r   r)   r*   rB     s
   z8SpeechT5SpeechDecoderPrenet.__init__.<locals>.<listcomp>)ra   rb   ro   r   r   rN   Zspeech_decoder_prenet_layerslayersr   r  r   final_layerr   positional_dropoutr   encode_positionsZspeaker_embedding_dimspeaker_embeds_layerr   rq   r   r*   rb     s    


z$SpeechT5SpeechDecoderPrenet.__init__c                 C   sJ   t j|d |d}|d|ddd}t |dk|dd d|  S )Nr   r   r   )r   Z	bernoullir   repeatr   where)rn   inputs_embedsr   r   Z	all_masksr)   r)   r*   _consistent_dropout  s    z/SpeechT5SpeechDecoderPrenet._consistent_dropoutN)r,   speaker_embeddingsc                 C   s   |}| j D ]$}tj||}| || jj}q
| |}| |}|d urtj	|}|
dd|dd}tj||gdd}tj| |}|S )Nr   r#   r   )r  r   
functionalZrelur  ro   Zspeech_decoder_prenet_dropoutr  r  	normalizer   r  r   r   r   r  )rn   r,   r  r  r   r)   r)   r*   rv     s    


z#SpeechT5SpeechDecoderPrenet.forward)N)
rx   ry   rz   rb   r  r   r   r   rv   r{   r)   r)   rq   r*   r	    s    r	  c                       s&   e Zd Zd fdd	Zdd Z  ZS )SpeechT5BatchNormConvLayerr   c                    s   t    |dkr|j}n|j}||jd kr6|j}n|j}tj|||jd|jd d dd| _t	|| _
||jd k rt | _nd | _t|j| _d S )Nr   r   r   F)r^   r_   r   r`   )ra   rb   r
  Zspeech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rf   Zspeech_decoder_postnet_kernelrj   BatchNorm1d
batch_normZTanhrl   r   Zspeech_decoder_postnet_dropoutr   )rn   ro   rp   rd   re   rq   r)   r*   rb     s(    
z#SpeechT5BatchNormConvLayer.__init__c                 C   s6   |  |}| |}| jd ur(| |}| |}|S rs   )rj   r  rl   r   rt   r)   r)   r*   rv     s    




z"SpeechT5BatchNormConvLayer.forward)r   rw   r)   r)   rq   r*   r    s   r  c                       s<   e Zd Z fddZejdddZejdddZ  ZS )SpeechT5SpeechDecoderPostnetc                    s^   t     | _t j j j | _t j j| _	t
 fddt jD | _d S )Nc                    s   g | ]}t  |qS r)   )r  r   r   r)   r*   rB     rC   z9SpeechT5SpeechDecoderPostnet.__init__.<locals>.<listcomp>)ra   rb   ro   r   r   r   r
  r-   feat_outprob_outr   rN   r  r  r   rq   r   r*   rb     s    
z%SpeechT5SpeechDecoderPostnet.__init__ru   c                 C   sJ   |  ||dd| jj}| |}| ||dd}|||fS )Nr   r#   )r  r   r   ro   r
  postnetr  )rn   ru   outputs_before_postnetoutputs_after_postnetlogitsr)   r)   r*   rv     s    
z$SpeechT5SpeechDecoderPostnet.forwardc                 C   s0   | dd}| jD ]}||}q|| dd S r   )r   r  )rn   ru   Zlayer_outputr   r)   r)   r*   r     s    

z$SpeechT5SpeechDecoderPostnet.postnet)	rx   ry   rz   rb   r   r   rv   r   r{   r)   r)   rq   r*   r    s   r  c                       s,   e Zd Z fddZejdddZ  ZS )SpeechT5TextEncoderPrenetc                    s>   t    || _t|j|j|j| _t	|j
|j|j| _d S rs   )ra   rb   ro   r   r   
vocab_sizer   r!   embed_tokensr   r  max_text_positionsr  r   rq   r)   r*   rb     s    
z"SpeechT5TextEncoderPrenet.__init__)r    c                 C   s   |  |}| |}|S rs   )r&  r  )rn   r    r  r)   r)   r*   rv   	  s    

z!SpeechT5TextEncoderPrenet.forward)rx   ry   rz   rb   r   r   rv   r{   r)   r)   rq   r*   r$    s   
r$  c                       s<   e Zd Z fddZdejeej ee dddZ	  Z
S )SpeechT5TextDecoderPrenetc                    sn   t    || _t|j| _|jr0t	|j
nd| _t|j|j
|j| _t|j|j d |j
|j| _d S )Nr   r   )ra   rb   ro   r   r   r  r   Zscale_embeddingr   sqrtr   embed_scaler   r%  r!   r&  r   r'  embed_positionsr   rq   r)   r*   rb     s    
z"SpeechT5TextDecoderPrenet.__init__N)r    r.   past_key_valuesc                 C   s   |d ur"|  }|d|d }ntdd}|d urZt|tsR|d d jd n| }| ||}| || j	 }||7 }| 
|}||fS )Nr#   z'You have to specify `decoder_input_ids`r   r   )r   r   r'   
isinstancer
   r%   get_seq_lengthr+  r&  r*  r   )rn   r    r.   r,  input_shaper   Z	positionsr  r)   r)   r*   rv     s    
z!SpeechT5TextDecoderPrenet.forward)NN)rx   ry   rz   rb   r   r   r   r  r
   rv   r{   r)   r)   rq   r*   r(    s     r(  c                       s<   e Zd Z fddZejdddZdd Zdd	 Z  Z	S )
SpeechT5TextDecoderPostnetc                    s*   t    || _tj|j|jdd| _d S )NFr`   )ra   rb   ro   r   r   r   r%  lm_headr   rq   r)   r*   rb   <  s    
z#SpeechT5TextDecoderPostnet.__init__r  c                 C   s
   |  |S rs   r2  rt   r)   r)   r*   rv   A  s    z"SpeechT5TextDecoderPostnet.forwardc                 C   s   | j S rs   r3  r   r)   r)   r*   get_output_embeddingsD  s    z0SpeechT5TextDecoderPostnet.get_output_embeddingsc                 C   s
   || _ d S rs   r3  rn   Znew_embeddingsr)   r)   r*   set_output_embeddingsI  s    z0SpeechT5TextDecoderPostnet.set_output_embeddings)
rx   ry   rz   rb   r   r   rv   r4  r6  r{   r)   r)   rq   r*   r0  ;  s   r0  c                       s   e Zd ZdZdeeee ee ee ee d fddZe	d	d
ddde
jee
j ee ee
j ee
j ee
j eee
j ee
jee
j ee f d	ddZ  ZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    r0   FTN)	embed_dim	num_headsr   
is_decoderr`   	layer_idxc                    s   t    || _|| _|| _|| | _| j| | jkrNtd| j d| d| jd | _|| _|| _	t
j|||d| _t
j|||d| _t
j|||d| _t
j|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r1  )ra   rb   r8  r9  r   head_dimr'   scalingr:  r;  r   r   k_projv_projq_projout_proj)rn   r8  r9  r   r:  r`   r;  rq   r)   r*   rb   S  s$    	


zSpeechT5Attention.__init__past_key_valuer,  4.58new_nameversion)	ru   key_value_statesr,  r.   layer_head_maskposition_biasoutput_attentionscache_positionr5   c	                 C   s  |du}	|  \}
}}| || j }|dur^t|trZ|j| j}|	rR|j}q^|j	}n|}|	rf|n|}|	r|dur|r|j
| j j}|j
| j j}n| |}| |}||
d| j| jdd}||
d| j| jdd}|dur&|	s|nd}|||| jd|i\}}|	r&d|j| j< |
| j d| jf}||
|| j| jdd}|j| }|j| }|j| }| d}t||dd}|  |
| j ||fkrtd|
| j ||f d|   |dur>| |
| j d| jd	d}t||d
d}|d	d|
| j | d	| d}||7 }|dur|  |
d||fkr~td|
d||f d|   ||
| j||| }||
| j ||}tjj|dd}|dur&|  | jfkrtd| jf d|   |dddd||
| j|| }||
| j ||}|rT||
| j||}||
| j ||}nd}tjj|| j| jd}t||}|  |
| j || jfkrtd|
| j|| jf d|   ||
| j|| j}|dd}||
|| j}|  |}||fS )z#Input shape: Batch x Time x ChannelNr#   r   r   rK  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   z/Head mask for a single layer should be of size )r   r   z `attn_output` should be of size )!r   r@  r=  r-  r   
is_updatedgetr;  Zcross_attention_cacheZself_attention_cacher  keysvaluesr>  r?  r   r9  r<  r   updaterW   r   Zbmmr'   
contiguousmatmulr   r  Zsoftmaxr   r   r8  rA  )rn   ru   rG  r,  r.   rH  rI  rJ  rK  Zis_cross_attentionr   tgt_lenrA   Zquery_statesrL  Zcurr_past_key_valueZcurrent_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZ	reshape_qZrel_pos_biasZattn_weights_reshapedZ
attn_probsZattn_outputr)   r)   r*   rv   p  s    








"

"
zSpeechT5Attention.forward)r0   FTN)NNNNNFN)rx   ry   rz   r   r7   r   r   rP   rb   r   r   r   r
   tuplerv   r{   r)   r)   rq   r*   r7  M  s@   	           r7  c                       s$   e Zd Z fddZdd Z  ZS )SpeechT5FeedForwardc                    sl   t    t|j| _t|j|| _t	|j
trBt|j
 | _n|j
| _t||j| _t|j| _d S rs   )ra   rb   r   r   Zactivation_dropoutintermediate_dropoutr   r   intermediate_denser-  Z
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)rn   ro   Zintermediate_sizerq   r)   r*   rb     s    
zSpeechT5FeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rs   )rX  rZ  rW  r[  r]  rt   r)   r)   r*   rv      s    




zSpeechT5FeedForward.forwardrw   r)   r)   rq   r*   rV    s   rV  c                       sN   e Zd Zed fddZd	ejeej eej eej edddZ	  Z
S )
SpeechT5EncoderLayerr   c                    sj   t    t|j|j|jdd| _t|j	| _
tj|j|jd| _t||j| _tj|j|jd| _d S )NF)r8  r9  r   r:  r   )ra   rb   r7  r   encoder_attention_headsattention_dropout	attentionr   r   r\  r   r}   r   r~   rV  Zencoder_ffn_dimfeed_forwardfinal_layer_normr   rq   r)   r*   rb     s    
zSpeechT5EncoderLayer.__init__NFru   r.   rH  rI  rJ  c           	      C   sh   |}| j |||||d\}}| |}|| }| |}|| | }| |}|f}|rd||f7 }|S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        rd  )ra  r   r~   rb  rc  )	rn   ru   r.   rH  rI  rJ  residualrT  outputsr)   r)   r*   rv     s"    




zSpeechT5EncoderLayer.forward)NNNF)rx   ry   rz   r   rb   r   r   r   rP   rv   r{   r)   r)   rq   r*   r^  
  s       r^  c                       s   e Zd Zded fddZedddddejeej eej eej eej eej ee	 ee
 ee
 eej d
ddZ  ZS )SpeechT5DecoderLayerNr   c                    s   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|j|j|jd|d| _tj|j|jd| _t||j| _tj|j|jd| _d S )NT)r8  r9  r   r:  r;  r   )r   r:  r;  )ra   rb   r7  r   Zdecoder_attention_headsr`  	self_attnr   r   r\  r   r}   r   self_attn_layer_normencoder_attnencoder_attn_layer_normrV  Zdecoder_ffn_dimrb  rc  )rn   ro   r;  rq   r)   r*   rb   H  s(    
zSpeechT5DecoderLayer.__init__rB  r,  rC  rD  FT)
ru   r.   encoder_hidden_statesencoder_attention_maskrH  cross_attn_layer_head_maskr,  rJ  	use_cacherK  c              	   C   s   |}| j ||||||
d\}}| |}|| }| |}d}|dur|}| j|||||||
d\}}| |}|| }| |}|| | }| |}|f}|r|||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )ru   r,  r.   rH  rJ  rK  N)ru   rG  r.   rH  r,  rJ  rK  )rh  r   ri  rj  rk  rb  rc  )rn   ru   r.   rl  rm  rH  rn  r,  rJ  ro  rK  re  Zself_attn_weightsZcross_attn_weightsrf  r)   r)   r*   rv   `  sB     



	


zSpeechT5DecoderLayer.forward)N)	NNNNNNFTN)rx   ry   rz   r   rb   r   r   r   r   r
   rP   rv   r{   r)   r)   rq   r*   rg  G  s.            rg  c                   @   s2   e Zd ZU eed< dZdZdZej	dddZ
dS )	SpeechT5PreTrainedModelro   speecht5r,   Tmodulec              	   C   s  | j j}t|tr\tjj|jjddt	
d|jjd |jj   d tj|jjd npt|trx|jjd nTt|trt	
d|jj }tjj|jj| |d tjj|jj| |d nt|tjr|jjjd|d |jdur|jj  nt|tjtjtjfr2|jj  |jjd nt|tjrtj|j |jdurt	
|j|j|jd   }tjj|j| |d n>t|tjr|jjjd|d |j dur|jj|j    t!|d	rtj|j" dS )
zInitialize the weightsr   r   r   meanstdr   )abr0   Nr   )#ro   initializer_ranger-  r   r   initnormal_rj   r   r   r)  r^   Zin_channelsZ	constant_r`   r   r   dataZfill_r   r   Zin_featuresr   r   zero_r}   r   r  rf   Zkaiming_normal_r   r   r   r   r   )rn   rs  rv  kr)   r)   r*   _init_weights  s@    
 


z%SpeechT5PreTrainedModel._init_weightsN)rx   ry   rz   r   __annotations__Zbase_model_prefixmain_input_nameZsupports_gradient_checkpointingr   Moduler  r)   r)   r)   r*   rp    s
   
rp  c                
       sd   e Zd ZdZed fddZd	ejeej	 eej	 ee
 ee
 ee
 eeef dddZ  ZS )
SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    r   c                    s~   t    tj j jd| _t j| _	 j
| _t fddt jD | _t j j  j| _d| _|   d S )Nr   c                    s   g | ]}t  qS r)   )r^  r?   r   r)   r*   rB     rC   z,SpeechT5Encoder.__init__.<locals>.<listcomp>F)ra   rb   r   r}   r   r   r~   r   r\  r   Zencoder_layerdrop	layerdropr   rN   Zencoder_layersr  r   r_  Zencoder_max_relative_positionr+  r   	post_initr   rq   r   r*   rb     s     zSpeechT5Encoder.__init__N)ru   r.   	head_maskrJ  output_hidden_statesreturn_dictr5   c                 C   s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|durPt||j}| |}| |}| |}t	 pzt
| }|rdnd}	|rdnd}
|dur| d t| jkrtdt| j d| d  dt| jD ]\}}|r|	|f }	d}| jrtg }|| jk }|r"|rN|||||dur<|| nd|d}|d }|rXd	}|r|
|d
 f }
q|r||	|f }	|stdd ||	|
fD S t||	|
dS )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr)   r   z&The head_mask should be specified for  layers, but it is for .F)r.   rI  rH  rJ  )NNr   c                 s   s   | ]}|d ur|V  qd S rs   r)   r@   vr)   r)   r*   	<genexpr>O  rC   z*SpeechT5Encoder.forward.<locals>.<genexpr>last_hidden_stateru   
attentions)ro   rJ  r  use_return_dictr   rE   r~   r   r+  r   r   r   rS   r  r'   	enumerater   r   rI   r  rU  r   )rn   ru   r.   r  rJ  r  r  rI  synced_gpusall_hidden_statesall_self_attentionsidxZencoder_layerskip_the_layerdropout_probabilitylayer_outputsr)   r)   r*   rv     s`    $







zSpeechT5Encoder.forward)NNNNNrx   ry   rz   r   r   rb   r   r  r   r   rP   r   rU  r   rv   r{   r)   r)   rq   r*   r    s         
r  c                
       sd   e Zd ZdZed fddZd	ejeej	 eej	 ee
 ee
 ee
 eeef dddZ  ZS )
SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    r   c                    s,   t  | t|| _t|| _|   d S rs   )ra   rb   r   prenetr  wrapped_encoderr  r   rq   r)   r*   rb   ^  s    

z(SpeechT5EncoderWithSpeechPrenet.__init__Nr,   r.   r  rJ  r  r  r5   c           	      C   s*   |  ||\}}| j||||||d}|S N)ru   r.   r  rJ  r  r  r  r  	rn   r,   r.   r  rJ  r  r  ru   rf  r)   r)   r*   rv   f  s    		z'SpeechT5EncoderWithSpeechPrenet.forward)NNNNNr  r)   r)   rq   r*   r  X  s         
r  c                
       st   e Zd ZdZed fddZdd Zdd Zdej	e
ej e
ej e
e e
e e
e eeef d
ddZ  ZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    r   c                    s,   t  | t|| _t|| _|   d S rs   )ra   rb   r$  r  r  r  r  r   rq   r)   r*   rb     s    

z&SpeechT5EncoderWithTextPrenet.__init__c                 C   s
   | j  S rs   r  get_input_embeddingsr   r)   r)   r*   r    s    z2SpeechT5EncoderWithTextPrenet.get_input_embeddingsc                 C   s   | j | d S rs   r  set_input_embeddingsrn   valuer)   r)   r*   r    s    z2SpeechT5EncoderWithTextPrenet.set_input_embeddingsNr  c           	      C   s$   |  |}| j||||||d}|S r  r  r  r)   r)   r*   rv     s    	
	z%SpeechT5EncoderWithTextPrenet.forward)NNNNN)rx   ry   rz   r   r   rb   r  r  r   r  r   r   rP   r   rU  r   rv   r{   r)   r)   rq   r*   r  }  s$        
r  c                
       sd   e Zd ZdZed fddZd	ejeej	 eej	 ee
 ee
 ee
 eeef dddZ  ZS )
SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    r   c                    s"   t  | t|| _|   d S rs   )ra   rb   r  r  r  r   rq   r)   r*   rb     s    
z%SpeechT5EncoderWithoutPrenet.__init__Nr  c                 C   s   | j ||||||dS r  )r  )rn   r,   r.   r  rJ  r  r  r)   r)   r*   rv     s    	z$SpeechT5EncoderWithoutPrenet.forward)NNNNNr  r)   r)   rq   r*   r    s    
     
r  c                       s   e Zd ZdZed fddZd	eej eej	 eej eej	 eej
 eej
 eeej  ee ee ee ee eej
 eeef dddZ  ZS )
SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    r   c                    sF   t     j| _t fddt jD | _d| _	| 
  d S )Nc                    s   g | ]}t  |d qS ))r;  )rg  r   r   r)   r*   rB     rC   z,SpeechT5Decoder.__init__.<locals>.<listcomp>F)ra   rb   Zdecoder_layerdropr  r   r   rN   Zdecoder_layersr  r   r  r   rq   r   r*   rb     s
     zSpeechT5Decoder.__init__N)ru   r.   rl  rm  r  cross_attn_head_maskr,  ro  rJ  r  r  rK  r5   c                 C   s  |	dur|	n| j j}	|
dur |
n| j j}
|dur4|n| j j}|durH|n| j j}| dd }| jr~| jr~|r~t	d d}|r|du rt
t| j dt| j d}|rt|trt	d t
|}|dur| nd}t||||}|dur|durt||j|d d}t pt| }|
r*d	nd}|	r8d	nd}|	rP|durPd	nd}t||gd
dgD ]V\}}|durf| d t| jkrftd| dt| j d| d  dqft| jD ]\}}|
r||f }d}| jr tg }|| jk }|r|sq||||||dur,|| nd|dur@|| nd||	||d
}|d }|	r||d f }|dur||d f }q|
r||f }|stdd |||||fD S t|||||dS )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr#   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   )rS  r)   r  r  zThe `z` should be specified for r  r  )rm  rH  rn  r,  rJ  ro  rK  r   r   c                 s   s   | ]}|d ur|V  qd S rs   r)   r  r)   r)   r*   r  {  s   z*SpeechT5Decoder.forward.<locals>.<genexpr>)r  r,  ru   r  cross_attentions)ro   rJ  r  ro  r  r   r   r   loggerZwarning_oncer   r   r-  rU  Zfrom_legacy_cacher.  r   r   rE   r   r   r  rS   r  r'   r  r   rI   r  r   )rn   ru   r.   rl  rm  r  r  r,  ro  rJ  r  r  rK  r/  r   r  r  r  Zall_cross_attentionsZ	attn_maskZ	mask_namer  Zdecoder_layerr  r  r  r)   r)   r*   rv     s    J







zSpeechT5Decoder.forward)NNNNNNNNNNNNrx   ry   rz   r   r   rb   r   r   r  r  r   listrP   r   rU  r   rv   r{   r)   r)   rq   r*   r    s:               
r  c                       s   e Zd ZdZed fddZd	eej eej	 eej eej	 eej
 eej
 eej
 eeej  ee ee ee ee eej
 eeef dddZ  ZS )
SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    r   c                    s,   t  | t|| _t|| _|   d S rs   )ra   rb   r	  r  r  wrapped_decoderr  r   rq   r)   r*   rb     s    

z(SpeechT5DecoderWithSpeechPrenet.__init__N)r,   r.   rl  rm  r  r  r  r,  ro  rJ  r  r  rK  r5   c                 C   s2   |  ||}| j||||||||	|
|||d}|S N)ru   r.   rl  rm  r  r  r,  ro  rJ  r  r  rK  r  r  )rn   r,   r.   rl  rm  r  r  r  r,  ro  rJ  r  r  rK  decoder_hidden_statesrf  r)   r)   r*   rv     s     z'SpeechT5DecoderWithSpeechPrenet.forward)NNNNNNNNNNNNNr  r)   r)   rq   r*   r    s>   
             
r  c                       s   e Zd ZdZed fddZdd Zdd Zdee	j
 ee	j ee	j
 ee	j ee	j ee	j eee	j
  ee ee ee ee ee	j eeef d
ddZ  ZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    r   c                    s,   t  | t|| _t|| _|   d S rs   )ra   rb   r(  r  r  r  r  r   rq   r)   r*   rb     s    

z&SpeechT5DecoderWithTextPrenet.__init__c                 C   s
   | j  S rs   r  r   r)   r)   r*   r    s    z2SpeechT5DecoderWithTextPrenet.get_input_embeddingsc                 C   s   | j | d S rs   r  r  r)   r)   r*   r    s    z2SpeechT5DecoderWithTextPrenet.set_input_embeddingsNr,   r.   rl  rm  r  r  r,  ro  rJ  r  r  rK  r5   c                 C   s8   |  |||\}}| j|||||||||	|
||d}|S r  r  )rn   r,   r.   rl  rm  r  r  r,  ro  rJ  r  r  rK  r  rf  r)   r)   r*   rv     s     z%SpeechT5DecoderWithTextPrenet.forward)NNNNNNNNNNNN)rx   ry   rz   r   r   rb   r  r  r   r   r  r  r   r  rP   r   rU  r   rv   r{   r)   r)   rq   r*   r    s>               
r  c                       s   e Zd ZdZed fddZd	eej eej	 eej eej	 eej
 eej
 eeej  ee ee ee ee eej
 eeef dddZ  ZS )
SpeechT5DecoderWithoutPrenetr  r   c                    s"   t  | t|| _|   d S rs   )ra   rb   r  r  r  r   rq   r)   r*   rb     s    
z%SpeechT5DecoderWithoutPrenet.__init__Nr  c                 C   s&   | j |||||||||	|
||d}|S r  )r  )rn   r,   r.   rl  rm  r  r  r,  ro  rJ  r  r  rK  rf  r)   r)   r*   rv     s    z$SpeechT5DecoderWithoutPrenet.forward)NNNNNNNNNNNNr  r)   r)   rq   r*   r    s:   	            
r  c                       sV   e Zd ZdZed fddZejejejej	dddZ
dd	 Zed
d Z  ZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
    r   c                    s   t    |j| _|j| _d S rs   )ra   rb   Zguided_attention_loss_sigmasigmaZguided_attention_loss_scalescaler   rq   r)   r*   rb   %  s    
z-SpeechT5GuidedMultiheadAttentionLoss.__init__)r  input_masksoutput_masksr5   c                 C   sX   |  |||j}|d|d@ }||jd}|| }t||}| j| S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r#   r   r   )_make_guided_attention_masksr   r   r   r   ru  masked_selectr  )rn   r  r  r  guided_attn_masksmasksZlosseslossr)   r)   r*   rv   *  s    z,SpeechT5GuidedMultiheadAttentionLoss.forwardc           
      C   s   | d}| d}tjt||jd |jd f|d}tt||D ]0\}\}}	| ||	| j|||d |	d |f< qF|	dS )Nr#   r   r   )
rL   r   rO   rS   r%   r  r  _make_guided_attention_maskr  r   )
rn   r  r  r   rY   r   r  r  ZilenZolenr)   r)   r*   r  C  s    

$&zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksc                 C   sd   t jt j| |dt j||ddd\}}| | }| |  }dt || d  d|d    S )Nr   Zxy)Zindexingr   r   )r   ZmeshgridrR   r   r   )r9   Zoutput_lengthr  r   Zgrid_yZgrid_xr)   r)   r*   r  N  s    
z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask)rx   ry   rz   r   r   rb   r   r  Z
BoolTensorr   rv   r  r   r  r{   r)   r)   rq   r*   r    s   r  c                	       sT   e Zd ZdZed fddZd	ejejejejeje	ej ej
dddZ  ZS )
SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    r   c                    sP   t    |j| _|j| _|j| _t | _tt	dd| _
| jrLt|| _d S )Ng      @)Z
pos_weight)ra   rb   use_guided_attention_lossguided_attention_loss_num_headsr-   r   l1_criterionr   r   r   bce_criterionr  attn_criterionr   rq   r)   r*   rb   _  s    
z SpeechT5SpectrogramLoss.__init__N)r.   r!  r"  r#  labelsr  r5   c                    s@  |dk}| |}| |}| |} || || }|d d d d df }	tj|	 d t|	dd|	jgdd}
|
d d dd f  |	}
| |	} ||
}|| } j	r<tj fdd|D dd}|dk}|d d d d df } j
dkr&|d d  j
d d  j
f } |||}||7 }|S )Nr/   r   r   r   r   c                    s"   g | ]}|d d d  j f qS rs   )r  )r@   xr   r)   r*   rB     rC   z3SpeechT5SpectrogramLoss.forward.<locals>.<listcomp>)r  r  r   r   rT   r   r   r   r  r  r-   r  )rn   r.   r!  r"  r#  r  r  r   Zl1_lossr  stop_labelsZbce_lossr  Zattnr  r  Z	attn_lossr)   r   r*   rv   k  s(    	


.
zSpeechT5SpectrogramLoss.forward)N)rx   ry   rz   r   r   rb   r   r  r  r   r   rv   r{   r)   r)   rq   r*   r  Z  s    r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    Zcustom_introc                       s   e Zd Zdeeej eej d fddZdd Zdd Z	d	d
 Z
dd Zedeej eej eej eej eej eej eej eeeej   eeeej   ee eej ee ee ee eej eeej ef dddZ  ZS )SpeechT5ModelN)ro   encoderdecoderc                    sJ   t  | || _|du r"t|n|| _|du r8t|n|| _|   dS )z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)ra   rb   ro   r  r  r  r  r  )rn   ro   r  r  rq   r)   r*   rb     s
    zSpeechT5Model.__init__c                 C   s4   t | jtr| j S t | jtr,| j S td S rs   )r-  r  r  r  r  r  NotImplementedErrorr   r)   r)   r*   r    s
    

z"SpeechT5Model.get_input_embeddingsc                 C   s4   t | jtr| j| t | jtr0| j| d S rs   )r-  r  r  r  r  r  r  r)   r)   r*   r    s    z"SpeechT5Model.set_input_embeddingsc                 C   s   | j S rs   )r  r   r)   r)   r*   get_encoder  s    zSpeechT5Model.get_encoderc                 C   s   t | jtr| jj  dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r-  r  r  r  r   r   r)   r)   r*   r     s    z$SpeechT5Model.freeze_feature_encoder)r,   r.   decoder_input_valuesdecoder_attention_maskr  decoder_head_maskr  encoder_outputsr,  ro  r  rJ  r  r  rK  r5   c                 C   sl  |dur|n| j j}|dur |n| j j}|
dur4|
n| j j}
|durH|n| j j}|du rp| j||||||d}nH|rt|tst|d t|dkr|d ndt|dkr|d ndd}|durt| jt	r| jj
|d jd |}n|}t| jtrd|i}ni }| jf |||d ||||	|
||||d|}|sD|| S t|j|j|j|j|j|j|j|jd	S )
a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r,   r.   r  rJ  r  r  r   r   r   r  r  )r,   r.   rl  rm  r  r  r,  ro  rJ  r  r  rK  )r  r,  r  decoder_attentionsr  encoder_last_hidden_staterl  encoder_attentions)ro   rJ  r  ro  r  r  r-  r   rS   r  r  r   r%   r  r  r   r  r,  ru   r  r  )rn   r,   r.   r  r  r  r  r  r  r,  ro  r  rJ  r  r  rK  rm  Zdecoder_argsZdecoder_outputsr)   r)   r*   rv     sp    *	
zSpeechT5Model.forward)NN)NNNNNNNNNNNNNNN)rx   ry   rz   r   r   r   r  rb   r  r  r  r   r   r   r   r  r  rU  rP   r   r   rv   r{   r)   r)   rq   r*   r    sZ   	                 r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c                       s   e Zd ZdgZed fddZdd Zdd Zd	d
 Zdd Z	dd Z
edeej eej eej eej eej eej eej eeeej   eeeej   ee ee ee ee eej eej eeef dddZ  ZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightr   c                    s\   t  | |jd u r(td| j dt|}t|}t|||| _t	|| _
|   d S )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)ra   rb   r%  r'   rr   r  r  r  rq  r0  text_decoder_postnetr  )rn   ro   speech_encoderZtext_decoderrq   r)   r*   rb   @  s    

z SpeechT5ForSpeechToText.__init__c                 C   s
   | j  S rs   rq  r  r   r)   r)   r*   r  T  s    z#SpeechT5ForSpeechToText.get_encoderc                 C   s
   | j  S rs   rq  get_decoderr   r)   r)   r*   r  W  s    z#SpeechT5ForSpeechToText.get_decoderc                 C   s   |   j  dS r  r  r  r   r   r)   r)   r*   r   Z  s    z.SpeechT5ForSpeechToText.freeze_feature_encoderc                 C   s
   | j  S rs   )r  r4  r   r)   r)   r*   r4  a  s    z-SpeechT5ForSpeechToText.get_output_embeddingsc                 C   s   | j | d S rs   )r  r6  r5  r)   r)   r*   r6  d  s    z-SpeechT5ForSpeechToText.set_output_embeddingsN)r,   r.   decoder_input_idsr  r  r  r  r  r,  ro  rJ  r  r  r  rK  r5   c                 C   s   |dur|n| j j}|dur8|du r8t|| j j| j j}| j|||||||||	|
||d|d}| |d }d}|durt }||d| j j	|d}|s|f|dd  }|dur|f| S |S t
|||j|j|j|j|j|j|jd	S )a(  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r,   r.   r  r  r  r  r  r  r,  ro  rJ  r  r  rK  r   r#   r   )	r  r#  r,  r  r  r  r  rl  r  )ro   r  r+   r!   r"   rq  r  r   r   r%  r   r,  r  r  r  r  rl  r  )rn   r,   r.   r  r  r  r  r  r  r,  ro  rJ  r  r  r  rK  rf  r#  r  Zloss_fctoutputr)   r)   r*   rv   g  sR    [zSpeechT5ForSpeechToText.forward)NNNNNNNNNNNNNNN)rx   ry   rz   Z_tied_weights_keysr   rb   r  r  r   r4  r6  r   r   r   r  r  r   rU  rP   r   r   rv   r{   r)   r)   rq   r*   r  8  sR                  
r        ?r0         4@F)modelr,   r  r.   	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsr5   c
           "   
      s  |d u rt d|d u r.d|| jjk  }
n|}
|d}| jj||
dd}|j}t| jjt	r~| jjj
|d jd |
}
t|d| | jj }t|d| | jj }||d| jj}g }g }d }d}i  |d7 }| jj
||}| jjj|d d dd f d ||
|d|dd}|r:|tj|jdd |jd}|j}| j|}||| jj| jj}|| |d d dd d f |d| jj}tj||fdd}t| j|}||k rqq||k rtj|dd|k}t|d  }nt t!|} fd	d
|D }t!|dkrlt"|}|#dd$dd}| j%|}|D ]}||  |< qXt! |krؐq~q؇ fdd
t t! D }|	s2|dkr|d ntj&j'j(j)|dd}|d ur||}n|}|rtj|dd}|dkr(|j|t|d| g| dd  R  }||f}ng t |D ]} ||  d q>|d u rtj&j'j(j)|dd}|f}n<g tj&j'j(j)|dd}||fdd
D }!|!f}|rtj|dd}|j|t|d| g| dd  R  }g ||R }|S )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r,   r.   r  r#   )ru   r.   rl  rm  r,  ro  rJ  r  r   c                    s   g | ]}| vr|qS r)   r)   r   result_spectrogramr)   r*   rB   T	  rC   z$_generate_speech.<locals>.<listcomp>r   c                    s   g | ]} | qS r)   r)   r   r  r)   r*   rB   ]	  rC   )Zbatch_firstc                    s&   g | ]}t d t  | qS )r   )r7   r   r8   r   )spectrogram_lengths	waveformsr)   r*   rB   w	  rC   )*r'   ro   r!   r7   r   rq  r  r  r-  r  r  r   r%   r-   r$   r
  r  r  rU   r   r   r  squeezer,  speech_decoder_postnetr  r   Zsigmoidr  rL   r  rM   rN   rS   stackr   flattenr   r   r   ZrnnZpad_sequence)"r  r,   r  r.   r  r  r  r  r  r  rm  r   Zencoder_outr  maxlenZminlenZoutput_sequencespectrogramr  r,  r  r  Zdecoder_outZlast_decoder_outputZspectrumZnew_spectrogramZprobZmeet_thresholdsZmeet_indexesZspectrogramsZ
meet_indexrf  r   Zwaveform_lengthsr)   )r  r  r  r*   _generate_speech  s    


$


&





r  zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c                       s  e Zd ZdZed fddZeedddZdd	 Z	d
d Z
edeej eej eej eej eej eej eej eeeej   eeeej   ee ee ee ee eej eej eej eej eeef dddZe dejeej eej eeeeej eeeejeejejf f d
ddZe dejeej eej eeeeej eeeejeejejf f d
ddZ  ZS )SpeechT5ForTextToSpeechr    r   c                    s\   t  | |jd u r(td| j dt|}t|}t|||| _t	|| _
|   d S )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)ra   rb   r%  r'   rr   r  r  r  rq  r  r  r  )rn   ro   Ztext_encoderspeech_decoderrq   r)   r*   rb   	  s    

z SpeechT5ForTextToSpeech.__init__)r5   c                 C   s   dS r   r)   )clsr)   r)   r*   can_generate	  s    z$SpeechT5ForTextToSpeech.can_generatec                 C   s
   | j  S rs   r  r   r)   r)   r*   r  	  s    z#SpeechT5ForTextToSpeech.get_encoderc                 C   s
   | j  S rs   r  r   r)   r)   r*   r  	  s    z#SpeechT5ForTextToSpeech.get_decoderN)r    r.   r  r  r  r  r  r  r,  ro  rJ  r  r  r  r  r  rK  r5   c                 C   s   |dur|n| j j}|durD|du r8t|| j j|\}}| j jrDd}| j|||||||||	|
|||d|d}| |d \}}}d}|durt| j }|||||||j}|s|f|dd  }|dur|f| S |S t	|||j
|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr,   r.   r  r  r  r  r  r  r,  ro  r  rJ  r  r  rK  r   r   	r  r  r,  r  r  r  r  rl  r  )ro   r  r1   r-   r  rq  r  r  r  r   r,  r  r  r  rl  r  )rn   r    r.   r  r  r  r  r  r  r,  ro  rJ  r  r  r  r  r  rK  rf  r!  r"  r#  r  	criterionr  r)   r)   r*   rv   	  sf    M

	zSpeechT5ForTextToSpeech.forwardr  r0   r  F)
r    r.   r  r  r  r  r  r  r  r5   c
                 K   s^   |durD| d}| d|krD| ddkr<||d}ntdt| |||||||||	
S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   r  r'   r  )rn   r    r.   r  r  r  r  r  r  r  kwargsrX   r)   r)   r*   generate2
  s(    E
z SpeechT5ForTextToSpeech.generate)
r    r  r.   r  r  r  r  r  r  r5   c
                 C   s^   |durD| d}
| d|
krD| ddkr<||
d}ntdt| |||||||||	
S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        Nr   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )rn   r    r  r.   r  r  r  r  r  r  rX   r)   r)   r*   generate_speech
  s(    I
z'SpeechT5ForTextToSpeech.generate_speech)NNNNNNNNNNNNNNNNN)NNr  r0   r  NFF)NNr  r0   r  NFF)rx   ry   rz   r  r   rb   classmethodrP   r  r  r  r   r   r   r  r  r   rU  r   r   rv   r   r   r   r  r  r  r{   r)   r)   rq   r*   r  	  s                    
         [        r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c                       s&  e Zd Zed fddZdd Zdd Zdd	 Zede	e
j e	e
j e	e
j e	e
j e	e
j e	e
j e	e
j e	eee
j   e	eee
j   e	e e	e e	e e	e e	e
j e	e
j e	e
j e	e
j eeef dddZe
 de
je	e
j e	e
j eeee	ej eee
jd
ddZ  ZS )SpeechT5ForSpeechToSpeechr   c                    s@   t  | t|}t|}t|||| _t|| _|   d S rs   )	ra   rb   r  r  r  rq  r  r  r  )rn   ro   r  r  rq   r)   r*   rb   
  s    
z"SpeechT5ForSpeechToSpeech.__init__c                 C   s
   | j  S rs   r  r   r)   r)   r*   r    s    z%SpeechT5ForSpeechToSpeech.get_encoderc                 C   s
   | j  S rs   r  r   r)   r)   r*   r    s    z%SpeechT5ForSpeechToSpeech.get_decoderc                 C   s   |   j  dS r  r  r   r)   r)   r*   r     s    z0SpeechT5ForSpeechToSpeech.freeze_feature_encoderN)r,   r.   r  r  r  r  r  r  r,  ro  rJ  r  r  r  r  r  rK  r5   c                 C   s   |dur|n| j j}|dur8|du r8t|| j j|\}}| j|||||||||	|
|||d|d}| |d \}}}d}|s|f|dd  }|dur|f| S |S t|||j|j|j	|j
|j|j|jd	S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
            a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr  r   r   r  )ro   r  r1   r-   rq  r  r   r,  r  r  r  r  rl  r  )rn   r,   r.   r  r  r  r  r  r  r,  ro  rJ  r  r  r  r  r  rK  rf  rA   r  r#  r  r  r)   r)   r*   rv     sN    T
z!SpeechT5ForSpeechToSpeech.forwardr  r0   r  F)
r,   r  r.   r  r  r  r  r  r  r5   c
           
      C   s2   |du rt jd|jd}t| |||||||||	
S )a'  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
                a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
                or the soundfile library (`pip install soundfile`).
                To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
                conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        N)r   i   r   )r   rO   r   r  )
rn   r,   r  r.   r  r  r  r  r  r  r)   r)   r*   r    s    Jz)SpeechT5ForSpeechToSpeech.generate_speech)NNNNNNNNNNNNNNNNN)NNr  r0   r  NFF)rx   ry   rz   r   rb   r  r  r   r   r   r   r  r  r   rU  rP   r   r   rv   r   r   r   r  r  r{   r)   r)   rq   r*   r  
  s                    
         r  c                       s@   e Zd Zd fdd	ZdddZd	d
 Zdd Zdd Z  ZS )HifiGanResidualBlockr   r   r      皙?c                    sb   t    |_t fddttD _t fddttD _d S )Nc                    s2   g | ]*}t j  d | | dqS r   )r_   dilationr   r   rf   get_paddingr   channelsr  r^   rn   r)   r*   rB     s   	z1HifiGanResidualBlock.__init__.<locals>.<listcomp>c                    s*   g | ]"}t j  d d d dqS r  r  r?   )r  r^   rn   r)   r*   rB     s   	
)	ra   rb   leaky_relu_sloper   r   rN   rS   convs1convs2)rn   r  r^   r  r  rq   r  r*   rb     s    
	
	
zHifiGanResidualBlock.__init__r   c                 C   s   || | d S r   r)   )rn   r^   r  r)   r)   r*   r    s    z HifiGanResidualBlock.get_paddingc                 C   sL   t jj}tt jjdr t jjj}| jD ]}|| q&| jD ]}|| q:d S Nr   )r   r   r   r   r   r  r  rn   r   r   r)   r)   r*   apply_weight_norm  s    



z&HifiGanResidualBlock.apply_weight_normc                 C   s4   | j D ]}tj| q| jD ]}tj| qd S rs   )r  r   r   remove_weight_normr  rn   r   r)   r)   r*   r    s    

z'HifiGanResidualBlock.remove_weight_normc                 C   sX   t | j| jD ]D\}}|}tj|| j}||}tj|| j}||}|| }q|S rs   )r  r  r  r   r  
leaky_relur  )rn   ru   Zconv1Zconv2re  r)   r)   r*   rv     s    
zHifiGanResidualBlock.forward)r   r  r
  )r   )	rx   ry   rz   rb   r  r  r  rv   r{   r)   r)   rq   r*   r    s
   

r  z
    HiFi-GAN vocoder.
    c                       sn   e Zd ZU eed< dZed fddZejdddZ	d	d
 Z
dd ZeddejejdddZ  ZS )SpeechT5HifiGanro   r  r   c              
      sN  t  | t|j| _t|j| _tj|j	|j
dddd| _t | _tt|j|jD ]H\}\}}| jtj|j
d|  |j
d|d   |||| d d qZt | _tt| jD ]F}|j
d|d   }t|j|jD ] \}}| jt||||j qqtj|ddddd| _| dt|j	 | dt|j	 |   d S )N   r   r   )r^   r_   r   r   ru  r  )ra   rb   rS   Zresblock_kernel_sizesnum_kernelsZupsample_ratesnum_upsamplesr   rf   Zmodel_in_dimZupsample_initial_channelconv_prer   	upsamplerr  r  Zupsample_kernel_sizesrU   ConvTranspose1d	resblocksrN   Zresblock_dilation_sizesr  r  	conv_postr   r   rO   rT   r  )rn   ro   r   Zupsample_rater^   r  r  rq   r)   r*   rb   2  s<    



zSpeechT5HifiGan.__init__rr  c                 C   sB   t |tjtjfr>|jjjd| jjd |j	dur>|j	j
  dS )zInitialize the weights.r0   rt  N)r-  r   rf   r   r   r|  r{  ro   ry  r`   r}  )rn   rs  r)   r)   r*   r  X  s    
zSpeechT5HifiGan._init_weightsc                 C   s`   t jj}tt jjdr t jjj}|| j | jD ]}|| q0| jD ]}|  qD|| j	 d S r  )
r   r   r   r   r   r  r  r!  r  r"  r  r)   r)   r*   r  _  s    





z!SpeechT5HifiGan.apply_weight_normc                 C   sL   t j| j | jD ]}t j| q| jD ]}|  q,t j| j d S rs   )r   r   r  r  r  r!  r"  r  r)   r)   r*   r  k  s    


z"SpeechT5HifiGan.remove_weight_norma  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  )r  r5   c                 C   s  | j jr|| j | j }| dk}|s2|d}|dd}| |}t| j	D ]p}t
j|| j j}| j| |}| j|| j  |}td| jD ] }|| j|| j |  |7 }q|| j }qRt
j|}| |}t|}|s|dddd}n
|d}|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r   r   r   r   r#   )ro   Znormalize_beforeru  r  r   r   r   r  rN   r  r   r  r  r  r  r!  r  r"  r   tanhr  r   )rn   r  Z
is_batchedru   r   Z	res_statejZwaveformr)   r)   r*   rv   s  s*    




zSpeechT5HifiGan.forward)rx   ry   rz   r   r  r  rb   r   r  r  r  r  r   r   r  rv   r{   r)   r)   rq   r*   r  )  s   
&r  )r  r  r  r  rp  r  )r   N)Nr   )NNr  r0   r  NFF)hr   r   typingr   r   numpyrG   r   Ztorch.utils.checkpointr   Ztorch.nnr   r   r   Zactivationsr	   Zcache_utilsr
   r   r   Z
generationr   Zintegrations.deepspeedr   Zintegrations.fsdpr   Zmodeling_attn_mask_utilsr   r   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   r   r   r   r   Zutils.deprecationr   Zconfiguration_speecht5r   r   Z
get_loggerrx   r  Z_HIDDEN_STATES_START_POSITIONr   r7   r+   r1   rU  r   r  Zndarrayr[   r\   r|   r   r  r   r   r   r   r   r   r   r   r	  r  r  r$  r(  r0  r7  rV  r^  rg  rp  r  r  r  r  r  r  r  r  r  r  r  r  r  rP   r  r  r  r  r  __all__r)   r)   r)   r*   <module>   s   
   
xE-) 4(, &=f+%*  D26-;=  :           j x>w