a
    hu                  
   @   s&  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+ e( rddl,m-Z- e).e/Z0ee'ddG dd deZ1G dd dej2Z3G dd dej2Z4G dd deZ5G dd deZ6G dd  d eZ7G d!d" d"ej2Z8G d#d$ d$ej2Z9dKej2e
j:e
j:e
j:ee
j: ee; e;ee
j: d&d'd(Z<G d)d* d*ej2Z=G d+d, d,ej2Z>G d-d. d.eZ?G d/d0 d0ej2Z@G d1d2 d2ej2ZAG d3d4 d4eZBG d5d6 d6ej2ZCG d7d8 d8ej2ZDe'G d9d: d:e#ZEdLeFeGeGf e;eGee
jH eGe	jId;d<d=ZJe ZKe'G d>d? d?eEZLe'd@dG dAdB dBeEZMdCZNe'dDdG dEdF dFeEZOe'dGdG dHdI dIeEZPg dJZQdS )M    N)	dataclass)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputModelOutputSequenceClassifierOutputWav2Vec2BaseModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )UniSpeechConfig)make_flex_block_causal_maskzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dS )	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r    r!   r"   tupler#    r,   r,   l/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/unispeech/modeling_unispeech.pyr   :   s   
r   c                       s$   e Zd Z fddZdd Z  ZS )UniSpeechSamePadLayerc                    s$   t    |d dkrdnd| _d S )N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__r,   r-   r1   X   s    
zUniSpeechSamePadLayer.__init__c                 C   s,   | j dkr(|d d d d d | j  f }|S )Nr   )r2   r3   r"   r,   r,   r-   forward\   s    
zUniSpeechSamePadLayer.forwardr$   r%   r&   r1   r8   __classcell__r,   r,   r5   r-   r.   W   s   r.   c                       s$   e Zd Z fddZdd Z  ZS ) UniSpeechPositionalConvEmbeddingc                    s$  t    tj|j|j|j|jd |jd| _tjj	}t
tjjdrNtjjj	}t rdd l}|jj| jjdd" || jddd| _W d    n1 s0    Y  t
| jdr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n|| jddd| _t|j| _t|j | _d S )	Nr/   )kernel_sizepaddinggroupsweight_normr   )Zmodifier_rankweight)namedimparametrizations)r0   r1   nnConv1dhidden_sizer4   Znum_conv_pos_embedding_groupsconvutilsr?   hasattrrC   r	   	deepspeedzeroZGatheredParametersr@   Z	original0Z	original1weight_gweight_vZregister_external_parameterr.   r=   r   feat_extract_activation
activation)r3   configr?   rJ   rL   rM   r5   r,   r-   r1   c   s2    

0z)UniSpeechPositionalConvEmbedding.__init__c                 C   s:   | dd}| |}| |}| |}| dd}|S )Nr   r/   )	transposerG   r=   rO   r7   r,   r,   r-   r8      s    


z(UniSpeechPositionalConvEmbedding.forwardr9   r,   r,   r5   r-   r;   b   s   !r;   c                       s&   e Zd Zd fdd	Zdd Z  ZS )UniSpeechNoLayerNormConvLayerr   c                    sj   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r<   stridebias)r0   r1   conv_dimin_conv_dimout_conv_dimrD   rE   conv_kernelconv_stride	conv_biasrG   r   rN   rO   r3   rP   layer_idr5   r,   r-   r1      s    
z&UniSpeechNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)rG   rO   r7   r,   r,   r-   r8      s    

z%UniSpeechNoLayerNormConvLayer.forward)r   r9   r,   r,   r5   r-   rR      s   rR   c                       s&   e Zd Zd fdd	Zdd Z  ZS )UniSpeechLayerNormConvLayerr   c                    s|   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rS   T)Zelementwise_affine)r0   r1   rV   rW   rX   rD   rE   rY   rZ   r[   rG   	LayerNorm
layer_normr   rN   rO   r\   r5   r,   r-   r1      s    
z$UniSpeechLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)rG   rQ   ra   rO   r7   r,   r,   r-   r8      s    


z#UniSpeechLayerNormConvLayer.forward)r   r9   r,   r,   r5   r-   r_      s   r_   c                       s&   e Zd Zd fdd	Zdd Z  ZS )UniSpeechGroupNormConvLayerr   c                    s   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rS   T)
num_groupsZnum_channelsZaffine)r0   r1   rV   rW   rX   rD   rE   rY   rZ   r[   rG   r   rN   rO   	GroupNormra   r\   r5   r,   r-   r1      s    
z$UniSpeechGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S r^   )rG   ra   rO   r7   r,   r,   r-   r8      s    


z#UniSpeechGroupNormConvLayer.forward)r   r9   r,   r,   r5   r-   rd      s   rd   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )UniSpeechFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr@t ddg fddt jd D  }n6 jdkrd fddt jD }ntd	 j d
t|| _	d| _
d| _d S )Ngroupr   r]   c                    s   g | ]}t  |d  dqS )r   ri   )rR   .0irP   r,   r-   
<listcomp>   s   z4UniSpeechFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )ri   )r_   rj   rm   r,   r-   rn      s   z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r0   r1   Zfeat_extract_normrd   rangeZnum_feat_extract_layers
ValueErrorrD   
ModuleListconv_layersgradient_checkpointing_requires_grad)r3   rP   rs   r5   rm   r-   r1      s    




z UniSpeechFeatureEncoder.__init__c                 C   s   |   D ]
}d|_qd| _d S )NF)
parametersrequires_gradru   r3   paramr,   r,   r-   _freeze_parameters   s    z*UniSpeechFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r"| jr"d|_| jD ]}||}q(|S )NT)ru   trainingrw   rs   )r3   input_valuesr"   Z
conv_layerr,   r,   r-   r8      s    

zUniSpeechFeatureEncoder.forward)r$   r%   r&   r'   r1   rz   r8   r:   r,   r,   r5   r-   rg      s   rg   c                       s$   e Zd Z fddZdd Z  ZS )UniSpeechFeatureProjectionc                    sJ   t    tj|jd |jd| _t|jd |j| _	t
|j| _d S )Nrc   eps)r0   r1   rD   r`   rV   layer_norm_epsra   LinearrF   
projectionDropoutZfeat_proj_dropoutdropoutr3   rP   r5   r,   r-   r1     s    
z#UniSpeechFeatureProjection.__init__c                 C   s&   |  |}| |}| |}||fS r^   )ra   r   r   )r3   r"   Znorm_hidden_statesr,   r,   r-   r8     s    


z"UniSpeechFeatureProjection.forwardr9   r,   r,   r5   r-   r}      s   r}           )modulequerykeyvalueattention_maskscalingr   	head_maskc                 K   s   |d u r| dd }t||dd| }	|d ur>|	| }	tjj|	dd}	|d urj|	|dddd }	tjj|	|| j	d}	t|	|}
|
dd
 }
|
|	fS )Nrc         r/   r   rB   r   )pr{   )sizer(   matmulrQ   rD   
functionalsoftmaxviewr   r{   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr,   r,   r-   eager_attention_forward  s    r   c                       s   e Zd ZdZdeeeeeeee d fddZ	de
jee
j ee
j ee
j ee ee ee
jee
j eee
j  f d	d
dZ  ZS )UniSpeechAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FTN)	embed_dim	num_headsr   
is_decoderrU   	is_causalrP   c                    s   t    || _|| _|| _|| | _|| _| j| | jkrTtd| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rU   )r0   r1   r   r   r   head_dimrP   rq   r   r   r   rD   r   k_projv_projq_projout_proj)r3   r   r   r   r   rU   r   rP   r5   r,   r-   r1   0  s&    



zUniSpeechAttention.__init__)r"   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                 K   s  |du}|j dd \}}	|r(|j d n|	}
||	d| jf}||
d| jf}| |j| dd}|rh|n|}| |j| dd}| |j| dd}t}| jj	dkrt
| jj	 }|| ||||f| jsdn| j| j||d|\}}|||	d }| |}||dfS )z#Input shape: Batch x Time x ChannelNrc   r   r/   eagerr   )r   r   r   r   )shaper   r   r   rQ   r   r   r   rP   _attn_implementationr   r{   r   r   reshaper   r   )r3   r"   r   r   r   r   r   Zis_cross_attentionZbszZtgt_lenZsrc_lenZq_input_shapeZkv_input_shapeZquery_statesZcurrent_statesZ
key_statesZvalue_statesZattention_interfacer   r   r,   r,   r-   r8   O  s:    


zUniSpeechAttention.forward)r   FTFN)NNNF)r$   r%   r&   r'   intfloatboolr   r   r1   r(   Tensorr   r   r+   r8   r:   r,   r,   r5   r-   r   -  s8        "    r   c                       s$   e Zd Z fddZdd Z  ZS )UniSpeechFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtrDt|j | _n|j| _t|j|j| _t|j| _d S r^   )r0   r1   rD   r   Zactivation_dropoutintermediate_dropoutr   rF   Zintermediate_sizeintermediate_dense
isinstanceZ
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   r5   r,   r-   r1     s    
zUniSpeechFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r^   )r   r   r   r   r   r7   r,   r,   r-   r8     s    




zUniSpeechFeedForward.forwardr9   r,   r,   r5   r-   r     s   r   c                       s&   e Zd Z fddZdddZ  ZS )UniSpeechEncoderLayerc                    sh   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r   r   rP   r~   )r0   r1   r   rF   num_attention_headsattention_dropout	attentionrD   r   r   r   r`   r   ra   r   feed_forwardfinal_layer_normr   r5   r,   r-   r1     s    

zUniSpeechEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|rb||f7 }|S Nr   r   )r   r   ra   r   r   r3   r"   r   r   Zattn_residualr   _outputsr,   r,   r-   r8     s    



zUniSpeechEncoderLayer.forward)NFr9   r,   r,   r5   r-   r     s   r   c                       sX   e Zd Z fddZdejeej eeedddZ	e
ejdf ejd	d
dZ  ZS )UniSpeechEncoderc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr~   c                    s   g | ]}t  qS r,   )r   rk   r   rm   r,   r-   rn         z-UniSpeechEncoder.__init__.<locals>.<listcomp>Fr0   r1   rP   r;   pos_conv_embedrD   r`   rF   r   ra   r   r   r   rr   rp   num_hidden_layerslayersrt   r   r5   rm   r-   r1     s    

 zUniSpeechEncoder.__init__NFT)r"   r   r   output_hidden_statesreturn_dictc                 C   s.  |rdnd }|rdnd }|d urD| ddd|jd }d|| < | ||}| |}	||	 }| |}| |}t pt| }
| j	D ]f}|r||f }t
g }| jo|| jjk }|r|
r||||d}|d }|rd}|r||d f }q|r||f }|s tdd	 |||fD S t|||d
S )Nr,   rc   r   r/   r   r   NNc                 s   s   | ]}|d ur|V  qd S r^   r,   rk   vr,   r,   r-   	<genexpr>   r   z+UniSpeechEncoder.forward.<locals>.<genexpr>last_hidden_stater"   r#   )	unsqueezerepeatr   _update_full_maskr   ra   r   r	   r
   r   r(   randr{   rP   	layerdropr+   r   r3   r"   r   r   r   r   Zall_hidden_statesZall_self_attentionsZexpand_attention_maskZposition_embeddingsZsynced_gpusro   Zdropout_probabilityZskip_the_layerZlayer_outputsr,   r,   r-   r8     sJ    







zUniSpeechEncoder.forwardr   inputs_embedsc                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S NZflash_attention_2r   ZsdpaZflex_attentionF)r   	rP   r   r   dtyper   r(   r   r   r   r3   r   r   r,   r,   r-   r     s    z"UniSpeechEncoder._update_full_mask)NFFT)r$   r%   r&   r1   r(   tensorr   r   r   r8   r   r   r:   r,   r,   r5   r-   r     s       >r   c                       s,   e Zd Z fddZejdddZ  ZS )UniSpeechAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r0   r1   adapter_attn_dimZ	input_dimrF   Z
hidden_dimrD   r`   normr   linear_1ZReLUact_fnlinear_2r   r5   r,   r-   r1     s    

z"UniSpeechAttnAdapterLayer.__init__)r"   c                 C   s,   |  |}| |}| |}| |}|S r^   )r   r   r   r   r7   r,   r,   r-   r8   -  s
    



z!UniSpeechAttnAdapterLayer.forward)r$   r%   r&   r1   r(   r)   r8   r:   r,   r,   r5   r-   r     s   r   c                       s8   e Zd Z fddZdejeej edddZ  Z	S )	$UniSpeechEncoderLayerStableLayerNormc                    s   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _t|dd d urt|| _nd | _d S )NFr   r~   r   )r0   r1   r   rF   r   r   r   rD   r   r   r   r`   r   ra   r   r   r   getattrr   adapter_layerr   r5   r,   r-   r1   8  s    

z-UniSpeechEncoderLayerStableLayerNorm.__init__NF)r"   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd urb|| | }|f}|rv||f7 }|S r   )ra   r   r   r   r   r   r   r,   r,   r-   r8   K  s    



z,UniSpeechEncoderLayerStableLayerNorm.forward)NF)
r$   r%   r&   r1   r(   r   r   r   r8   r:   r,   r,   r5   r-   r   7  s     r   c                       sB   e Zd Z fddZdddZeejdf ejdd	d
Z  Z	S )UniSpeechEncoderStableLayerNormc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nr~   c                    s   g | ]}t  qS r,   )r   r   rm   r,   r-   rn   m  r   z<UniSpeechEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r   r5   rm   r-   r1   f  s    

z(UniSpeechEncoderStableLayerNorm.__init__NFTc                 C   s.  |rdnd }|rdnd }|d urD| ddd|jd }d|| < | ||}| |}	||	 }| |}t pxt| }
| jD ]f}|r||f }t	
g }| jo|| jjk }|r|
r||||d}|d }|rd}|r||d f }q| |}|r||f }|s tdd	 |||fD S t|||d
S )Nr,   rc   r   r/   r   r   r   c                 s   s   | ]}|d ur|V  qd S r^   r,   r   r,   r,   r-   r     r   z:UniSpeechEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r   r   r   r	   r
   r   r(   r   r{   rP   r   ra   r+   r   r   r,   r,   r-   r8   q  sJ    







z'UniSpeechEncoderStableLayerNorm.forwardr   c                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S r   r   r   r,   r,   r-   r     s    z1UniSpeechEncoderStableLayerNorm._update_full_mask)NFFT)
r$   r%   r&   r1   r8   r   r(   r   r   r:   r,   r,   r5   r-   r   e  s       
@r   c                       s4   e Zd ZdZ fddZedd Zdd Z  ZS )UniSpeechGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                    s   t    |j| _|j| _|j| j dkrDtd|j d| j dt	t
d| j| j |j| j | _t|jd | j| j | _d| _d S )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr   rc   r/   )r0   r1   Znum_codevector_groupsre   Znum_codevectors_per_groupnum_varscodevector_dimrq   rD   	Parameterr(   r)   codevectorsr   rV   weight_projtemperaturer   r5   r,   r-   r1     s    

z'UniSpeechGumbelVectorQuantizer.__init__c                 C   s8   | j dd}ttj|t|d  dd  }|S )Nr   r   gHz>rc   )meanr(   expsumlog)ZprobsZmarginal_probs
perplexityr,   r,   r-   _compute_perplexity  s    (z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}| |}||| | j d}| jr~tjj| | j	dd
|}tj||| | jd dd}| |}nH|jdd}|j|j  d|ddd}||| | jd}| |}||| d}|d| j }	|	|| | j| jd}
|
d||d}
|
|fS )Nrc   T)tauhardr   r         ?rb   )r   r   r   re   r{   rD   r   Zgumbel_softmaxr   r   type_asr(   r   r   ZargmaxZ	new_zerosZscatter_r   r   r   r   )r3   r"   
batch_sizesequence_lengthrF   Zcodevector_probsZcodevector_soft_distr   Zcodevector_idxZcodevectors_per_groupr   r,   r,   r-   r8     s0    

z&UniSpeechGumbelVectorQuantizer.forward)	r$   r%   r&   r'   r1   staticmethodr   r8   r:   r,   r,   r5   r-   r     s
   
r   c                   @   s`   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zeejef ddd	Zeejd
ddZdS )UniSpeechPreTrainedModelrP   	unispeechr|   Tc              	   C   s  t |tr>|jjjjddd |jjj  tj	
|j njt |trtj	j|jjddtd|jjd |jj   d tj	|jjd nt |trtd|jj }tj	j
|jj| |d tj	j
|jj| |d nt |tjr|jjjd| jjd |jdur|jj  nt |tjtjfrN|jj  |jjd nZt |tjrtj	|j |jdurt|j|j|jd   }tj	j
|j| |d dS )	zInitialize the weightsr   r   )r   Zstdr   r/   )abNr   )r   r   r   r@   dataZnormal_rU   Zzero_rD   inituniform_r   r;   rG   mathsqrtr<   Zin_channelsZ	constant_r}   r   Zin_featuresr   rP   Zinitializer_ranger`   rf   fill_rE   Zkaiming_normal_r>   )r3   r   kr,   r,   r-   _init_weights  s6    

 
z&UniSpeechPreTrainedModel._init_weights)input_lengthsc                 C   s4   dd }t | jj| jjD ]\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )r(   div)input_lengthr<   rT   r,   r,   r-   _conv_out_length<  s    zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ziprP   rY   rZ   )r3   r  r  r<   rT   r,   r,   r-    _get_feat_extract_output_lengths7  s    z9UniSpeechPreTrainedModel._get_feat_extract_output_lengths)feature_vector_lengthr   c                 C   s   |j ddd d df }| |tj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dg d
dg }|S )Nrc   r   r   )r   devicer   )r  )Zcumsumr  tor(   longr   zerosr   r  arangeflipr   )r3   r  r   Znon_padded_lengthsZoutput_lengthsr   r,   r,   r-   "_get_feature_vector_attention_maskF  s    
"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r$   r%   r&   r   r*   Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnr  r   r(   
LongTensorr   r  r  r,   r,   r,   r-   r     s   
!r   )r   	mask_probmask_lengthr   	min_masksr   c                    s  | \}dk rt dkr6t d d dtjd   fdd}|durt| d	 nfd
dt|D }tj	|ft
d}g }	|}
|
dkr|S |D ]v}||}tjjt|d  |dd}t|dkrd }n|d }t|tj|
| tjd| g}|	| qt|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d kr҈d |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr2 }| d  |k rTt| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r  num_masked_spanepsilonr  r  r  r   r,   r-   compute_num_masked_span|  s    
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrc   c                    s   g | ]} qS r,   r,   r   )r   r,   r-   rn     r   z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)rq   nprandomr   itemdetachr   tolistrp   r  r   choicer  lenZconcatenateonesZint32appendarrayZbroadcast_tor   r   Zput_along_axis)r   r  r  r   r  r   r$  r  Zspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr  r!  Zspec_aug_mask_idxZdummy_mask_idxoffsetsr,   r"  r-   _compute_mask_indicesV  s\    

r2  c                       s   e Zd Zed fddZdejeej eej dddZ	e
deej eej eej ee ee ee eeef dd	d
Z  ZS )UniSpeechModelrm   c                    sz   t  | || _t|| _t|| _|jdks:|jdkrRt	
t|j | _|jrdt|| _n
t|| _|   d S )Nr   )r0   r1   rP   rg   feature_extractorr}   feature_projectionmask_time_probmask_feature_probrD   r   r(   r   rF   r  masked_spec_embedZdo_stable_layer_normr   encoderr   	post_initr   r5   r,   r-   r1     s    


zUniSpeechModel.__init__N)r"   mask_time_indicesr   c                 C   s  t | jdds|S | \}}}|dur<| j|j||< nZ| jjdkr| jrt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        Zapply_spec_augmentTNr   )r  r  r   r  )r  r   )r  r  r  rc   )r   rP   r   r8  r  r   r6  r{   r2  Zmask_time_lengthZmask_time_min_masksr(   r   r  r   r7  Zmask_feature_lengthZmask_feature_min_masksexpand)r3   r"   r;  r   r   r   rF   Zmask_feature_indicesr,   r,   r-   _mask_hidden_states  s4    z"UniSpeechModel._mask_hidden_states)r|   r   r;  r   r   r   r   c           
      C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| |}|dd}|durl| |jd |}| |\}}| j	|||d}| j
|||||d}	|	d }|s||f|	dd  S t|||	j|	jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r/   )r;  r   r   r   r   r   r   )r   extract_featuresr"   r#   )rP   r   r   use_return_dictr4  rQ   r  r   r5  r=  r9  UniSpeechBaseModelOutputr"   r#   )
r3   r|   r   r;  r   r   r   r?  r"   Zencoder_outputsr,   r,   r-   r8     s8    
zUniSpeechModel.forward)NN)NNNNN)r$   r%   r&   r   r1   r(   r)   r   r  r=  r   r   r   r   r+   rA  r8   r:   r,   r,   r5   r-   r3    s.     .     
r3  zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                
       s   e Zd Zed fddZedddZdd Zd	d
 Ze	de
je
je
jedddZedee
j ee
j ee ee ee eeef dddZ  ZS )UniSpeechForPreTrainingrm   c                    s~   t  | t|| _t|j| _t|| _	t
|j|j| _t
|j|j| _t
|j|j| _t|j| _|   d S r^   )r0   r1   r3  r  rD   r   Zfeat_quantizer_dropoutdropout_featuresr   	quantizerr   r   Zproj_codevector_dim	project_qrF   project_hidZnum_ctc_classesctc_projfinal_dropoutr   r:  r   r5   r,   r-   r1   M  s    

z UniSpeechForPreTraining.__init__)r   c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)rD  r   )r3   r   r,   r,   r-   set_gumbel_temperature\  s    z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C   s   t dt |   dS z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr3   r,   r,   r-   freeze_feature_extractorb  s
    z0UniSpeechForPreTraining.freeze_feature_extractorc                 C   s   | j j  dS 
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nr  r4  rz   rQ  r,   r,   r-   rP  n  s    z.UniSpeechForPreTraining.freeze_feature_encoderr   )target_featuresnegative_featurespredicted_featuresr   c                 C   s@   t j| |gdd} t j| |  dd}|| }|| }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r   rc   )r(   catZcosine_similarityr   r   )rV  rW  rX  r   logitsr,   r,   r-   compute_contrastive_logitsu  s
    
z2UniSpeechForPreTraining.compute_contrastive_logitsN)r|   r   r   r   r   r   c                 C   sN  |dur|n| j j}| j|||||d}|d }| |d }| |\}	}
| |	| jjj}	| 	|	}	t
|d|d| j j}|dd}t
| |j}|dd}|d}||d|	| d }| |}| |}d}|s6|dur |||	|
f|dd  S ||	|
f|dd  S t|||	|
|j|jdS )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr>  r   r   rc   r   r/   )r   r   r    r!   r"   r#   )rP   r@  r  rC  rD  rE  r  r@   r   rF  r(   emptyr   r	  Zreplace_probrQ   Z	bernoullir   r  r   Zmasked_fillr   rG  r   r"   r#   )r3   r|   r   r   r   r   r   Ztransformer_featuresr?  Zquantized_featuresr!   Zprob_replace_matrixZsampled_replace_matrixrZ  r   r,   r,   r-   r8     sL    





zUniSpeechForPreTraining.forward)r   )NNNN)r$   r%   r&   r   r1   r   rI  rR  rP  r   r(   r)   r[  r   r   r   r   r   r+   r   r8   r:   r,   r,   r5   r-   rB  G  s2        
rB  r/   zq
    UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                       s   e Zd Zdee d fddZdd Zdd Zd	d
 Zdd Z	e
deej eej ee ee ee eej eeef dddZ  ZS )UniSpeechForCTCN)target_langc                    s~   t  | t|| _t|j| _|| _|j	du rFt
d| j dt|dr\|jr\|jn|j}t||j	| _|   dS )a3  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `UniSpeechForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r0   r1   r3  r  rD   r   rH  r   r^  
vocab_sizerq   r6   rI   r_  output_hidden_sizerF   r   lm_headr:  )r3   rP   r^  ra  r5   r,   r-   r1     s    

zUniSpeechForCTC.__init__c                 C   sr   | j }|dur2t| jdddu r2td| dn<|du rXt| jdddurXtd n|durn| j|dd dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr   zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)Z
force_load)r^  r   rP   rq   loggerinfoZload_adapter)r3   r^  r,   r,   r-   tie_weights  s    zUniSpeechForCTC.tie_weightsc                 C   s   t dt |   dS )rT  rK  NrL  rQ  r,   r,   r-   rR    s
    z(UniSpeechForCTC.freeze_feature_extractorc                 C   s   | j j  dS rS  rU  rQ  r,   r,   r-   rP    s    z&UniSpeechForCTC.freeze_feature_encoderc                 C   s   | j  D ]
}d|_q
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr  rv   rw   rx   r,   r,   r-   freeze_base_model  s    z!UniSpeechForCTC.freeze_base_modelr|   r   r   r   r   labelsr   c              
   C   s  |dur|n| j j}|dur>| | j jkr>td| j j | j|||||d}|d }| |}| |}	d}
|dur@|dur|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
6 tjj||||| j j| j j| j jd}
W d   n1 s60    Y  |sp|	f|td  }|
durl|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r>  r   r%  rc   )rB   r   r   F)Zenabled)blankZ	reductionZzero_infinityr   rZ  r"   r#   )rP   r@  r   r`  rq   r  r   rb  r(   Z	ones_liker  r  r   r  Zmasked_selectrD   r   Zlog_softmaxZfloat32rQ   backendsZcudnnflagsZctc_lossZpad_token_idZctc_loss_reductionZctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r"   r#   )r3   r|   r   r   r   r   rj  r   r"   rZ  r   r  Zlabels_maskZtarget_lengthsZflattened_targetsZ	log_probsoutputr,   r,   r-   r8   '  sL    




&
zUniSpeechForCTC.forward)N)NNNNN)r$   r%   r&   r   r   r1   re  rR  rP  rh  r   r(   r   r   r   r+   r   r8   r:   r,   r,   r5   r-   r]    s(        
r]  z
    UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       sz   e Zd Z fddZdd Zdd Zdd Zedee	j
 ee	j
 ee ee ee ee	j
 eeef d
ddZ  ZS )"UniSpeechForSequenceClassificationc                    s   t  | t|dr$|jr$tdt|| _|jd }|jrTt	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr_  z`Sequence classification does not support the use of UniSpeech adapters (config.add_adapter=True)r   )r0   r1   rI   r_  rq   r3  r  r   use_weighted_layer_sumrD   r   r(   r.  layer_weightsr   rF   Zclassifier_proj_size	projector
num_labels
classifierr:  )r3   rP   Z
num_layersr5   r,   r-   r1   v  s    

z+UniSpeechForSequenceClassification.__init__c                 C   s   t dt |   dS rJ  rL  rQ  r,   r,   r-   rR    s
    z;UniSpeechForSequenceClassification.freeze_feature_extractorc                 C   s   | j j  dS rS  rU  rQ  r,   r,   r-   rP    s    z9UniSpeechForSequenceClassification.freeze_feature_encoderc                 C   s   | j  D ]
}d|_q
dS rf  rg  rx   r,   r,   r-   rh    s    z4UniSpeechForSequenceClassification.freeze_base_modelNri  c                 C   s  |dur|n| j j}| j jr dn|}| j|||||d}| j jr|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|du r|jdd}
nV| |jd |}|ddd|jd }d	|| < |jdd|jdddd }
| |
}d}|dur<t }||d| j j|d}|sl|f|td  }|durh|f| S |S t|||j|jd
S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`UniSpeechProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr>  r   r   rc   r   r/   r   rl  )rP   r@  rr  r  ro  r(   stackrD   r   r   rs  r   r   rt  r   r  r   r   r   rv  r   ru  r   r"   r#   )r3   r|   r   r   r   r   rj  r   r"   Znorm_weightsZpooled_outputZpadding_maskZexpand_padding_maskrZ  r   Zloss_fctrp  r,   r,   r-   r8     sH    

 

z*UniSpeechForSequenceClassification.forward)NNNNN)r$   r%   r&   r1   rR  rP  rh  r   r   r(   r   r   r   r+   r   r8   r:   r,   r,   r5   r-   rq  o  s&        
rq  )r]  rB  rq  r3  r   )Nr   N)Nr   )Rr  rM  dataclassesr   typingr   r   r   numpyr'  r(   Ztorch.nnrD   r   Zactivationsr   Zintegrations.deepspeedr	   Zintegrations.fsdpr
   Zmodeling_attn_mask_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   rH   r   r   r   Zconfiguration_unispeechr   Zintegrations.flex_attentionr   Z
get_loggerr$   rc  r   Moduler.   r;   rR   r_   rd   rg   r}   r   r   r   r   r   r   r   r   r   r   r   r   r+   r   r  Zndarrayr2  rA  r3  rB  ro  r]  rq  __all__r,   r,   r,   r-   <module>   s   
-)   X$].aFM  
wv  s