a
    h_                  
   @   s  d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	 d dlm
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& e# rddl'm(Z( e$)e*Z+G dd de	j,Z-G dd de	j,Z.G dd deZ/G dd deZ0G dd deZ1G dd de	j,Z2G dd de	j,Z3dAe	j,ej4ej4ej4eej4 ee5 e5eej4 d!d"d#Z6G d$d% d%e	j,Z7G d&d' d'e	j,Z8G d(d) d)eZ9G d*d+ d+e	j,Z:G d,d- d-e	j,Z;G d.d/ d/eZ<G d0d1 d1e	j,Z=e"G d2d3 d3eZ>dBe?e@e@f e5e@eejA e@ejBd4d5d6ZCe"G d7d8 d8e>ZDdZEe"d9d:G d;d< d<e>ZFe"d=d:G d>d? d?e>ZGg d@ZHdS )C    N)CallableOptionalUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringis_torch_flex_attn_availablelogging   )HubertConfig)make_flex_block_causal_maskc                       s$   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    sB  t    tj|j|j|j|jd |jd| _d | _|j	rJt
|j| _ntjj}ttjjdrjtjjj}t rdd l}|jj| jjdd" || jddd| _W d    n1 s0    Y  t| jdr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n|| jddd| _t|j| _t|j | _d S )	N   )kernel_sizepaddinggroupsweight_normr   Zmodifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsconv
batch_normZconv_pos_batch_normBatchNorm1dutilsr   hasattrr$   r   	deepspeedzeroGatheredParametersr!   Z	original0Z	original1weight_gweight_vZregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r0   r3   r4   	__class__ f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/hubert/modeling_hubert.pyr&   3   s8    

0z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur | |}| |}| |}| |}| dd}|S )Nr   r   )	transposer,   r+   r   r7   r8   hidden_statesr<   r<   r=   forwardX   s    




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r&   rA   __classcell__r<   r<   r:   r=   r   2   s   %r   c                       s$   e Zd Z fddZdd Z  ZS )r5   c                    s$   t    |d dkrdnd| _d S )Nr   r   r   )r%   r&   num_pad_remove)r8   r*   r:   r<   r=   r&   e   s    
zHubertSamePadLayer.__init__c                 C   s,   | j dkr(|d d d d d | j  f }|S )Nr   )rG   r?   r<   r<   r=   rA   i   s    
zHubertSamePadLayer.forwardrB   r<   r<   r:   r=   r5   d   s   r5   c                       s&   e Zd Zd fdd	Zdd Z  ZS )HubertNoLayerNormConvLayerr   c                    sj   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _d S )Nr   r   r   stridebias)r%   r&   conv_dimin_conv_dimout_conv_dimr'   r(   conv_kernelconv_stride	conv_biasr+   r   r6   r7   r8   r9   layer_idr:   r<   r=   r&   p   s    
z#HubertNoLayerNormConvLayer.__init__c                 C   s   |  |}| |}|S N)r+   r7   r?   r<   r<   r=   rA   ~   s    

z"HubertNoLayerNormConvLayer.forward)r   rB   r<   r<   r:   r=   rH   o   s   rH   c                       s&   e Zd Zd fdd	Zdd Z  ZS )HubertLayerNormConvLayerr   c                    s|   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
tj| jdd| _t|j | _d S )Nr   r   rI   T)Zelementwise_affine)r%   r&   rL   rM   rN   r'   r(   rO   rP   rQ   r+   	LayerNorm
layer_normr   r6   r7   rR   r:   r<   r=   r&      s    
z!HubertLayerNormConvLayer.__init__c                 C   s:   |  |}|dd}| |}|dd}| |}|S )N)r+   r>   rW   r7   r?   r<   r<   r=   rA      s    


z HubertLayerNormConvLayer.forward)r   rB   r<   r<   r:   r=   rU      s   rU   c                       s&   e Zd Zd fdd	Zdd Z  ZS )HubertGroupNormConvLayerr   c                    s   t    |dkr |j|d  nd| _|j| | _tj| j| j|j| |j| |j	d| _
t|j | _tj| j| jdd| _d S )Nr   r   rI   T)Z
num_groupsZnum_channelsZaffine)r%   r&   rL   rM   rN   r'   r(   rO   rP   rQ   r+   r   r6   r7   	GroupNormrW   rR   r:   r<   r=   r&      s    
z!HubertGroupNormConvLayer.__init__c                 C   s"   |  |}| |}| |}|S rT   )r+   rW   r7   r?   r<   r<   r=   rA      s    


z HubertGroupNormConvLayer.forward)r   rB   r<   r<   r:   r=   rZ      s   rZ   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc                    s   t     jdkr@t ddg fddt jd D  }n6 jdkrd fddt jD }ntd	 j d
t|| _	d| _
d| _d S )Ngroupr   rS   c                    s   g | ]}t  |d  dqS )r   r^   )rH   .0ir9   r<   r=   
<listcomp>   s   z1HubertFeatureEncoder.__init__.<locals>.<listcomp>r   layerc                    s   g | ]}t  |d qS )r^   )rU   r_   rb   r<   r=   rc          z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r%   r&   Zfeat_extract_normrZ   rangeZnum_feat_extract_layers
ValueErrorr'   
ModuleListconv_layersgradient_checkpointing_requires_grad)r8   r9   ri   r:   rb   r=   r&      s    



zHubertFeatureEncoder.__init__c                 C   s   |   D ]
}d|_qd| _d S )NF)
parametersrequires_gradrk   r8   paramr<   r<   r=   _freeze_parameters   s    z'HubertFeatureEncoder._freeze_parametersc                 C   s:   |d d d f }| j r"| jr"d|_| jD ]}||}q(|S )NT)rk   trainingrm   ri   )r8   input_valuesr@   Z
conv_layerr<   r<   r=   rA      s    

zHubertFeatureEncoder.forward)rC   rD   rE   __doc__r&   rp   rA   rF   r<   r<   r:   r=   r\      s   r\   c                       s$   e Zd Z fddZdd Z  ZS )HubertFeatureProjectionc                    sX   t    |j| _| jr0tj|jd |jd| _t|jd |j	| _
t|j| _d S )NrY   eps)r%   r&   feat_proj_layer_normr'   rV   rL   layer_norm_epsrW   Linearr)   
projectionDropoutZfeat_proj_dropoutdropoutr8   r9   r:   r<   r=   r&      s    
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S rT   )rw   rW   rz   r|   r?   r<   r<   r=   rA      s
    


zHubertFeatureProjection.forwardrB   r<   r<   r:   r=   rt      s   rt           )modulequerykeyvalueattention_maskscalingr|   	head_maskc                 K   s   |d u r| dd }t||dd| }	|d ur>|	| }	tjj|	dd}	|d urj|	|dddd }	tjj|	|| j	d}	t|	|}
|
dd
 }
|
|	fS )NrY         r   r   r#   r   )prq   )sizetorchmatmulr>   r'   
functionalsoftmaxviewr|   rq   
contiguous)r   r   r   r   r   r   r|   r   kwargsattn_weightsattn_outputr<   r<   r=   eager_attention_forward   s    r   c                       s   e Zd ZdZdeeeeeeee d fddZ	de
jee
j ee
j ee
j ee ee ee
jee
j eee
j  f d	d
dZ  ZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr~   FTN)	embed_dim	num_headsr|   
is_decoderrK   	is_causalr9   c                    s   t    || _|| _|| _|| | _|| _| j| | jkrTtd| j d| d| jd | _|| _	|| _
tj|||d| _tj|||d| _tj|||d| _tj|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )rK   )r%   r&   r   r   r|   head_dimr9   rg   r   r   r   r'   ry   k_projv_projq_projout_proj)r8   r   r   r|   r   rK   r   r9   r:   r<   r=   r&     s&    



zHubertAttention.__init__)r@   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                 K   s  |du}|j dd \}}	|r(|j d n|	}
||	d| jf}||
d| jf}| |j| dd}|rh|n|}| |j| dd}| |j| dd}t}| jj	dkrt
| jj	 }|| ||||f| jsdn| j| j||d|\}}|||	d }| |}||dfS )z#Input shape: Batch x Time x ChannelNrY   r   r   eagerr~   )r|   r   r   r   )shaper   r   r   r>   r   r   r   r9   _attn_implementationr   rq   r|   r   reshaper   r   )r8   r@   r   r   r   r   r   Zis_cross_attentionZbszZtgt_lenZsrc_lenZq_input_shapeZkv_input_shapeZquery_statesZcurrent_statesZ
key_statesZvalue_statesZattention_interfacer   r   r<   r<   r=   rA   /  s:    


zHubertAttention.forward)r~   FTFN)NNNF)rC   rD   rE   rs   intfloatboolr   r   r&   r   Tensorr   r   tuplerA   rF   r<   r<   r:   r=   r     s8        "    r   c                       s$   e Zd Z fddZdd Z  ZS )HubertFeedForwardc                    sp   t    t|j| _t|j|j| _	t
|jtrDt|j | _n|j| _t|j|j| _t|j| _d S rT   )r%   r&   r'   r{   Zactivation_dropoutintermediate_dropoutry   r)   Zintermediate_sizeintermediate_dense
isinstanceZ
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr}   r:   r<   r=   r&   f  s    
zHubertFeedForward.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rT   )r   r   r   r   r   r?   r<   r<   r=   rA   s  s    




zHubertFeedForward.forwardrB   r<   r<   r:   r=   r   e  s   r   c                       s&   e Zd Z fddZdddZ  ZS )HubertEncoderLayerc                    sh   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _d S )NFr   r   r|   r   r9   ru   )r%   r&   r   r)   num_attention_headsattention_dropout	attentionr'   r{   r   r|   rV   rx   rW   r   feed_forwardfinal_layer_normr}   r:   r<   r=   r&   ~  s    

zHubertEncoderLayer.__init__NFc                 C   sf   |}| j |||d\}}}| |}|| }| |}|| | }| |}|f}|rb||f7 }|S Nr   r   )r   r|   rW   r   r   r8   r@   r   r   Zattn_residualr   _outputsr<   r<   r=   rA     s    



zHubertEncoderLayer.forward)NFrB   r<   r<   r:   r=   r   }  s   r   c                       sX   e Zd Z fddZdejeej eeedddZ	e
ejdf ejd	d
dZ  ZS )HubertEncoderc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nru   c                    s   g | ]}t  qS r<   )r   r`   r   rb   r<   r=   rc     re   z*HubertEncoder.__init__.<locals>.<listcomp>Fr%   r&   r9   r   pos_conv_embedr'   rV   r)   rx   rW   r{   r   r|   rh   rf   num_hidden_layerslayersrj   r}   r:   rb   r=   r&     s    

 zHubertEncoder.__init__NFT)r@   r   r   output_hidden_statesreturn_dictc                 C   s.  |rdnd }|rdnd }|d urD| ddd|jd }d|| < | ||}| |}	||	 }| |}| |}t pt| }
| j	D ]f}|r||f }t
g }| jo|| jjk }|r|
r||||d}|d }|rd}|r||d f }q|r||f }|s tdd	 |||fD S t|||d
S )Nr<   rY   r   r   r   r   NNc                 s   s   | ]}|d ur|V  qd S rT   r<   r`   vr<   r<   r=   	<genexpr>  re   z(HubertEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater@   
attentions)	unsqueezerepeatr   _update_full_maskr   rW   r|   r   r	   r   r   randrq   r9   	layerdropr   r   r8   r@   r   r   r   r   Zall_hidden_statesZall_self_attentionsZexpand_attention_maskZposition_embeddingsZsynced_gpusrd   Zdropout_probabilityZskip_the_layerZlayer_outputsr<   r<   r=   rA     sJ    







zHubertEncoder.forwardr   inputs_embedsc                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S NZflash_attention_2r   ZsdpaZflex_attentionF)r   	r9   r   r   dtyper   r   r   r   r
   r8   r   r   r<   r<   r=   r     s    zHubertEncoder._update_full_mask)NFFT)rC   rD   rE   r&   r   tensorr   r   r   rA   r   r   rF   r<   r<   r:   r=   r     s       >r   c                       s,   e Zd Z fddZejdddZ  ZS )HubertAttnAdapterLayerc                    sZ   t    |j| _|j| _t| j| _t	| j| j| _
t | _t	| j| j| _dS )z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r%   r&   adapter_attn_dimZ	input_dimr)   Z
hidden_dimr'   rV   normry   linear_1ZReLUact_fnlinear_2r}   r:   r<   r=   r&     s    

zHubertAttnAdapterLayer.__init__)r@   c                 C   s,   |  |}| |}| |}| |}|S rT   )r   r   r   r   r?   r<   r<   r=   rA     s
    



zHubertAttnAdapterLayer.forward)rC   rD   rE   r&   r   FloatTensorrA   rF   r<   r<   r:   r=   r     s   r   c                       s8   e Zd Z fddZdejeej edddZ  Z	S )	!HubertEncoderLayerStableLayerNormc                    s   t    t|j|j|jd|d| _t|j	| _
tj|j|jd| _t|| _tj|j|jd| _t|dd d urt|| _nd | _d S )NFr   ru   r   )r%   r&   r   r)   r   r   r   r'   r{   r   r|   rV   rx   rW   r   r   r   getattrr   adapter_layerr}   r:   r<   r=   r&     s    

z*HubertEncoderLayerStableLayerNorm.__init__NF)r@   r   r   c                 C   sz   |}|  |}| j|||d\}}}| |}|| }|| | | }| jd urb|| | }|f}|rv||f7 }|S r   )rW   r   r|   r   r   r   r   r<   r<   r=   rA   +  s    



z)HubertEncoderLayerStableLayerNorm.forward)NF)
rC   rD   rE   r&   r   r   r   r   rA   rF   r<   r<   r:   r=   r     s     r   c                       sB   e Zd Z fddZdddZeejdf ejdd	d
Z  Z	S )HubertEncoderStableLayerNormc                    sf   t     | _t | _tj j jd| _	t
 j| _t fddt jD | _d| _d S )Nru   c                    s   g | ]}t  qS r<   )r   r   rb   r<   r=   rc   M  re   z9HubertEncoderStableLayerNorm.__init__.<locals>.<listcomp>Fr   r}   r:   rb   r=   r&   F  s    

z%HubertEncoderStableLayerNorm.__init__NFTc                 C   s.  |rdnd }|rdnd }|d urD| ddd|jd }d|| < | ||}| |}	||	 }| |}t pxt| }
| jD ]f}|r||f }t	
g }| jo|| jjk }|r|
r||||d}|d }|rd}|r||d f }q| |}|r||f }|s tdd	 |||fD S t|||d
S )Nr<   rY   r   r   r   r   r   c                 s   s   | ]}|d ur|V  qd S rT   r<   r   r<   r<   r=   r     re   z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>r   )r   r   r   r   r   r|   r   r	   r   r   r   rq   r9   r   rW   r   r   r   r<   r<   r=   rA   Q  sJ    







z$HubertEncoderStableLayerNorm.forwardr   c                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S r   r   r   r<   r<   r=   r     s    z.HubertEncoderStableLayerNorm._update_full_mask)NFFT)
rC   rD   rE   r&   rA   r   r   r   r   rF   r<   r<   r:   r=   r   E  s       
@r   c                   @   s`   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zeejef ddd	Zeejd
ddZdS )HubertPreTrainedModelr9   hubertrr   Tc                 C   s  t |tjr<|jjjd| jjd |jdur8|jj	  nxt |tj
tjtjfrp|jj	  |jjd nDt |tjr\t r2ddl}t|drt|dr|jj|j|jgdd  tj|jj W d   n1 s0    Y  nD|jj|jdd  tj|jj W d   n1 s&0    Y  ntj|jj |jdur|jj	  nXt |trt|d	r|jj  n2t |trt|d
r|jjd| jjd   dS )zInitialize the weightsr~   )meanZstdNg      ?r   r4   r3   r    masked_spec_embedlayer_weightsr   )r   r'   ry   r!   dataZnormal_r9   Zinitializer_rangerK   Zzero_rV   r[   r-   Zfill_r(   r   r0   r/   r1   r2   r4   r3   initZkaiming_normal_HubertModelr   uniform_HubertForSequenceClassificationr   r   )r8   r   r0   r<   r<   r=   _init_weights  s0    
02z#HubertPreTrainedModel._init_weights)input_lengthsc                 C   s4   dd }t | jj| jjD ]\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )r   div)input_lengthr   rJ   r<   r<   r=   _conv_out_length  s    zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr9   rO   rP   )r8   r   r   r   rJ   r<   r<   r=    _get_feat_extract_output_lengths  s    z6HubertPreTrainedModel._get_feat_extract_output_lengths)feature_vector_lengthr   c                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )NrY   r   )r   devicer   )r   )r   sumtor   longr   zerosr   r   arangeflipZcumsumr   )r8   r   r   Zoutput_lengths
batch_sizer<   r<   r=   "_get_feature_vector_attention_mask  s    
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)rC   rD   rE   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnr   r   r   
LongTensorr   r   r  r<   r<   r<   r=   r     s   
!r   )r   	mask_probmask_lengthr   	min_masksr   c                    s  | \}dk rt dkr6t d d dtjd   fdd}|durt| d	 nfd
dt|D }tj	|ft
d}g }	|}
|
dkr|S |D ]v}||}tjjt|d  |dd}t|dkrd }n|d }t|tj|
| tjd| g}|	| qt|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d kr҈d |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr2 }| d  |k rTt| d  d}|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r   num_masked_spanepsilonr  r  r  sequence_lengthr<   r=   compute_num_masked_span  s    
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrY   c                    s   g | ]} qS r<   r<   r   )r  r<   r=   rc   &  re   z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)rg   nprandomr   itemdetachr   tolistrf   r   r   choicer   lenZconcatenateonesZint32appendarrayZbroadcast_tor   r  Zput_along_axis)r   r  r  r   r  r   r  r   Zspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr   r	  Zspec_aug_mask_idxZdummy_mask_idxoffsetsr<   r
  r=   _compute_mask_indices  s\    

r  c                       s   e Zd Zed fddZdejeej eej dddZ	e
deej eej eej ee ee ee eeef dd	d
Z  ZS )r   rb   c                    sz   t  | || _t|| _t|| _|jdks:|jdkrRt	
t|j | _|jrdt|| _n
t|| _|   d S )Nr~   )r%   r&   r9   r\   feature_extractorrt   feature_projectionmask_time_probmask_feature_probr'   	Parameterr   r   r)   r   r   Zdo_stable_layer_normr   encoderr   	post_initr}   r:   r<   r=   r&   f  s    


zHubertModel.__init__N)r@   mask_time_indicesr   c                 C   s  t | jdds|S | \}}}|dur<| j|j||< nZ| jjdkr| jrt||f| jj| jj	|| jj
d}tj||jtjd}| j|j||< | jjdkr| jrt||f| jj| jj| jjd}tj||jtjd}|dddf d|d}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        Zapply_spec_augmentTNr   )r  r  r   r  )r   r   )r  r  r  rY   )r   r9   r   r   r   r   r  rq   r  Zmask_time_lengthZmask_time_min_masksr   r   r   r   r  Zmask_feature_lengthZmask_feature_min_masksexpand)r8   r@   r#  r   r   r  r)   Zmask_feature_indicesr<   r<   r=   _mask_hidden_statesx  s4    zHubertModel._mask_hidden_states)rr   r   r#  r   r   r   r   c           
      C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| |}|dd}|durl| |jd |}| |}| j	||d}| j
|||||d}	|	d }|s|f|	dd  S t||	j|	jdS )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r#  r   r   r   r   r   r   )r9   r   r   use_return_dictr  r>   r  r   r  r%  r!  r   r@   r   )
r8   rr   r   r#  r   r   r   Zextract_featuresr@   Zencoder_outputsr<   r<   r=   rA     s2    $

zHubertModel.forward)NN)NNNNN)rC   rD   rE   r   r&   r   r   r   r  r%  r   r   r   r   r   r   rA   rF   r<   r<   r:   r=   r   d  s.     .     
r   zn
    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )Zcustom_introc                       s   e Zd Zdee d fddZdd Zdd Zd	d
 Zdd Z	e
deej eej ee ee ee eej eeef dddZ  ZS )HubertForCTCN)target_langc                    s~   t  | t|| _t|j| _|| _|j	du rFt
d| j dt|dr\|jr\|jn|j}t||j	| _|   dS )a0  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r%   r&   r   r   r'   r{   Zfinal_dropoutr|   r)  
vocab_sizerg   r;   r/   r*  output_hidden_sizer)   ry   lm_headr"  )r8   r9   r)  r,  r:   r<   r=   r&     s    

zHubertForCTC.__init__c                 C   sr   | j }|dur2t| jdddu r2td| dn<|du rXt| jdddurXtd n|durn| j|dd dS )a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        Nr   zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)Z
force_load)r)  r   r9   rg   loggerinfoZload_adapter)r8   r)  r<   r<   r=   tie_weights  s    zHubertForCTC.tie_weightsc                 C   s   t dt |   dS )
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr8   r<   r<   r=   freeze_feature_extractor)  s
    z%HubertForCTC.freeze_feature_extractorc                 C   s   | j j  dS r1  Nr   r  rp   r8  r<   r<   r=   r7  5  s    z#HubertForCTC.freeze_feature_encoderc                 C   s   | j  D ]
}d|_q
dS z
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr   rl   rm   rn   r<   r<   r=   freeze_base_model<  s    zHubertForCTC.freeze_base_modelrr   r   r   r   r   labelsr   c              
   C   s  |dur|n| j j}|dur>| | j jkr>td| j j | j|||||d}|d }| |}| |}	d}
|dur@|dur|ntj	|tj
d}| |dtj
}|dk}|d}||}tjj|	dtjddd}tjjjd	d
6 tjj||||| j j| j j| j jd}
W d   n1 s60    Y  |sp|	f|td  }|
durl|
f| S |S t|
|	|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r&  r   r  rY   )r#   r   r   F)Zenabled)blankZ	reductionZzero_infinitylosslogitsr@   r   )r9   r'  r  r+  rg   r   r|   r-  r   Z	ones_liker   r   r   r   Zmasked_selectr'   r   Zlog_softmaxZfloat32r>   backendsZcudnnflagsZctc_lossZpad_token_idZctc_loss_reductionZctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r@   r   )r8   rr   r   r   r   r   r@  r   r@   rD  rC  r   Zlabels_maskZtarget_lengthsZflattened_targetsZ	log_probsoutputr<   r<   r=   rA   D  sL    




&
zHubertForCTC.forward)N)NNNNN)rC   rD   rE   r   r   r&   r0  r9  r7  r>  r   r   r   r   r   r   r   rA   rF   r<   r<   r:   r=   r(    s(        
r(  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                       sz   e Zd Z fddZdd Zdd Zdd Zedee	j
 ee	j
 ee ee ee ee	j
 eeef d
ddZ  ZS )r   c                    s   t  | t|dr$|jr$tdt|| _|jd }|jrTt	
t|| | _t	|j|j| _t	|j|j| _|   d S )Nr*  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )r%   r&   r/   r*  rg   r   r   r   use_weighted_layer_sumr'   r   r   r  r   ry   r)   Zclassifier_proj_size	projector
num_labels
classifierr"  )r8   r9   Z
num_layersr:   r<   r=   r&     s    

z(HubertForSequenceClassification.__init__c                 C   s   t dt |   dS )z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r2  Nr3  r8  r<   r<   r=   r9    s
    z8HubertForSequenceClassification.freeze_feature_extractorc                 C   s   | j j  dS r:  r;  r8  r<   r<   r=   r7    s    z6HubertForSequenceClassification.freeze_feature_encoderc                 C   s   | j  D ]
}d|_q
dS r<  r=  rn   r<   r<   r=   r>    s    z1HubertForSequenceClassification.freeze_base_modelNr?  c                 C   s  |dur|n| j j}| j jr dn|}| j|||||d}| j jr|t }tj|dd}tjj	| j
dd}	||	ddd jdd}n|d }| |}|du r|jdd}
nV| |jd |}|ddd|jd }d	|| < |jdd|jdddd }
| |
}d}|dur<t }||d| j j|d}|sl|f|td  }|durh|f| S |S t|||j|jd
S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr&  r   r   rY   r   r   r~   rB  )r9   r'  rI  r   rG  r   stackr'   r   r   r   r   r   rJ  r   r  r   r   r   rL  r   rK  r   r@   r   )r8   rr   r   r   r   r   r@  r   r@   Znorm_weightsZpooled_outputZpadding_maskZexpand_padding_maskrD  rC  Zloss_fctrH  r<   r<   r=   rA     sH    

 

z'HubertForSequenceClassification.forward)NNNNN)rC   rD   rE   r&   r9  r7  r>  r   r   r   r   r   r   r   r   rA   rF   r<   r<   r:   r=   r     s&        
r   )r(  r   r   r   )Nr~   N)Nr   )Ir4  typingr   r   r   numpyr  r   Ztorch.nnr'   r   Zactivationsr   Zintegrations.deepspeedr   Zintegrations.fsdpr	   Zmodeling_attn_mask_utilsr
   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   r.   r   r   r   Zconfiguration_hubertr   Zintegrations.flex_attentionr   Z
get_loggerrC   r.  Moduler   r5   rH   rU   rZ   r\   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r  Zndarrayr  r   rG  r(  r   __all__r<   r<   r<   r=   <module>   s   
2&   X$].aJ  
w 
 s