a
    h.                     @   sN  d Z ddlmZmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZ ddlmZ dZG dd dejZG dd deZG dd deZG dd dejZ G dd deZ!G dd deZ"eG dd deZ#G dd dee#Z$G dd deZ%G d d! d!eZ&g d"Z'dS )#zPyTorch Hubert model.    )OptionalUnionN   )ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigc                       s$   e Zd Z fddZdd Z  ZS )HubertPositionalConvEmbeddingc                    sB  t    tj|j|j|j|jd |jd| _d | _|j	rJt
|j| _ntjj}ttjjdrjtjjj}t rdd l}|jj| jjdd" || jddd| _W d    n1 s0    Y  t| jdr| jjjj}| jjjj}n| jj}| jj}|j| | |j| | n|| jddd| _t|j| _t|j | _d S )	Nr
   )kernel_sizepaddinggroupsweight_normr   Zmodifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizeZnum_conv_pos_embeddingsZnum_conv_pos_embedding_groupsconv
batch_normZconv_pos_batch_normBatchNorm1dutilsr   hasattrr   r   	deepspeedzeroGatheredParametersr   Z	original0Z	original1weight_gweight_vZregister_external_parameterHubertSamePadLayerr   r   Zfeat_extract_activation
activation)selfconfigr   r(   r+   r,   	__class__ e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/hubert/modular_hubert.pyr   +   s8    

0z&HubertPositionalConvEmbedding.__init__c                 C   sN   | dd}| jd ur | |}| |}| |}| |}| dd}|S )Nr   r
   )	transposer$   r#   r   r.   r/   hidden_statesr3   r3   r4   forwardP   s    




z%HubertPositionalConvEmbedding.forward__name__
__module____qualname__r   r8   __classcell__r3   r3   r1   r4   r   *   s   %r   c                   @   s   e Zd ZdS )r-   Nr:   r;   r<   r3   r3   r3   r4   r-   \   s   r-   c                   @   s   e Zd ZdS )HubertFeatureEncoderNr>   r3   r3   r3   r4   r?   `   s   r?   c                       s$   e Zd Z fddZdd Z  ZS )HubertFeatureProjectionc                    sX   t    |j| _| jr0tj|jd |jd| _t|jd |j	| _
t|j| _d S )N)eps)r   r   feat_proj_layer_normr    	LayerNormZconv_dimZlayer_norm_eps
layer_normLinearr"   
projectionZDropoutZfeat_proj_dropoutdropoutr/   r0   r1   r3   r4   r   e   s    
z HubertFeatureProjection.__init__c                 C   s(   | j r| |}| |}| |}|S )N)rC   rE   rG   rH   r6   r3   r3   r4   r8   m   s
    


zHubertFeatureProjection.forwardr9   r3   r3   r1   r4   r@   d   s   r@   c                   @   s   e Zd ZdS )HubertEncoderNr>   r3   r3   r3   r4   rJ   v   s   rJ   c                   @   s   e Zd ZdS )HubertEncoderStableLayerNormNr>   r3   r3   r3   r4   rK   z   s   rK   c                   @   s`   e Zd ZU eed< dZdZdZdZdZ	dZ
dd Zeejef ddd	Zeejd
ddZdS )HubertPreTrainedModelr0   Zhubertinput_valuesTc                 C   s  t |tjr<|jjjd| jjd |jdur8|jj	  nxt |tj
tjtjfrp|jj	  |jjd nDt |tjr\t r2ddl}t|drt|dr|jj|j|jgdd  tj|jj W d   n1 s0    Y  nD|jj|jdd  tj|jj W d   n1 s&0    Y  ntj|jj |jdur|jj	  nXt |trt|d	r|jj  n2t |trt|d
r|jjd| jjd   dS )zInitialize the weights        )meanZstdNg      ?r   r,   r+   r   masked_spec_embedlayer_weightsr   )
isinstancer    rF   r   dataZnormal_r0   Zinitializer_rangeZbiasZzero_rD   Z	GroupNormr%   Zfill_r!   r   r(   r'   r)   r*   r,   r+   initZkaiming_normal_HubertModelrP   uniform_HubertForSequenceClassificationrQ   Znum_hidden_layers)r/   moduler(   r3   r3   r4   _init_weights   s0    
02z#HubertPreTrainedModel._init_weights)input_lengthsc                 C   s4   dd }t | jj| jjD ]\}}||||}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )Nfloor)Zrounding_moder   )torchdiv)Zinput_lengthr   strider3   r3   r4   _conv_out_length   s    zPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)zipr0   Zconv_kernelZconv_stride)r/   rZ   r_   r   r^   r3   r3   r4    _get_feat_extract_output_lengths   s    z6HubertPreTrainedModel._get_feat_extract_output_lengths)feature_vector_lengthattention_maskc                 C   s~   |  |dtj}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< |
dgd
dg }|S )NrA   r   )dtypedevicer   )re   )ra   sumtor\   longshapeZzerosrd   re   ZarangeflipZcumsumbool)r/   rb   rc   Zoutput_lengthsZ
batch_sizer3   r3   r4   "_get_feature_vector_attention_mask   s    
"z8HubertPreTrainedModel._get_feature_vector_attention_maskN)r:   r;   r<   r   __annotations__Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnrY   r   r\   Z
LongTensorintra   rl   r3   r3   r3   r4   rL   ~   s   
!rL   c                
       st   e Zd Zed fddZdd Zdd Zdeej	 eej	 eej
 ee ee ee eeef d	d
dZ  ZS )rU   )r0   c                    s~   t  | || _t|| _t|| _|jdks:|jdkrRt	
t|j | _|jrdt|| _n
t|| _|   | `d S )NrN   )r   r   r0   r?   feature_extractorr@   feature_projectionZmask_time_probZmask_feature_probr    	Parameterr\   Tensorr"   rV   rP   Zdo_stable_layer_normrK   encoderrJ   Z	post_initadapterrI   r1   r3   r4   r      s    


zHubertModel.__init__c                 C   s   t dd S NzNot needed for HubertAttributeErrorr/   r3   r3   r4   freeze_feature_extractor   s    z$HubertModel.freeze_feature_extractorc                 C   s   t dd S ru   rv   rx   r3   r3   r4   freeze_feature_encoder   s    z"HubertModel.freeze_feature_encoderN)rM   rc   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc           
      C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| |}|dd}|durl| |jd |}| |}| j	||d}| j
|||||d}	|	d }|s|f|	dd  S t||	j|	jdS )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r
   )r{   )rc   r|   r}   r~   r   )Zlast_hidden_stater7   
attentions)r0   r|   r}   Zuse_return_dictro   r5   rl   ri   rp   Z_mask_hidden_statesrs   r   r7   r   )
r/   rM   rc   r{   r|   r}   r~   Zextract_featuresr7   Zencoder_outputsr3   r3   r4   r8      s2    #

zHubertModel.forward)NNNNN)r:   r;   r<   r   r   ry   rz   r   r\   rr   ZFloatTensorrk   r   tupler   r8   r=   r3   r3   r1   r4   rU      s"        
rU   c                   @   s   e Zd ZdS )HubertForCTCNr>   r3   r3   r3   r4   r   &  s   r   c                   @   s   e Zd ZdS )rW   Nr>   r3   r3   r3   r4   rW   *  s   rW   )r   rW   rU   rL   )(__doc__typingr   r   r\   Ztorch.nnr    Zactivationsr   Zintegrations.deepspeedr   Zmodeling_outputsr   Zmodeling_utilsr   r&   r	   Zwav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   Zconfiguration_hubertr   Z_HIDDEN_STATES_START_POSITIONModuler   r-   r?   r@   rJ   rK   rL   rU   r   rW   __all__r3   r3   r3   r4   <module>   s.   $	2Fa