a
    ¾ÀhG  ã                   @   s   d Z ddlZddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZ ddlmZ e e ¡Z!eeddG dd„ deƒƒƒZ"G dd„ deƒZ#G dd„ deƒZ$G dd„ deƒZ%G dd„ deƒZ&G dd„ deƒZ'G dd„ deƒZ(eG dd„ deƒƒZ)eZ*G dd „ d e)eƒZ+ed!dG d"d#„ d#e)ƒƒZ,G d$d%„ d%eƒZ-G d&d'„ d'eƒZ.g d(¢Z/dS ))zPyTorch UniSpeech model.é    N)Ú	dataclass)ÚOptionalÚUnioné   )ÚModelOutputÚWav2Vec2BaseModelOutput)ÚPreTrainedModel)Úauto_docstringÚloggingé   )	ÚWav2Vec2EncoderÚWav2Vec2EncoderStableLayerNormÚWav2Vec2FeatureEncoderÚWav2Vec2FeatureProjectionÚWav2Vec2ForCTCÚ!Wav2Vec2ForSequenceClassificationÚWav2Vec2GumbelVectorQuantizerÚWav2Vec2ModelÚWav2Vec2PositionalConvEmbeddingé   )ÚUniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )Zcustom_introc                   @   s†   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeeej  ed< dZeeej  ed< dS )	ÚUniSpeechForPreTrainingOutputaÝ  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    NÚlossÚprojected_statesÚprojected_quantized_statesÚcodevector_perplexityÚhidden_statesÚ
attentions)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   ÚtorchÚFloatTensorÚ__annotations__r   r   r   r   Útupler   © r&   r&   úk/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/unispeech/modular_unispeech.pyr   -   s   
r   c                   @   s   e Zd ZdS )Ú UniSpeechPositionalConvEmbeddingN©r   r   r    r&   r&   r&   r'   r(   J   s   r(   c                   @   s   e Zd ZdS )ÚUniSpeechFeatureEncoderNr)   r&   r&   r&   r'   r*   N   s   r*   c                   @   s   e Zd ZdS )ÚUniSpeechFeatureProjectionNr)   r&   r&   r&   r'   r+   R   s   r+   c                   @   s   e Zd ZdS )ÚUniSpeechEncoderNr)   r&   r&   r&   r'   r,   V   s   r,   c                   @   s   e Zd ZdS )ÚUniSpeechEncoderStableLayerNormNr)   r&   r&   r&   r'   r-   Z   s   r-   c                   @   s    e Zd Zedd„ ƒZdd„ ZdS )ÚUniSpeechGumbelVectorQuantizerc                 C   s8   | j dd}t tj|t |d ¡ dd ¡ ¡ }|S )Nr   ©ÚdimgH¯¼šò×z>éÿÿÿÿ)Úmeanr"   ÚexpÚsumÚlog)ZprobsZmarginal_probsÚ
perplexityr&   r&   r'   Ú_compute_perplexity_   s    (z2UniSpeechGumbelVectorQuantizer._compute_perplexityc                 C   s  |j \}}}|  |¡}| || | j d¡}| jr~tjj| ¡ | j	dd 
|¡}tj| || | jd¡ ¡ dd}|  |¡}nH|jdd}|j|j Ž  d| dd¡d¡}| || | jd¡}|  |¡}| || d¡}| d¡| j }	|	 || | j| jd¡}
|
 d¡ ||d¡}
|
|fS )Nr1   T)ÚtauÚhardr/   r   ç      ð?éþÿÿÿ)ÚshapeÚweight_projÚviewZ
num_groupsZtrainingÚnnZ
functionalZgumbel_softmaxÚfloatÚtemperatureÚtype_asr"   Zsoftmaxr7   ZargmaxZ	new_zerosZscatter_Ú	unsqueezeÚcodevectorsZnum_varsr4   )Úselfr   Ú
batch_sizeZsequence_lengthÚhidden_sizeZcodevector_probsZcodevector_soft_distr6   Zcodevector_idxZcodevectors_per_grouprD   r&   r&   r'   Úforwarde   s0    
ÿþÿÿ
z&UniSpeechGumbelVectorQuantizer.forwardN)r   r   r    Ústaticmethodr7   rH   r&   r&   r&   r'   r.   ^   s   
r.   c                   @   s`   e Zd ZU eed< dZdZdZdZdZ	dZ
dd„ Zeejef dœdd	„Zeejd
œdd„ZdS )ÚUniSpeechPreTrainedModelÚconfigÚ	unispeechÚinput_valuesTc              	   C   s¬  t |tƒr>|jjjjddd |jjj ¡  tj	 
|j¡ njt |tƒr’tj	j|jjddt d|jjd |jj  ¡ d tj	 |jjd¡ nt |tƒràt d|jj ¡}tj	j
|jj| |d tj	j
|jj| |d nÈt |tjƒr|jjjd| jjd |jdur¨|jj ¡  nŠt |tjtjfƒrN|jj ¡  |jj d¡ nZt |tjƒr¨tj	 |j¡ |jdur¨t |j|j|jd   ¡}tj	j
|j| |d dS )	zInitialize the weightsç        r   )r2   Zstdr   r   )ÚaÚbNr:   )Ú
isinstancer.   r=   ÚweightÚdataZnormal_ZbiasZzero_r?   ÚinitÚuniform_rD   r(   ÚconvÚmathÚsqrtÚkernel_sizeZin_channelsZ	constant_r+   Z
projectionZin_featuresÚLinearrK   Zinitializer_rangeZ	LayerNormZ	GroupNormÚfill_ZConv1dZkaiming_normal_Úgroups)rE   ÚmoduleÚkr&   r&   r'   Ú_init_weights•   s6    

 ý
z&UniSpeechPreTrainedModel._init_weights)Úinput_lengthsc                 C   s4   dd„ }t | jj| jjƒD ]\}}||||ƒ}q|S )zH
        Computes the output length of the convolutional layers
        c                 S   s   t j| | |ddd S )NÚfloor)Zrounding_moder   )r"   Údiv)Zinput_lengthrY   Ústrider&   r&   r'   Ú_conv_out_length»   s    zSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length)ÚziprK   Zconv_kernelZconv_stride)rE   r`   rd   rY   rc   r&   r&   r'   Ú _get_feat_extract_output_lengths¶   s    z9UniSpeechPreTrainedModel._get_feat_extract_output_lengths)Úfeature_vector_lengthÚattention_maskc                 C   s   |j ddd d …df }|  |¡ tj¡}|jd }tj||f|j|jd}d|tj	|jd |jd|d f< | 
dg¡  d¡ 
dg¡ ¡ }|S )Nr1   r/   r   )ÚdtypeÚdevicer   )rj   )Zcumsumrf   Útor"   Úlongr<   Zzerosri   rj   ZarangeÚflipÚbool)rE   rg   rh   Znon_padded_lengthsZoutput_lengthsrF   r&   r&   r'   Ú"_get_feature_vector_attention_maskÅ   s    
ÿ"z;UniSpeechPreTrainedModel._get_feature_vector_attention_maskN)r   r   r    r   r$   Zbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnr_   r   r"   Z
LongTensorÚintrf   ro   r&   r&   r&   r'   rJ   ‹   s   
!rJ   c                
   @   sl   e Zd Zedœdd„Zdd„ Zdd„ Zdeej	 eej	 eej
 ee ee ee eeef d	œd
d„ZdS )ÚUniSpeechModel©rK   c                 C   sz   t  | |¡ || _t|ƒ| _t|ƒ| _|jdks:|jdkrRt	 
t |j¡ ¡ ¡| _|jrdt|ƒ| _n
t|ƒ| _|  ¡  d S )NrN   )rJ   Ú__init__rK   r*   Úfeature_extractorr+   Úfeature_projectionZmask_time_probZmask_feature_probr?   Ú	Parameterr"   ÚTensorrG   rU   Zmasked_spec_embedZdo_stable_layer_normr-   Úencoderr,   Ú	post_init©rE   rK   r&   r&   r'   rs   Ù   s    


zUniSpeechModel.__init__c                 C   s   t dƒ‚d S ©NzNot needed for UniSpeech©ÚAttributeError©rE   r&   r&   r'   Úfreeze_feature_extractorê   s    z'UniSpeechModel.freeze_feature_extractorc                 C   s   t dƒ‚d S r{   r|   r~   r&   r&   r'   Úfreeze_feature_encoderí   s    z%UniSpeechModel.freeze_feature_encoderN)rM   rh   Úmask_time_indicesÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚreturnc           
      C   sÒ   |dur|n| j j}|dur |n| j j}|dur4|n| j j}|  |¡}| dd¡}|durl|  |jd |¡}|  |¡\}}| j	|||d}| j
|||||d}	|	d }|s¾||f|	dd…  S t|||	j|	jdS )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   rh   ©rh   r‚   rƒ   r„   r   )Zlast_hidden_stateÚextract_featuresr   r   )rK   r‚   rƒ   Úuse_return_dictrt   Ú	transposero   r<   ru   Z_mask_hidden_statesrx   ÚUniSpeechBaseModelOutputr   r   )
rE   rM   rh   r   r‚   rƒ   r„   r‡   r   Zencoder_outputsr&   r&   r'   rH   ð   s8    ÿ
ÿûüzUniSpeechModel.forward)NNNNN)r   r   r    r   rs   r   r€   r   r"   rw   r#   rn   r   r%   rŠ   rH   r&   r&   r&   r'   rq   Ø   s"        ù
ørq   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                
       sž   e Zd Zedœ‡ fdd„Zedœdd„Zdd„ Zd	d
„ Ze	de
je
je
jedœdd„ƒZedee
j ee
j ee ee ee eeef dœdd„ƒZ‡  ZS )ÚUniSpeechForPreTrainingrr   c                    s~   t ƒ  |¡ t|ƒ| _t |j¡| _t|ƒ| _	t 
|j|j¡| _t 
|j|j¡| _t 
|j|j¡| _t |j¡| _|  ¡  d S )N)Úsuperrs   rq   rL   r?   ZDropoutZfeat_quantizer_dropoutÚdropout_featuresr.   Ú	quantizerrZ   Zcodevector_dimZproj_codevector_dimÚ	project_qrG   Úproject_hidZnum_ctc_classesÚctc_projZfinal_dropoutÚdropoutry   rz   ©Ú	__class__r&   r'   rs   +  s    

z UniSpeechForPreTraining.__init__)rA   c                 C   s   || j _dS )zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)rŽ   rA   )rE   rA   r&   r&   r'   Úset_gumbel_temperature:  s    z.UniSpeechForPreTraining.set_gumbel_temperaturec                 C   s   t  dt¡ |  ¡  dS )z©
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zžThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)ÚwarningsÚwarnÚFutureWarningr€   r~   r&   r&   r'   r   @  s
    ýz0UniSpeechForPreTraining.freeze_feature_extractorc                 C   s   | j j ¡  dS )z¨
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rL   rt   Z_freeze_parametersr~   r&   r&   r'   r€   L  s    z.UniSpeechForPreTraining.freeze_feature_encoderr   )Útarget_featuresÚnegative_featuresÚpredicted_featuresrA   c                 C   s@   t j| |gdd} t j| ¡ |  ¡ dd}| | ¡}|| }|S )zé
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r/   r1   )r"   ÚcatZcosine_similarityr@   rB   )r™   rš   r›   rA   Úlogitsr&   r&   r'   Úcompute_contrastive_logitsS  s
    
z2UniSpeechForPreTraining.compute_contrastive_logitsN)rM   rh   r‚   rƒ   r„   r…   c                 C   sN  |dur|n| j j}| j|||||d}|d }|  |d ¡}|  |¡\}	}
|  |	 | jjj¡¡}	|  	|	¡}	t
 | d¡| d¡¡ | j j¡}| dd¡}t
 |¡ ¡  |j¡}| dd¡}| d¡}| |d¡|	 | d¡ }|  |¡}|  |¡}d}|s6|dur |||	|
f|dd…  S ||	|
f|dd…  S t|||	|
|j|jdS )	a›  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr†   r   r   r1   rN   r   )r   r   r   r   r   r   )rK   rˆ   rL   r   rŽ   r   rk   rR   ri   r   r"   ÚemptyÚsizer[   Zreplace_probr‰   Z	bernoullirn   rj   rC   Zmasked_fillr’   r‘   r   r   r   )rE   rM   rh   r‚   rƒ   r„   ÚoutputsZtransformer_featuresr‡   Zquantized_featuresr   Zprob_replace_matrixZsampled_replace_matrixr   r   r&   r&   r'   rH   g  sL    û
ÿ

ÿ


úzUniSpeechForPreTraining.forward)r   )NNNN)r   r   r    r   rs   rp   r•   r   r€   rI   r"   r#   rž   r	   r   rw   rn   r   r%   r   rH   Ú__classcell__r&   r&   r“   r'   r‹   %  s2    üü    ú
ùr‹   c                   @   s   e Zd ZdS )ÚUniSpeechForCTCNr)   r&   r&   r&   r'   r£   ¯  s   r£   c                   @   s   e Zd ZdS )Ú"UniSpeechForSequenceClassificationNr)   r&   r&   r&   r'   r¤   ³  s   r¤   )r£   r‹   r¤   rq   rJ   )0r!   rW   r–   Údataclassesr   Útypingr   r   r"   Ztorch.nnr?   Zmodeling_outputsr   r   Zmodeling_utilsr   Úutilsr	   r
   Zwav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   Zconfiguration_unispeechr   Z
get_loggerr   Úloggerr   r(   r*   r+   r,   r-   r.   rJ   rŠ   rq   r‹   r£   r¤   Ú__all__r&   r&   r&   r'   Ú<module>   sF   ,
ÿ-IMÿ 