a
    h                  
   @   s  d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
mZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 G dd dej3Z4G dd dej3Z5ej6e7ej6dddZ8d>ej3ej6ej6ej6eej6 e9e9e*e, dddZ:dd  Z;d?d!d"Z<G d#d$ d$ej3Z=G d%d& d&ej3Z>G d'd( d(eZ?G d)d* d*eZ@e-G d+d, d,e(ZAG d-d. d.eAZBe-G d/d0 d0eAZCd@eDe7e7f e9e7eejE e7ejFd1d2d3ZGe-G d4d5 d5eAZHej6e7e7d6d7d8ZIe-d9d:G d;d< d<eAeZJg d=ZKdS )A    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg   )MoonshineConfigc                       s0   e Zd Z fddZejejdddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S Nsuper__init__configr   activation_fnnnLinearhidden_sizeZintermediate_sizefc1fc2selfr'   Z
hidden_act	__class__ l/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/moonshine/modeling_moonshine.pyr&   4   s
    

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r#   )r,   r(   r-   )r/   r5   r2   r2   r3   forward;   s    


zMoonshineEncoderMLP.forward__name__
__module____qualname__r&   torchTensorr7   __classcell__r2   r2   r0   r3   r"   3   s   r"   c                       s0   e Zd Z fddZejejdddZ  ZS )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )N   r$   r.   r0   r2   r3   r&   C   s
    

zMoonshineDecoderMLP.__init__r4   c                 C   s8   |  |}|jddd\}}| || }| |}|S )Nr@   dim)r,   chunkr(   r-   )r/   r5   Zgater2   r2   r3   r7   J   s
    

zMoonshineDecoderMLP.forwardr8   r2   r2   r0   r3   r?   B   s   r?   )r5   n_repr6   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)shapeexpandreshape)r5   rE   batchnum_key_value_headsslenhead_dimr2   r2   r3   	repeat_kvR   s
    0rM           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr@   r   rA   )rC   dtype)ptrainingr    )rM   num_key_value_groupsr<   matmul	transposerF   r)   
functionalZsoftmaxZfloat32torX   rU   rZ   
contiguous)rO   rP   rQ   rR   rS   rT   rU   rV   
key_statesvalue_statesattn_weightscausal_maskattn_outputr2   r2   r3   eager_attention_forward^   s    
&rf   c                 C   s>   | ddddf }| ddddf }t j| |fdddS )	z*Rotates half the hidden dims of the input..r   Nr@   r    rA   rB   rW   )r<   stackflatten)xx1Zx2r2   r2   r3   rotate_halfx   s    rk   c                 C   s   | |}| |}|dd|jd d f jddd}|dd|jd d f jddd}|jd }| dd|f | d|df  }}|dd|f |d|df  }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrA   r@   rB   )	unsqueezerF   Zrepeat_interleaverk   r<   cat)qkcossinposition_idsZunsqueeze_dimZ
rotary_dimZq_rotZq_passZk_rotZk_passZq_embedZk_embedr2   r2   r3   apply_rotary_pos_emb   s    

$$
""rs   c                       s   e Zd ZdZeeeeed fddZeddddde	j
eee	j
e	j
f  ee	j
 ee ee	j ee	j
 ee ee	j
ee	j
 eee	j
  f d
ddZ  ZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr'   	layer_idx	is_causalnum_attention_headsrJ   c                    s  t    |||d || _|| _t|d|j|j | _|j|j	 | _
| jd | _|j| _|| _tj|j|j| j |jd| _tj|j|j	| j |jd| _tj|j|j	| j |jd| _tj|j| j |jdd| _| jjd ur| jj}|| j| d |  }|| j | _nd| _d S )N)rx   rJ   rL   g      ࿩biasFr    r   )r%   r&   updater'   rv   getattrr+   rx   rL   rJ   r[   rT   attention_dropoutrw   r)   r*   Zattention_biasq_projk_projv_projo_projZpad_head_dim_to_multiple_ofhead_dim_padding)r/   r'   rv   rw   rx   rJ   Ztarget_multipleZtarget_head_dimr0   r2   r3   r&      s0    
zMoonshineAttention.__init__past_key_valuepast_key_values4.58new_nameversionN)r5   position_embeddingsrS   r   cache_positionkey_value_statesrV   r6   c                 K   sh  |j d d \}}	| |||	| jj| jdd}
|d u}|d urr|j| j	}|rld|j| j	< |j
}n|j}|d ur~|n|}|r|r|r|j| j	 j}|j| j	 j}nr| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|sj|\}}t|
|||\}
}|d urj|||d}|||| j	|\}}t}| jjdkrt| jj }| jo|d u o|	dk}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| js
d	n| j| j|d
|\}}| jdkrD|dd | j f }|||	d  }| !|}||fS )NrA   r    r@   Tr   )rq   rp   r   eagerr   rN   )rU   rT   rw   .)"rF   r~   viewr'   rJ   rL   r]   
is_updatedgetrv   Zcross_attention_cacheZself_attention_cachelayerskeysvaluesr   r   r{   rs   rf   _attn_implementationr   rw   r   r<   r)   r^   padrZ   r}   rT   rH   r`   r   )r/   r5   r   rS   r   r   r   rV   ZbszZq_lenZquery_statesZis_cross_attentionr   Zcurrent_statesra   rb   rp   rq   Zcache_kwargsZattention_interfacerw   re   rc   r2   r2   r3   r7      s    "



	

zMoonshineAttention.forward)NNNNN)r9   r:   r;   __doc__r!   intboolr&   r   r<   r=   r   tupler	   
LongTensorr   r   r7   r>   r2   r2   r0   r3   rt      s.   %     rt   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	MoonshineRotaryEmbeddinginv_freqNr'   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r%   r&   hasattr
isinstancer   dictr   r   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr'   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r/   r'   devicer   r0   r2   r3   r&   -  s    
z!MoonshineRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   rA   r    ZmpscpuF)device_typeZenabledr@   rB   rX   )r   floatrG   rF   r_   r   r   r   strr<   Zautocastr]   rm   rp   r   rq   rX   )
r/   ri   rr   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembrp   rq   r2   r2   r3   r7   >  s    0&,z MoonshineRotaryEmbedding.forward)N)r9   r:   r;   r<   r=   __annotations__r!   r&   Zno_gradr   r7   r>   r2   r2   r0   r3   r   *  s
   

r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee ejd
	ddZ  ZS )MoonshineEncoderLayerr'   rv   c                    s`   t    |j| _t||d|j|jd| _t||j| _	t
j|jdd| _t
j|jdd| _d S )NFru   ry   )r%   r&   r+   rt   Zencoder_num_attention_headsZencoder_num_key_value_heads	self_attnr"   Zencoder_hidden_actmlpr)   	LayerNorminput_layernormpost_attention_layernormr/   r'   rv   r0   r2   r3   r&   O  s    
zMoonshineEncoderLayer.__init__r   r   r   r   NF)	r5   rS   rr   r   	use_cacher   r   rV   r6   c              
   K   s^   |}	|  |}| jf |||||||d|\}}
|	| }|}	| |}| |}|	| }|S )Nr5   rS   rr   r   r   r   r   )r   r   r   r   )r/   r5   rS   rr   r   r   r   r   rV   residual_r2   r2   r3   r7   _  s&    




zMoonshineEncoderLayer.forward)NNNFNN)r9   r:   r;   r!   r   r&   r   r<   r=   r   r   r	   r   r   r   r   r7   r>   r2   r2   r0   r3   r   N  s&         r   c                       s   e Zd Zdeee d fddZedddddej	eej	 eej	 eej	 eej
 eej
 ee ee eej
 eeej	ej	f  eeej	ej	f  ee eejeeejejf  f d
ddZ  ZS )MoonshineDecoderLayerNr   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTru   Fry   )r%   r&   r+   rt   Zdecoder_num_attention_headsZdecoder_num_key_value_headsr   encoder_attnr?   Zdecoder_hidden_actr   r)   r   r   r   final_layernormr   r0   r2   r3   r&     s(    
zMoonshineDecoderLayer.__init__r   r   r   r   F)r5   rS   encoder_hidden_statesencoder_attention_maskrr   encoder_position_idsr   r   r   r   encoder_position_embeddingsrV   r6   c              
   K   s   |}|  |}| jf ||||||	|
d|\}}|| }|d urp|}| |}| j|||||d\}}|| }|}| |}| |}|| }|S )Nr   )r5   r   rS   r   r   )r   r   r   r   r   r   )r/   r5   rS   r   r   rr   r   r   r   r   r   r   rV   r   r   r2   r2   r3   r7     s<    






zMoonshineDecoderLayer.forward)N)
NNNNNNFNNN)r9   r:   r;   r!   r   r   r&   r   r<   r=   r   r	   r   r   r   r   FloatTensorr7   r>   r2   r2   r0   r3   r     s6             r   c                   @   sF   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZejddd	Zd
S )MoonshinePreTrainedModelr'   modelinput_valuesTr   r   )input_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r       r   r@   )r   )r/   r   Zoutput_conv1_lengthZoutput_conv2_lengthZoutput_conv3_lengthr2   r2   r3    _get_feat_extract_output_lengths  s    z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r9   r:   r;   r!   r   Zbase_model_prefixmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attnZ_supports_sdpaZ_can_compile_fullgraphr<   r   r   r2   r2   r2   r3   r     s   
r   c                       sz   e Zd ZdZdZeedZed fddZ	e
jddd	Ze
jd
ddZedejeej ee edddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsr5   r   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t d| _t fddt jD | _tj|dd| _d| _|   d S )Nr    r   r   F)kernel_sizestriderz   r@   r   r   )r   r   gh㈵>)Z
num_groupsZnum_channelsepsr   c                    s   g | ]}t  |qS r2   )r   .0idxr   r2   r3   
<listcomp>      z-MoonshineEncoder.__init__.<locals>.<listcomp>ry   )r%   r&   r'   r+   r)   ZConv1dconv1conv2conv3Z	GroupNorm	groupnormr   
rotary_emb
ModuleListrangeZencoder_num_hidden_layersr   r   
layer_normgradient_checkpointing	post_init)r/   r'   Z	embed_dimr0   r   r3   r&     s    zMoonshineEncoder.__init__r6   c                 C   s   | j S r#   r   r/   r2   r2   r3   get_input_embeddings  s    z%MoonshineEncoder.get_input_embeddings)rR   c                 C   s
   || _ d S r#   r   r/   rR   r2   r2   r3   set_input_embeddings  s    z%MoonshineEncoder.set_input_embeddingsN)r   rS   rV   r6   c           
      K   s>  | d}tj| |}| |}tj| |}tj| |}|	ddd}|dur| 
|jd }d}|ddd|f dd|f }| jjdkr|d	k r|nd}n&| jjd
krt||j}nt||j}tjd|jd |jd d}| ||}| jD ]}	|	|f|||d|}q
| |}t|dS )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r    r   r@   NrA     .flash_attention_2rN   sdpar   )rS   rr   r   )last_hidden_state)rl   r)   r^   tanhr   r   Zgelur   r   Zpermuter   rF   r'   r   anyr   rX   r   r<   aranger   r   r   r   r   )
r/   r   rS   rV   r5   mask_lendownsample_striderr   r   Zencoder_layerr2   r2   r3   r7     s>    




zMoonshineEncoder.forward)N)r9   r:   r;   r   r   rt   r   _can_record_outputsr!   r&   r)   Moduler   r   r   r<   r   r   r=   r   r   r   r7   r>   r2   r2   r0   r3   r     s     r   c                       s   e Zd ZdZeedddeeeddddZed fdd	Z	e
deej eej eej ee eej ee eej eej eej ee eeef dddZ  ZS )MoonshineDecoder	input_idsr    r   )indexZ
layer_namer   )r   r5   cross_attentionsr   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _tj jdd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r2   )r   r   r   r2   r3   r   W  r   z-MoonshineDecoder.__init__.<locals>.<listcomp>Fry   r   )r%   r&   pad_token_idZpadding_idx
vocab_sizer)   Z	Embeddingr+   embed_tokensr   r   Zdecoder_num_hidden_layersr   r   normr   r   r   r   r/   r'   r0   r   r3   r&   P  s    zMoonshineDecoder.__init__N)r   rS   rr   r   inputs_embedsr   r   r   r   rV   r6   c
              
   K   s  |du |duA rt d|du r*| |}|rP|du rPtt| jdt| jd}|du r|durh| nd}tj|||jd  |j	d}|du r|
d}t| j|||||d}|}| ||}|	durT|jd }d	}|	d
dd|f d
d|f }	| jjdkr|	dk r|	nd}	n8| jjdkr@t|	|j|jd }	nt|	|j|jd }	| jD ](}||||f|	|||||d|
}qZ| |}t||r|nddS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r    r   )r'   Zinput_embedsrS   r   r   rr   rW   r   .r   rN   r   )r   rr   r   r   r   r   )r   r   )
ValueErrorr   r   r
   r'   Zget_seq_lengthr<   r   rF   r   rl   r   r   r   r   r   rX   r   r   r   r   )r/   r   rS   rr   r   r   r   r   r   r   rV   Zpast_seen_tokensrd   r5   r   r   r   Zdecoder_layerr2   r2   r3   r7   `  sp    

	





zMoonshineDecoder.forward)	NNNNNNNNN)r9   r:   r;   r   r   rt   r   r   r!   r&   r   r   r<   r   r=   r	   r   r   r   r   r   r   r   r7   r>   r2   r2   r0   r3   r   G  s:            
r   )rF   	mask_probmask_lengthrS   	min_masksr6   c                    s  | \}dk rt dkr6t d d dtjd   fdd}|durt| d	 nfd
dt|D }tj	|ft
d}g }	|}
|
dkr|S |D ]v}||}tjjt|d  |dd}t|dkrd }n|d }t|tj|
| tjd| g}|	| qt|	}	t|	dddddf ||
f}	|	||
 }	tddddf }t|||
f||
 }|	| }	|	 d kr҈d |	|	d k< t||	dd	 |S )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r    z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    sX   t |     }t|}| kr2 }| d  |k rTt| d  d}|S )z;Given input length, compute how many spans should be maskedr    r   )r   max)input_lengthnum_masked_spanepsilonr   r   r   sequence_lengthr2   r3   compute_num_masked_span  s    
z6_compute_mask_indices.<locals>.compute_num_masked_spanNrA   c                    s   g | ]} qS r2   r2   )r   r   )r  r2   r3   r     r   z)_compute_mask_indices.<locals>.<listcomp>r   r   F)replace)r   nprandomZranditemdetachsumtolistr   Zzerosr   choicer   lenZconcatenateZonesZint32appendarrayZbroadcast_torH   r   Zput_along_axis)rF   r   r   rS   r   
batch_sizer  r   Zspec_aug_maskZspec_aug_mask_idxsZmax_num_masked_spanr   r  Zspec_aug_mask_idxZdummy_mask_idxoffsetsr2   r  r3   _compute_mask_indices  s\    

r  c                       s   e Zd Zed fddZdd Zdd Zdd	 Zd
d Zde	j
ee	j dddZeedee	j
 ee	j ee	j ee	j eeee	j
   eeeee	j
 f  eee	j
  eee	j  ee ee	j ee edddZ  ZS )MoonshineModelr   c                    s,   t  | t|| _t|| _|   d S r#   )r%   r&   r   encoderr   decoderr   r   r0   r2   r3   r&   4  s    

zMoonshineModel.__init__c                 C   s   | j jS r#   r  r   r   r2   r2   r3   r   <  s    z#MoonshineModel.get_input_embeddingsc                 C   s   || j _d S r#   r  r   r2   r2   r3   r   ?  s    z#MoonshineModel.set_input_embeddingsc                 C   s   | j S r#   )r  r   r2   r2   r3   get_encoderB  s    zMoonshineModel.get_encoderc                 C   s   | j   dS )z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)r  Z_freeze_parametersr   r2   r2   r3   freeze_encoderE  s    zMoonshineModel.freeze_encoderN)input_featuresrS   c                 C   s   t | jdds|S | \}}}| jjdkr| jrt||f| jj| jj|| jjd}tj	||j
tjd}|dddf d|d}d||< | jjdkr| jrt||f| jj| jj| jjd}tj	||j
tjd}d||< |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        Zapply_spec_augmentTr   )r   r   rS   r   )r   rX   NrA   )r   r   r   )r|   r'   sizeZmask_time_probrZ   r  Zmask_time_lengthZmask_time_min_masksr<   Ztensorr   r   rG   Zmask_feature_probZmask_feature_lengthZmask_feature_min_masks)r/   r  rS   r  r+   r  Zmask_time_indicesZmask_feature_indicesr2   r2   r3   _mask_input_featuresL  s0    z#MoonshineModel._mask_input_features)r   rS   decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr   r   rV   r6   c                 K   sl   |du r| j |fd|i|}| jf ||||j||||	|
d	|}t|j|j|j|j|j|j|j|jdS )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        NrS   )	r   rS   r   r   r   r   rr   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)r  r  r   r   r   r5   r   r   )r/   r   rS   r  r  r  r   r   r!  r   r   rV   Zdecoder_outputsr2   r2   r3   r7   w  s2    .
zMoonshineModel.forward)N)
NNNNNNNNNN)r9   r:   r;   r!   r&   r   r   r  r  r<   r   r   r   r  r   r   r   r   r   r   r   r   r   r7   r>   r2   r2   r0   r3   r  2  sH   
 +          r  )r   r   decoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du rTtd||dk| |S )z1
    Shift input ids one token to the right.
    NrA   r    r   z1self.model.config.pad_token_id has to be defined.i)Z	new_zerosrF   cloner   Zmasked_fill_)r   r   r&  Zshifted_input_idsr2   r2   r3   shift_tokens_right  s    (r(  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )Zcustom_introc                       s   e Zd ZdgZed fddZdd Zdd Zd	d
 Zdd Z	e
jdddZeedeej eej eej eej eeeej   eeeeej f  eeej  eeej  ee eej eej ee edddZ  ZS )!MoonshineForConditionalGenerationzproj_out.weightr   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFry   )
r%   r&   r  r   r)   r*   r+   r   proj_outr   r   r0   r2   r3   r&     s    
z*MoonshineForConditionalGeneration.__init__c                 C   s
   | j  S r#   )r   r  r   r2   r2   r3   r    s    z-MoonshineForConditionalGeneration.get_encoderc                 C   s
   | j  S r#   )r   get_decoderr   r2   r2   r3   r+    s    z-MoonshineForConditionalGeneration.get_decoderc                 C   s   | j S r#   r*  r   r2   r2   r3   get_output_embeddings  s    z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   s
   || _ d S r#   r,  )r/   Znew_embeddingsr2   r2   r3   set_output_embeddings  s    z7MoonshineForConditionalGeneration.set_output_embeddingsr   c                 C   s
   | j  S r#   )r   r   r   r2   r2   r3   r     s    z6MoonshineForConditionalGeneration.get_input_embeddingsN)r   rS   r  r  r  r   r   r!  r   r   labelsrV   r6   c                 K   s   |dur,|du r,|du r,t || jj| jj}| j|f||||||||	|
d	|}| |j}d}|dur~| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	rS   r  r  r  r   r   r!  r   r   )logitsr/  r   )	lossr0  r   r"  r#  r   r$  r   r%  )r(  r'   r   r&  r   r*  r   Zloss_functionr   r   r   r"  r#  r   r$  r   r%  )r/   r   rS   r  r  r  r   r   r!  r   r   r/  rV   outputsr0  r1  r2   r2   r3   r7     sF    3z)MoonshineForConditionalGeneration.forward)NNNNNNNNNNN)r9   r:   r;   Z_tied_weights_keysr!   r&   r  r+  r-  r.  r)   r   r   r   r   r   r<   r   r   r   r   r   r   r   r   r   r7   r>   r2   r2   r0   r3   r)    sF              r)  )r  r   r)  )rN   )Nr    )Nr   )Ltypingr   r   r   numpyr  r<   Ztorch.nnr)   Ztransformers.utils.genericr   r   Zactivationsr   Zcache_utilsr	   r
   r   Z
generationr   Zmasking_utilsr   Zmodeling_attn_mask_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zutils.deprecationr   Zconfiguration_moonshiner!   r   r"   r?   r=   r   rM   r   rf   rk   rs   rt   r   r   r   r   r   r   r   r   Zndarrayr  r  r(  r)  __all__r2   r2   r2   r3   <module>   s~    
* $4Kbw  
w s