a
    h                     @   s.  d dl mZmZmZ d dlZd dlmZ d dlmZm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; e.<e=Z>G dd deZ?G dd dej@ZAG dd dej@ZBG dd de2ZCG dd  d e3ZDG d!d" d"e6ZEG d#d$ d$eZFe,G d%d& d&e'ZGG d'd( d(eGZHG d)d* d*e7ZIG d+d, d,e:ZJe,d-d.G d/d0 d0eGeZKg d1ZLdS )2    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PretrainedConfig)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)rope_config_validation)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc                       s8   e Zd ZdZdZdgZddddZd fdd	Z  ZS )MoonshineConfiga"  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        rope_scaling (`Dict`, *optional*):
            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
            accordingly.
            Expected contents:
                `rope_type` (`str`):
                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                    'llama3'], with 'default' being the original RoPE implementation.
                `factor` (`float`, *optional*):
                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                    original maximum pre-trained length.
                `original_max_position_embeddings` (`int`, *optional*):
                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                    pretraining.
                `attention_factor` (`float`, *optional*):
                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                    computation. If unspecified, it defaults to value recommended by the implementation, using the
                    `factor` field to infer the suggested value.
                `beta_fast` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                    ramp function. If unspecified, it defaults to 32.
                `beta_slow` (`float`, *optional*):
                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                    ramp function. If unspecified, it defaults to 1.
                `short_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `long_factor` (`list[float]`, *optional*):
                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                    size divided by the number of attention heads divided by 2
                `low_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
                `high_freq_factor` (`float`, *optional*):
                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
        partial_rotary_factor (`float`, *optional*, defaults to 0.9):
            Percentage of the query and keys which will have rotary embedding.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Z	moonshinepast_key_valuesencoder_num_key_value_headsencoder_num_attention_headsencoder_num_hidden_layers)num_key_value_headsnum_attention_headsZnum_hidden_layers              Ngelusilu   {Gz?   T     @?F        r!   c                    s   || _ || _|| _|| _|| _|| _|| _|d u r6|}|| _|	d u rH|}	|	| _|
| _	|| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _t|  t jf ||||d| d S )N)bos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id)
vocab_sizehidden_sizeintermediate_sizer.   decoder_num_hidden_layersr-   decoder_num_attention_headsr,   decoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actdecoder_hidden_actmax_position_embeddingsinitializer_rangerA   	use_cache
rope_thetarope_scalingpartial_rotary_factorr@   attention_biasattention_dropoutr   super__init__)selfrB   rC   rD   r.   rE   r-   rF   r,   rG   rH   rI   rJ   rK   rL   rA   rM   rN   rO   rP   r@   rQ   rR   r>   r?   kwargs	__class__ k/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/moonshine/modular_moonshine.pyrT      sF    zMoonshineConfig.__init__)r1   r2   r3   r4   r4   r5   r5   NNNr6   r7   r8   r9   r:   Tr;   Nr<   TFr=   r:   r!   )	__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferenceZattribute_maprT   __classcell__rY   rY   rW   rZ   r*   2   s@   }                        r*   c                       s0   e Zd Z fddZejejdddZ  ZS )MoonshineEncoderMLPc                    sB   t    || _t| | _t|j|j| _	t|j|j| _
d S NrS   rT   configr   activation_fnnnLinearrC   rD   fc1fc2rU   rc   Z
hidden_actrW   rY   rZ   rT      s
    

zMoonshineEncoderMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S ra   )rg   rd   rh   )rU   rk   rY   rY   rZ   forward  s    


zMoonshineEncoderMLP.forwardr[   r\   r]   rT   torchTensorrm   r_   rY   rY   rW   rZ   r`      s   r`   c                       s0   e Zd Z fddZejejdddZ  ZS )MoonshineDecoderMLPc                    sF   t    || _t| | _t|j|jd | _	t|j|j| _
d S )Nr!   rb   ri   rW   rY   rZ   rT     s
    

zMoonshineDecoderMLP.__init__rj   c                 C   s8   |  |}|jddd\}}| || }| |}|S )Nr!   )dim)rg   chunkrd   rh   )rU   rk   ZgaterY   rY   rZ   rm     s
    

zMoonshineDecoderMLP.forwardrn   rY   rY   rW   rZ   rq     s   rq   c                       s   e Zd Zeeeeed fddZedddddej	e
eej	ej	f  e
ej	 e
e e
ej e
ej	 ee eej	e
ej	 e
eej	  f d	d
dZ  ZS )MoonshineAttentionrc   	layer_idx	is_causalr0   r/   c                    s|   | ||d t || || _t|d|j|j | _| jj	d urr| jj	}|| j| d |  }|| j | _
nd| _
d S )N)r0   r/   head_dimr:   r   )updaterS   rT   rx   getattrrC   r0   ry   rc   rH   head_dim_padding)rU   rc   rw   rx   r0   r/   Ztarget_multipleZtarget_head_dimrW   rY   rZ   rT     s    zMoonshineAttention.__init__past_key_valuer+   4.58new_nameversionN)rk   position_embeddingsattention_maskr+   cache_positionkey_value_statesrV   rl   c                 K   sh  |j d d \}}	| |||	| jj| jdd}
|d u}|d urr|j| j	}|rld|j| j	< |j
}n|j}|d ur~|n|}|r|r|r|j| j	 j}|j| j	 j}nr| ||d| jj| jdd}| ||d| jj| jdd}|r|d ur|||| j	d|i\}}|sj|\}}t|
|||\}
}|d urj|||d}|||| j	|\}}t}| jjdkrt| jj }| jo|d u o|	dk}| jdkrtjj|
d| jf}
tjj|d| jf}tjj|d| jf}|| |
|||f| js
d	n| j| j|d
|\}}| jdkrD|dd | j f }|||	d  }| !|}||fS )Nrr   r:   r!   Tr   )sincosr   eagerr   r=   )Zdropoutscalingrx   .)"shapeZq_projviewrc   r/   ry   Z	transpose
is_updatedgetrw   Zcross_attention_cacheZself_attention_cachelayerskeysvaluesZk_projZv_projrz   r$   r'   _attn_implementationr   rx   r|   ro   re   
functionalpadZtrainingrR   r   Zreshape
contiguousZo_proj)rU   rk   r   r   r+   r   r   rV   ZbszZq_lenZquery_statesZis_cross_attentionr   Zcurrent_statesZ
key_statesZvalue_statesr   r   Zcache_kwargsZattention_interfacerx   Zattn_outputZattn_weightsrY   rY   rZ   rm   4  s    "



	

zMoonshineAttention.forward)NNNNN)r[   r\   r]   r*   intboolrT   r    ro   rp   r   tupler	   
LongTensorr   r   rm   r_   rY   rY   rW   rZ   ru     s,        ru   c                   @   s   e Zd ZdS )MoonshineRotaryEmbeddingN)r[   r\   r]   rY   rY   rY   rZ   r     s   r   c                       s$   e Zd Zeed fddZ  ZS )MoonshineEncoderLayerrc   rw   c                    s\   t  || t||d|j|jd| _t||j| _t	j
|jdd| _t	j
|jdd| _d S )NFrv   bias)rS   rT   ru   r-   r,   	self_attnr`   rI   mlpre   	LayerNormrC   input_layernormpost_attention_layernormrU   rc   rw   rW   rY   rZ   rT     s    zMoonshineEncoderLayer.__init__)r[   r\   r]   r*   r   rT   r_   rY   rY   rW   rZ   r     s   r   c                       s   e Zd Zdeee d fddZedddddej	eej	 eej	 eej	 eej
 eej
 ee ee eej
 eeej	ej	f  eeej	ej	f  ee eejeeejejf  f d
ddZ  ZS )MoonshineDecoderLayerNr   c                    s   t    |j| _t||d|j|jd| _t||d|j|jd| _t||j	| _
tj|jdd| _tj|jdd| _tj|jdd| _d S )NTrv   Fr   )rS   rT   rC   ru   rF   rG   r   encoder_attnrq   rJ   r   re   r   r   r   final_layernormr   rW   rY   rZ   rT     s(    
zMoonshineDecoderLayer.__init__r}   r+   r~   r   F)rk   r   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr+   rM   r   r   encoder_position_embeddingsrV   rl   c              
   K   s   |}|  |}| jf ||||||	|
d|\}}|| }|d urp|}| |}| j|||||d\}}|| }|}| |}| |}|| }|S )N)rk   r   r   r+   rM   r   r   )rk   r   r   r+   rM   )r   r   r   r   r   r   )rU   rk   r   r   r   r   r   r+   rM   r   r   r   rV   Zresidual_rY   rY   rZ   rm     s<    






zMoonshineDecoderLayer.forward)N)
NNNNNNFNNN)r[   r\   r]   r*   r   r   rT   r    ro   rp   r   r	   r   r   r   r   FloatTensorrm   r_   rY   rY   rW   rZ   r     s6             r   c                   @   sF   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZejddd	Zd
S )MoonshinePreTrainedModelrc   modelinput_valuesTr   r   )input_lengthsc                 C   s@   t |d d d }t |d d d }t |d d d }|S )zH
        Computes the output length of the convolutional layers
           @   r:      r   r!   )r   )rU   r   Zoutput_conv1_lengthZoutput_conv2_lengthZoutput_conv3_lengthrY   rY   rZ    _get_feat_extract_output_lengths  s    z9MoonshinePreTrainedModel._get_feat_extract_output_lengthsN)r[   r\   r]   r*   __annotations__Zbase_model_prefixmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attnZ_supports_sdpaZ_can_compile_fullgraphro   r   r   rY   rY   rY   rZ   r     s   
r   c                       sz   e Zd ZdZdZeedZed fddZ	e
jddd	Ze
jd
ddZedejeej ee edddZ  ZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsrk   rc   c                    s   t     | _ j}tjd|dddd| _tj|d| ddd	| _tjd| |ddd	| _tj	d|d
d| _
t d| _t fddt jD | _tj|dd| _d| _|   d S )Nr:   r   r   F)kernel_sizestrider   r!   r   r   )r   r   gh㈵>)Z
num_groupsZnum_channelsepsr   c                    s   g | ]}t  |qS rY   )r   .0idxr   rY   rZ   
<listcomp>      z-MoonshineEncoder.__init__.<locals>.<listcomp>r   )rS   rT   rc   rC   re   ZConv1dconv1conv2conv3Z	GroupNorm	groupnormr   
rotary_emb
ModuleListranger.   r   r   
layer_normZgradient_checkpointing	post_init)rU   rc   Z	embed_dimrW   r   rZ   rT     s    zMoonshineEncoder.__init__rl   c                 C   s   | j S ra   r   rU   rY   rY   rZ   get_input_embeddings%  s    z%MoonshineEncoder.get_input_embeddings)valuec                 C   s
   || _ d S ra   r   )rU   r   rY   rY   rZ   set_input_embeddings(  s    z%MoonshineEncoder.set_input_embeddingsN)r   r   rV   rl   c           
      K   s>  | d}tj| |}| |}tj| |}tj| |}|	ddd}|dur| 
|jd }d}|ddd|f dd|f }| jjdkr|d	k r|nd}n&| jjd
krt||j}nt||j}tjd|jd |jd d}| ||}| jD ]}	|	|f|||d|}q
| |}t|dS )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r:   r   r!   Nrr     .flash_attention_2r=   sdpadevice)r   r   r   )last_hidden_state)	unsqueezere   r   tanhr   r   r6   r   r   Zpermuter   r   rc   r   anyr   dtyper   ro   aranger   r   r   r   r   )
rU   r   r   rV   rk   mask_lendownsample_strider   r   Zencoder_layerrY   rY   rZ   rm   +  s>    




zMoonshineEncoder.forward)N)r[   r\   r]   r^   r   ru   r   _can_record_outputsr*   rT   re   Moduler   r   r   ro   r   r   rp   r   r   r   rm   r_   rY   rY   rW   rZ   r     s     r   c                       s   e Zd ZdZeedddeeeddddZed fdd	Z	e
deej eej eej ee eej ee eej eej eej ee eeef dddZ  ZS )MoonshineDecoder	input_idsr:   r   )indexZ
layer_namer   )r   rk   cross_attentionsr   c                    sB   t    tj jdd| _t fddt jD | _	d S )NFr   c                    s   g | ]}t  |qS rY   )r   r   r   rY   rZ   r   s  r   z-MoonshineDecoder.__init__.<locals>.<listcomp>)
rS   rT   re   r   rC   normr   r   rE   r   rU   rc   rW   r   rZ   rT   o  s
    zMoonshineDecoder.__init__N)r   r   r   r+   inputs_embedsrM   r   r   r   rV   rl   c
              
   K   s  |du |duA rt d|du r*| |}|rP|du rPtt| jdt| jd}|du r|durh| nd}tj|||jd  |j	d}|du r|
d}t| j|||||d}|}| ||}|	durT|jd }d	}|	d
dd|f d
d|f }	| jjdkr|	dk r|	nd}	n8| jjdkr@t|	|j|jd }	nt|	|j|jd }	| jD ](}||||f|	|||||d|
}qZ| |}t||r|nddS )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r:   r   )rc   Zinput_embedsr   r   r+   r   r   .r   r=   r   )r   r   r+   rM   r   r   )r   r+   )
ValueErrorZembed_tokensr   r
   rc   Zget_seq_lengthro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rU   r   r   r   r+   r   rM   r   r   r   rV   Zpast_seen_tokensZcausal_maskrk   r   r   r   Zdecoder_layerrY   rY   rZ   rm   v  sp    

	





zMoonshineDecoder.forward)	NNNNNNNNN)r[   r\   r]   r   r   ru   r   r   r*   rT   r   r   ro   r   rp   r	   r   r   r   r   r   r   r   rm   r_   rY   rY   rW   rZ   r   g  s:            
r   c                   @   s   e Zd Zeedeej eej eej eej ee	e	ej   ee
ee	ej f  ee	ej  ee	ej  ee eej ee edddZdS )MoonshineModelN)r   r   decoder_input_idsdecoder_attention_maskencoder_outputsr+   decoder_inputs_embedsdecoder_position_idsrM   r   rV   rl   c                 K   sl   |du r| j |fd|i|}| jf ||||j||||	|
d	|}t|j|j|j|j|j|j|j|jdS )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        Nr   )	r   r   r   r   r+   r   r   rM   r   )r   r+   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)encoderdecoderr   r   r+   rk   r   r   )rU   r   r   r   r   r   r+   r   r   rM   r   rV   Zdecoder_outputsrY   rY   rZ   rm     s2    .
zMoonshineModel.forward)
NNNNNNNNNN)r[   r\   r]   r   r   r   ro   r   r   r   r   r   r   r   r   r   rm   rY   rY   rY   rZ   r     s4             r   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )Zcustom_introc                       s   e Zd ZdgZed fddZdd Zdd Zd	d
 Zdd Z	e
jdddZeedeej eej eej eej eeeej   eeeeej f  eeej  eeej  ee eej eej ee edddZ  ZS )!MoonshineForConditionalGenerationzproj_out.weightr   c                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr   )
rS   rT   r   r   re   rf   rC   rB   proj_outr   r   rW   rY   rZ   rT   $  s    
z*MoonshineForConditionalGeneration.__init__c                 C   s
   | j  S ra   )r   get_encoderr   rY   rY   rZ   r   ,  s    z-MoonshineForConditionalGeneration.get_encoderc                 C   s
   | j  S ra   )r   get_decoderr   rY   rY   rZ   r   /  s    z-MoonshineForConditionalGeneration.get_decoderc                 C   s   | j S ra   r   r   rY   rY   rZ   get_output_embeddings2  s    z7MoonshineForConditionalGeneration.get_output_embeddingsc                 C   s
   || _ d S ra   r   )rU   Znew_embeddingsrY   rY   rZ   set_output_embeddings5  s    z7MoonshineForConditionalGeneration.set_output_embeddingsr   c                 C   s
   | j  S ra   )r   r   r   rY   rY   rZ   r   8  s    z6MoonshineForConditionalGeneration.get_input_embeddingsN)r   r   r   r   r   r+   r   r   rM   r   labelsrV   rl   c                 K   s   |dur,|du r,|du r,t || jj| jj}| j|f||||||||	|
d	|}| |j}d}|dur~| j||| jjd}t	|||j
|j|j|j|j|j|jd	S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r   r   r   r   r+   r   r   rM   r   )logitsr   rB   )	lossr   r+   r   r   r   r   r   r   )r)   rc   Zpad_token_idrA   r   r   r   Zloss_functionrB   r   r+   r   r   r   r   r   r   )rU   r   r   r   r   r   r+   r   r   rM   r   r   rV   outputsr   r   rY   rY   rZ   rm   ;  sF    3z)MoonshineForConditionalGeneration.forward)NNNNNNNNNNN)r[   r\   r]   Z_tied_weights_keysr*   rT   r   r   r   r   re   r   r   r   r   r   ro   r   r   r   r   r   r   r   r   r   rm   r_   rY   rY   rW   rZ   r     sF              r   )r*   r   r   r   )Mtypingr   r   r   ro   Ztorch.nnre   Ztransformers.utils.genericr   r   Zactivationsr   Zcache_utilsr	   r
   r   Zconfiguration_utilsr   Z
generationr   Zmasking_utilsr   Zmodeling_attn_mask_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   r   Zmodeling_rope_utilsr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr    Zglm.modeling_glmr"   r#   r$   Zllama.modeling_llamar%   r&   r'   Zwhisper.modeling_whisperr(   r)   Z
get_loggerr[   loggerr*   r   r`   rq   ru   r   r   r   r   r   r   r   r   __all__rY   rY   rY   rZ   <module>   sN   
 NoKbjKs