a
    hW-                     @   s@  d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZmZmZ ddlm Z  G dd deZ!G dd deZ"G dd deZ#eddG dd deZ$G dd dej%Z&eddG dd de#e
Z'g d Z(dS )!    )OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                   @   s   e Zd ZdS )VoxtralAttentionN__name__
__module____qualname__ r   r   g/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/voxtral/modular_voxtral.pyr   &   s   r   c                   @   s   e Zd ZdS )VoxtralEncoderLayerNr   r   r   r   r    r!   *   s   r!   c                   @   s$   e Zd ZdZdZdZdZdZdZdS )VoxtralPreTrainedModelTN)r   r   r   Z_supports_flex_attnZ_supports_cache_classZ_supports_attention_backendZ_can_compile_fullgraphZ_no_split_modulesr   r   r   r    r"   .   s   r"   z:
    The Voxtral encoder, which is a Whisper encoder.
    )Zcustom_introc                   @   s.   e Zd ZeedZedee dddZ	dS )VoxtralEncoder)Z
attentionshidden_statesN)kwargsc                 K   s  | j j| jjd  | jjd  }|jd |krPtd| d|jd  d| d|j| jjj	| jjj
d}tj| |}tj| |}|ddd	}| jj}|| |j	}tjj|| j| jd
}t| jD ]\}}	|	||dd}
|
d }q| |}t|dS )a  
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        r   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptrainingN)attention_maskZlayer_head_mask)last_hidden_state)configZmax_source_positionsZconv1ZstrideZconv2shape
ValueErrortoweightr(   r)   r   Z
functionalZgeluZpermuteembed_positionsZdropoutr+   	enumerateZlayersZ
layer_normr	   )selfinput_featuresr,   r%   Zexpected_seq_lengthinputs_embedsZ	embed_posr$   idxZencoder_layerZlayer_outputsr   r   r    forwardC   s.     

zVoxtralEncoder.forward)N)
r   r   r   r   r!   Z_can_record_outputsr   r   r   r9   r   r   r   r    r#   8   s    r#   c                       s*   e Zd Zed fddZdd Z  ZS )VoxtralMultiModalProjector)r.   c                    sN   t    tj|jj|jjdd| _t	|j
 | _tj|jj|jjdd| _d S )NF)Zbias)super__init__r   ZLinearaudio_configintermediate_sizetext_configZhidden_sizelinear_1r   Zprojector_hidden_actactlinear_2r5   r.   	__class__r   r    r<   u   s    
z#VoxtralMultiModalProjector.__init__c                 C   s"   |  |}| |}| |}|S N)r@   rA   rB   )r5   Zaudio_featuresr$   r   r   r    r9   {   s    


z"VoxtralMultiModalProjector.forward)r   r   r   r   r<   r9   __classcell__r   r   rD   r    r:   t   s   r:   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                       s   e Zd ZdgZddiZddgdgfiZdgZ fddZd	d
 Zdd Z	dd Z
dd Zdd Zdd ZejdddZeedeej eej eej eej ee eej eej ee eej eeejf ee edddZ fddZ  ZS ) VoxtralForConditionalGenerationzlm_head.weightZlm_headZcolwise_repr$   Zlogitsr3   c                    sH   t  | |jj| _t|j| _t|j| _	t
|| _|   d S rF   )r;   r<   r?   Z
vocab_sizer   from_configr=   audio_towerr   language_modelr:   multi_modal_projectorZ	post_initrC   rD   r   r    r<      s    

z(VoxtralForConditionalGeneration.__init__c                 C   s
   | j  S rF   )rK   get_input_embeddingsr5   r   r   r    rM      s    z4VoxtralForConditionalGeneration.get_input_embeddingsc                 C   s   | j | d S rF   )rK   set_input_embeddings)r5   valuer   r   r    rO      s    z4VoxtralForConditionalGeneration.set_input_embeddingsc                 C   s
   | j  S rF   )rK   get_output_embeddingsrN   r   r   r    rQ      s    z5VoxtralForConditionalGeneration.get_output_embeddingsc                 C   s   | j | d S rF   )rK   set_output_embeddings)r5   Znew_embeddingsr   r   r    rR      s    z5VoxtralForConditionalGeneration.set_output_embeddingsc                 C   s   | j | d S rF   )rK   set_decoder)r5   decoderr   r   r    rS      s    z+VoxtralForConditionalGeneration.set_decoderc                 C   s
   | j  S rF   )rK   get_decoderrN   r   r   r    rU      s    z+VoxtralForConditionalGeneration.get_decoder)r6   c                 C   s0   |  |}|j}|d| jjj}| |}|S )a  
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        r&   )rJ   r-   Zreshaper.   r=   r>   rL   )r5   r6   Zaudio_outputsZaudio_hidden_statesaudio_embedsr   r   r    get_audio_embeds   s
    

z0VoxtralForConditionalGeneration.get_audio_embedsNr   )	input_idsr6   r,   position_idspast_key_valuesr7   labels	use_cachecache_positionlogits_to_keepr%   returnc                 K   s`   |du r|   |}|dur:| |}|| jjk}|||< | jf |||||||	|
d|}|S )aj  
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```N)r,   rY   rZ   r7   r[   r\   r]   r^   )rM   rW   r.   Zaudio_token_idrK   )r5   rX   r6   r,   rY   rZ   r7   r[   r\   r]   r^   r%   rV   Zaudio_token_maskoutputsr   r   r    r9      s&    1
	z'VoxtralForConditionalGeneration.forwardc                    sH   | dd }|d}t j|i |}|d urD|d dkrD||d< |S )Nr6   r]   r   )popgetr;   prepare_inputs_for_generation)r5   argsr%   r6   r]   Zmodel_inputsrD   r   r    rc     s    
z=VoxtralForConditionalGeneration.prepare_inputs_for_generation)
NNNNNNNNNr   ) r   r   r   Z_tied_weights_keysZ_tp_planZ_pp_planZ_keep_in_fp32_modules_strictr<   rM   rO   rQ   rR   rS   rU   torchZFloatTensorrW   r   r   r   Z
LongTensorZTensorr   boolr   intr   r   r   r9   rc   rG   r   r   rD   r    rH      sN   
          FrH   )r"   r#   rH   ))typingr   r   re   r   Zactivationsr   Zcache_utilsr   Z
generationr   Zmodeling_outputsr	   r
   r   Zprocessing_utilsr   utilsr   r   r   Zutils.genericr   autor   r   Z qwen2_audio.modeling_qwen2_audior   r   r   r   Zconfiguration_voxtralr   r   r!   r"   r#   Moduler:   rH   __all__r   r   r   r    <module>   s4   
7 