a
    h                     @   s  d Z ddlmZmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ e  rddl0m1Z1 e"2e3Z4eG dd deZ5G dd dej6Z7G dd de)Z8G dd de%Z9G dd de&Z:G d d! d!e$Z;G d"d# d#ej6Z<G d$d% d%eZ=G d&d' d'e5Z>G d(d) d)eZ?G d*d+ d+e5Z@ed,d-G d.d/ d/e5ZAed0d-G d1d2 d2e5e/ZBg d3ZCdS )4zPyTorch Dia model.    )CallableOptionalUnionN)nn   )DynamicCacheEncoderDecoderCache)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )LlamaAttentionLlamaRMSNormLlamaRotaryEmbeddingeager_attention_forward)Phi3MLP   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZddgZdS )DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r!   __annotations__base_model_prefixZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZmain_input_nameZ_no_split_modules r1   r1   _/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dia/modular_dia.pyr&   9   s   
r&   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r'   c                    s^   t    t|j|j |j| _|j| _|j| _tj	|jtj
d|j }| jd|dd d S )N)dtypeoffsetsF)
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongZregister_buffer)selfr'   r6   	__class__r1   r2   r9   T   s    
z!DiaMultiChannelEmbedding.__init__)audio_codesreturnc                 C   sH   || j |j d}| ||jd |jd d| j}|jddS )Nr    r   r   )dim)	r6   todeviceZsqueezer>   viewshaper=   sum)rB   rE   tokensZembedsr1   r1   r2   forward\   s    $z DiaMultiChannelEmbedding.forward)
r,   r-   r.   __doc__r"   r9   r?   TensorrO   __classcell__r1   r1   rC   r2   r3   F   s   r3   c                   @   s   e Zd ZdS )DiaMLPNr,   r-   r.   r1   r1   r1   r2   rS   b   s   rS   c                   @   s   e Zd ZdS )
DiaRMSNormNrT   r1   r1   r1   r2   rU   f   s   rU   c                   @   s   e Zd ZdS )DiaRotaryEmbeddingNrT   r1   r1   r1   r2   rV   j   s   rV   c                   @   s,   e Zd ZdZdeeef eedddZ	dS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperF)r'   	layer_idx	is_causalc                 C   s   t j|  || _|| _|j| _| jj| _| jjp6| j| _| j| j | _	t
|d|j| j | _d| _d| _|| _t j| j| j| j dd| _t j| j| j| j dd| _t j| j| j| j dd| _t j| j| j | jdd| _d S )Nhead_dimr            FZbias)r   Moduler9   r'   rY   r=   Znum_attention_heads	num_headsnum_key_value_headsnum_key_value_groupsgetattrr[   scalingattention_dropoutrZ   Linearq_projk_projv_projo_proj)rB   r'   rY   rZ   r1   r1   r2   r9   q   s    
zDiaSelfAttention.__init__N)F)
r,   r-   r.   rP   r   r#   r"   intboolr9   r1   r1   r1   r2   rW   n   s   rW   c                
       sd   e Zd ZdZeed fddZd	ejeje	ej e	e
 ee eeje	ej f dddZ  ZS )
DiaCrossAttentionrX   r'   rY   c                    s   t    || _|| _|j| _|j| _| jj| _| jj| _	| j| j	 | _
|j| _d| _d| _d| _tj| j| j| j dd| _tj| j| j	| j dd| _tj| j| j	| j dd| _tj| j| j | jdd| _d S )Nr    r\   Fr]   )r8   r9   r'   rY   r=   Zcross_hidden_sizeZcross_num_attention_headsr_   Zcross_num_key_value_headsr`   ra   Zcross_head_dimr[   rc   rd   rZ   r   re   rf   rg   rh   ri   rB   r'   rY   rC   r1   r2   r9      s     


zDiaCrossAttention.__init__N)hidden_statescross_attention_statesattention_maskpast_key_valueskwargsrF   c                 K   sd  |j d d }g |d| jR }g |j d d d| jR }| ||dd}	|d urn|j| jnd}
|d ur|
r|jj	| j j
}|jj	| j j}nZ| ||dd}| ||dd}|d ur|j||| j\}}d|j| j< t}| jjdkrt| jj }|| |	|||fd| ji|\}}|g |dR  }| |}||fS )NrG   r    r   FTeagerrc   )rL   r[   rf   rK   	transpose
is_updatedgetrY   Zcross_attention_cachelayerskeysvaluesrg   rh   updater   r'   _attn_implementationr   rc   reshape
contiguousri   )rB   ro   rp   rq   rr   rs   input_shapeZhidden_shapeZcross_shapeZquery_statesrv   Z
key_statesZvalue_statesZattention_interfaceZattn_outputZattn_weightsr1   r1   r2   rO      sD    


zDiaCrossAttention.forward)NN)r,   r-   r.   rP   r"   rj   r9   r?   rQ   r   r   r   r   tuplerO   rR   r1   r1   rC   r2   rl      s     rl   c                	       sh   e Zd Zeed fddZdejee	ejejf  eej e
e e	ejeej f dddZ  ZS )	r*   rm   c                    sL   t    t|j|jd| _t||dd| _t|j|jd| _t	|| _
d S )NepsFrZ   )r8   r9   rU   r=   norm_epspre_sa_normrW   self_attentionpost_sa_normrS   mlprn   rC   r1   r2   r9      s
    
zDiaEncoderLayer.__init__N)ro   position_embeddingsrq   rs   rF   c           
      K   sZ   |}|  |}| j|f||d|\}}|| }|}| |}| |}	||	 }||fS )Nr   rq   )r   r   r   r   )
rB   ro   r   rq   rs   residualnormed_statesself_attn_outputself_attn_weightsmlp_outr1   r1   r2   rO      s     



zDiaEncoderLayer.forward)NN)r,   r-   r.   r#   rj   r9   r?   rQ   r   r   r   r   rO   rR   r1   r1   rC   r2   r*      s   
  r*   c                       s|   e Zd Zed fddZeedeje	ej e	e
 e	e
 ee eeef dddZeejdf ejd	d
dZ  ZS )
DiaEncoderr4   c                    sd   t     | _t j j| _t fddt	 j
D | _t j jd| _t | _d S )Nc                    s   g | ]}t  |qS r1   )r*   .0rY   r4   r1   r2   
<listcomp>       z'DiaEncoder.__init__.<locals>.<listcomp>r   )r8   r9   r'   r   r:   r;   r=   	embedding
ModuleListrangenum_hidden_layersrx   rU   r   normrV   rotary_embeddingsrB   r'   rC   r4   r2   r9      s    zDiaEncoder.__init__NF)r)   rq   output_attentionsoutput_hidden_statesrs   rF   c                 K   s   |  |}tj|jd |jdd d d f }| ||}| ||}|rLdnd }	|rXdnd }
| jD ]B}|rt|	|f }	||f||d|}|d }|rb|
|d f }
qb| |}|r|	|f7 }	t	||	|
dS )NrG   rJ   r1   r   r   r    last_hidden_statero   
attentions)
r   r?   r@   rL   rJ   r   _update_full_maskrx   r   r   )rB   r)   rq   r   r   rs   ro   position_idsr   Zencoder_statesZall_attentionsZencoder_layerlayer_outputsr1   r1   r2   rO      s:    

"



zDiaEncoder.forward)rq   inputs_embedsc                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S )Nflash_attention_2r   sdpaflex_attentionFr   	r'   r|   r   r5   
isinstancer?   rQ   r%   r
   )rB   rq   r   r1   r1   r2   r   .  s    zDiaEncoder._update_full_mask)NFF)r,   r-   r.   r#   r9   r   r   r?   rQ   r   rk   r   r   r   r   r   rO   r   rR   r1   r1   rC   r2   r      s"      
3r   c                       s   e Zd Zeed fddZdejee	ejejf  eej eej eej ee
 eej e	ejeej eej f dddZ  ZS )	r+   rm   c                    sr   t    |j| _t||dd| _t||| _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|| _d S )NTr   r   )r8   r9   r=   Z	embed_dimrW   r   rl   cross_attentionrU   r   r   pre_ca_normpre_mlp_normrS   r   rn   rC   r1   r2   r9   F  s    
zDiaDecoderLayer.__init__N)ro   r   rq   encoder_hidden_statesencoder_attention_maskrr   cache_positionrF   c                 K   s   |}	t |	tr|	j}	|}
| |}| j||||	fd|i|\}}|
| }|}
| |}| j||f||d|\}}|
| }|}
| |}| |}|
| }|||fS )Nr   )rq   rr   )	r   r   Zself_attention_cacher   r   r   r   r   r   )rB   ro   r   rq   r   r   rr   r   rs   Zself_attn_cacher   r   r   r   Zcross_statesZcross_attn_weightsr   r1   r1   r2   rO   P  sB    







zDiaDecoderLayer.forward)NNNNNN)r,   r-   r.   r"   rj   r9   r?   rQ   r   r   r   
LongTensorrO   rR   r1   r1   rC   r2   r+   E  s"         r+   c                       s   e Zd ZdZed fddZeedej	e
ej e
ej	 e
ej e
ej e
e e
e e
e e
ej eeef d
dd	Zeej	df eej	df ejej	d
ddZ  ZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r4   c                    sf   t     j| _ j| _t | _t | _t	 fddt
 jD | _t j jd| _d S )Nc                    s   g | ]}t  |qS r1   )r+   r   r4   r1   r2   r     r   z'DiaDecoder.__init__.<locals>.<listcomp>r   )r8   r9   r<   r;   r3   
embeddingsrV   r   r   r   r   r   rx   rU   r=   r   r   r   rC   r4   r2   r9     s    

zDiaDecoder.__init__NF)
r)   r   rq   r   r   rr   r   r   r   rF   c
                 K   s  |  dd \}}|dur$| nd}|	du rFtj||| |jd}	|du r^|	dddf }| |}| ||}|du rt s|| }tj|||jd}t	| j
|||	||d}| |||jdd |}|rdnd}|rdnd}|r|durdnd}| jD ]b}|r||f7 }|||||f|||	d|
}|d }|r||d	 f }|dur||d f }q| |}|r|||f7 }t|||||d
S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrG   r   r   )r'   Zinput_embedsrq   r   rr   r   r   r1   )r   rr   r   r    )r   rr   ro   r   cross_attentions)sizeZget_seq_lengthr?   r@   rJ   r   r   r   Zonesr	   r'   _update_cross_attn_maskrL   rx   r   r   )rB   r)   r   rq   r   r   rr   r   r   r   rs   
batch_sizeZ
seq_lengthZpast_key_values_lengthro   r   Zmask_seq_lengthZall_hidden_statesZall_self_attnsZall_cross_attentionslayerr   r1   r1   r2   rO     sx    





zDiaDecoder.forward)r   r   r   r   c                 C   s   |d ur|d ur| j jdkr.d|v r(|nd }nb| j jdkrPt||j|d d}n@| j jdkr|t|tjrt||d dd}nt||j|d d}|S )	Nr   r   r   rG   )Ztgt_lenr   F)Zquery_lengthrZ   r   )rB   r   r   r   r   r1   r1   r2   r     s(    z"DiaDecoder._update_cross_attn_mask)NNNNNFFN)r,   r-   r.   rP   r"   r9   r   r   r?   rQ   r   r   ZFloatTensorr   rk   r   r   r   rO   Sizer   rR   r1   r1   rC   r2   r     s:           
_r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )Zcustom_introc                       s   e Zd Zed fddZdd Zeed
ee	j
 ee	j
 ee	j
 ee	j
 ee	j
 eeeef  ee ee ee ee ee	j
 eeef ddd	Z  ZS )DiaModelr4   c                    s6   t  | || _t|j| _t|j| _| 	  d S N)
r8   r9   r'   r   Zencoder_configencoderr   decoder_configdecoder	post_initr   rC   r1   r2   r9     s
    zDiaModel.__init__c                 C   s   | j S r   )r   rB   r1   r1   r2   get_encoder  s    zDiaModel.get_encoderN)r)   rq   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsrr   	use_cacher   r   r   rF   c                 K   s  |du r|du rt d|	dur$|	n| jj}	|
dur8|
n| jj}
|durL|n| jj}| jrr| jrr|rrtd d}|r|du rt	t
| jdt
| jd}|du r| jf |||	|
d|}nFt|tst|d t|dkr|d ndt|d	kr|d	 ndd
}|d jd d| jjj  }}}|du rHtj|d|f| jj| jd}|jd	krj||||dd	}| jf ||||d |||	|
||d
|}t|j|j|j|j|j|d |j|jdS )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr4   )r)   rq   r   r   r   r    r   r   rG   )r   Z
fill_valuerJ   )
r)   r   rq   r   r   rr   r   r   r   r   )r   rr   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions) 
ValueErrorr'   r   r   r   Zis_gradient_checkpointingZtrainingloggerZwarning_oncer   r   r   r   r   lenrL   r   r<   r?   fullZbos_token_idrJ   ndimr}   ru   r   r   r   rr   ro   r   r   )rB   r)   rq   r   r   r   r   rr   r   r   r   r   rs   ZbszZseq_lenZchannelsZdecoder_outputsr1   r1   r2   rO   !  s|    ' 
zDiaModel.forward)NNNNNNNNNNN)r,   r-   r.   r!   r9   r   r   r   r   r?   r   r   r   r   r   rk   r   rO   rR   r1   r1   rC   r2   r     s:              
r   zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       s   e Zd ZdZed fddZdd Zdd Zee	de
ej e
ej e
ej e
ej e
ej e
eeef  e
e e
e e
e e
e e
ej e
ej eeef d
ddZ  ZS )DiaForConditionalGenerationr(   r4   c                    s`   t  | || _t|| _|jj| _|jj| _tj	|jj
| j| j dd| _d| _|   d S )NFr]   ZForMaskedLM)r8   r9   r'   r   r(   r   r<   r;   r   re   r=   logits_denseZ	loss_typer   r   rC   r1   r2   r9     s    


z$DiaForConditionalGeneration.__init__c                 C   s
   | j  S r   )r(   r   r   r1   r1   r2   r     s    z'DiaForConditionalGeneration.get_encoderc                 C   s
   | j  S r   )r(   get_decoderr   r1   r1   r2   r     s    z'DiaForConditionalGeneration.get_decoderN)r)   rq   r   r   r   r   rr   r   r   r   labelsr   rF   c                 K   s   | j f |||||||||	|
|d|}|d }|jd }| ||d| j| jfdd || j d| j}d}|dur| jf ||| jd|}t	|||j
|j|j|j|j|j|jd	S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r)   rq   r   r   r   r   rr   r   r   r   r   r   rG   r    r   N)logitsr   r;   )	lossr   rr   r   r   r   r   r   r   )r(   rL   r   rK   r<   r;   ru   r~   Zloss_functionr   rr   r   r   r   r   r   r   )rB   r)   rq   r   r   r   r   rr   r   r   r   r   r   rs   outputsr   r   Zaudio_logitsr   r1   r1   r2   rO     sN    ,

z#DiaForConditionalGeneration.forward)NNNNNNNNNNNN)r,   r-   r.   r0   r!   r9   r   r   r   r   r   r?   r   r   r   r   r   rk   r   rO   rR   r1   r1   rC   r2   r     sB               
r   )r   r&   r   )DrP   typingr   r   r   r?   r   Zcache_utilsr   r   Zmasking_utilsr	   Zmodeling_attn_mask_utilsr
   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zllama.modeling_llamar   r   r   r   Zphi3.modeling_phi3r   Zconfiguration_diar!   r"   r#   Zgeneration_diar$   Zintegrations.flex_attentionr%   Z
get_loggerr,   r   r&   r^   r3   rS   rU   rV   rW   rl   r*   r   r+   r   r   r   __all__r1   r1   r1   r2   <module>   sR   
J!V; {o