a
    hK                  
   @   s  d dl mZmZmZ d dlZd dlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0 ddl1m2Z2 e( r,ddl3m4Z4 e*5e6Z7e&G dd de!Z8G dd dej9Z:G dd dej9Z;edG dd dej9Z<G dd  d ej9Z=d!d" Z>d@d#d$Z?ej@eAej@d%d&d'ZBdAej9ej@ej@ej@eej@ eCeCe#e% d)d*d+ZDG d,d- d-ej9ZEG d.d/ d/ej9ZFG d0d1 d1eZGG d2d3 d3e8ZHG d4d5 d5eZIG d6d7 d7e8ZJe&d8d9G d:d; d;e8ZKe&d<d9G d=d> d>e8e2ZLg d?ZMdS )B    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging)deprecate_kwarg   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZddgZdS )DiaPreTrainedModelconfigmodelT	input_idsDiaEncoderLayerDiaDecoderLayerN)__name__
__module____qualname__r"   __annotations__base_model_prefixZsupports_gradient_checkpointingZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZmain_input_nameZ_no_split_modules r2   r2   `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dia/modeling_dia.pyr'   ?   s   
r'   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )DiaMultiChannelEmbeddinga  In order to efficiently compute the audio embedding from the 9 different channels,
    we vectorize the embedding process by using a single embedding layer and an offset.
    Example:
    - num_embeds = 4
    - vocab_size = 8
    - num_channels = 3
    We would have offsets = [0, 8, 16]
    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
    then tokens = audio_codes + offsets
                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
    This allows us to use a single embedding layer for all channels.
    r(   c                    s^   t    t|j|j |j| _|j| _|j| _tj	|jtj
d|j }| jd|dd d S )NdtypeoffsetsF
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr(   r8   	__class__r2   r3   r<   Z   s    
z!DiaMultiChannelEmbedding.__init__)audio_codesreturnc                 C   sH   || j |j d}| ||jd |jd d| j}|jddS )Nr!   r      dim)	r8   todeviceZsqueezerA   viewshaper@   sum)rF   rI   tokensZembedsr2   r2   r3   forwardb   s    $z DiaMultiChannelEmbedding.forward)
r-   r.   r/   __doc__r#   r<   rB   TensorrU   __classcell__r2   r2   rG   r3   r4   L   s   r4   c                       s0   e Zd Z fddZejejdddZ  ZS )DiaMLPc                    sP   t    || _tj|jd|j dd| _tj|j|jdd| _t	|j
 | _d S )NrL   FZbias)r;   r<   r(   r   Linearr@   Zintermediate_sizegate_up_proj	down_projr   Z
hidden_actactivation_fnrF   r(   rG   r2   r3   r<   i   s
    
zDiaMLP.__init__)hidden_statesrJ   c                 C   s4   |  |}|jddd\}}|| | }| |S )NrL   rK   rM   )r\   chunkr^   r]   )rF   r`   Z	up_statesZgater2   r2   r3   rU   q   s    
zDiaMLP.forward)r-   r.   r/   r<   rB   FloatTensorrU   rX   r2   r2   rG   r3   rY   h   s   rY   ZRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	
DiaRMSNormư>c                    s&   t    tt|| _|| _dS )z9
        DiaRMSNorm is equivalent to T5LayerNorm
        N)r;   r<   r   	ParameterrB   onesweightvariance_epsilon)rF   r@   epsrG   r2   r3   r<   |   s    
zDiaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )NrL   rK   T)Zkeepdim)	r7   rO   rB   float32powmeanZrsqrtrh   rg   )rF   r`   Zinput_dtypeZvariancer2   r2   r3   rU      s
    zDiaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerg   rR   rh   rF   r2   r2   r3   
extra_repr   s    zDiaRMSNorm.extra_repr)rd   )r-   r.   r/   r<   rU   ro   rX   r2   r2   rG   r3   rc   z   s   rc   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	DiaRotaryEmbeddinginv_freqNr5   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrq   Fr9   )r;   r<   hasattr
isinstancerr   dictgetrs   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr(   r   Zrope_init_fnattention_scalingrE   rq   Zoriginal_inv_freq)rF   r(   rP   rq   rG   r2   r3   r<      s    
zDiaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   rK   r!   ZmpscpuF)device_typeZenabledrL   rM   r6   )rq   floatexpandrR   rO   rP   rw   rt   strrB   Zautocast	transposecatcosrz   sinr7   )
rF   xposition_idsZinv_freq_expandedZposition_ids_expandedr|   ZfreqsZembr   r   r2   r2   r3   rU      s    0&,zDiaRotaryEmbedding.forward)N)r-   r.   r/   rB   rW   r0   r"   r<   Zno_gradr   rU   rX   r2   r2   rG   r3   rp      s
   

rp   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrK   rL   rM   )rR   rB   r   )r   x1Zx2r2   r2   r3   rotate_half   s    r   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )Z	unsqueezer   )qkr   r   r   Zunsqueeze_dimZq_embedZk_embedr2   r2   r3   apply_rotary_pos_emb   s
    

r   )r`   n_reprJ   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)rR   r~   reshape)r`   r   batchnum_key_value_headsslenhead_dimr2   r2   r3   	repeat_kv   s
    0r           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrL   r   rK   )rN   r7   )ptrainingr!   )r   num_key_value_groupsrB   matmulr   rR   r   Z
functionalZsoftmaxrj   rO   r7   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr2   r2   r3   eager_attention_forward   s    
&r   c                       s   e Zd ZdZdeeef eed fddZ	e
dddd	dejeejejf eej ee eej ee eejejf dddZ  ZS )DiaSelfAttention=Multi-headed attention from 'Attention Is All You Need' paperF)r(   	layer_idx	is_causalc                    s   t    || _|| _|j| _| jj| _| jjp4| j| _| j| j | _t	|d|j| j | _
d| _d| _|| _tj| j| j| j
 dd| _tj| j| j| j
 dd| _tj| j| j| j
 dd| _tj| j| j
 | jdd| _d S )Nr   r!   r   FrZ   )r;   r<   r(   r   r@   Znum_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   r[   q_projk_projv_projo_proj)rF   r(   r   r   rG   r2   r3   r<      s    

zDiaSelfAttention.__init__Zpast_key_valuepast_key_valuesz4.58)new_nameversionN)r`   position_embeddingsr   r   cache_positionr   rJ   c                 K   s$  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j	|\}
}t
}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )NrK   r!   rL   )r   r   r   eagerr   )r   r   )rR   r   r   rQ   r   r   r   r   updater   r   r(   _attn_implementationr   r   r   r   r   r   r   )rF   r`   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   Zcache_kwargsattention_interfacer   r   r2   r2   r3   rU     s8    


zDiaSelfAttention.forward)F)NN)r-   r.   r/   rV   r   r$   r#   intboolr<   r    rB   rW   rm   r   r   
LongTensorr   r   rU   rX   r2   r2   rG   r3   r      s      r   c                
       sd   e Zd ZdZeed fddZd	ejeje	ej e	e
 ee eeje	ej f dddZ  ZS )
DiaCrossAttentionr   r(   r   c                    s   t    || _|| _|j| _|j| _| jj| _| jj| _	| j| j	 | _
|j| _d| _d| _d| _tj| j| j| j dd| _tj| j| j	| j dd| _tj| j| j	| j dd| _tj| j| j | jdd| _d S )Nr!   r   FrZ   )r;   r<   r(   r   r@   Zcross_hidden_sizeZcross_num_attention_headsr   Zcross_num_key_value_headsr   r   Zcross_head_dimr   r   r   r   r   r[   r   r   r   r   rF   r(   r   rG   r2   r3   r<   @  s     


zDiaCrossAttention.__init__N)r`   cross_attention_statesr   r   r   rJ   c                 K   sd  |j d d }g |d| jR }g |j d d d| jR }| ||dd}	|d urn|j| jnd}
|d ur|
r|jj	| j j
}|jj	| j j}nZ| ||dd}| ||dd}|d ur|j||| j\}}d|j| j< t}| jjdkrt| jj }|| |	|||fd| ji|\}}|g |dR  }| |}||fS )NrK   r!   rL   FTr   r   )rR   r   r   rQ   r   
is_updatedry   r   Zcross_attention_cachelayerskeysvaluesr   r   r   r   r(   r   r   r   r   r   r   )rF   r`   r   r   r   r   r   r   Zcross_shaper   r   r   r   r   r   r   r2   r2   r3   rU   S  sD    


zDiaCrossAttention.forward)NN)r-   r.   r/   rV   r#   r   r<   rB   rW   r   r
   r   r   rm   rU   rX   r2   r2   rG   r3   r   =  s     r   c                	       sh   e Zd Zeed fddZdejee	ejejf  eej e
e e	ejeej f dddZ  ZS )	r+   r   c                    sL   t    t|j|jd| _t||dd| _t|j|jd| _t	|| _
d S )Nri   Fr   )r;   r<   rc   r@   norm_epspre_sa_normr   self_attentionpost_sa_normrY   mlpr   rG   r2   r3   r<     s
    
zDiaEncoderLayer.__init__N)r`   r   r   r   rJ   c           
      K   sZ   |}|  |}| j|f||d|\}}|| }|}| |}| |}	||	 }||fS )Nr   r   )r   r   r   r   )
rF   r`   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outr2   r2   r3   rU     s     



zDiaEncoderLayer.forward)NN)r-   r.   r/   r$   r   r<   rB   rW   r   rm   r   r   rU   rX   r2   r2   rG   r3   r+     s   
  r+   c                       s|   e Zd Zed fddZeedeje	ej e	e
 e	e
 ee eeef dddZeejdf ejd	d
dZ  ZS )
DiaEncoderr5   c                    sd   t     | _t j j| _t fddt	 j
D | _t j jd| _t | _d S )Nc                    s   g | ]}t  |qS r2   )r+   .0r   r5   r2   r3   
<listcomp>      z'DiaEncoder.__init__.<locals>.<listcomp>r   )r;   r<   r(   r   r=   r>   r@   	embedding
ModuleListrangenum_hidden_layersr   rc   r   normrp   rotary_embeddingsr_   rG   r5   r3   r<     s    zDiaEncoder.__init__NF)r*   r   output_attentionsoutput_hidden_statesr   rJ   c                 K   s   |  |}tj|jd |jdd d d f }| ||}| ||}|rLdnd }	|rXdnd }
| jD ]B}|rt|	|f }	||f||d|}|d }|rb|
|d f }
qb| |}|r|	|f7 }	t	||	|
dS )NrK   rP   r2   r   r   r!   last_hidden_stater`   
attentions)
r   rB   rC   rR   rP   r   _update_full_maskr   r   r   )rF   r*   r   r   r   r   r`   r   r   Zencoder_statesZall_attentionsZencoder_layerlayer_outputsr2   r2   r3   rU     s:    

"



zDiaEncoder.forward)r   inputs_embedsc                 C   sv   |d urr| j jdkr&d|v r |nd }nL| j jdkr@t||j}n2| j jdkrft|tjrrt|dd}nt||j}|S )Nflash_attention_2r   sdpaflex_attentionFr   	r(   r   r   r7   rw   rB   rW   r&   r   )rF   r   r   r2   r2   r3   r     s    zDiaEncoder._update_full_mask)NFF)r-   r.   r/   r$   r<   r   r   rB   rW   r   r   r   r   r   r   rm   rU   r   rX   r2   r2   rG   r3   r     s"      
3r   c                       s   e Zd Zeed fddZdejee	ejejf  eej eej eej ee
 eej e	ejeej eej f dddZ  ZS )	r,   r   c                    sr   t    |j| _t||dd| _t||| _t|j|j	d| _
t|j|j	d| _t|j|j	d| _t|| _d S )NTr   r   )r;   r<   r@   Z	embed_dimr   r   r   cross_attentionrc   r   r   pre_ca_normpre_mlp_normrY   r   r   rG   r2   r3   r<     s    
zDiaDecoderLayer.__init__N)r`   r   r   encoder_hidden_statesencoder_attention_maskr   r   rJ   c                 K   s   |}	t |	tr|	j}	|}
| |}| j||||	fd|i|\}}|
| }|}
| |}| j||f||d|\}}|
| }|}
| |}| |}|
| }|||fS )Nr   )r   r   )	rw   r
   Zself_attention_cacher   r   r   r   r   r   )rF   r`   r   r   r   r   r   r   r   Zself_attn_cacher   r   r   r   Zcross_statesZcross_attn_weightsr   r2   r2   r3   rU   	  sB    







zDiaDecoderLayer.forward)NNNNNN)r-   r.   r/   r#   r   r<   rB   rW   r   rm   r
   r   rU   rX   r2   r2   rG   r3   r,     s"         r,   c                       s   e Zd ZdZed fddZeedej	e
ej e
ej	 e
ej e
ej e
e e
e e
e e
ej eeef d
dd	Zeej	df eej	df ejej	d
ddZ  ZS )
DiaDecoderz-Transformer Decoder Stack using DenseGeneral.r5   c                    sf   t     j| _ j| _t | _t | _t	 fddt
 jD | _t j jd| _d S )Nc                    s   g | ]}t  |qS r2   )r,   r   r5   r2   r3   r   C  r   z'DiaDecoder.__init__.<locals>.<listcomp>r   )r;   r<   r?   r>   r4   
embeddingsrp   r   r   r   r   r   r   rc   r@   r   r   r_   rG   r5   r3   r<   <  s    

zDiaDecoder.__init__NF)
r*   r   r   r   r   r   r   r   r   rJ   c
                 K   s  |  dd \}}|dur$| nd}|	du rFtj||| |jd}	|du r^|	dddf }| |}| ||}|du rt s|| }tj|||jd}t	| j
|||	||d}| |||jdd |}|rdnd}|rdnd}|r|durdnd}| jD ]b}|r||f7 }|||||f|||	d|
}|d }|r||d	 f }|dur||d f }q| |}|r|||f7 }t|||||d
S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

            [What are input IDs?](../glossary#input-ids)
        NrK   r   r   )r(   Zinput_embedsr   r   r   r   rL   r2   )r   r   r   r!   )r   r   r`   r   cross_attentions)sizeZget_seq_lengthrB   rC   rP   r   r   r   rf   r   r(   _update_cross_attn_maskrR   r   r   r   )rF   r*   r   r   r   r   r   r   r   r   r   
batch_sizeZ
seq_lengthZpast_key_values_lengthr`   r   Zmask_seq_lengthZall_hidden_statesZall_self_attnsZall_cross_attentionslayerr   r2   r2   r3   rU   G  sx    





zDiaDecoder.forward)r   r   r   r   c                 C   s   |d ur|d ur| j jdkr.d|v r(|nd }nb| j jdkrPt||j|d d}n@| j jdkr|t|tjrt||d dd}nt||j|d d}|S )	Nr   r   r   rK   )Ztgt_lenr   F)Zquery_lengthr   r   )rF   r   r   r   r   r2   r2   r3   r     s(    z"DiaDecoder._update_cross_attn_mask)NNNNNFFN)r-   r.   r/   rV   r#   r<   r   r   rB   rW   r   r   rb   r
   r   r   r   rm   rU   Sizer   rX   r2   r2   rG   r3   r   9  s:           
_r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )Zcustom_introc                       s   e Zd Zed fddZdd Zeed
ee	j
 ee	j
 ee	j
 ee	j
 ee	j
 eeeef  ee ee ee ee ee	j
 eeef ddd	Z  ZS )DiaModelr5   c                    s6   t  | || _t|j| _t|j| _| 	  d S N)
r;   r<   r(   r   Zencoder_configencoderr   decoder_configdecoder	post_initr_   rG   r2   r3   r<     s
    zDiaModel.__init__c                 C   s   | j S r   )r   rn   r2   r2   r3   get_encoder  s    zDiaModel.get_encoderN)r*   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher   r   r   rJ   c                 K   s  |du r|du rt d|	dur$|	n| jj}	|
dur8|
n| jj}
|durL|n| jj}| jrr| jrr|rrtd d}|r|du rt	t
| jdt
| jd}|du r| jf |||	|
d|}nFt|tst|d t|dkr|d ndt|d	kr|d	 ndd
}|d jd d| jjj  }}}|du rHtj|d|f| jj| jd}|jd	krj||||dd	}| jf ||||d |||	|
||d
|}t|j|j|j|j|j|d |j|jdS )a\  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr5   )r*   r   r   r   r   r!   rL   r   rK   )r   Z
fill_valuerP   )
r*   r   r   r   r   r   r   r   r  r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions) 
ValueErrorr(   r   r   r  Zis_gradient_checkpointingr   loggerZwarning_oncer
   r	   r   rw   r   lenrR   r   r?   rB   fullZbos_token_idrP   ndimr   r   r   r   r   r   r`   r   r   )rF   r*   r   r  r  r  r  r   r  r   r   r   r   ZbszZseq_lenZchannelsZdecoder_outputsr2   r2   r3   rU     s|    ' 
zDiaModel.forward)NNNNNNNNNNN)r-   r.   r/   r"   r<   r   r   r   r   rB   r   r   r   rm   r
   r   r   rU   rX   r2   r2   rG   r3   r     s:              
r   zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                       s   e Zd ZdZed fddZdd Zdd Zee	de
ej e
ej e
ej e
ej e
ej e
eeef  e
e e
e e
e e
e e
ej e
ej eeef d
ddZ  ZS )DiaForConditionalGenerationr)   r5   c                    s`   t  | || _t|| _|jj| _|jj| _tj	|jj
| j| j dd| _d| _|   d S )NFrZ   ZForMaskedLM)r;   r<   r(   r   r)   r   r?   r>   r   r[   r@   logits_denseZ	loss_typer   r_   rG   r2   r3   r<   R  s    


z$DiaForConditionalGeneration.__init__c                 C   s
   | j  S r   )r)   r   rn   r2   r2   r3   r   a  s    z'DiaForConditionalGeneration.get_encoderc                 C   s
   | j  S r   )r)   get_decoderrn   r2   r2   r3   r  d  s    z'DiaForConditionalGeneration.get_decoderN)r*   r   r  r  r  r  r   r  r   r   labelsr   rJ   c                 K   s   | j f |||||||||	|
|d|}|d }|jd }| ||d| j| jfdd || j d| j}d}|dur| jf ||| jd|}t	|||j
|j|j|j|j|j|jd	S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
            tened audio logits which are used to calculate the loss.

            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
            Dia to calculate embeddings and subsequent steps more efficiently.

            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
            [`DiaProcessor.__call__`] for more details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in
            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
            are ignored (masked).
        )r*   r   r  r  r  r  r   r  r   r   r   r   rK   r!   rL   N)logitsr  r>   )	lossr  r   r  r  r   r  r   r	  )r)   rR   r  rQ   r?   r>   r   r   Zloss_functionr   r   r  r  r   r  r   r	  )rF   r*   r   r  r  r  r  r   r  r   r   r  r   r   outputsr   r   Zaudio_logitsr  r2   r2   r3   rU   g  sN    ,

z#DiaForConditionalGeneration.forward)NNNNNNNNNNNN)r-   r.   r/   r1   r"   r<   r   r  r   r   r   rB   r   r   r   rm   r
   r   r   rU   rX   r2   r2   rG   r3   r  J  sB               
r  )r   r'   r  )Nr!   )r   )Ntypingr   r   r   rB   r   Zactivationsr   Zcache_utilsr   r	   r
   Zintegrationsr   Zmasking_utilsr   Zmodeling_attn_mask_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   r   Zutils.deprecationr    Zconfiguration_diar"   r#   r$   Zgeneration_diar%   Zintegrations.flex_attentionr&   Z
get_loggerr-   r  r'   Moduler4   rY   rc   rp   r   r   rW   r   r   r}   r   r   r   r+   r   r,   r   r   r  __all__r2   r2   r2   r3   <module>   sr    
$
 BJ!V; {o