a
    hI                  
   @   s  d dl mZ d dlmZmZmZ d dlZd dlmZ d dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 e(2e3Z4ee&ddG dd de$Z5edG dd dej6Z7G dd dej6Z8G d d! d!ej6Z9d"d# Z:dCd$d%Z;ej<e=ej<d&d'd(Z>dDej6ej<ej<ej<eej< e?e?e"e% d*d+d,Z@G d-d. d.ej6ZAG d/d0 d0eZBe&d1de&G d2d3 d3e ZCe&G d4d5 d5eCZDG d6d7 d7ej6ZEe&d8dG d9d: d:eCeZFG d;d< d<ej6ZGe&G d=d> d>eCZHe&d?dG d@dA dAeCe1ZIg dBZJdS )E    )	dataclass)CallableOptionalUnionN)check_model_inputs   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg   )	AutoModel   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeeej   ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed	< dZejed
< dZeeeej   ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed< dS )CsmOutputWithPasta	
  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
    Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss)__name__
__module____qualname____doc__r#   r   torchFloatTensor__annotations__r$   r%   tupler&   r'   r(   r)   r*   r+   r,   r-    r6   r6   `/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/csm/modeling_csm.pyr"   2   s   
r"   ZRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	
CsmRMSNormư>c                    s&   t    tt|| _|| _dS )z9
        CsmRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parameterr2   onesweightvariance_epsilon)selfhidden_sizeeps	__class__r6   r7   r;   f   s    
zCsmRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   T)Zkeepdim)	dtypetor2   float32powmeanZrsqrtr@   r?   )rA   r&   Zinput_dtypeZvariancer6   r6   r7   forwardn   s
    zCsmRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r5   r?   shaper@   rA   r6   r6   r7   
extra_repru   s    zCsmRMSNorm.extra_repr)r9   )r.   r/   r0   r;   rL   rO   __classcell__r6   r6   rD   r7   r8   d   s   r8   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	CsmRotaryEmbeddinginv_freqNconfigc                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrR   F
persistent)r:   r;   hasattr
isinstancerU   dictgetrV   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenrT   r   Zrope_init_fnattention_scalingregister_bufferrR   Zoriginal_inv_freq)rA   rT   devicerR   rD   r6   r7   r;   |   s    
zCsmRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   rF   r   ZmpscpuF)device_typeZenabledr   dim)rG   )rR   floatexpandrM   rH   ra   r\   rW   strr2   Zautocast	transposecatcosr_   sinrG   )
rA   xposition_idsZinv_freq_expandedZposition_ids_expandedrc   ZfreqsZembrk   rl   r6   r6   r7   rL      s    0&,zCsmRotaryEmbedding.forward)N)r.   r/   r0   r2   Tensorr4   r   r;   no_gradr   rL   rP   r6   r6   rD   r7   rQ   y   s
   

rQ   c                       s$   e Zd Z fddZdd Z  ZS )CsmMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )NZbias)r:   r;   rT   rB   Zintermediate_sizer<   LinearZmlp_bias	gate_projup_proj	down_projr   Z
hidden_actact_fnrA   rT   rD   r6   r7   r;      s    
zCsmMLP.__init__c                 C   s$   |  | | || | }|S N)rv   rw   rt   ru   )rA   rm   rv   r6   r6   r7   rL      s     zCsmMLP.forwardr.   r/   r0   r;   rL   rP   r6   r6   rD   r7   rq      s   
rq   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..NrF   r   rd   )rM   r2   rj   )rm   x1Zx2r6   r6   r7   rotate_half   s    r|   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer|   )qkrk   rl   rn   Zunsqueeze_dimZq_embedZk_embedr6   r6   r7   apply_rotary_pos_emb   s
    

r   )r&   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rM   rg   reshape)r&   r   batchnum_key_value_headsslenhead_dimr6   r6   r7   	repeat_kv   s
    0r           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr   r   rF   )re   rG   )ptrainingr   )r   num_key_value_groupsr2   matmulri   rM   r<   
functionalZsoftmaxrI   rH   rG   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputr6   r6   r7   eager_attention_forward   s    
&r   c                       s   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej ee e
ej	ej	f d
ddZ  ZS )CsmAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrT   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr   g      Trr   )r:   r;   rT   r   getattrrB   Znum_attention_headsr   r   r   r   attention_dropoutZ	is_causalr<   rs   Zattention_biasq_projk_projv_projo_projrA   rT   r   rD   r6   r7   r;      s(    
zCsmAttention.__init__past_key_valuer%   4.58new_nameversionN)r&   position_embeddingsr   r%   cache_positionr   r   c                 K   s$  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j	|\}
}t
}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )NrF   r   r   )rl   rk   r   eagerr   )r   r   )rM   r   r   viewri   r   r   r   updater   r   rT   Z_attn_implementationr   r   r   r   r   r   r   )rA   r&   r   r   r%   r   r   Zinput_shapeZhidden_shapeZquery_statesr   r   rk   rl   Zcache_kwargsZattention_interfacer   r   r6   r6   r7   rL     s8    


zCsmAttention.forward)NN)r.   r/   r0   r1   r   intr;   r   r2   ro   r5   r   r	   
LongTensorr   r   rL   rP   r6   r6   rD   r7   r      s     r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee ejd
	ddZ  ZS )CsmDecoderLayerr   c                    sR   t    |j| _t||d| _t|| _t|j|jd| _	t|j|jd| _
d S )Nr   rC   )r:   r;   rB   r   	self_attnrq   mlpr8   rms_norm_epsinput_layernormpost_attention_layernormr   rD   r6   r7   r;   =  s    

zCsmDecoderLayer.__init__r   r%   r   r   NF)	r&   r   rn   r%   	use_cacher   r   r   r   c              
   K   s^   |}	|  |}| jf |||||||d|\}}
|	| }|}	| |}| |}|	| }|S )N)r&   r   rn   r%   r   r   r   )r   r   r   r   )rA   r&   r   rn   r%   r   r   r   r   Zresidual_r6   r6   r7   rL   G  s&    




zCsmDecoderLayer.forward)NNNFNN)r.   r/   r0   r   r   r;   r   r2   ro   r   r   r	   boolr5   r   r   rL   rP   r6   r6   rD   r7   r   <  s&   
      r   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                       sT   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZeedZ fddZ  ZS )	CsmPreTrainedModelrT   modelTr   r%   )r&   r'   c                    sL   t  | t|trH|j}t|d D ]}|jj| jd| j	j
d q(d S )Nr   r   )rK   Zstd)r:   _init_weightsr\   CsmCodebooksHeadnum_codebooksranger?   dataZnormal_rT   Zinitializer_range)rA   r   r   irD   r6   r7   r     s
    
z CsmPreTrainedModel._init_weights)r.   r/   r0   r   r4   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_can_compile_fullgraphZ_supports_attention_backendr   r   Z_can_record_outputsr   rP   r6   r6   rD   r7   r   j  s   
r   c                       s   e Zd ZU eed<  fddZeedej	e
ej e
ej e
ej	 e
e e
ej e
e e
ej	 ee eeef d
ddZ  ZS )	CsmDepthDecoderModelrT   c                    s   t     j| _ j| _t j j  j| _	t
 fddt jD | _t j jd| _t d| _d| _tj j jdd| _|   d S )Nc                    s   g | ]}t  |qS r6   r   .0r   rS   r6   r7   
<listcomp>      z1CsmDepthDecoderModel.__init__.<locals>.<listcomp>r   rS   Frr   )r:   r;   pad_token_idpadding_idx
vocab_sizer<   	Embeddingr   Zbackbone_hidden_sizeembed_tokens
ModuleListr   num_hidden_layerslayersr8   rB   r   normrQ   
rotary_embgradient_checkpointingrs   inputs_embeds_projector	post_initrx   rD   rS   r7   r;     s    zCsmDepthDecoderModel.__init__N)
	input_idsbackbone_last_hidden_stater   rn   r%   inputs_embedsr   r   r   r   c	              
   K   s  |dur t j s td d}|du |duA r8td|rP|du rPt| jd}|du r|durh| nd}
|dur~|j	d n|j	d }|dur|j
n|j
}t j|
|
| |d}|du r(t j|d dd}|| j }| || }|d dk}|dur||dddf< nt j s(|r(td	 | |}t| j|||||d
}|}|d}| ||}| jd| jj D ]$}||f||||||d|	}qt| |}t||r|nddS )aJ  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.rS   r   r   ra   )minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.rT   input_embedsr   r   r%   rn   )r   rn   r%   r   r   r   Zlast_hidden_stater%   )r2   compilerZis_compilingloggerZwarning_once
ValueErrorr
   rT   get_seq_lengthrM   ra   arangeclampr   r   warningr   r   r}   r   r   r   r   r   )rA   r   r   r   rn   r%   r   r   r   r   past_seen_tokensZinputs_seq_lengthra   codebook_idxsoffsetZinput_ids_are_first_codebookr   r&   r   decoder_layerr6   r6   r7   rL     sn    



	


zCsmDepthDecoderModel.forward)NNNNNNNN)r.   r/   r0   r    r4   r;   r   r   r2   r   r   r3   ro   r	   r   r   r   r   r5   r   rL   rP   r6   r6   rD   r7   r     s0   
        
r   c                       s&   e Zd Z fddZdddZ  ZS )r   c                    s0   t    || _tt| jd ||| _d S )Nr   )r:   r;   r   r<   r=   r2   emptyr?   )rA   rB   r   r   rD   r6   r7   r;     s    
zCsmCodebooksHead.__init__Nc                    sf   |d u r$j d }| jt|  n|d }| j|   fddt j d D tjddS )Nr   c              	      s2   g | ]*}t jd d |d d f  | jqS ry   )r<   r   ZlinearT)r   Zcodebook_idxZcodebook_weightr&   r6   r7   r     s   z,CsmCodebooksHead.forward.<locals>.<listcomp>r   rd   )rM   r?   r2   r   r   stack)rA   r&   r   Z
seq_lengthr   r6   r   r7   rL     s    

zCsmCodebooksHead.forward)Nrz   r6   r6   rD   r7   r     s   r   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                       s   e Zd ZdZdZdZ fddZeede	j
ee	j ee	j ee	j
 eeeee	j f  ee	j ee	j
 ee ee	j
 eee	jf ee eeef dddZde	j
ee ee	j
 ee	j ee	j
 d fd	d
Z  ZS )CsmDepthDecoderForCausalLMNc                    s>   t  | t|| _|j| _t|j|j|j| _| 	  d S ry   )
r:   r;   r   r   r   r   rB   r   codebooks_headr   rx   rD   r6   r7   r;     s
    
z#CsmDepthDecoderForCausalLM.__init__r   )r   r   r   rn   r%   r   labelsr   r   logits_to_keepr   r   c                 K   s   | j f ||||||||	d|}|d }t|
trV|
dkrHtdd}qZt|
 d}n|
}| |dd|ddf |	dur|	| nd}| }d}|dur|dddf  }| jf |d| jj|d|}t	|||j
|j|jdS )a  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r   r   r   rn   r%   r   r   r   r   r   N.)r$   r   r   shift_labels)r#   r$   r%   r&   r'   )r   r\   r   slicer   r   loss_functionrT   r   r   r%   r&   r'   )rA   r   r   r   rn   r%   r   r   r   r   r   r   outputsr&   slice_indicesr$   r#   r   r6   r6   r7   rL   !  sJ    	
&z"CsmDepthDecoderForCausalLM.forwardr   r%   r   r   r   c           	         sH   t  j|||||fi |}|d d dk}|s:|d |d |S )Nr   r   r   rn   )r:   prepare_inputs_for_generationpop)	rA   r   r%   r   r   r   r   model_inputsZis_first_generation_steprD   r6   r7   r   e  s    	


z8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation)
NNNNNNNNNr   )NNNN)r.   r/   r0   _tied_weights_keysZ_tp_planZ_pp_planr;   r   r   r2   r   r   r3   ro   r   r	   listr   r   r   r   r5   r   rL   r   rP   r6   r6   rD   r7   r     sR   	          
E    r   c                       s$   e Zd Z fddZdd Z  ZS )CsmBackboneModelEmbeddingsc                    sD   t    t|j|j |j| _| jdt	
|j|j dd d S )Naudio_tokens_offsetsFrY   )r:   r;   r<   r   r   r   rB   embed_audio_tokensr`   r2   r   rx   rD   r6   r7   r;   }  s
    
z#CsmBackboneModelEmbeddings.__init__c                 C   s    |  || j }|jdd}|S )Nr   rd   )r  r  sum)rA   r   r   r6   r6   r7   rL     s    z"CsmBackboneModelEmbeddings.forwardrz   r6   r6   rD   r7   r   |  s   r   c                       sn   e Zd Z fddZeedeej eej	 eej ee
 eej eej ee ee ed	ddZ  ZS )CsmBackboneModelc                    sv   t     j| _ j| _t | _t fddt	 j
D | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r6   r   r   rS   r6   r7   r     r   z-CsmBackboneModel.__init__.<locals>.<listcomp>r   rS   F)r:   r;   r   r   r   r   r   r<   r   r   r   r   r8   rB   r   r   rQ   r   r   r   rx   rD   rS   r7   r;     s    
zCsmBackboneModel.__init__N)	r   r   rn   r%   r   r   r   r   r   c              	   K   s   |du |duA rt d|du r*| |}|rB|du rBt| jd}|du rz|durZ| nd}	tj|	|	|jd  |jd}|du r|	d}t
| j|||||d}
|}| ||}| jd| jj D ] }||f|
||||d|}q| |}t||d	S )
a&  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedsrS   r   r   r   r   )r   rn   r%   r   r   r   )r   r   r
   rT   r   r2   r   rM   ra   r}   r   r   r   r   r   r   )rA   r   r   rn   r%   r   r   r   r   r   r   r&   r   r   r6   r6   r7   rL     sP    

	

zCsmBackboneModel.forward)NNNNNNN)r.   r/   r0   r;   r   r   r   r2   r   ro   r	   r3   r   r   r   r   rL   rP   r6   r6   rD   r7   r    s*          r  z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                       sJ  e Zd ZddgZ fddZdd Zdd Zd	d
 Ze fddZ	 fddZ
deej eej eej eej eej dddZdejee eej eej eej d fddZeedejeej eej eej eej eeeeej f  eej eej ee eej eeejf ee eeef dddZ  ZS )CsmForConditionalGenerationz5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                    sp   t  | |j| _tj|j|jdd| _t|j|j| _	t
|| _t|j| _t|j| _|   d S )NFrr   )r:   r;   r   r<   rs   rB   lm_headr   Ztext_vocab_sizeembed_text_tokensr  _from_configbackbone_modelr   Zdepth_decoder_configdepth_decoderr   from_configZcodec_configcodec_modelr   rx   rD   r6   r7   r;     s    z$CsmForConditionalGeneration.__init__c                 C   s   | j jS ry   r	  r   rN   r6   r6   r7   get_input_embeddings  s    z0CsmForConditionalGeneration.get_input_embeddingsc                 C   s   || j _d S ry   r  )rA   r   r6   r6   r7   set_input_embeddings  s    z0CsmForConditionalGeneration.set_input_embeddingsc                 C   s$   | j jr | | jjj| jjj d S ry   )rT   Ztie_codebooks_embeddingsZ_tie_or_clone_weightsr	  r   r  r
  r   rN   r6   r6   r7   _tie_weights  s
    z(CsmForConditionalGeneration._tie_weightsc                    s   | ddr$t j|i |\}}nt j|i |}d t  fddt|j D }t|jjddi| |D ]}t	|j |  q~d|v r||fS |S d S )NZoutput_loading_infoFdepth_decoder_c                    s(   i | ] \}}|  r|d  |qS ry   )
startswith)r   attrr   prefix
prefix_lenr6   r7   
<dictcomp>  s   
z?CsmForConditionalGeneration.from_pretrained.<locals>.<dictcomp>Z_from_model_config)
r^   r:   from_pretrainedlenvarsgeneration_configitemsr
  r   delattr)clsargsr   r   Zloading_infodepth_decoder_attrsr  rD   r  r7   r    s    z+CsmForConditionalGeneration.from_pretrainedc                    sV   d}| j j }|dd  | D ]\}}t| j|| | q$t j|i | d S )Nr  Ztransformers_version)r
  r  Zto_diff_dictr   r  setattrr:   save_pretrained)rA   r  r   r  r   r  r   rD   r6   r7   r"  !  s    z+CsmForConditionalGeneration.save_pretrainedN)r   input_valuesinput_values_cutoffsr   r   c                    sH  |  |}|dur>tj|d}||dk  }||dk }tj| |jd	t
|d}||dk }t  g }t||D ]~\}	}
|
|
dk }
t|
jd d D ]V}|
| }|
|d  }|	d||f }| j|d}|jdd}||d  qqtdd	 |D  t fd
d|D }| j|}W d   n1 sP0    Y  | jj}||k}| j|}|| ||< tjdd| jjf|jtjd| jj }| j|d}|| jj k}|!|" d||< |dur>|d!dd| jj}|| ||< |||< |dkj#dd}d||d |d ddf< |}||dS )a  
        Merges the input_ids and input_values to produce a single inputs_embeds tensor:
        1 - Infers the codec model on the input_values to retreive codebook token.
        2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
        3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The input ids to embed.
            input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
                The audio input values to embed.
            input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
                The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
        Nr   r   r   r   rF   r   .c                 s   s   | ]}|j d  V  qdS )r   N)rM   r   elr6   r6   r7   	<genexpr>[  r   zQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>c                    s,   g | ]$}t j|d d d  |jd   fqS )r   )r<   r   padrM   r&  Zmax_audio_framesr6   r7   r   ]  r   zRCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<listcomp>)ra   rG   iTas_tuple)r   r   )$r  r<   r   r)  diffr2   r   maxra   rg   r  r}   rp   zipr   rM   r  encodeZaudio_codesri   appendr   Zget_audio_codes_maskrT   audio_token_idr	  r   r>   r   longZcodebook_eos_token_idZsqueezeZaudio_eos_token_idrepeatr  nonzero)rA   r   r#  r$  r   r   Zaudio_lengthsZinput_values_maskZaudio_tokens_listZbatch_input_valuesZbatch_input_values_cutoffsr   Z	start_idxZend_idxZaudio_batchZcodec_outputsZcodebook_idsZbatched_audio_token_idsZaudio_codes_maskr3  Zaudio_token_maskZaudio_embedsZaudio_eos_frame_idsZaudio_eos_embedsZaudio_eos_token_maskZlabels_expandedZ depth_decoder_ignore_frames_idxsr6   r*  r7   "_merge_input_ids_with_input_values+  sX    


,
z>CsmForConditionalGeneration._merge_input_ids_with_input_valuesr   c           	         s   t  jf |||||d|}|d ur||jdkr||dd u r|| j||d|d|dd}||d |d d d |S )	Nr   r   r   r#  r$  r   )r   r#  r$  r   )r   r   r   )r:   r   ndimr^   r7  r   )	rA   r   r%   r   r   r   r   r   merged_inputsrD   r6   r7   r   }  s(    		 z9CsmForConditionalGeneration.prepare_inputs_for_generationr   )r   r#  r   r$  rn   r%   r   r   r   r   r   r   r   c                 K   s  |dur6|j dkr6| ||||}|d }|d }d}| jf ||||||	|
d|}|d }t|trtt| dn|}| |dd|ddf }d}d}d}d}|dur|dddddf }| jf ||| jj	d|}|ddddddf d	kj
d
d }|| dd| jjd f }tjj|ddd}|jdd}||d |d d ddf }|| }| jf |||	d|d|}|j}|| }t|||||j|j|j|dur|jnd|dur|jnd|dur|jnd|dur|jnddS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
            Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
            If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
            where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
            the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
            Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
            - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

            Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            Kept for compatibility. Does not support another value than:
            1. `0`, which is equivalent to keeping all logits, used in the training regime
            2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

        Example:

        ```python
        >>> import torch
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        >>> # ensure the audio is 24kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

        >>> conversation = []
        >>> # prepare a conversation with text and corresponding audio
        >>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
        ...     conversation.append(
        ...         {
        ...             "role": f"{speaker_id}",
        ...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        ...         }
        ...     )

        >>> inputs = processor.apply_chat_template(
        ...     conversation,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     output_labels=True,
        ... ).to(torch_device)

        >>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
        >>> output = model(**inputs)
        >>> output.loss.backward()
        ```Nr   r   r   )r   r   rn   r%   r   r   r   r   )r$   r   r   r   r-  rF   rd   .r%  )r   Tr+  )r   r   r   Zreturn_dictr   )r#   r-   r(   r$   r%   r&   r'   r)   r*   r+   r,   )r8  r7  r	  r\   r   r   r  r   rT   r   allr   r<   r   r)  r6  r
  r#   r"   r%   r&   r'   r$   )rA   r   r#  r   r$  rn   r%   r   r   r   r   r   r   r9  Zbackbone_outputsZbackbone_hidden_statesr   Zbackbone_logitsr#   r-   r(   Zdepth_decoder_outputsZbackbone_labelsZ
train_maskZdepth_decoder_input_idsZ
train_idxsZbackbone_last_hidden_statesZdepth_decoder_labelsr6   r6   r7   rL     s    S

(	

z#CsmForConditionalGeneration.forward)NNNN)NNNN)NNNNNNNNNNr   )r.   r/   r0   r   r;   r  r  r  classmethodr  r"  r   r2   ro   r7  r   r	   r3   r   r   r   r   r   r   r   r   r   r5   r"   rL   rP   r6   r6   rD   r7   r    sx   
    U               
r  )r   r  r   r   r  )Nr   )r   )Kdataclassesr   typingr   r   r   r2   Ztorch.nnr<   Ztransformers.utils.genericr   Zactivationsr   Zcache_utilsr	   r
   Z
generationr   Zintegrationsr   Zmasking_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   autor   Zconfiguration_csmr   r    Zgeneration_csmr!   Z
get_loggerr.   r   r"   Moduler8   rQ   rq   r|   r   ro   r   r   rf   r   r   r   r   r   r   r   r   r  r  __all__r6   r6   r6   r7   <module>   s   
,$
 G.jiY  U