a
    h                     @   s  d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, e-e.Z/eeddG dd deZ0G dd de%Z1G dd de&Z2G dd de#Z3G dd de Z4G dd  d e!Z5ed!deG d"d# d#eZ6eG d$d% d%e$e6Z7G d&d' d'ej8Z9ed(dG d)d* d*e"eZ:G d+d, d,ej8Z;eG d-d. d.e$Z<ed/dG d0d1 d1e6e,Z=g d2Z>dS )3    )	dataclass)OptionalUnionN)check_model_inputs   )CacheDynamicCache)GenerationMixin)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tuplelogging   )	AutoModel)LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingTransformersKwargs   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeeej   ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed	< dZejed
< dZeeeej   ed< dZeeejdf  ed< dZeeejdf  ed< dZeej ed< dS )CsmOutputWithPasta	
  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the depth decoder model.
    depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
    depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
    depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.
    backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction) of the backbone model.
    Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss)__name__
__module____qualname____doc__r"   r   torchFloatTensor__annotations__r#   r$   tupler%   r&   r'   r(   r)   r*   r+   r,    r5   r5   _/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/csm/modular_csm.pyr!   1   s   
r!   c                   @   s   e Zd ZdS )
CsmRMSNormNr-   r.   r/   r5   r5   r5   r6   r7   d   s   r7   c                   @   s   e Zd ZdS )CsmRotaryEmbeddingNr8   r5   r5   r5   r6   r9   h   s   r9   c                   @   s   e Zd ZdS )CsmMLPNr8   r5   r5   r5   r6   r:   l   s   r:   c                   @   s   e Zd ZdS )CsmAttentionNr8   r5   r5   r5   r6   r;   p   s   r;   c                   @   s   e Zd ZdS )CsmDecoderLayerNr8   r5   r5   r5   r6   r<   t   s   r<   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                       sT   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZeedZ fddZ  ZS )	CsmPreTrainedModelconfigmodelTr<   r$   )r%   r&   c                    sL   t  | t|trH|j}t|d D ]}|jj| jd| j	j
d q(d S )Nr   g        )meanZstd)super_init_weights
isinstanceCsmCodebooksHeadnum_codebooksrangeweightdataZnormal_r>   Zinitializer_range)selfmodulerE   i	__class__r5   r6   rB      s
    
z CsmPreTrainedModel._init_weights)r-   r.   r/   r   r3   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_can_compile_fullgraphZ_supports_attention_backendr<   r;   Z_can_record_outputsrB   __classcell__r5   r5   rL   r6   r=   x   s   
r=   c                       s   e Zd ZU eed<  fddZeedej	e
ej e
ej e
ej	 e
e e
ej e
e e
ej	 ee eeef d
ddZ  ZS )	CsmDepthDecoderModelr>   c                    s>   t  | t|j|j |j| _tj|j|j	dd| _
d S NF)Zbias)rA   __init__nn	EmbeddingrE   
vocab_sizeZbackbone_hidden_sizeembed_tokensLinearhidden_sizeinputs_embeds_projectorrI   r>   rL   r5   r6   rQ      s    zCsmDepthDecoderModel.__init__N)
	input_idsbackbone_last_hidden_stateattention_maskposition_idsr$   inputs_embeds	use_cachecache_positionkwargsreturnc	              
   K   s  |dur t j s td d}|du |duA r8td|rP|du rPt| jd}|du r|durh| nd}
|dur~|j	d n|j	d }|dur|j
n|j
}t j|
|
| |d}|du r(t j|d dd}|| j }| || }|d dk}|dur||dddf< nt j s(|r(td	 | |}t| j|||||d
}|}|d}| ||}| jd| jj D ]$}||f||||||d|	}qt| |}t||r|nddS )aJ  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.)r>   r   r   device)minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.)r>   input_embedsr\   r`   r$   r]   )r\   r]   r$   r_   r`   position_embeddings)Zlast_hidden_stater$   )r1   compilerZis_compilingloggerZwarning_once
ValueErrorr   r>   Zget_seq_lengthshaperd   arangeclamprT   rU   warningrX   r
   	unsqueezeZ
rotary_embZlayersZnum_hidden_layersZnormr   )rI   rZ   r[   r\   r]   r$   r^   r_   r`   ra   Zpast_seen_tokensZinputs_seq_lengthrd   codebook_idxsoffsetZinput_ids_are_first_codebookZcausal_maskr%   rg   Zdecoder_layerr5   r5   r6   forward   sn    



	


zCsmDepthDecoderModel.forward)NNNNNNNN)r-   r.   r/   r   r3   rQ   r   r   r1   
LongTensorr   r2   Tensorr   boolr   r   r   r4   r   rr   rN   r5   r5   rL   r6   rO      s0   
        
rO   c                       s&   e Zd Z fddZdddZ  ZS )rD   c                    s0   t    || _tt| jd ||| _d S )Nr   )rA   rQ   rE   rR   	Parameterr1   emptyrG   )rI   rW   rE   rT   rL   r5   r6   rQ      s    
zCsmCodebooksHead.__init__Nc                    sf   |d u r$j d }| jt|  n|d }| j|   fddt j d D tjddS )Nr   c              	      s2   g | ]*}t jd d |d d f  | jqS N)rR   
functionalZlinearT).0Zcodebook_idxZcodebook_weightr%   r5   r6   
<listcomp>  s   z,CsmCodebooksHead.forward.<locals>.<listcomp>r   dim)rk   rG   r1   rl   rF   stack)rI   r%   r`   Z
seq_lengthrp   r5   r|   r6   rr      s    

zCsmCodebooksHead.forward)Nr-   r.   r/   rQ   rr   rN   r5   r5   rL   r6   rD      s   rD   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                       s   e Zd ZdZdZdZ fddZdeje	e
 e	ej e	ej e	ej d fddZeedeje	ej e	ej e	ej e	ee
eej f  e	ej e	ej e	e e	ej eeejf ee eeef dd	d
Z  ZS )CsmDepthDecoderForCausalLMNc                    s2   t  | | `t|j|j|j| _t|| _	d S rx   )
rA   rQ   lm_headrD   rW   rE   rT   codebooks_headrO   r?   rY   rL   r5   r6   rQ     s    z#CsmDepthDecoderForCausalLM.__init__rZ   r$   r\   r^   r`   c           	         sH   t  j|||||fi |}|d d dk}|s:|d |d |S )Nr`   r   r[   r]   )rA   prepare_inputs_for_generationpop)	rI   rZ   r$   r\   r^   r`   ra   model_inputsZis_first_generation_steprL   r5   r6   r   !  s    	


z8CsmDepthDecoderForCausalLM.prepare_inputs_for_generationr   )rZ   r[   r\   r]   r$   r^   labelsr_   r`   logits_to_keepra   rb   c                 K   s   | j f ||||||||	d|}|d }t|
trV|
dkrHtdd}qZt|
 d}n|
}| |dd|ddf |	dur|	| nd}| }d}|dur|dddf  }| jf |d| jj|d|}t	|||j
|j|jdS )a  
        backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
            The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
            is provided in the `input_ids` argument.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )rZ   r[   r\   r]   r$   r^   r_   r`   r   r   N.)r#   r   rT   shift_labels)r"   r#   r$   r%   r&   )r?   rC   intslicer   
contiguousloss_functionr>   rT   r   r$   r%   r&   )rI   rZ   r[   r\   r]   r$   r^   r   r_   r`   r   ra   outputsr%   slice_indicesr#   r"   r   r5   r5   r6   rr   7  sJ    	
&z"CsmDepthDecoderForCausalLM.forward)NNNN)
NNNNNNNNNr   )r-   r.   r/   _tied_weights_keysZ_tp_planZ_pp_planrQ   r1   rs   r   r   r2   r   r   r   rt   r   listru   r   r   r   r4   r   rr   rN   r5   r5   rL   r6   r     sR   	              
r   c                       s$   e Zd Z fddZdd Z  ZS )CsmBackboneModelEmbeddingsc                    sD   t    t|j|j |j| _| jdt	
|j|j dd d S )Naudio_tokens_offsetsF)
persistent)rA   rQ   rR   rS   rE   rT   rW   embed_audio_tokensZregister_bufferr1   rl   rY   rL   r5   r6   rQ   }  s
    
z#CsmBackboneModelEmbeddings.__init__c                 C   s    |  || j }|jdd}|S )Nr   r~   )r   r   sum)rI   rZ   rf   r5   r5   r6   rr     s    z"CsmBackboneModelEmbeddings.forwardr   r5   r5   rL   r6   r   |  s   r   c                       s0   e Zd Z fddZee fddZ  ZS )CsmBackboneModelc                    s   t  | t|| _d S rx   )rA   rQ   r   rU   rY   rL   r5   r6   rQ     s    zCsmBackboneModel.__init__c                    s   t  jf i |S )a&  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        )rA   rr   )rI   Zsuper_kwargsrL   r5   r6   rr     s    zCsmBackboneModel.forward)r-   r.   r/   rQ   r   r   rr   rN   r5   r5   rL   r6   r     s   r   z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                       sJ  e Zd ZddgZ fddZdd Zdd Zd	d
 Ze fddZ	 fddZ
deej eej eej eej eej dddZdejee eej eej eej d fddZeedejeej eej eej eej eeeeej f  eej eej ee eej eeejf ee eeef dddZ  ZS )CsmForConditionalGenerationz5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                    sp   t  | |j| _tj|j|jdd| _t|j|j| _	t
|| _t|j| _t|j| _|   d S rP   )rA   rQ   rT   rR   rV   rW   r   rS   Ztext_vocab_sizeembed_text_tokensr   _from_configbackbone_modelr   Zdepth_decoder_configdepth_decoderr   from_configZcodec_configcodec_modelZ	post_initrY   rL   r5   r6   rQ     s    z$CsmForConditionalGeneration.__init__c                 C   s   | j jS rx   r   rU   rI   r5   r5   r6   get_input_embeddings  s    z0CsmForConditionalGeneration.get_input_embeddingsc                 C   s   || j _d S rx   r   )rI   valuer5   r5   r6   set_input_embeddings  s    z0CsmForConditionalGeneration.set_input_embeddingsc                 C   s$   | j jr | | jjj| jjj d S rx   )r>   Ztie_codebooks_embeddingsZ_tie_or_clone_weightsr   rU   r   r   r?   r   r5   r5   r6   _tie_weights  s
    z(CsmForConditionalGeneration._tie_weightsc                    s   | ddr$t j|i |\}}nt j|i |}d t  fddt|j D }t|jjddi| |D ]}t	|j |  q~d|v r||fS |S d S )NZoutput_loading_infoFdepth_decoder_c                    s(   i | ] \}}|  r|d  |qS rx   )
startswith)r{   attrr   prefix
prefix_lenr5   r6   
<dictcomp>  s   
z?CsmForConditionalGeneration.from_pretrained.<locals>.<dictcomp>Z_from_model_config)
getrA   from_pretrainedlenvarsgeneration_configitemsr   updatedelattr)clsargsra   r?   Zloading_infodepth_decoder_attrsr   rL   r   r6   r     s    z+CsmForConditionalGeneration.from_pretrainedc                    sV   d}| j j }|dd  | D ]\}}t| j|| | q$t j|i | d S )Nr   Ztransformers_version)r   r   Zto_diff_dictr   r   setattrrA   save_pretrained)rI   r   ra   r   r   r   r   rL   r5   r6   r     s    z+CsmForConditionalGeneration.save_pretrainedN)rZ   input_valuesinput_values_cutoffsr   rb   c                    sH  |  |}|dur>tj|d}||dk  }||dk }tj| |jd	t
|d}||dk }t  g }t||D ]~\}	}
|
|
dk }
t|
jd d D ]V}|
| }|
|d  }|	d||f }| j|d}|jdd}||d  qqtdd	 |D  t fd
d|D }| j|}W d   n1 sP0    Y  | jj}||k}| j|}|| ||< tjdd| jjf|jtjd| jj }| j|d}|| jj k}|!|" d||< |dur>|d!dd| jj}|| ||< |||< |dkj#dd}d||d |d ddf< |}||dS )a  
        Merges the input_ids and input_values to produce a single inputs_embeds tensor:
        1 - Infers the codec model on the input_values to retreive codebook token.
        2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
        3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

        Args:
            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
                The input ids to embed.
            input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
                The audio input values to embed.
            input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
                The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
        Nr   r   r   rc   r   .c                 s   s   | ]}|j d  V  qdS )r   N)rk   r{   elr5   r5   r6   	<genexpr>      zQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>c                    s,   g | ]$}t j|d d d  |jd   fqS )r   )rR   ry   padrk   r   Zmax_audio_framesr5   r6   r}     r   zRCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<listcomp>)rd   ZdtypeiTas_tuple)r^   r   )$r   rR   ry   r   diffr1   rl   maxrd   expandr   ro   Zno_gradziprF   rk   r   encodeZaudio_codesZ	transposeappendr   Zget_audio_codes_maskr>   audio_token_idr   rU   ZonesrE   longZcodebook_eos_token_idZsqueezeZaudio_eos_token_idrepeatr   nonzero)rI   rZ   r   r   r   r^   Zaudio_lengthsZinput_values_maskZaudio_tokens_listZbatch_input_valuesZbatch_input_values_cutoffsrK   Z	start_idxZend_idxZaudio_batchZcodec_outputsZcodebook_idsZbatched_audio_token_idsZaudio_codes_maskr   Zaudio_token_maskZaudio_embedsZaudio_eos_frame_idsZaudio_eos_embedsZaudio_eos_token_maskZlabels_expandedZ depth_decoder_ignore_frames_idxsr5   r   r6   "_merge_input_ids_with_input_values  sX    


,
z>CsmForConditionalGeneration._merge_input_ids_with_input_valuesr   c           	         s   t  jf |||||d|}|d ur||jdkr||dd u r|| j||d|d|dd}||d |d d d |S )	Nr   r   r^   r   r   r   )rZ   r   r   r   )r^   r   rZ   )rA   r   ndimr   r   r   )	rI   rZ   r$   r\   r^   r`   ra   r   merged_inputsrL   r5   r6   r   ;  s(    		 z9CsmForConditionalGeneration.prepare_inputs_for_generationr   )rZ   r   r\   r   r]   r$   r^   r   r_   r`   r   ra   rb   c                 K   s  |dur6|j dkr6| ||||}|d }|d }d}| jf ||||||	|
d|}|d }t|trtt| dn|}| |dd|ddf }d}d}d}d}|dur|dddddf }| jf ||| jj	d|}|ddddddf d	kj
d
d }|| dd| jjd f }tjj|ddd}|jdd}||d |d d ddf }|| }| jf |||	d|d|}|j}|| }t|||||j|j|j|dur|jnd|dur|jnd|dur|jnd|dur|jnddS )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
            1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
            requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

            2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
            Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
            If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
            where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
            the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
            Requires targeted `input_values` to be provided as audio tokens will be inferred from it using the `codec_model`.
            - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
            - `-100` will be ignored in the loss computation
            - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

            Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            Kept for compatibility. Does not support another value than:
            1. `0`, which is equivalent to keeping all logits, used in the training regime
            2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

        Example:

        ```python
        >>> import torch
        >>> from transformers import CsmForConditionalGeneration, AutoProcessor
        >>> from datasets import load_dataset, Audio

        >>> model_id = "sesame/csm-1b"
        >>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

        >>> processor = AutoProcessor.from_pretrained(model_id)

        >>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        >>> # ensure the audio is 24kHz
        >>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

        >>> conversation = []
        >>> # prepare a conversation with text and corresponding audio
        >>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
        ...     conversation.append(
        ...         {
        ...             "role": f"{speaker_id}",
        ...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
        ...         }
        ...     )

        >>> inputs = processor.apply_chat_template(
        ...     conversation,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     output_labels=True,
        ... ).to(torch_device)

        >>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
        >>> output = model(**inputs)
        >>> output.loss.backward()
        ```Nr   r^   r   )rZ   r\   r]   r$   r^   r_   r`   r   )r#   r   rT   r   r   r   r~   .r   )r   Tr   )rZ   r[   r_   Zreturn_dictr   )r"   r,   r'   r#   r$   r%   r&   r(   r)   r*   r+   )r   r   r   rC   r   r   r   r   r>   rT   allrE   rR   ry   r   r   r   r"   r!   r$   r%   r&   r#   )rI   rZ   r   r\   r   r]   r$   r^   r   r_   r`   r   ra   r   Zbackbone_outputsZbackbone_hidden_statesr   Zbackbone_logitsr"   r,   r'   Zdepth_decoder_outputsZbackbone_labelsZ
train_maskZdepth_decoder_input_idsZ
train_idxsZbackbone_last_hidden_statesZdepth_decoder_labelsr5   r5   r6   rr   Z  s    S

(	

z#CsmForConditionalGeneration.forward)NNNN)NNNN)NNNNNNNNNNr   )r-   r.   r/   r   rQ   r   r   r   classmethodr   r   r   r1   rt   r   rs   r   r2   r   r   r   r   r   ru   r   r   r   r4   r!   rr   rN   r5   r5   rL   r6   r     sx   
    U               
r   )r=   r   rO   r   r   )?dataclassesr   typingr   r   r1   Ztorch.nnrR   Ztransformers.utils.genericr   Zcache_utilsr   r   Z
generationr	   Zmasking_utilsr
   Zmodeling_outputsr   r   Zmodeling_utilsr   Zprocessing_utilsr   utilsr   r   r   r   autor   Zllama.modeling_llamar   r   r   r   r   r   r   r   Zconfiguration_csmr   r   Zgeneration_csmr    Z
get_loggerr-   ri   r!   r7   r9   r:   r;   r<   r=   rO   ModulerD   r   r   r   r   __all__r5   r5   r5   r6   <module>   s`   (

-_f  U