a
    h+                  	   @   s  d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* e( rddl+m,Z,m-Z- ddl.m/Z/ n
d\Z/Z-Z,e' rBddl0m1Z1m2Z2 nd\Z2Z1e3e/e-e1e2e,fZ4e#5e6Z7G dd de	j8Z9ej:e;ej:dddZ<G dd dZ=d;e	j8ej:ej:ej:eej: e>e>d!d"d#Z?G d$d% d%e	j8Z@G d&d' d'e	j8ZAG d(d) d)e	j8ZBG d*d+ d+e	j8ZCG d,d- d-e	j8ZDG d.d/ d/e	j8ZEe"G d0d1 d1eZFe"G d2d3 d3eFZGG d4d5 d5eFeZHe"d6d7G d8d9 d9eFZIg d:ZJdS )<zPyTorch Zamba model.    N)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)Cache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_update)NNc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	ZambaRMSNormư>c                    s&   t    tt|| _|| _dS )z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/zamba/modeling_zamba.pyr%   A   s    
zZambaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )N   T)Zkeepdim)	dtypetor'   float32powmeanZrsqrtr*   r)   )r+   hidden_statesZinput_dtypeZvariancer0   r0   r1   forwardI   s
    zZambaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler)   shaper*   r+   r0   r0   r1   
extra_reprP   s    zZambaRMSNorm.extra_repr)r#   )__name__
__module____qualname__r%   r:   r>   __classcell__r0   r0   r.   r1   r"   @   s   r"   )r9   n_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r<   expandreshape)r9   rC   batchnum_key_value_headsslenhead_dimr0   r0   r1   	repeat_kvU   s
    0rK   c                   @   s   e Zd ZdZdZejdfddZdd Ze	e
ejejf dd	d
Zdejeje	eeeef  e
ejejf dddZejdddZdee	 e	dddZdS )ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc              
      s"  || _ d| _|j| _d| _|j|j | _|j| _|j	| _
|j| _g | _g | _g | _i | _i | _i | _t|jD ]v}|  jtj | j| j
|dg7  _ | j| j| j | jf}|  jtj||dg7  _| j| dkrn| j| qn fddt|jD | _ fddt|jD | _d S )NFdevicer4   hybridc                    s    g | ]}t jg g  d qS rN   r'   Ztensor.0_
batch_sizerN   r0   r1   
<listcomp>       z4ZambaHybridDynamicCache.__init__.<locals>.<listcomp>c                    s    g | ]}t jg g  d qS rP   rR   rS   rV   r0   r1   rX      rY   )r4   is_compileablelayers_block_typehas_previous_statemamba_expandr,   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layersZ_modules_parameters_buffersrangenum_hidden_layersr'   zerosappend	key_cachevalue_cache)r+   configrW   r4   rN   iZcache_shaper0   rV   r1   r%   q   s8    
z ZambaHybridDynamicCache.__init__c                 C   s
   t | jS N)lenrm   r=   r0   r0   r1   __len__   s    zZambaHybridDynamicCache.__len__)	layer_idxrD   c                 C   s   | j | | j| fS rq   )rm   rn   r+   rt   r0   r0   r1   __getitem__   s    z#ZambaHybridDynamicCache.__getitem__)
key_statesvalue_statesrt   cache_kwargsrD   c                 C   sz   | j | jd dkr*|| j |< || j|< n<tj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr3   r   r2   dim)rm   r<   rn   r'   cat)r+   rw   rx   rt   ry   r0   r0   r1   update   s    
zZambaHybridDynamicCache.update)beam_idxc                 C   s   t t| jD ]}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)	ri   rr   rm   rN   Zindex_selectr5   rn   rd   re   )r+   r~   rt   rN   r0   r0   r1   reorder_cache   s    z%ZambaHybridDynamicCache.reorder_cacher   c                 C   s:   || j vr| j d n|}t| j|kr*dS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rf   rr   rm   r<   ru   r0   r0   r1   get_seq_length   s    z&ZambaHybridDynamicCache.get_seq_length)N)r   )r?   r@   rA   __doc__rZ   r'   Zfloat16r%   rs   intr;   Tensorrv   r   dictstrr   r}   
LongTensorr   r   r0   r0   r0   r1   rL   a   s    	 rL           )modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr2   r
   r   r3   )r{   r4   )ptrainingr   )rK   num_key_value_groupsr'   matmul	transposer<   r   
functionalZsoftmaxr6   r5   r4   r   r   
contiguous)r   r   r   r   r   r   r   kwargsrw   rx   attn_weightscausal_maskattn_outputr0   r0   r1   eager_attention_forward   s    
&r   c                       s|   e Zd ZdZeed fddZedddddej	ee
ej	 e
e ee eej	e
ej	 e
eej	  f d
ddZ  ZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    ro   rt   c                    s   t    || _|| _|j| _|j| _|j|j | _	|j
| _
| jd d | _d| _|j| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j|j| j dd| _tj|j| j |jdd| _d S )Nr2         TFbias)r$   r%   ro   rt   attention_hidden_sizeZattention_head_dimrJ   Znum_attention_headsrH   r   Zmax_position_embeddingsr   Z	is_causalattention_dropoutr   Linearq_projk_projv_projr,   o_projr+   ro   rt   r.   r0   r1   r%      s    
zZambaAttention.__init__past_key_valuepast_key_values4.58new_nameversionN)r9   rt   r   r   r   rD   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}	| ||dd}
|d ur||	|
|\}	}
t}| j	j
dkrt| j	j
 }|| ||	|
|f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr3   r   r2   eagerr   )r   r   )r<   rJ   r   viewr   r   r   r}   r   ro   _attn_implementationr   r   r   r   rF   r   r   )r+   r9   rt   r   r   r   Zinput_shapeZhidden_shapeZquery_statesrw   rx   Zattention_interfacer   r   r0   r0   r1   r:      s2    	

zZambaAttention.forward)N)r?   r@   rA   r   r   r   r%   r   r'   r   r   rL   r   r   r;   r:   rB   r0   r0   r.   r1   r      s    r   c                       sZ   e Zd ZdZed fddZdejedddZ	ded	d
dZ
ded	ddZ  ZS )ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    ro   c                    s  t    || _|| _|j| _|j| _|j| _|j	|j | _
|j| _|j| _| j
| j | _|j| _|j| _tj| j
| j
| j| j| j
| jd d| _|j| _t|j | _|j| _tj| j| j
d | jd| _tt | j| j| jd  | j| _!tt | j| j| jd d | jd  | _"tt | j| j| _#tj$d| jd tj%dd d d f }|&| j
d' }tt(|)| j| jd| _*tt+| j| j| _,tj| j
| j| jd| _-t.st/0d d S )	Nr   )Zin_channelsZout_channelsr   Zkernel_sizegroupspaddingr2   r   g      ?r4   r3   ap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r$   r%   ro   rt   r,   r_   r`   ra   rb   r]   r^   mamba_dt_ranktime_step_rankrc   mamba_head_dimZmamba_conv_biasuse_conv_biasZmamba_proj_biasZuse_biasr   Conv1dconv1dZhidden_mamba_act
activationr   actZuse_mamba_kernelsuse_fast_kernelsr   in_projr&   r'   rk   x_proj_weightdt_proj_weightdt_proj_biasaranger6   rE   r   logrF   A_logr(   Dout_projis_fast_path_availableloggerwarning_once)r+   ro   rt   Ar.   r0   r1   r%   /  s`    
	$ zZambaMambaMixer.__init__N)r9   cache_paramsc                 C   s  |j \}}}|d uo |jo |dk}| |dd}||dd|jddd\}}	|d }|	d}	|	|| j	d|dd}	| j
j| j
jd| j
jd}
|rt|d|j| j |
| j
j| j}|d}n|d urt|dks||d }|d ur>tj|| j|j d  df}|j| j | t||
| j
j| jd}|d ur|t|dks|||d }|d| j	| j|dd}| jd d d d d d d f | dd}tj|| j| j| jgdd\}}}| j d d d f |dd }t!| j"#  }| j$d ur*| j$# nd }tj%|d|f|j&|j'd}|rt(| j	D ]}t)|j*| j d d |f ||d	df ||d	df || ||d d df ||d d df | j+| |	|d	df || d
d
d}tj,||fdd}qXntj%|d| j| jf|j&|j'd}t(| j	D ]}t-|| || || || dd|| dd| j+| # |	| || d
d
d
\}}tj,||fdd }tj,||dfdd}q|d ur|d ur|j*| j | | .|dd}|S )Nr   r2   r3   rz   r   )r   r   rM   .T)Zdt_softplus)Zdelta_softplusZreturn_last_state)/r<   r\   r   r   r   chunksqueezer   rF   rc   r   r)   sizer!   rd   rt   r   r   	unsqueezer'   allr   r   padrb   copy_r    r   r   splitr   r`   r   expr   floatr   emptyrN   r4   ri   r   re   r   r|   r   r   )r+   r9   r   r   rW   seq_lenrU   Zuse_precomputed_statesprojected_statesgateZconv_weightsrd   ssm_parameters	time_stepBCdiscrete_time_stepr   Ztime_proj_biasscan_outputsnZscan_outputs_	ssm_stateZ
ssm_state_contextualized_statesr0   r0   r1   cuda_kernels_forwardl  s    
$

*
z$ZambaMambaMixer.cuda_kernels_forward)r   c              
   C   s  |j \}}}|j}| |dd}||dd|jddd\}	}
|	d }	|
d}
|
|| j	d|dd}
t
|t}|rb|j| j j d |krb| jr|j| j  }n|j| j }||	j}|jr|dkr|j| j j d |kr|j| j }tj|ddd}|	d d d d df |d d d d df< ||j| j< tj|| jjd d dd d f  dd}	| jr|	| jj7 }	| |	|d}	n|d urt|dks|	|d d |	j d  d f d }	tj |	| j!|	j d  df}||j| j< | | |	dd |f }	|d urt|dks|	|d d |	j d  d f d }	ntj"|| j	| j#| j$f|	j|d}|d urt|dks|	|d }	| | |	dd |f }	|d urt|dks|	|d }	|	d| j	| j#|dd}	| j%d d d d d d d f |	 dd	}tj&|| j'| j$| j$gdd\}}}| j(d d d f |dd	 | j)d d d d d d f  }tj*|}t+| j,-  }t+|d d d d d d d d f |d d d d d d d d d f  }|d d d d d d d d d f |d d d d d d d d d f -  }||	d d d d d d d d d f -  }g }t.|D ]}|d d d d d d |d d f dd| |d d d d d d |d d f dd }t/|dd||d d d d |d d f d}|0|d d d d d d df  qptj1|dd}||	| j2d d d d d d f   }|| |
 }|rz||j| j< | 3|dd|d|dd}|S )
Nr   r2   r3   rz   r   )Zshiftsdims.rM   r   )4r<   r4   r   r   r   r   r   r   rF   rc   
isinstancerL   re   rt   r   cloner5   rN   r\   rd   r'   Zrollsumr   r)   r   r   r   r   r   r   r   r   rb   rk   r   r`   r   r   r   r   r   Zsoftplusr   r   r   ri   r   rl   stackr   r   )r+   Zinput_statesr   r   rW   r   rU   r4   r   r9   r   	use_cacher   Z
conv_stater   r   r   r   r   r   Z
discrete_AZ
discrete_BZdeltaB_ur   rp   Zscan_outputr   r0   r0   r1   slow_forward  s    

((&(* FH*X8&"zZambaMambaMixer.slow_forwardc                 C   s@   | j r0trd| jjjvr td| j|||dS | j|||dS )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   rN   type
ValueErrorr   r   )r+   r9   r   r   r0   r0   r1   r:   *  s    zZambaMambaMixer.forward)NN)NN)NN)r?   r@   rA   r   r   r%   r'   r   rL   r   r   r:   rB   r0   r0   r.   r1   r   "  s   > a]r   c                       s$   e Zd Z fddZdd Z  ZS )ZambaMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFr   )r$   r%   ro   r,   r^   r   r   	gate_projup_proj	down_projr   Z
hidden_actact_fnr+   ro   r.   r0   r1   r%   8  s    
zZambaMLP.__init__c                 C   s$   |  | | || | }|S rq   )r   r   r   r   )r+   xr   r0   r0   r1   r:   B  s     zZambaMLP.forward)r?   r@   rA   r%   r:   rB   r0   r0   r.   r1   r   7  s   
r   c                       s   e Zd Zdeee d fddZedddddej	ej	eeej	 ee
 ee ee ee eejeeejejf  f d
	ddZ  ZS )ZambaAttentionDecoderLayerNr   c                    sH   t    t||| _t|| _t|j|jd| _	t|j
|jd| _d S )Nr-   )r$   r%   r   	self_attnr   feed_forwardr"   r   rms_norm_epsinput_layernormr,   pre_ff_layernormr   r.   r0   r1   r%   H  s
    

z#ZambaAttentionDecoderLayer.__init__r   r   r   r   F)	r9   original_hidden_statesrt   r   r   output_attentionsr   r   rD   c              	   K   sj   t j||gdd}| |}| jf ||||||d|\}}	| |}| |}|f}
|rf|
|	f7 }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r3   rz   )r9   rt   r   r   r   r   )r'   Zconcatenater   r   r   r   )r+   r9   r   rt   r   r   r   r   r   self_attn_weightsoutputsr0   r0   r1   r:   P  s$     





z"ZambaAttentionDecoderLayer.forward)N)NNFF)r?   r@   rA   r   r   r   r%   r   r'   r   rL   boolr   r   r;   FloatTensorr:   rB   r0   r0   r.   r1   r   G  s"       r   c                       s   e Zd Zeed fddZedddddeje	ej e	e e	ej e	ej e	e
 e	e e	e e	ej e	ej eeje	eejejf  f d
ddZ  ZS )ZambaMambaDecoderLayerr   c                    s4   t    t||d| _t|j|jd| _|| _d S )Nr   r   )	r$   r%   r   mambar"   r,   r   r   rt   r   r.   r0   r1   r%     s    
zZambaMambaDecoderLayer.__init__r   r   r   r   NF)r9   r   rt   r   r   r   r   r   cache_positiontransformer_hidden_statesrD   c                 K   sd   |}|
dur||
 n|}|  |}| j|||d}d}|| }|f}|rR||f7 }|r`||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)r9   r   r   )r   r  )r+   r9   r   rt   r   r   r   r   r   r  r  r   Zresidualr  r  r0   r0   r1   r:     s"    


zZambaMambaDecoderLayer.forward)	NNNNNFFNN)r?   r@   rA   r   r   r%   r   r'   r   r   rL   r  r   r;   r  r:   rB   r0   r0   r.   r1   r    s0            r  c                       s   e Zd Zeejed fddZeddddde	j
ee	j
 ee ee	j
 ee	j
 ee ee ee ee	j ee	jeee	je	jf  f d

ddZ  ZS )ZambaHybridLayer)shared_transflinearr  c                    s    t    || _|| _|| _d S rq   )r$   r%   r
  r  mamba_decoder)r+   r
  r  r  r.   r0   r1   r%     s    
zZambaHybridLayer.__init__r   r   r   r   NF)
r9   r   rt   r   r   r   r   r   r  rD   c
              
   C   sp   | j ||||||||	d}
|
d }|r.|
d }| |}| j|||||||	d}
|rl|
d |f|
dd  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )r   rt   r   r   r   r   r  r   r   )r  r   r   r   r   r  r2   N)r
  r  r  )r+   r9   r   rt   r   r   r   r   r   r  layer_outputsr  r  r0   r0   r1   r:     s4     

zZambaHybridLayer.forward)NNNNNFFN)r?   r@   rA   r   r   r   r  r%   r   r'   r   r   r   rL   r  r   r;   r  r:   rB   r0   r0   r.   r1   r	    s,           r	  c                   @   s>   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZdd	 Zd
S )ZambaPreTrainedModelro   modelTr   r  r   Fc                 C   s  | j j}t|tjtjfrF|jjjd|d |j	d urB|j	j
  nt|tjr|jjjd|d |jd ur|jj|j 
  nJt|tr|jjd n.t|tr|jjjd|d | j jd }tj|j| | | j j| j j | j j }tt| j j|t| j jt| j j  t| j j j| j j d}|tt!|   }|j"j#| tj$d|j%d tj&dd d d f }|'|j(d) }|j*j#t|+|j|j,d |j-jd d S )	Nr   )r8   stdg      ?r   )minr   r   r3   ).ro   Zinitializer_ranger   r   r   r   r)   dataZnormal_r   Zzero_	Embeddingpadding_idxr"   Zfill_r   r   r   initZuniform_r   r]   r,   rc   r'   r   Zrandmathr   Ztime_step_maxZtime_step_minclampZtime_step_floorexpm1r   r   r   r`   r6   rE   r^   r   r   rF   r   r   )r+   r   r  Zdt_init_stdr   dtZinv_dtr   r0   r0   r1   _init_weights!  s>    


$"z"ZambaPreTrainedModel._init_weightsN)r?   r@   rA   r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_is_statefulr  r0   r0   r0   r1   r    s   
r  c                       s   e Zd ZdZed fddZedeej	 eej
 eej	 ee eej ee ee ee ee eej	 eeef dddZd	d
 Z  ZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    r   c           
         s  t  | |j| _|j| _t|j|j| j| _t	|}g }g }|j
| _
t|jD ]d}|j
| dkrz|t||d qT|j
| dkrT|tj| jj| jjdd |t||d qTt|}t|}g }g | _t| j
D ]n\}}|dkr>d| d g d}	g | j fd	d
|	D | _|t|t|t| q|t| qt|| _|j| _t|j|jd| _d| _|   d S )Nr  )rt   rO   Fr   zlayers..)	z%shared_transf.self_attn.q_proj.weightz%shared_transf.self_attn.k_proj.weightz%shared_transf.self_attn.v_proj.weightz%shared_transf.self_attn.o_proj.weightz+shared_transf.feed_forward.gate_proj.weightz)shared_transf.feed_forward.up_proj.weightz+shared_transf.feed_forward.down_proj.weightz$shared_transf.input_layernorm.weightz%shared_transf.pre_ff_layernorm.weightc                    s   g | ]} | qS r0   r0   )rT   r   Zprefix_namer0   r1   rX   m  rY   z'ZambaModel.__init__.<locals>.<listcomp>r   )r$   r%   pad_token_idr  
vocab_sizer   r  r,   embed_tokensr   r[   ri   rj   rl   r  r   ro   iter_tied_weights_keys	enumerater	  nextZ
ModuleListlayersr   r"   r   final_layernormgradient_checkpointing	post_init)
r+   ro   blockZmamba_layersZlinear_layersrp   r&  Zlayer_idZ
layer_typeZ	tied_keysr.   r  r1   r%   K  s<    
zZambaModel.__init__N)	input_idsr   position_idsr   inputs_embedsr   r   output_hidden_statesreturn_dictr  rD   c                 C   s
  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|	d urH|	n| j j}	|d u |d uA rhtd| jr| jr|rt	d d}|d u r| 
|}|}t|}|r|d u rt	d |
d u rtj|jd |jd}
|d u r|
d}| |||
}|rdnd }|rdnd }t| jD ]\}}|r8||f7 }| jrh| jrh| |j|||||||||

}n||||||||||
d		}|d }|r |d d ur ||d f7 }q | |}|r||f7 }|r|jsd
|_t||r|nd ||d}|	r|S | S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rQ   r   r0   )r   rt   r   r   r   r   r   r  T)Zlast_hidden_stater   r9   
attentions)ro   r   r.  r   use_return_dictr   r(  r   r   r   r!  r'   r   r   r<   rN   r   _update_causal_maskr$  r&  Z_gradient_checkpointing_func__call__r'  r\   r   Zto_tuple)r+   r+  r   r,  r   r-  r   r   r.  r/  r  r9   r   r   Zall_hidden_statesZall_self_attnsrt   layerr  outputr0   r0   r1   r:   z  s    





zZambaModel.forwardc                 C   s  | j jdkr$|d ur d|v r |S d S |j|j }}t|j}|jd }|d d }tj||f|||d}	|dkrtj	|	dd}	|	tj
||d|ddk9 }	|	d d d d d d f |jd ddd}	|d urJ|	 }	| d	krJ|jd }
|	d
d |
f d|d d d d d d f d }|	d
d |
f |||	d
d |
f< | j jdkr||d ur||jjdv r|t|	|}	|	S )NZflash_attention_2r   r   r3   )Z
fill_valuer4   rN   )ZdiagonalrQ   r   r2   .Zsdpa)r   ZxpuZnpu)ro   r   r4   rN   r'   Zfinfor  r<   fullZtriur   rF   rE   r   r{   eqZmasked_fillr   r   Z_unmask_unattended)r+   r   Zinput_tensorr  r4   rN   Z	min_dtypeZsequence_lengthZtarget_lengthr   Zmask_lengthZpadding_maskr0   r0   r1   r2    s6    
*

4$

zZambaModel._update_causal_mask)
NNNNNNNNNN)r?   r@   rA   r   r   r%   r   r   r'   r   r   rL   r  r  r   r;   r   r:   r2  rB   r0   r0   r.   r1   r  B  s6   /          
nr  c                       s   e Zd Zed fddZedeej eej	 eej ee
 eej eej ee ee ee ee eej eeej	f eeef dddZdd
dZ  ZS )ZambaForCausalLMr   c                    sP   t  | t|| _dg| jj| _|j| _tj|j|jdd| _	| 
  d S )Nzlm_head.weightFr   )r$   r%   r  r  r#  r   r   r   r,   lm_headr)  r   r.   r0   r1   r%     s    
zZambaForCausalLM.__init__Nr   )r+  r   r,  r   r-  labelsr   r   r.  r/  r  logits_to_keeprD   c                 K   s   |dur|n| j j}|	dur |	n| j j}	|
dur4|
n| j j}
| j||||||||	||
d
}|d }t|trxt| dn|}| |dd|ddf }d}|dur| j	||| j
fi |}|
s|f|dd  }|dur|f| S |S t|||j|j|jdS )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r+  r   r,  r   r-  r   r   r.  r  r/  r   r   losslogitsr   r9   r0  )ro   r   r.  r1  r  r   r   slicer9  Zloss_functionr   r   r   r9   r0  )r+   r+  r   r,  r   r-  r:  r   r   r.  r/  r  r;  r   r  r9   Zslice_indicesr>  r=  r5  r0   r0   r1   r:     s@    (zZambaForCausalLM.forwardTc              	   K   s  |d u }	|	sj|d us&|d |j d krD|d d |j d  d f }q|j d |j d kr|d d |f }nt| j|j d | j| jd}|d ur|d u r| dd }||dkd |	s|d d |j d  d f }|d ur|	rd|i}
nd| i}
|
	||||| jj
|d |
S )Nr3   r   r   )r4   rN   r-  r+  )r,  r   r   r   r;  r  )r<   rL   ro   r4   rN   longZcumsumZmasked_fill_r   r}   Znum_logits_to_keep)r+   r+  r   r   r-  r  r,  r   r   Zempty_past_kvZmodel_inputsr0   r0   r1   prepare_inputs_for_generationk  s<    

z.ZambaForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNr   )NNNNNT)r?   r@   rA   r   r%   r   r   r'   r   r   rL   r  r  r   r   r;   r   r:   rA  rB   r0   r0   r.   r1   r8    sH   
            
T      r8  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )Zcustom_introc                       s   e Zd Z fddZedeej eej eej ee	e
eej f  eej eej ee ee ee ee e	eef dddZ  ZS )ZambaForSequenceClassificationc                    sJ   t  | |j| _t|| _| jj| _tj|j| jdd| _	| 
  d S r   )r$   r%   
num_labelsr  r  r#  r   r   r,   scorer)  r   r.   r0   r1   r%     s    

z'ZambaForSequenceClassification.__init__N)r+  r   r,  r   r-  r:  r   r   r.  r/  rD   c                 C   sV  |
dur|
n| j j}
| j||||||||	|
d	}|d }| |}|durV|jd }n
|jd }| j jdu r||dkr|td| j jdu rd}nb|dur|| j jk|jt	j
}t	j|jd |jt	j
d}|| d}nd}t| jj d |t	j||jd	|f }d}|dur||j}| j jdu r| jdkrFd
| j _n:| jdkrx|jt	jksn|jt	jkrxd| j _nd| j _| j jd
krt }| jdkr|| | }n
|||}nN| j jdkrt }||d| j|d}n| j jdkrt }|||}|
s>|f|dd  }|dur:|f| S |S t|||j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r,  r   r-  r   r   r.  r/  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r3   rM   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rQ   Z
regressionZsingle_label_classificationZmulti_label_classificationr<  )ro   r1  r  rD  r<   r  r   r5   rN   r'   Zint32r   Zargmaxr   r   r/   r?   Zproblem_typerC  r4   r@  r   r	   r   r   r   r   r   r   r9   r0  )r+   r+  r   r,  r   r-  r:  r   r   r.  r/  Ztransformer_outputsr9   r>  rW   Zlast_non_pad_tokenZnon_pad_maskZtoken_indicesZpooled_logitsr=  Zloss_fctr5  r0   r0   r1   r:     sx    



(

z&ZambaForSequenceClassification.forward)
NNNNNNNNNN)r?   r@   rA   r%   r   r   r'   r   r   r   r   listr  r  r;   r   r:   rB   r0   r0   r.   r1   rB    s2   
          
rB  )r8  rB  r  r  )r   )Kr   r  typingr   r   r   r   r'   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   Zcache_utilsr   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   Zutils.deprecationr   Zutils.import_utilsr   r   Zconfiguration_zambar   Z&mamba_ssm.ops.selective_scan_interfacer   r   Z+mamba_ssm.ops.triton.selective_state_updater   Zcausal_conv1dr    r!   r   r   Z
get_loggerr?   r   Moduler"   r   r   rK   rL   r   r   r   r   r   r   r  r	  r  r  r8  rB  __all__r0   r0   r0   r1   <module>   s|   

g G  @EI, L j