a
    h%                 
   @   s  d dl mZmZmZmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* e( rd dl+m,Z, d dl-m.Z.m/Z/ ndZ,e' r8d dl0m1Z1m2Z2 nd\Z2Z1e#3e4Z5G dd deddZ6G dd dZ7G dd dej8Z9d d! Z:ej;e<ej;d"d#d$Z=dHej8ej;ej;ej;eej; e>e>ee  d&d'd(Z?dId)d*Z@G d+d, d,ej8ZAG d-d. d.ejj8ZBej;e<d/d0d1ZCd2d3 ZDd4d5 ZEeFe,e1e2fZGd6d7 ZHG d8d9 d9ej8ZIG d:d; d;ej8ZJed<G d=d> d>ej8ZKG d?d@ d@eZLe!G dAdB dBeZMe!G dCdD dDeMZNe!G dEdF dFeMeZOg dGZPdS )J    )AnyCallableOptional	TypedDictUnionN)nn)ACT2FN   )Cache)GenerationMixin)use_kernel_forward_from_hub)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_2_ssm_available   )BambaConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_update)NNc                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )BambaFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    Zcu_seq_lens_qZcu_seq_lens_kZmax_length_qZmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor r/   r/   d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/bamba/modeling_bamba.pyr$   @   s   


r$   F)totalc                   @   s   e Zd ZdZdZejdfedddZdej	ej	e
eeeef  eej	ej	f ddd	Zejd
ddZdee
 e
dddZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfigc                    s0  |j | _ d| _|j}|j}g | _g | _g | _t|jD ]}| j | dkr|  jt	j
 |j|j d|j |  ||dg7  _|  jt	j
 |j|j||dg7  _q6|  jt	jg g  dg7  _|  jt	jg g  dg7  _| j| q6 fddt|jD | _ fddt|jD | _d S )	NFmamba   devicedtyper8   c                    s    g | ]}t jg g  d qS r:   r*   tensor.0_
batch_sizer8   r/   r0   
<listcomp>       z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>c                    s    g | ]}t jg g  d qS r;   r<   r>   rA   r/   r0   rC      rD   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr*   Zzerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headr=   append	key_cachevalue_cache)selfr4   rB   r9   r8   conv_kernel_sizessm_state_sizeir/   rA   r0   __init__i   sB    	
  z)HybridMambaAttentionDynamicCache.__init__)
key_statesvalue_states	layer_idxcache_kwargsreturnc                 C   sz   | j | jd dkr*|| j |< || j|< n<tj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr   r6   dim)rT   shaperU   r*   cat)rV   r[   r\   r]   r^   r/   r/   r0   update   s    
z'HybridMambaAttentionDynamicCache.update)beam_idxc                 C   s   t t| jD ]}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)	rL   lenrT   r8   Zindex_selecttorU   rI   rJ   )rV   rf   r]   r8   r/   r/   r0   reorder_cache   s    z.HybridMambaAttentionDynamicCache.reorder_cacher   )r]   r_   c                 C   s:   || j vr| j d n|}t| j|kr*dS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rK   rg   rT   rc   )rV   r]   r/   r/   r0   get_seq_length   s    z/HybridMambaAttentionDynamicCache.get_seq_length)N)r   )r&   r'   r(   r)   Zis_compileabler*   Zfloat16r   rZ   Tensorr-   r   dictstrr   tuplere   r+   ri   rk   r/   r/   r/   r0   r2   Y   s   + r2   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	BambaRotaryEmbeddinginv_freqNr3   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultrq   F)
persistent)superrZ   hasattr
isinstancerr   rm   getrs   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr4   r   Zrope_init_fnattention_scalingZregister_bufferrq   Zoriginal_inv_freq)rV   r4   r8   rq   	__class__r/   r0   rZ      s    
zBambaRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r`   r   ZmpscpuF)device_typeZenabledr6   ra   r9   )rq   floatexpandrc   rh   r8   ry   rt   rn   r*   Zautocast	transposerd   cosr{   sinr9   )
rV   xposition_idsZinv_freq_expandedZposition_ids_expandedr   ZfreqsZembr   r   r/   r/   r0   forward   s    0&,zBambaRotaryEmbedding.forward)N)r&   r'   r(   r*   rl   r,   r   rZ   Zno_gradr   r   __classcell__r/   r/   r|   r0   rp      s
   

rp   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr`   r6   ra   )rc   r*   rd   )r   x1Zx2r/   r/   r0   rotate_half   s    r   )hidden_statesn_repr_   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rc   r   reshape)r   r   batchnum_key_value_headsslenhead_dimr/   r/   r0   	repeat_kv   s
    0r           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr6   r	   rj   r`   )rb   r9   )ptrainingr   )r   num_key_value_groupsr*   matmulr   rc   r   
functionalZsoftmaxfloat32rh   r9   r   r   
contiguous)r   r   r   r   r   r   r   r   r[   r\   attn_weightscausal_maskattn_outputr/   r/   r0   eager_attention_forward   s    
&r   c                 C   s   | |}| |}|jd }| dd|f | d|df  }}|dd|f |d|df  }	}
|| t||  }|	| t|	|  }tj||gdd}tj||
gdd}||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    r`   .Nra   )	unsqueezerc   r   r*   rd   )qkr   r   r   Zunsqueeze_dimZ
rotary_dimZq_rotZq_passZk_rotZk_passZq_embedZk_embedr/   r/   r0   apply_rotary_pos_emb  s    


""r   c                       s   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej ee e
ej	ej	f d
ddZ  ZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr4   r]   c                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )Nr   g      Tbias)rw   rZ   r4   r]   getattrrO   Znum_attention_headsr   r   r   r   attention_dropoutZ	is_causalr   LinearZattention_biasq_projk_projv_projo_proj)rV   r4   r]   r|   r/   r0   rZ   3  s(    
zBambaAttention.__init__past_key_valuepast_key_values4.58new_nameversionN)r   position_embeddingsr   r   cache_positionr   r_   c                 K   s$  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d ur|||d}||
|| j	|\}
}t
}| jjdkrt| jj }|| |	|
||f| jsdn| j| jd|\}}|jg |dR   }| |}||fS )Nr`   r   r6   )r   r   r   eagerr   )r   r   )rc   r   r   viewr   r   r   r   re   r]   r   r4   _attn_implementationr   r   r   r   r   r   r   )rV   r   r   r   r   r   r   Zinput_shapeZhidden_shapeZquery_statesr[   r\   r   r   r^   Zattention_interfacer   r   r/   r/   r0   r   J  s8    


zBambaAttention.forward)NN)r&   r'   r(   r)   r   r-   rZ   r   r*   rl   ro   r   r
   r+   r   r   r   r   r/   r/   r|   r0   r   0  s     r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	BambaRMSNormGatedư>c                    s&   t    tt|| _|| _d S Nrw   rZ   r   	Parameterr*   onesweightvariance_epsilonrV   rO   epsr|   r/   r0   rZ   x  s    
zBambaRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur2|tj|tj }|djddd}|t	|| j
  }| j|| S Nr6   r`   T)Zkeepdim)r9   rh   r*   r   r   r   silupowmeanrsqrtr   r   )rV   r   gateinput_dtypevariancer/   r/   r0   r   }  s    zBambaRMSNormGated.forward)r   )Nr&   r'   r(   rZ   r   r   r/   r/   r|   r0   r   w  s   r   )input_tensorpad_sizec                 C   sH   t | jdkr"ddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   Zconstant)moder   )rg   rc   r*   r   r   pad)r   r   Z	pad_shaper/   r/   r0   pad_tensor_by_size  s    2r   c                 C   s\   t | |} t| jdkr4| | jd d|| jd S | | jd d|| jd | jd S dS )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r	   r   r`   r6   N)r   rg   rc   r   )r   r   
chunk_sizer/   r/   r0   reshape_into_chunks  s    
r   c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r`   .Nr7   Zdiagonalr   rj   ra   )
sizer   r*   Ztrilr   r8   boolmasked_fillcumsuminf)r   r   maskZtensor_segsumr/   r/   r0   segment_sum  s    
  r   c                 C   sN   |durJ|j d dkrJ|j d dkrJ| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rc   r9   rh   )r   r   r9   r/   r/   r0   apply_mask_to_padding_states  s    $ r   c                       s   e Zd ZdZeed fddZdeje	e
 e	ej e	ej e	ej dddZde	e
 e	ej e	ej d	d
dZde	e
 e	ej e	ej e	ej dddZ  ZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r   c                    s  t    |j| _|j| _|j| _|j| _t	|j
| j | _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _dtdf| _d| _d| _ | jd| j | j  | _!t"j#| j!| j!|j| j| j!| jd d| _$| j| j! | j }t"j%| j|| jd| _&t"'t()| j| _*t(+d| jd }t"'t(,|| _-d	| j-_.t/| j| jd
| _0t"'t()| j| _1d	| j1_.t"j%| j| j| jd| _2t3st45d n
t45d d S )Nr   r   gMbP?g?r6   r   )Zin_channelsZout_channelsr   Zkernel_sizegroupspaddingr   Tr   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)6rw   rZ   rQ   	num_headsrO   rH   rX   rG   rW   r-   rN   intermediate_sizer]   Zmamba_conv_biasuse_conv_bias
hidden_act
activationr   actZmamba_proj_biasZuse_biasrms_norm_epsZlayer_norm_epsilonrP   n_groupsrR   r   Zmamba_chunk_sizer   r   time_step_limitZtime_step_minZtime_step_maxconv_dimr   ZConv1dconv1dr   in_projr   r*   r   dt_biasarangelogA_logZ_no_weight_decayr   normDout_projis_fast_path_availableloggerwarning_once)rV   r4   r]   Zprojection_sizeAr|   r/   r0   rZ     s\    

	zBambaMixer.__init__N)r   cache_paramsr   r   r%   c                 C   s  t ||}| |}|j\}}}	| j| j }
|d uo|jo|dko|j| j jd |j| j jd   kop|kn  o|d uo|d dk}|rR|	dj
| j| j| jgdd\}}}t||j| j | jj	d| jj| j}tj
|| j|
|
gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}|  |d d d df }nFt| j  }| j!d	td
fkr|i nd| j!i}| j"r|d u rt#|| jj	d| jj| j|f| j| j$|| j| jj| jj%| j j| j j| j| jddd|}n|j
| j| j| jgdd\}}}|d ur^|&dd}t'j()|| j*|jd  df}|j| j +| | jdvr| ,| |&dddd |f &dd}n2t-|&dd| jj	d| jj| j|d&dd}t ||}tj
|| j|
|
gdd\}}}t.|||d| j|||||| jd|||| jdf| j$| jd |d| jdd|\}}|d urt|d urt|j| j +| |||d}| ||}|  |}|S )Nr   r   r`   ra   .r   T)zr   dt_softplusr   r   Zdt_limitF)r   r   r%   r   Zrmsnorm_weightZrmsnorm_epsZoutproj_weightZoutproj_biasZheaddimZngroupsZnorm_before_gatereturn_final_statesr6   )r   Zswish)r   r   r   r   r%   )r   r   r  r%   r  r   r  )/r   r   rc   r   rX   rF   rI   r]   rJ   squeezesplitr   r   r   r#   r   r   r   r   r*   expr   r   r   r   rh   r   r   r   r   r   r   r   r   r   r!   r   r   r   r   r   r   rW   copy_r   r"   r    )rV   r   r  r   r   r%   projected_statesrB   seq_lenr@   Zgroups_time_state_sizeuse_precomputed_statesr   hidden_states_B_CdtBCr  r   r   Zhidden_states_reshapedoutZdt_limit_kwargshidden_states_B_C_transposedrI   scan_output	ssm_stater/   r/   r0   cuda_kernels_forward  s   	





<"
"

$




zBambaMixer.cuda_kernels_forward)r  r   r   c           3   
      s  |j \}}}|j}t||}|}	|	jjjjgdd\}
}}|d uo|jo|dko|j	j
 j d |jj
 j d   ko|kn  o|d uo|d dk}|r^|j	j
 jddd|j	j
< |d d dd d f |j	j
 j|j	j
 d d d d df< |j	j
 jjjjd}tj|jjd dd}jrR|jj }|}nr|d ur|dd}tj|j|j d  df}|j	j
 | |dddd |f dd}t||}tj|jjj jj gdd\}}}tj !  }|r|jj
 j}|d d dd d f d d d df }|dd"||j d j#}j$d	 "j$j d j#}tjj%|||j }t&|j'd j'd }|d
 "jj#jjtj(d}t|d	 | j|d}|)|jddd d d f }|"|jjj |j d * }|)|d|j d }|d	 |dd d d f  }|)|dj#}||d	  j|d}|jj
 |jj
 | |  |)|jddd d d f }|"|jjj |j d * }|)|d|j d }|jj
 j|j|jd}|+|j j#j}|+|j jd}t,||}|+|jj#}j-d	 "j-j d j#}|||  |j}|)|dd d d df }ntj%|j$ }t&|j'd j'd }|)||dj#! }|)||dj! }|)||dj! }|j.jj djd}|j.jj djd}j/|j/  j/  j-d	 t0|  }||d	  }||j| } fdd||||fD \}}}}|1dddd}tj2|dd}tt3|} |d d d d d d d d d d d f |d d d d d d d d d d d f  }!|!jdd}"|"d	 | 1dddddd	  }#|#jdd}$|$d	 |d d d d d f  jdd}%t|d d d d d d dd f | }&||&1ddddd	  }'|'dd d d f |d	  jdd}(|r0|jj
 d d d df j|(jd})nt4|(d d d df })tj5|)|(gdd}(tt3tj|d d d d d d df d}*|*dd}*|*d
 |(d d d d d df  jdd}+|+d d d df |+d d df  }(},t|}-|dd d d f |(d d d d d df  }.|-1dddd}/|.d|/d	  }0|%|0 }|)|djj#}|| } dkr|d d d |d d d d f }|)||d}|,d ur|d ur|jj
 |, 6||
}17|1|}2|2S )Nr`   ra   r   r   )Zshiftsdimsr:   r6   .r   ).NNr   r7   )rb   Zoutput_sizec                    s   g | ]}t | jqS r/   )r   r   )r?   tr   rV   r/   r0   rC   R  rD   z,BambaMixer.torch_forward.<locals>.<listcomp>r	   r   rj   )r   r   )8rc   r9   r   r   r  r   r   r   rF   rI   r]   rJ   Zrollrh   r8   r   r   r*   sumr  r   r   r   r   r   r   r   rW   r	  r   rX   r  r   r   r   r   r   Zsoftplusclampr   r   r   r   r   Zbmmr   Zrepeat_interleaver   r   Zpermuter   r   Z
zeros_likerd   r   r   )3rV   Zinput_statesr  r   r   rB   r  r@   r9   r
  r   r  r  r  rI   r  r   r  r  r  Zcache_devicer   ZdAZdBZdBxrJ   Zssm_states_reshapedZ
C_reshapedyr   Z
D_residualZA_cumsumLZG_intermediateGZM_intermediateMZY_diagZdecay_statesZB_decayZstatesZprevious_statesZdecay_chunkZ
new_statesr  Zstate_decay_outZC_times_statesZstate_decay_out_permutedZY_offr  Zcontextualized_statesr/   r  r0   torch_forward  s    


@
,
$"$$$P&*"&0(&
*
 zBambaMixer.torch_forward)r  r   r   r%   c                 K   s   t r&d| jjjjv r&| |||||S |d ur6td|j}|d ur|jd dkr|jd dkr||d d d d d f  	|}| 
||||S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r   r   r   r8   rt   r  NotImplementedErrorr9   rc   rh   r  )rV   r   r  r   r   r%   r   r9   r/   r/   r0   r     s    	$ zBambaMixer.forward)NNNN)NNN)NNNN)r&   r'   r(   r)   r   r-   rZ   r*   rl   r   r2   r+   r.   r  r  r   r   r/   r/   r|   r0   r     sB   F     .    S    r   c                       s$   e Zd Z fddZdd Z  ZS )BambaMLPc                    sx   t    || _|j| _|j| _tj| j| j|jd| _tj| j| j|jd| _	tj| j| j|jd| _
t|j | _d S )Nr   )rw   rZ   r4   rO   r   r   r   Zmlp_bias	gate_projup_proj	down_projr   r   act_fnrV   r4   r|   r/   r0   rZ     s    
zBambaMLP.__init__c                 C   s$   |  | | || | }|S r   )r%  r&  r#  r$  )rV   r   r%  r/   r/   r0   r     s     zBambaMLP.forwardr   r/   r/   r|   r0   r"    s   
r"  ZRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	BambaRMSNormr   c                    s&   t    tt|| _|| _dS )z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr   r   r|   r/   r0   rZ     s    
zBambaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S r   )	r9   rh   r*   r   r   r   r   r   r   )rV   r   r   r   r/   r/   r0   r     s
    zBambaRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)ro   r   rc   r   )rV   r/   r/   r0   
extra_repr  s    zBambaRMSNorm.extra_repr)r   )r&   r'   r(   rZ   r   r)  r   r/   r/   r|   r0   r(    s   r(  c                       s   e Zd Zdeeed fddZedddddej	e
ej	 e
ej e
e e
e e
e e
ej e
eej	ej	f  ee eeje
eejejf  f d
ddZ  ZS )BambaDecoderLayerr5   )r4   r]   
layer_typec                    s   t    d}|dkrtnd }||| _t|j|jd| _t|j|jd| _|| _	|dkrjt
||d| _n|dkrt||| _ntdd S )Nr   r   r5   r   	attentionzInvalid layer_type)rw   rZ   r"  feed_forwardr(  rO   r   input_layernormpre_ff_layernormr+  r   r5   r   	self_attn
ValueError)rV   r4   r]   r+  Znum_expertsZffn_layer_classr|   r/   r0   rZ     s    

zBambaDecoderLayer.__init__r   r   r   r   NF)
r   r   r   r   output_attentions	use_cacher   r   r   r_   c	                 K   s   |}
|  |}| jdkr8| jf ||||d|	}d}n0| jdkrh| jf ||||||||d|	\}}|
| }|}
| |}| |}|
| }|f}|r||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r5   )r   r  r   r   Nr,  )r   r   r   r   r2  r3  r   r   )r.  r+  r5   r0  r/  r-  )rV   r   r   r   r   r2  r3  r   r   r   ZresidualZself_attn_weightsoutputsr/   r/   r0   r     sD    #


	



zBambaDecoderLayer.forward)r5   )NNNFFNN)r&   r'   r(   r   r-   rn   rZ   r   r*   rl   r   r+   r2   r   ro   r   r$   FloatTensorr   r   r/   r/   r|   r0   r*    s*          r*  c                       sD   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZ fddZ  ZS )BambaPreTrainedModelr4   modelTr*  r   c                    sR   t  | t|trN|jjd tt	d|j
d |j_|jjd d S )Ng      ?r   )rw   _init_weightsry   r   r   dataZfill_r*   r   r   r   r   r   )rV   r   r|   r/   r0   r8  A  s
    
z"BambaPreTrainedModel._init_weights)r&   r'   r(   r   r,   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_is_statefulr8  r   r/   r/   r|   r0   r6  5  s   
r6  c                       s   e Zd Zed fddZeedeej	 eej
 eej	 ee eej ee ee ee eej	 ee edddZej
ej
ej
eedd	d
Zeej
eeejej
edddZdd Z  ZS )
BambaModelr3   c                    s   t  | |j| _|j| _t|j|j| j| _g }t	|j
D ]}|t|||j| d q@t|| _|j| _t|j|jd| _t|d| _d| _|   d S )N)r]   r+  r   r3   F)rw   rZ   Zpad_token_idZpadding_idx
vocab_sizer   Z	EmbeddingrO   embed_tokensrL   rM   rS   r*  rE   Z
ModuleListlayersr   r(  r   final_layernormrp   
rotary_embgradient_checkpointing	post_init)rV   r4   Zdecoder_layersrY   r|   r/   r0   rZ   K  s    zBambaModel.__init__N)	input_idsr   r   r   inputs_embedsr3  r2  output_hidden_statesr   r   r_   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}|d u r| 	|}|}|r|d u rtd |	d u rt
j|jd |jd}	|d u r|	d}| |||	||}| ||	}| ||}|rdnd }|rdnd }| jD ]t}|jd	kr.|n|}|rB||f7 }||f||||||	|d
|
}|d }|r|d d ur||d f7 }q| |}|r||f7 }|r|jsd|_|sd n|}t||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r:   r   r/   r5   )r   r   r   r2  r3  r   r   T)last_hidden_stater   r   
attentions)r4   r2  rD  r3  r1  r@  r   r   r   r<  r*   r   rc   r8   r   _update_causal_mask_update_mamba_maskr?  r=  r+  r>  rF   r   )rV   rB  r   r   r   rC  r3  r2  rD  r   r   r   r   
mamba_maskr   Zall_hidden_statesZall_self_attnsZdecoder_layerZ
layer_maskZlayer_outputsZ
next_cacher/   r/   r0   r   ^  s|    




	

zBambaModel.forward)r   r   r   r   r2  c                 C   s   | j jdkr$|d ur d|v r |S d S |d ur4| nd}| j jdkr`|s`tj|||| jdr`d S |j}|jd }t|t	j
r|jd n
|| d }	| j|||	|||jd d}
| j jdkr|d ur|jjd	v r|st	|j}t|
|}
|
S )
NZflash_attention_2r   r   Zsdpa)rC  Zpast_key_values_lengthZis_trainingr   r`   )sequence_lengthtarget_lengthr9   r   rB   )r   ZxpuZnpu)r4   r   rk   r   Z_ignore_causal_mask_sdpar   r9   rc   ry   r*   rl   5_prepare_4d_causal_attention_mask_with_cache_positionr8   rt   finfominZ_unmask_unattended)rV   r   r   r   r   r2  Zpast_seen_tokensr9   rJ  rK  r   	min_dtyper/   r/   r0   rG    sL    





	zBambaModel._update_causal_mask)r   rJ  rK  r9   r   rB   c                 K   s~  | dur|   dkr| }n^t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durz|
 }| jd }	| ddddddf | ddddddf kdddd| dddf |}
|ddddddd|	f |
 }|dk}|ddddddd|	f |||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )Z
fill_valuer9   r8   r   r   r:   r`   r   )rb   r*   rM  rN  fullr8   Ztriur   r   r   clonerc   rh   r   )r   rJ  rK  r9   r   rB   r   r   rO  Zmask_lengthZpadding_attention_maskZpadding_maskr/   r/   r0   rL    s0     $

.$  z@BambaModel._prepare_4d_causal_attention_mask_with_cache_positionc                 C   s.   |}|d dks&|dur*t |dkr*d}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r*   all)rV   r   r   rI  r/   r/   r0   rH  6  s    "zBambaModel._update_mamba_mask)	NNNNNNNNN)r&   r'   r(   r   rZ   r   r   r   r*   r+   rl   r2   r5  r   r   r$   r   r   rG  staticmethodr-   r9   rL  rH  r   r/   r/   r|   r0   r:  I  sP            d<7r:  c                       s   e Zd ZdgZddiZddgdgfiZ fddZeede	e
j e	e
j e	e
j e	e e	e
j e	e
j e	e e	e e	e e	e
j eee
jf ed
ddZdddZ  ZS )BambaForCausalLMzlm_head.weightlm_headZcolwise_repr   logitsc                    sH   t  | t|| _|j| _tj|j|jdd| _|j	| _	| 
  d S )NFr   )rw   rZ   r:  r7  r;  r   r   rO   rU  z_loss_coefficientrA  r'  r|   r/   r0   rZ   H  s    
zBambaForCausalLM.__init__Nr   )rB  r   r   r   rC  labelsr3  r2  rD  r   logits_to_keepr_   c                 K   s   |dur|n| j j}|	dur |	n| j j}	| jf ||||||||	|
d	|}|j}t|trht| dn|}| |dd|ddf }d}|dur| j	f ||| j j
d|}| jdkr|jddj|jdd }|| j|  }t|||j|j|jd	S )
aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	rB  r   r   r   rC  r3  r2  rD  r   )rV  rX  r;  r   r`   ra   r   r6   )lossrV  r   r   rF  )r4   r2  rD  r7  rE  ry   r-   slicerU  Zloss_functionr;  rW  Z	logsumexprh   r9   r   r   r   r   r   rF  )rV   rB  r   r   r   rC  rX  r3  r2  rD  r   rY  r   r4  r   Zslice_indicesrV  rZ  Zz_lossr/   r/   r0   r   R  s@    '

 zBambaForCausalLM.forwardTc              	   K   s  |d u }	|	sj|d us&|d |j d krD|d d |j d  d f }q|j d |j d kr|d d |f }nt| j|j d | j| jd}|d ur|d u r| dd }||dkd |	s|d d |j d  d f }|d ur|	rd|i}
nd| i}
|
	||||| jj
|d |
S )Nr`   r   r   r:   rC  rB  )r   r   r3  r   rY  r   )rc   r2   r4   r9   r8   longr   Zmasked_fill_r   re   Znum_logits_to_keep)rV   rB  r   r   rC  r   r   r3  r   Zempty_past_kvZmodel_inputsr/   r/   r0   prepare_inputs_for_generation  s<    

z.BambaForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNr   )NNNNNT)r&   r'   r(   Z_tied_weights_keysZ_tp_planZ_pp_planrZ   r   r   r   r*   r+   rl   r2   r5  r   r   r-   r   r   r]  r   r/   r/   r|   r0   rT  B  sL   
           P      rT  )r:  rT  r6  )r   )Nr   )Qtypingr   r   r   r   r   r*   r   Ztransformers.activationsr   Zcache_utilsr
   Z
generationr   Zintegrationsr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zutils.import_utilsr   r   Zconfiguration_bambar   Z+mamba_ssm.ops.triton.selective_state_updater   Z!mamba_ssm.ops.triton.ssd_combinedr    r!   Zcausal_conv1dr"   r#   Z
get_loggerr&   r   r$   r2   Modulerp   r   rl   r-   r   r   r   r   r   r   r   r   r   rR  r   r   r   r"  r(  r*  r6  r:  rT  __all__r/   r/   r/   r0   <module>   s   
]$ 
(G   da y 