a
    h[I                 	   @   sV  d dl mZmZmZmZmZ d dlZd dlm  m	Z
 d dlmZ d dlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z, e* r$d dl-m.Z. d dl/m0Z0m1Z1 ndZ.e) rBd dl2m3Z3m4Z4 nd\Z4Z3e$ rjd dl5m6Z6 ddl7m8Z8 e%9e:Z;dd Z<dRddZ=ej>e?ej>dddZ@dSejAej>ej>ej>eej> eBeBd!d"d#ZCG d$d% d%ejAZDG d&d' d'ZEej>e?d(d)d*ZFd+d, ZGd-d. ZHeIe.e3e4fZJd/d0 ZKG d1d2 d2ejAZLG d3d4 d4ejjAZMG d5d6 d6ejAZNG d7d8 d8ed9d:ZOG d;d< d<ejAZPG d=d> d>ejAZQG d?d@ d@ejAZRG dAdB dBejAZSG dCdD dDeZTe"G dEdF dFeZUG dGdH dHejAZVe"G dIdJ dJeUZWdTeej>eXej> df ee? eej> eej>e?f dLdMdNZYG dOdP dPeUeZZg dQZ[dS )U    )AnyCallableOptional	TypedDictUnionN)nn)ACT2FN   )Cache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg)is_causal_conv1d_availableis_mamba_2_ssm_available   )GraniteMoeHybridConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNN)	BlockMask)make_flex_block_causal_maskc                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1Zx2 r0   z/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.pyrotate_halfA   s    r2   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer2   )qkcossinposition_idsZunsqueeze_dimZq_embedZk_embedr0   r0   r1   apply_rotary_pos_embH   s
    

r9   )hidden_statesn_repreturnc                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r+   expandreshape)r:   r;   batchnum_key_value_headsslenhead_dimr0   r0   r1   	repeat_kvc   s
    0rC           )modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr(   r	   r'   )r*   dtype)ptrainingr   )rC   num_key_value_groupsr,   matmul	transposer+   r   
functionalsoftmaxfloat32torM   rK   rO   
contiguous)rE   rF   rG   rH   rI   rJ   rK   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr0   r0   r1   eager_attention_forwardo   s    
&r^   c                       s   e Zd ZdZeed fddZedddddej	e
ej	 e
ej e
e ee
ej e
eej	ej	f  eej	e
ej	 e
eej	  f dddZ  ZS )GraniteMoeHybridAttentionz=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                    s   t    || _|| _|d u r4td| jj d |j| _|j	| _	|j
| _| j	| j | _|j| _| j| j | _d| _|j| _| j| j | j	krtd| j	 d| j dtj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j	|jd| _d S )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).bias)super__init__ra   rb   loggerwarning_once	__class____name__attention_dropouthidden_sizenum_attention_heads	num_headsrB   r@   rP   Z	is_causalZattention_multiplierrJ   
ValueErrorr   LinearZattention_biasq_projk_projv_projo_projselfra   rb   ri   r0   r1   rf      s2    

z"GraniteMoeHybridAttention.__init__past_key_valuepast_key_values4.58new_nameversionNF)r:   rI   r8   ry   	use_cachecache_positionposition_embeddingsr<   c                 K   sF  |  \}	}
}| |}| |}| |}||	|
| j| jdd}||	|
| j| jdd}||	|
| j| jdd}|d ur|nd\}}|d urt	||||\}}|d ur|||d}|
||| j|\}}t}| jjdkrt| jj }|| ||||f| jsdn| j| jd|\}}||	|
d}| |}||fS )	Nr   r(   r$   )r7   r6   r   eagerrD   )rK   rJ   r'   )sizerq   rr   rs   viewrn   rB   rR   r@   r9   updaterb   r^   ra   _attn_implementationr   rO   rk   rJ   rt   )rv   r:   rI   r8   ry   r~   r   r   rX   bszZq_len_Zquery_statesrY   rZ   r6   r7   cache_kwargsZattention_interfacer]   r[   r0   r0   r1   forward   s>    




z!GraniteMoeHybridAttention.forward)NNNFNN)rj   
__module____qualname____doc__r   intrf   r   r,   Tensorr   
LongTensorr
   booltupler   __classcell__r0   r0   rw   r1   r_      s&          r_   c                   @   s   e Zd ZdZdZejdfedddZdej	ej	e
eeeef  eej	ej	f ddd	Zejd
ddZdee
 e
dddZdS ) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNra   c                    s0  |j | _ d| _|j}|j}g | _g | _g | _t|jD ]}| j | dkr|  jt	j
 |j|j d|j |  ||dg7  _|  jt	j
 |j|j||dg7  _q6|  jt	jg g  dg7  _|  jt	jg g  dg7  _| j| q6 fddt|jD | _ fddt|jD | _d S )	NFmambar(   devicerM   r   c                    s    g | ]}t jg g  d qS r   r,   tensor.0r   
batch_sizer   r0   r1   
<listcomp>      z=HybridMambaAttentionDynamicCache.__init__.<locals>.<listcomp>c                    s    g | ]}t jg g  d qS r   r   r   r   r0   r1   r     r   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr,   zerosmamba_expandrl   mamba_n_groupsmamba_n_headsmamba_d_headr   append	key_cachevalue_cache)rv   ra   r   rM   r   conv_kernel_sizessm_state_sizeir0   r   r1   rf      sB    	
  z)HybridMambaAttentionDynamicCache.__init__)rY   rZ   rb   r   r<   c                 C   sz   | j | jd dkr*|| j |< || j|< n<tj| j | |gdd| j |< tj| j| |gdd| j|< | j | | j| fS )Nr'   r   r(   r)   )r   r+   r   r,   r-   )rv   rY   rZ   rb   r   r0   r0   r1   r     s    
z'HybridMambaAttentionDynamicCache.update)beam_idxc                 C   s   t t| jD ]}| j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< | j| j}| j| d||| j|< qdS )zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r   Zindex_selectrV   r   r   r   )rv   r   rb   r   r0   r0   r1   reorder_cache*  s    z.HybridMambaAttentionDynamicCache.reorder_cacher   )rb   r<   c                 C   s:   || j vr| j d n|}t| j|kr*dS | j| jd S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   rL   )r   r   r   r+   )rv   rb   r0   r0   r1   get_seq_length7  s    z/HybridMambaAttentionDynamicCache.get_seq_length)N)r   )rj   r   r   r   is_compileabler,   Zfloat16r   rf   r   r   r   dictstrr   r   r   r   r   r   r0   r0   r0   r1   r      s   + r   )input_tensorpad_sizec                 C   sH   t | jdkr"ddddd|ddfnddd|ddf}tjjj| |dddS )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   Zconstant)moderH   )r   r+   r,   r   rS   pad)r   r   Z	pad_shaper0   r0   r1   pad_tensor_by_sizeC  s    2r   c                 C   s\   t | |} t| jdkr4| | jd d|| jd S | | jd d|| jd | jd S dS )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r	   r   r'   r(   N)r   r   r+   r>   )r   r   
chunk_sizer0   r0   r1   reshape_into_chunksN  s    
r   c                 C   s   |  d}| d jg |   |R  } tjtj||| jtjddd}| | d} tj| dd}tjtj||| jtjddd}|| tj	 }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r'   .Nr   Zdiagonalr   rL   r)   )
r   r=   r,   Ztrilonesr   r   masked_fillcumsuminf)r   r   maskZtensor_segsumr0   r0   r1   segment_sumb  s    
  r   c                 C   sN   |durJ|j d dkrJ|j d dkrJ| j}| |dddddf  |} | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r+   rM   rV   )r:   rI   rM   r0   r0   r1   apply_mask_to_padding_statesy  s    $ r   c                       s   e Zd ZdZeed fddZdeje	e
 e	ej e	ej e	ej dddZde	e
 e	ej e	ej d	d
dZde	e
 e	ej e	ej e	ej dddZ  ZS )GraniteMoeHybridMambaLayeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r`   c                    s  t    |j| _|j| _|j| _|j| _t	|j
| j | _|| _|j| _|j| _t|j | _|j| _|j| _|j| _|j| _|j| _dtdf| _d| _d| _ | jd| j | j  | _!t"j#| j!| j!|j| j| j!| jd d| _$| j| j! | j }t"j%| j|| jd| _&t"'t()| j| _*t(+d| jd }t"'t(,|| _-d	| j-_.t/| j| jd
| _0t"'t()| j| _1d	| j1_.t"j%| j| j| jd| _2t3st45d n
t45d d S )NrD   r   gMbP?g?r(   r   )Zin_channelsZout_channelsrd   Zkernel_sizegroupspaddingrc   Tepsa  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzOThe fast path for GraniteMoeHybrid will be used when running the model on a GPU)6re   rf   r   rn   rl   r   r   r   r   r   r   intermediate_sizerb   Zmamba_conv_biasuse_conv_bias
hidden_act
activationr   actZmamba_proj_biasZuse_biasrms_norm_epsZlayer_norm_epsilonr   n_groupsr   rB   Zmamba_chunk_sizer   floattime_step_limitZtime_step_minZtime_step_maxconv_dimr   ZConv1dconv1drp   in_proj	Parameterr,   r   dt_biasarangelogA_logZ_no_weight_decayGraniteMoeHybridRMSNormGatednormDout_projis_fast_path_availablerg   rh   )rv   ra   rb   Zprojection_sizeArw   r0   r1   rf     s\    

	z#GraniteMoeHybridMambaLayer.__init__N)r:   cache_paramsr   rI   seq_idxc                 C   s  t ||}| |}|j\}}}	| j| j }
|d uo|jo|dko|j| j jd |j| j jd   kop|kn  o|d uo|d dk}|rR|	dj
| j| j| jgdd\}}}t||j| j | jj	d| jj| j}tj
|| j|
|
gdd\}}}t| j  }|d d d df d d d d d f d| j| jjtjd}|d d d d d f dd| j}| jd d d df d| j}| jd d d df d| j}||| j|jd | j }||| j|jd | j }||| j| j}t|j| j ||||||d |dd
}||| j| j }| ||}|  |d d d df }nFt| j  }| j!d	td
fkr|i nd| j!i}| j"r|d u rt#|| jj	d| jj| j|f| j| j$|| j| jj| jj%| j j| j j| j| jddd|}n|j
| j| j| jgdd\}}}|d ur^|&dd}t'j()|| j*|jd  df}|j| j +| | jdvr| ,| |&dddd |f &dd}n2t-|&dd| jj	d| jj| j|d&dd}t ||}tj
|| j|
|
gdd\}}}t.|||d| j|||||| jd|||| jdf| j$| jd |d| jdd|\}}|d urt|d urt|j| j +| |||d}| ||}|  |}|S )Nr   r   r'   r)   .rM   T)zr   dt_softplusrD   r   Zdt_limitF)r   r   r   r   Zrmsnorm_weightZrmsnorm_epsZoutproj_weightZoutproj_biasZheaddimZngroupsZnorm_before_gatereturn_final_statesr(   )siluZswish)r.   weightrd   r   r   )r   r   r   r   r   r   r   )/r   r   r+   r   r   r   r   rb   r   squeezesplitr   r   rn   r#   r   r   rd   r   r,   expr   r   r=   rB   rV   rU   r   r   r   r   r   r   r   rO   r!   r   variance_epsilonrR   r   rS   r   r   copy_r   r"   r    )rv   r:   r   r   rI   r   projected_statesr   seq_lenr   Zgroups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   Zhidden_states_reshapedoutZdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_stater0   r0   r1   cuda_kernels_forward  s   	





<"
"

$




z/GraniteMoeHybridMambaLayer.cuda_kernels_forward)r   r   rI   c           3   
      s  |j \}}}|j}t||}|}	|	jjjjgdd\}
}}|d uo|jo|dko|j	j
 j d |jj
 j d   ko|kn  o|d uo|d dk}|r^|j	j
 jddd|j	j
< |d d dd d f |j	j
 j|j	j
 d d d d df< |j	j
 jjjjd}tj|jjd dd}jrR|jj }|}nr|d ur|dd}tj|j|j d  df}|j	j
 | |dddd |f dd}t||}tj|jjj jj gdd\}}}tj !  }|r|jj
 j}|d d dd d f d d d df }|dd"||j d j#}j$d	 "j$j d j#}tjj%|||j }t&|j'd j'd }|d
 "jj#jjtj(d}t|d	 | j|d}|)|jddd d d f }|"|jjj |j d * }|)|d|j d }|d	 |dd d d f  }|)|dj#}||d	  j|d}|jj
 |jj
 | |  |)|jddd d d f }|"|jjj |j d * }|)|d|j d }|jj
 j|j|jd}|+|j j#j}|+|j jd}t,||}|+|jj#}j-d	 "j-j d j#}|||  |j}|)|dd d d df }ntj%|j$ }t&|j'd j'd }|)||dj#! }|)||dj! }|)||dj! }|j.jj djd}|j.jj djd}j/|j/  j/  j-d	 t0|  }||d	  }||j| } fdd||||fD \}}}}|1dddd}tj2|dd}tt3|} |d d d d d d d d d d d f |d d d d d d d d d d d f  }!|!jdd}"|"d	 | 1dddddd	  }#|#jdd}$|$d	 |d d d d d f  jdd}%t|d d d d d d dd f | }&||&1ddddd	  }'|'dd d d f |d	  jdd}(|r0|jj
 d d d df j|(jd})nt4|(d d d df })tj5|)|(gdd}(tt3tj|d d d d d d df d}*|*dd}*|*d
 |(d d d d d df  jdd}+|+d d d df |+d d df  }(},t|}-|dd d d f |(d d d d d df  }.|-1dddd}/|.d|/d	  }0|%|0 }|)|djj#}|| } dkr|d d d |d d d d f }|)||d}|,d ur|d ur|jj
 |, 6||
}17|1|}2|2S )Nr'   r)   r   r   )Zshiftsdimsr   r(   .r   ).NNr   r   )r*   output_sizec                    s   g | ]}t | jqS r0   )r   r   )r   tr   rv   r0   r1   r   	  r   z<GraniteMoeHybridMambaLayer.torch_forward.<locals>.<listcomp>r	   r   rL   )r   r   )8r+   rM   r   r   r   r   r   rn   r   r   rb   r   ZrollrV   r   r   r   r,   sumr   r   rd   r   rR   r   rS   r   r   r   r   r   r   r   r   r=   rB   r   Zsoftplusclampr   rU   r>   rW   r   Zbmmr   Zrepeat_interleaver   r   Zpermuter   r   Z
zeros_liker-   r   r   )3rv   Zinput_statesr   r   rI   r   r   r   rM   r   r   r   r   r   r   r   r:   r   r   r   Zcache_devicer   ZdAZdBZdBxr   Zssm_states_reshapedZ
C_reshapedyr   Z
D_residualZA_cumsumLZG_intermediateGZM_intermediateMZY_diagZdecay_statesZB_decayZstatesZprevious_statesZdecay_chunkZ
new_statesr  Zstate_decay_outZC_times_statesZstate_decay_out_permutedZY_offr  Zcontextualized_statesr0   r  r1   torch_forward  s    


@
,
$"$$$P&*"&0(&
*
 z(GraniteMoeHybridMambaLayer.torch_forward)r   r   rI   r   c                 K   s   t r&d| jjjjv r&| |||||S |d ur6td|j}|d ur|jd dkr|jd dkr||d d d d d f  	|}| 
||||S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r   r   r   r   typer  NotImplementedErrorrM   r+   rV   r  )rv   r:   r   r   rI   r   rX   rM   r0   r0   r1   r   O  s    	$ z"GraniteMoeHybridMambaLayer.forward)NNNN)NNN)NNNN)rj   r   r   r   r   r   rf   r,   r   r   r   r   	IntTensorr  r  r   r   r0   r0   rw   r1   r     sB   F     .    S    r   c                       s(   e Zd Zd fdd	ZdddZ  ZS )	r   ư>c                    s&   t    tt|| _|| _d S )Nre   rf   r   r   r,   r   r   r   rv   rl   r   rw   r0   r1   rf   g  s    
z%GraniteMoeHybridRMSNormGated.__init__Nc                 C   sj   |j }|tj}|d ur2|tj|tj }|djddd}|t	|| j
  }| j|| S Nr(   r'   T)Zkeepdim)rM   rV   r,   rU   r   rS   r   powmeanrsqrtr   r   )rv   r:   r   input_dtypevariancer0   r0   r1   r   l  s    z$GraniteMoeHybridRMSNormGated.forward)r  )N)rj   r   r   rf   r   r   r0   r0   rw   r1   r   f  s   r   c                       s:   e Zd ZdZed fddZejejdddZ  Z	S )GraniteMoeHybridMLPz~
    MLP layer for shared experts

    Args:
        config:
            Configuration object with model hyperparameters.
    r   c                    sZ   t    |j| _|j| _t|j | _tj	| j| jd dd| _
tj	| j| jdd| _d S )Nr(   Frc   )re   rf   rl   
input_sizeZshared_intermediate_sizer   r   r   r   rp   input_linearoutput_linearrv   ra   rw   r0   r1   rf     s    
zGraniteMoeHybridMLP.__init__)r:   r<   c                 C   s<   |  |}|jddd}| |d |d  }| |}|S )Nr(   r'   r)   r   r   )r  chunkr   r  )rv   r:   chunked_hidden_statesr0   r0   r1   r     s
    

zGraniteMoeHybridMLP.forward)
rj   r   r   r   r   rf   r,   r   r   r   r0   r0   rw   r1   r  x  s   	r  c                   @   s@   e Zd ZU dZejed< ejed< eed< eed< ejed< dS )GraniteFlashAttentionKwargsa  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    Attributes:
        cu_seq_lens_q (`torch.LongTensor`)
            Gets cumulative sequence length for query state.
        cu_seq_lens_k (`torch.LongTensor`)
            Gets cumulative sequence length for key state.
        max_length_q (`int`):
            Maximum sequence length for query state.
        max_length_k (`int`):
            Maximum sequence length for key state.
        seq_idx (`torch.IntTensor):
            Index of each packed sequence.
    Zcu_seq_lens_qZcu_seq_lens_kZmax_length_qZmax_length_kr   N)	rj   r   r   r   r,   r   __annotations__r   r  r0   r0   r0   r1   r#    s   


r#  F)totalc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	GraniteMoeHybridRMSNormr  c                    s&   t    tt|| _|| _dS )zF
        GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
        Nr  r  rw   r0   r1   rf     s    
z GraniteMoeHybridRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S r  )	rM   rV   r,   rU   r  r  r  r   r   )rv   r:   r  r  r0   r0   r1   r     s
    zGraniteMoeHybridRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r   r+   r   )rv   r0   r0   r1   
extra_repr  s    z"GraniteMoeHybridRMSNorm.extra_repr)r  )rj   r   r   rf   r   r'  r   r0   r0   rw   r1   r&    s   r&  c                       s0   e Zd Zeeedd fddZdd Z  ZS )GraniteMoeHybridParallelExpertsN)num_expertsr  r  r<   c                    s6   t    tt|||| _|| _|| _|| _	dS )a  
        Initialize the GraniteMoeHybridParallelExperts module.
        The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
        many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
        [ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
        [MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
        used in vllm.

        Args:
            num_experts (int):
                Number of experts.
            input_size (int):
                Size of the input.
            output_size (int):
                Size of the output.
        N)
re   rf   r   r   r,   emptyr   r)  r  r  )rv   r)  r  r  rw   r0   r1   rf     s
    
z(GraniteMoeHybridParallelExperts.__init__c                 C   sP   |j |dd}g }t| jD ] }|t|| | j|  qtj|dd}|S )a  
        Forward pass of the GraniteMoeHybridParallelExperts module.

        Args:
            inputs (Tensor):
                Input tensor.
            expert_size:
                Expert size information.

        Returns:
            Tensor: Output tensor.
        r   r)   )	r   r   r)  r   FZlinearr   r,   r-   )rv   inputsexpert_sizeZ
input_listZoutput_listr   resultsr0   r0   r1   r     s    z'GraniteMoeHybridParallelExperts.forwardrj   r   r   r   rf   r   r   r0   r0   rw   r1   r(    s   r(  c                       s.   e Zd Zeeed fddZdd Z  ZS )GraniteMoeHybridTopKGatingr  r)  top_kc                    s2   t    || _|| _|| _tj||dd| _dS )a  
        Initialize the top-k gating mechanism.
        Args:
            input_size (`int`):
                Size of the input.
            num_experts (`int`):
                Number of experts.
            top_k (`int`):
                Number of top experts to select.
        Frc   N)re   rf   r)  r  r2  r   rp   layer)rv   r  r)  r2  rw   r0   r1   rf     s
    
z#GraniteMoeHybridTopKGating.__init__c                 C   s   |  | }|j| jdd\}}tj|dd|}tj|d| j	g|j
|jd}|d|d}| d}| }| }	|	d\}
}|j| jdd}| }|| }|||||fS )Nr   r)   r   rM   r   trunc)Zrounding_mode)r3  r   topkr2  r,   rT   Ztype_asr   r   r)  rM   r   Zscatterlongr  tolistflattensortdiv)rv   r:   logitsZtop_k_logitsZtop_k_indicesZtop_k_gatesr   Zgatesr-  Ztop_k_expertsr   Zindex_sorted_expertsbatch_indexbatch_gatesr0   r0   r1   r      s    z"GraniteMoeHybridTopKGating.forwardr/  r0   r0   rw   r1   r0    s   r0  c                       s.   e Zd ZdZed fddZdd Z  ZS )GraniteMoeHybridMoEz
    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

    Args:
        config:
            Configuration object with model hyperparameters.
    r   c                    sl   t    |j| _|j| _t|j | _t|j	| j| jd | _
t|j	| j| j| _t| j|j	|jd| _d S )Nr(   r1  )re   rf   rl   r  r   r   r   r   r(  num_local_expertsr  r  r0  num_experts_per_tokrouterr   rw   r0   r1   rf   %  s    
zGraniteMoeHybridMoE.__init__c                 C   s   |  \}}}|d|}| |\}}}}}	|| }
| |
|}|jddd}| |d |d  }| ||}||dddf  }tj|| | j	f|j
|jd}|d||}|||| j	}||	fS )a  
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
            Tensor:
                Router logits.
        r'   r(   r)   r   r   Nr4  )r   r>   rB  r  r!  r   r  r,   r   r  rM   r   Z	index_addr   )rv   Zlayer_inputr   lengthZemb_sizer   r=  r>  r-  router_logitsZexpert_inputsr:   r"  Zexpert_outputsr   Zlayer_outputr0   r0   r1   r   8  s    zGraniteMoeHybridMoE.forward)rj   r   r   r   r   rf   r   r   r0   r0   rw   r1   r?    s   r?  c                       s   e Zd Zeed fddZedddddeje	ej e	e
 e	e e	e e	ej e	e e	eejejf  ee eeje	eejejf  f d

ddZ  ZS )GraniteMoeHybridDecoderLayerr`   c                    s   t    |j| _d | _|jdkr,t|| _t|j|jd| _	t|j|jd| _
|j| _t|| _d | _|j| dkrt||| _nt||| _|j| | _t|dddk| _d S )Nr   r   r   r@  )re   rf   rl   	self_attnr@  r?  block_sparse_moer&  r   input_layernormpost_attention_layernormresidual_multiplierr  
shared_mlpr   r   r   r_   
layer_typegetattrhas_expertsru   rw   r0   r1   rf   Y  s    



z%GraniteMoeHybridDecoderLayer.__init__rx   ry   rz   r{   NF)
r:   rI   ry   output_attentionsr~   r   output_router_logitsr   rX   r<   c	              
   K   s   |}
|  |}| jdur8| jf ||||d|	}d}n$| jf |||||||d|	\}}|
|| j  }|}
| |}| jr| |\}}|| | }n| |}d}|
|| j  }|f}|r||f7 }|r||f7 }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        N)r:   r   r   rI   )r:   rI   ry   rO  r~   r   r   )rH  r   rF  rJ  rI  rN  rG  rK  )rv   r:   rI   ry   rO  r~   r   rP  r   rX   ZresidualZself_attn_weightsZmoe_hidden_statesrD  outputsr0   r0   r1   r   p  sL    &






z$GraniteMoeHybridDecoderLayer.forward)NNFFNFN)rj   r   r   r   r   rf   r   r,   r   r   r
   r   r   r   r   r#  FloatTensorr   r   r0   r0   rw   r1   rE  X  s*          rE  c                       sJ   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZ fddZ  ZS )	GraniteMoeHybridPreTrainedModelra   modelTrE  ry   Fc                    s   t  | t|tr,|jjjd| jjd t|t	rp|j
jd ttd|jd |j_|jjd nt|tr|jjd d S )NrD   )r  Zstdg      ?r   )re   _init_weights
isinstancer(  r   dataZnormal_ra   Zinitializer_ranger   r   Zfill_r,   r   r   rn   r   r   r   )rv   rE   rw   r0   r1   rU    s    


z-GraniteMoeHybridPreTrainedModel._init_weights)rj   r   r   r   r$  Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_can_compile_fullgraphZ_is_statefulrU  r   r0   r0   rw   r1   rS    s   
rS  c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	GraniteMoeHybridRotaryEmbeddinginv_freqNr   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typer  defaultrY  F)
persistent)re   rf   hasattrrV  rZ  r   getr[  max_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenra   r   Zrope_init_fnattention_scalingZregister_bufferrY  Zoriginal_inv_freq)rv   ra   r   rY  rw   r0   r1   rf     s    
z(GraniteMoeHybridRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|ddV | |  dd}t	j||fdd	}| | j }| | j }	W d    n1 s0    Y  |j|jd
|	j|jd
fS )Nr   r'   r   ZmpscpuF)device_typeZenabledr(   r)   r   )rY  r   r=   r+   rV   r   rV  r  r   r,   ZautocastrR   r-   r6   ra  r7   rM   )
rv   r.   r8   Zinv_freq_expandedZposition_ids_expandedrc  ZfreqsZembr6   r7   r0   r0   r1   r     s    0&,z'GraniteMoeHybridRotaryEmbedding.forward)N)rj   r   r   r,   r   r$  r   rf   Zno_gradr   r   r   r0   r0   rw   r1   rX    s
   

rX  c                       s   e Zd Zed fddZeedeje	ej
 e	ej e	eeeej f  e	ej e	e e	e e	e e	e e	e e	ej ee eeef dddZdeej
d	f ej
ej
eed
ddZeej
eeejej
edddZdd Z  ZS )GraniteMoeHybridModelr   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _d| _ j| _ j| _ j| _| j| j | _ j| _ j| _ j| _| jdkrt nd | _|   d S )Nc                    s   g | ]}t  |qS r0   )rE  )r   rb   r   r0   r1   r     r   z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>r   FZrope)re   rf   Zpad_token_idZpadding_idx
vocab_sizer   Z	Embeddingrl   embed_tokensZ
ModuleListr   r   layersr&  r   r   gradient_checkpointingembedding_multiplierrm   rn   rB   r`  Z
rope_thetaZposition_embedding_typerX  
rotary_emb	post_initr   rw   r   r1   rf     s$    zGraniteMoeHybridModel.__init__N)	input_idsrI   r8   ry   inputs_embedsr~   rO  output_hidden_statesrP  return_dictr   rX   r<   c                 K   sJ  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|
d urH|
n| j j}
|d u |d uA rhtd| jr| jr|rt	d d}|d u r| 
|}|| j }|r|d u rt	d |d u r|d ur| nd}tj|||jd  |jd}|d u r|d}| |||||}| ||}|}d }| jd urB| ||}|rLdnd }|rZdnd }|	rhdnd }| jD ]}|jd	kr|n|}|r||f7 }||f||||||	|d
|}|d }|r|d d ur||d f7 }|	rr|d d urr||d f7 }qr| |}|r$||f7 }|r8|js8d|_t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   r   r0   r   )rI   ry   rO  r~   r   rP  r   r'   T)Zlast_hidden_statery   r:   
attentionsrD  )ra   rO  rn  r~   use_return_dictro   rh  rO   rg   rh   rf  ri  r   r,   r   r+   r   r3   _update_causal_mask_update_mamba_maskrj  rg  rL  r   r   r   )rv   rl  rI   r8   ry   rm  r~   rO  rn  rP  ro  r   rX   past_seen_tokensr\   
mamba_maskr:   r   Zall_hidden_statesZall_self_attnsZall_router_logitsZdecoder_layerZ
layer_maskZlayer_outputsr0   r0   r1   r   !  s    






	

zGraniteMoeHybridModel.forwardFr%   )rI   r   r   ry   rO  c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2rD   Zflex_attentionr   FZsdpa)rm  Zpast_key_values_lengthZis_trainingr   r'   )sequence_lengthtarget_lengthrM   r   r   )r  ZxpuZnpu)ra   r   anyrV  r,   r   r&   r   r   r   Z_ignore_causal_mask_sdparO   rM   r+   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r  finfominZ_unmask_unattended)rv   rI   r   r   ry   rO  rt  Zusing_compilable_cacherM   rv  rw  r\   	min_dtyper0   r0   r1   rr    sZ    






	z)GraniteMoeHybridModel._update_causal_mask)rI   rv  rw  rM   r   r   c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr   )Z
fill_valuerM   r   r   r   r   r'   r   )r*   r,   rz  r{  fullr   Ztriur   r>   r=   cloner+   rV   r   )rI   rv  rw  rM   r   r   rX   r\   r|  Zmask_lengthZpadding_maskr0   r0   r1   ry    s*     $

6  zKGraniteMoeHybridModel._prepare_4d_causal_attention_mask_with_cache_positionc                 C   s.   |}|d dks&|dur*t |dkr*d}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )r,   all)rv   rI   r   ru  r0   r0   r1   rs    s    "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNNNNNN)F)rj   r   r   r   rf   r   r   r,   r   r   r   r   r
   listrR  r   r   r#  r   r   r   rr  staticmethodr   rM   ry  rs  r   r0   r0   rw   r1   rd    s\              
{ D6rd  r(   )gate_logitsr)  rI   r<   c                    s  | du st | tsdS t | trF| d j tj fdd| D dd}tjjj|dd}tj||dd\}}tjj	||}|du rtj
| dd}	tj
|dd}
n|j\}}|jd ||  }|dddddddf |||||fd|| }tj| | ddtj|dd }	|ddddddf ||||jd fd|jd  }tj|| ddtj|dd }
|jjdur|jjnd}|jd t| }t|	dd|||jd  f |
d }|| S )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   c                    s   g | ]}|  qS r0   )rV   )r   Z
layer_gateZcompute_devicer0   r1   r   A  r   z,load_balancing_loss_func.<locals>.<listcomp>r)   r'   r   )rV  r   r   r,   r-   r   rS   rT   r6  Zone_hotr  r   r+   r=   r>   rV   r  indexr   r3   )r  r)  r2  rI   Zconcatenated_gate_logitsZrouting_weightsr   Zselected_expertsZexpert_maskZtokens_per_expertZrouter_prob_per_expertr   rv  r   Zexpert_attention_maskZ router_per_expert_attention_maskZdevice_indexZrankZoverall_lossr0   r  r1   load_balancing_loss_func  sR    



&r  c                       s   e Zd ZdgZed fddZedeej	 eej
 eej	 eeeeej f  eej eej	 ee ee ee ee ee eej	 eeej
f eeef ddd	ZdddZ  ZS )GraniteMoeHybridForCausalLMzlm_head.weightr   c                    sX   t  | t|| _|j| _tj|j|jdd| _|j	| _	|j
| _|j| _|   d S )NFrc   )re   rf   rd  rT  re  r   rp   rl   lm_headrouter_aux_loss_coefr@  r)  rA  rk  r   rw   r0   r1   rf   x  s    
z$GraniteMoeHybridForCausalLM.__init__Nr   )rl  rI   r8   ry   rm  labelsr~   rO  rn  rP  ro  r   logits_to_keepr<   c                 K   s  |dur|n| j j}|
dur |
n| j j}
|	dur4|	n| j j}	|durH|n| j j}| jf ||||||||	|
||d|}|d }t|trt| dn|}| 	|dd|ddf }|| j j
 }d}|dur| }| j||fd| j ji|}d}|
r>t|r
|jn|d | j| j|}|dur>|| j||j 7 }|s~|f|dd  }|
rf|f| }|durz|f| S |S t||||j|j|j|jdS )ax  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

        >>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm/PowerMoE-3b")
        >>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)rl  rI   r8   ry   rm  r~   rO  rn  rP  ro  r   r   re  r'   r   )lossaux_lossr<  ry   r:   rp  rD  )ra   rO  rP  rn  rq  rT  rV  r   slicer  Zlogits_scalingr   Zloss_functionre  r  rD  r)  rA  r  rV   r   r   ry   r:   rp  )rv   rl  rI   r8   ry   rm  r  r~   rO  rn  rP  ro  r   r  rX   rQ  r:   Zslice_indicesr<  r  r  outputr0   r0   r1   r     sx    (

z#GraniteMoeHybridForCausalLM.forwardTc                 K   s  |d u }	|	sj|d us&|d |j d krD|d d |j d  d f }q|j d |j d kr|d d |f }n |rt| j|j d | j| jd}|d ur|d u r| dd }||dkd |	s|d d |j d  d f }|d ur|	rd|i}
nd| i}
|
	|||||d |
S )Nr'   r   r   r   rm  rl  )r8   ry   r~   rI   r   )
r+   r   ra   rM   r   r7  r   Zmasked_fill_rW   r   )rv   rl  ry   rI   rm  r   r8   r~   rX   Zempty_past_kvZmodel_inputsr0   r0   r1   prepare_inputs_for_generation  s<    
	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation)NNNNNNNNNNNNr   )NNNNNT)rj   r   r   Z_tied_weights_keysr   rf   r   r   r,   r   r   r   r
   r  rR  r   r   r   r   r   r  r   r0   r0   rw   r1   r  u  sN                
p      r  )r  rd  rS  )Nr   )rD   )Nr(   N)\typingr   r   r   r   r   r,   Ztorch.nn.functionalr   rS   r+  Ztransformers.activationsr   Zcache_utilsr
   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zutils.import_utilsr   r   Zconfiguration_granitemoehybridr   Z+mamba_ssm.ops.triton.selective_state_updater   Z!mamba_ssm.ops.triton.ssd_combinedr    r!   Zcausal_conv1dr"   r#   Z!torch.nn.attention.flex_attentionr%   Zintegrations.flex_attentionr&   Z
get_loggerrj   rg   r2   r9   r   r   rC   Moduler   r^   r_   r   r   r   r   r  r   r   r   r   r  r#  r&  r(  r0  r?  rE  rS  rX  rd  r   r  r  __all__r0   r0   r0   r1   <module>   s   

 W`   d-0<q$     V 9