a
    hE                     @   s   d dl mZmZmZ e r.ddlZddlmZ e r@ddlmZ ddlZe	e
Zg dZdd Zd	d
 ZejddejeejdddZG dd dejZdd Zdd Zdd Zdd Zdd Zd!ddZd"dd ZdS )#   )is_accelerate_availableis_torch_availablelogging    N)nn)init_empty_weights)g        g      ?g      ?g      ?g       @g      @g      @g      @g       g      g      g      g       g      g      g      c                 C   s.   |j jj}|| tjtjdd\} }| |fS )N   )Zaxis)Znumerics_detailsZmxfpdowncast_to_mxfp_torchtotorchbfloat16uint8)wtriton_kernels_hubr	   w_scale r   [/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/integrations/mxfp4.pyquantize_to_mxfp43   s    
r   c           
      C   sn   |j j|j j|j j  }}}|jj}|jjj}|jdd\}}	||| |d|fi |	} ||||}| |fS )zE
    Changes the layout of the tensors depending on the hardware
    r   )Zmx_axisdtype)tensorFP4convert_layoutwrap_torch_tensorZtensor_detailslayoutStridedLayoutZ"make_default_matmul_mxfp4_w_layout)
r   r   r   r   r   r   r   r   Zvalue_layoutZvalue_layout_optsr   r   r   swizzle_mxfp49   s    

r   i   )r   rows_per_chunk)r   r   returnc                C   s  ddl }| js(tj r(|  } | }|tjd }| jdd |jkslJ d| jdd d|jtjt	|| j
d}| j^ }}}||| }	| |	|} ||	d}tj|	|d	 || j
d}
td|	|D ]}t|| |	}| || }||| }|d
@ tj}|d? tj}|
|| }|| |ddddd	f< || |ddddd	f< tj|||d ~~~~~q|
jg |||d	 R  jg ||| d	 R  }
~ ~~|
dd	 S )zw
    Convert the mxfp4 weights again, dequantizing and makes them compatible with the forward
    pass of GPT_OSS.
    r   N   zblocks.shape[:-1]=z does not match scales.shape=)r   devicer   r         )out)mathZis_cudar   cudais_availabler
   int32shaper   
FP4_VALUESr!   prodreshapeemptyrangeminlongldexpview	transpose
contiguous)blocksscalesr   r   r%   ZlutZprefix_shapeGBZ
rows_totalr$   Zr0r1ZblkexpZidx_loZidx_hisubr   r   r   convert_moe_packed_tensorsM   s4    44r<   c                       s0   e Zd Z fddZejejdddZ  ZS )Mxfp4GptOssExpertsc                    sR  t    |j| _|j| _|j| _tjtj	| jd| j | jd dtj
ddd| _tjtj	| jd| j | jd tj
ddd| _tjtj	| jd| j tjddd| _tjtj	| j| j| jd dftj
ddd| _tjtj	| j| j| jd tj
ddd| _tjtj	| j| jtjddd| _d| _t|dd	| _d | _d | _t|dd	| _d S )
Nr          r   FZrequires_gradgZd;?Zswiglu_limitg      @)super__init__Znum_local_expertsZnum_expertsintermediate_sizehidden_sizer   	Parameterr   Zzerosr   Zgate_up_proj_blocksZgate_up_proj_scalesfloat32gate_up_proj_biasZdown_proj_blocksZdown_proj_scalesdown_proj_biasalphagetattrlimitgate_up_proj_precision_configdown_proj_precision_config)selfconfig	__class__r   r   rB      s>    
"  zMxfp4GptOssExperts.__init__)hidden_statesr   c                 C   s   t jjt jjt jj  }}}t jj}tj|jx ||d|d| j	| j
fd}	||| j| jtj||| jd |	d}
||
| j| jtj||| j|jd}W d    n1 s0    Y  |S )Nswiglu)rI   rK   r   )gather_indxprecision_configgammasZfused_activation)scatter_indxrU   rV   )r   
matmul_ogsFnSpecsFusedActivationrS   	swiglu_fnr   r&   r!   rI   rK   gate_up_projrG   r
   rF   rL   	down_projrH   rM   	gate_scal)rN   rR   routing_data
gather_idxscatter_idxrY   rZ   rX   r[   ZactZintermediate_cache1Zintermediate_cache3r   r   r   forward   s6    
$	zMxfp4GptOssExperts.forward)__name__
__module____qualname__rB   r   Tensorrb   __classcell__r   r   rP   r   r=      s   $r=   c                 C   s  dd l }tjjtjjtjjtjjf\}}}}tj	| j	 tj
 }t|jdd}d}	| jd }
| jd }|| }|| }|d | }|
| }dd }|| |\}}tj|dd}tj|dd\}}t|d|}|d}tj|||d d	|| }|dtj}d
}t||k ||}tj|ddtj}t|tj}t||k ||	}t||k||	}t||	k|	|}|| }t|| |	k|	|}|| | d}|| | d}||||}|}W d    n1 s0    Y  ||||||||fS )Nr   Z
LOCAL_RANK0r    r   c                 S   sF   t j|  dddd d d |f }| }t j| |dd}|| fS )Nr   T)dimstableri   )r   argsortr0   Ztake_along_dimint)valskZtk_indxZtk_valr   r   r   topk   s    "z routing_torch_dist.<locals>.topkrk   )Zbinsmaxi  T)rj   )Zsrc_indxZdst_indx)osr   routing
GatherIndxRoutingDataScatterIndxcompute_expt_data_torchr   r&   r!   distributedZget_world_sizerm   environgetr)   ZsoftmaxsortZgatherr,   Zhistcr2   r
   r(   whererl   )ZlogitsZn_expts_actrr   rt   ru   rv   rw   Z
world_sizerankZreplace_valueZn_tokensZn_expts_totZn_local_expertsZlocal_expert_startZlocal_expert_endZn_gates_padrp   Z	expt_scalZ	expt_indxZsort_indiceshistvarZ	topk_indxZ	gate_indxr^   rT   rW   Z	expt_dataZhit_expertsr   r   r   routing_torch_dist   sL    



$r   c           
      C   s   dd l m} | r,| r,t| dr,t}ntjj}|jd }|	d| j
j}tj|| j
j| j
j}tj|j$ ||| j
j\}}}W d    n1 s0    Y  | ||||}	|		|d| j
j}	|	|fS )Nr   Z
_is_hookedr    )Ztorch.distributedrx   r'   Zis_initializedhasattrr   r   rs   r)   r,   ZrouterZ
hidden_dimr   Z
functionalZlinearweightZbiasr   r&   r!   Ztop_kZexperts)
rN   rR   distrs   Z
batch_sizeZrouter_logitsr_   r`   ra   Z
routed_outr   r   r   mlp_forward  s    
2r   c                    s(   d |  t fdd|D s$dS dS )N.c                 3   s.   | ]&}t | d  p$t |  V  qdS )z\.N)rematch).0keyZcurrent_key_name_strr   r   	<genexpr>&  s   z(should_convert_module.<locals>.<genexpr>TF)joinany)current_key_namepatternsr   r   r   should_convert_module$  s    
r   c                 K   s   ddl m} |d}|d}|d}	|d}
|d}|d}d	D ]}||v rL|d urz||||||	|
||d
d	}| d}| d}t| |ddd | t| |rLt| |rLtt| |t| |}|dkrtj	
 rtj	  t| |tj|| t| | t| | qLd S )Nr   shard_and_distribute_modulemodelempty_paramcasting_dtypeto_contiguousr}   device_mesh)r\   r]   F)	set_param_blocks_scalesr   r   cpu)integrations.tensor_parallelr   rz   setattrrsplitr   r<   rJ   r   r&   r'   Zempty_cacher   rE   r
   delattr)module
param_nameparam_valuetarget_deviceZdq_param_namekwargsr   r   r   r   r   r}   r   projblocks_attrscales_attrZdequantizedr   r   r   
dequantize-  s>    









r   c              	   K   sn  |j j|j j|j j  }}}ddlm}	 |d}
|d}|d}|d}|d}|d}d	|v r|d
d dd }d|v r|d
d dd }|dur|	|
||||||| n$t| |	d
dd t
jj|dd | d}| d}t| |}t| |}|jjdkrj|jjdkrj|d}|dkrV||| jd d}n||d| jd }t|d|dkrd}|| }|| }t
j|0 t|dd|dd|\}}W d   n1 s0    Y  |dkrt
|| j| jd g|_nt
|| j| jg|_t| || t| | d|||| dd t| | t| | ~dS )zq
    This transforms the weights obtained using `convert_gpt_oss.py` to load them into `Mxfp4GptOssExperts`.
    r   r   r   r   r   r   r}   r   r5   r   r    r   r   r6   r   Nr   Fr@   metar\   typer   r&   Z_precision_config)Zrhs_data)weight_scaleZflex_ctx)rX   PrecisionConfigFlexCtx
InFlexDatar   r   rz   splitr   r   r   r   rE   rJ   r!   r   sizer,   rC   r
   r4   r&   r   r3   SizerD   r)   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r}   r   r   r   r   r5   r6   Zlocal_expertsZtriton_weight_tensorr   r   r   r   load_and_swizzle_mxfp4Q  sb    






$





(


r   Fc           
   	   C   s   |d u rg }|   D ]\}}|| t||s<|d q|jjdkr|jst " t|| j	|< d}W d    n1 s|0    Y  |jjdkr|jsddl
m} |t||_tt| dkrt||||||d\}	}|d q| |fS )Nr    ZGptOssExpertsTZ	GptOssMLPr   )
MethodType)has_been_replacedrO   )Znamed_childrenappendr   poprQ   rc   r   r   r=   Z_modulestypesr   r   rb   lenlistchildren_replace_with_mxfp4_linear)
r   modules_to_not_convertr   quantization_configr   rO   namer   r   _r   r   r   r     s2    


"
r   c                 C   sz   |j r
| S ddlm} |da|d u r,dgn|}|jd urF||j tt|}t| ||||d\} }|svt	
d | S )Nr   )
get_kernelz kernels-community/triton_kernelsZlm_head)rO   zYou are loading your model using mixed-precision FP4 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r   Zkernelsr   r   r   extendr   setr   loggerwarning)r   r   r   r   rO   r   r   r   r   r   replace_with_mxfp4_linear  s(    

r   )NNNFN)NNNN)utilsr   r   r   r   r   Z
accelerater   r   Z
get_loggerrc   r   r*   r   r   r   r   rm   rf   r<   Moduler=   r   r   r   r   r   r   r   r   r   r   r   <module>   sD   
6ID	$E     
'    