a
    h
K                     @   sl  d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ ed
ddZe
j dddZ!e" dd Z#e" e$dddZ%e$dddZ&e" e$dddZ'e"ddd Z(e$dddZ)eddG d d! d!Z*G d"d# d#eZ+dS )$    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 C   s   t tttf ddd}|S )Nreturnc                 S   s4   | j j}|j j}||ks J d|dkr,dS dS d S )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )ZscalarZprimitive_bitwidth)Zlhs_typeZrhs_typeZlhs_bitwidthZrhs_bitwidth r   [/var/www/html/assistant/venv/lib/python3.9/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibility   s    z-min_dot_size.<locals>.check_dot_compatibility)r   int)r   r   r   r   r   min_dot_size   s    	r   r   c                   C   s   t jjS N)r	   r   ptxasr   r   r   r   	get_ptxas!   s    r    c                  C   s0   t jj} | d ur| S tt jdgd}|S )Nz	--versionutf-8)r	   r   Zmock_ptx_version
subprocesscheck_outputr    pathdecode)Zmock_verversionr   r   r   get_ptxas_version%   s
    r'   c                 C   sv   t | tsJ tt| d\}}|dkrF|dk r:d| S d| d S |dkrVd| S |dkrfd	| S td
|  dS )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P         F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: N)
isinstancestrmapr   splitRuntimeError)cuda_versionmajorminorr   r   r   ptx_get_version.   s    r9   )archc                 C   s"   | j }|d u rt j}t|}|S r   )ptx_versionr    r&   r9   )optionsr:   r;   r6   r   r   r   get_ptx_version_from_optionsA   s
    r=   c                 C   s"   t | |}td|}d| }|S )NV   z+ptx)r=   min)r<   r:   r;   Zllvm_ptx_versionfeaturesr   r   r   get_featuresI   s    


rA   c                 C   s@   t | d"}t|  W  d    S 1 s20    Y  d S )Nrb)openhashlibsha256read	hexdigest)r$   fr   r   r   	file_hashW   s    rI   )
capabilityc                 C   s   | dkrdnd}d|  | S )NZ   a Zsm_r   )rJ   suffixr   r   r   sm_arch_from_capability]   s    rO   T)frozenc                   @   s.  e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZe	e ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"d# Z d$d% Z!dS )&CUDAOptions   	num_warpsr,   num_ctas   
num_stagesNmaxnreg)r,   r,   r,   cluster_dimsr;   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)Zfp8e5fp8e4b15supported_fp8_dtypesr   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)ra   Ztf32x3Zieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowr:   c                 C   s   t tjd }| jd u ri nt| j}|dd sJtjjpDt	|d |d< t
| dt|  | jdkr~| j| jd @ dksJ dd S )Nlib	libdevicezlibdevice.10.bcre   r   r,   znum_warps must be a power of 2)r   __file__parentre   dictgetr	   r   Zlibdevice_pathr2   object__setattr__tupleitemsrS   )selfZdefault_libdirre   r   r   r   __post_init__}   s     zCUDAOptions.__post_init__c                 C   sX   t | j}tdd t|d D |d< ddd t| D }t|d	 S )Nc                 s   s   | ]\}}|t |fV  qd S r   )rI   ).0kvr   r   r   	<genexpr>       z#CUDAOptions.hash.<locals>.<genexpr>re   _c                 S   s   g | ]\}}| d | qS )-r   )rv   namevalr   r   r   
<listcomp>   rz   z$CUDAOptions.hash.<locals>.<listcomp>r!   )
rn   __dict__rr   sortedjoinrs   rD   rE   encoderG   )rt   Z	hash_dictkeyr   r   r   hash   s    
zCUDAOptions.hash)"__name__
__module____qualname__rS   r   __annotations__rT   rV   rW   r   rX   rr   r;   rY   r2   rZ   r[   boolr\   r]   r_   r   r`   rb   rc   rd   re   rn   rf   rh   ri   r:   ru   r   r   r   r   r   rQ   c   s.   

rQ   c                       s   e Zd ZeedddZdd ZedddZed	d
 fddZ	e
dddZdd Zdd Zeeef dddZdd Zedd Zedd Zdd Zdd Zdd  Zd!d" Zd#d$ Ze d%d& Z  ZS )'CUDABackendr   c                 C   s
   | j dkS )Nrg   )backendr   r   r   r   supports_target   s    zCUDABackend.supports_targetc                 C   s0   d}t ||}|s"td| t|dS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r,   )re	fullmatch
ValueErrorr   group)rt   r:   patternmatchr   r   r   _parse_arch   s
    zCUDABackend._parse_archr   c                 C   s   |  |j}d| S )Ncuda:)r   r:   )rt   r<   rJ   r   r   r   get_target_name   s    zCUDABackend.get_target_nameN)r   r   c                    s   t  | d| _d S )Ncubin)super__init__Z
binary_ext)rt   r   	__class__r   r   r      s    zCUDABackend.__init__c                    s   dt jjpd| jj i}| fddtj D  t	| 
|d }d|vr~ttj}|dkrn|d tt||d< d|vr|d	krd
|d< d|vrt jj|d< |d	krdnd|d< tf i |S )Nr:   smc                    s*   i | ]"}| v r | d ur| | qS r   r   )rv   rw   optsr   r   
<dictcomp>   rz   z-CUDABackend.parse_options.<locals>.<dictcomp>r_   Y   Zfp8e4nvr`   rK   )r^   r[   i   @r   rd   )r	   ZruntimeZoverride_archr   r:   updaterQ   __dataclass_fields__keysr   r   setr_   addrr   r   languageZdefault_fp_fusion)rt   r   argsrJ   r_   r   r   r   parse_options   s    

zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r,      )rS   rT   sharedrX   )rt   metadatar   r   r   pack_metadata   s    zCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t| |j}|dkr6|jn|jt	| j
d}|S )Nr   r+   )Zconvert_custom_typesr   )triton.language.extra.cudar   extrarg   r   r   r:   Zconvert_custom_float8_sm80Zconvert_custom_float8_sm70r   r   )rt   r<   rg   rJ   Zcodegen_fnsr   r   r   get_codegen_implementation   s    z&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )rk   ztriton.language.extra.libdevice)r   rk   )rt   rk   r   r   r   get_module_map   s    zCUDABackend.get_module_mapc                 C   s   t | d S r   )r   load_dialects)rt   ctxr   r   r   r      s    zCUDABackend.load_dialectsc                 C   s   t | j}|  tj| tj| |d dk rDtj	| tj
| tj| tj| tj| tj| tj| ||  | S )Nr/   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttirZadd_rewrite_tensor_pointerZ(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizerZadd_combineZadd_reorder_broadcastadd_cseadd_symbol_dceZadd_loop_unrollrun)modr   optrJ   pmr   r   r   	make_ttir   s    
zCUDABackend.make_ttirc                 C   sx  |j d ur&| dt| j|j  t }|jd ur\|jd |_	|jd |_
|jd |_t| j}| }tj|d| |jd|j tj| |d dkrtj| tjj|| tj| tj| tj| tj| tj||d	k tjj| tj| |d d
v rtj| tj | tj!| tj | tj"| tjj#$||j%| tj&||j% tj'| tj(||j%| n|d dkr^tj| tj | tj!| tj)| tj*| tjj+| tj&||j% tj'| tj,||j% tj(||j%| tj"| tjj-| ntj!| tj | tj| tj.| tj||d	k tj/| tjj0| tj| tjj1| tj2| tj3| tj| tj4| |d dkr.tjj5| tjj6| tj7| tj | |8|  |j	|j
|jf|d< | 9 }||d< | S )Nzttg.maxnregr   r,   r   r   r   r/   r   r+   )r   r   r   rX   tensordesc_meta):rW   Zset_attrr   builderr   Zget_int32_attrr   ZClusterInforX   ZclusterDimXZclusterDimYZclusterDimZr   r   r   r   Zadd_convert_to_ttgpuirrS   rT   ttgpuirZadd_coalesceZadd_f32_dot_tc	ttnvgpuirZadd_plan_ctaZadd_remove_layout_conversionsZadd_optimize_thread_localityZadd_accelerate_matmulZadd_optimize_dot_operandsZ add_optimize_descriptor_encodingadd_loop_aware_cseZadd_fuse_nested_loopsr   r   Zadd_triton_licm add_combine_tensor_select_and_ifZhopperZadd_hopper_warpspecrV   Zadd_assign_latenciesZadd_schedule_loopsZadd_pipelineZadd_optimize_accumulator_initZadd_hoist_tmem_allocZadd_promote_lhs_to_tmemZadd_warp_specializeZadd_remove_tmem_tokensZadd_prefetchZadd_coalesce_async_copyZadd_optimize_tmem_layoutsZadd_interleave_tmemZadd_reduce_data_duplicationZadd_reorder_instructionsr   Zadd_tma_loweringZadd_fence_insertionadd_sccpr   get_tensordesc_metadata)r   r   r   rJ   Zcluster_infor   Zdump_enabledr   r   r   r   
make_ttgir   s    


zCUDABackend.make_ttgirc                 C   sn   |}t |j}|  tj| tj| tj	
| tj| tj| || | |d< |S )Nr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rt   srcr   r<   rJ   r   r   r   r   r   	ttgir_opt0  s    
zCUDABackend.ttgir_optc                 C   s  t || jj}|}t|j}|  tjj	
| tj| tj| tj| tj| tjj	| tj| tjj||| tj| tj| tjj	| tjj	| tj| tj| tj| tjjstj| || t !  t  }tjj"r(t#dt $||}	t%|}
t&|| jj}d}t'  t (|	||
| t)|	 |j*rdd |j*D }t +|	| t ,|	t j- |.d}|d ur||d< |.d|d< |.d	|d
< |.d|d< |.d|d< t/|	}~	~|S )NzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                 S   s   g | ]\}}|qS r   r   )rv   r}   r$   r   r   r   r   h  rz   z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsrS   z
ttg.sharedr   zttg.tensor_memory_sizeZ	tmem_sizezttg.global_scratch_memory_sizeZglobal_scratch_sizez#ttg.global_scratch_memory_alignmentZglobal_scratch_align)0r=   r   r:   r   r   r   r   r   r   r   Zadd_lower_mmar   r   Zadd_allocate_warp_groupsconvertZadd_scf_to_cfZadd_allocate_shared_memoryZadd_allocate_tensor_memoryZ"add_allocate_global_scratch_memoryZadd_to_llvmirr   r   r   Zadd_nvgpu_to_llvmZadd_warp_specialize_to_llvmr   r	   compilationdisable_line_infoZllvmirZadd_di_scoper   r   Zinit_targetsZenable_asanr5   Z	to_modulerO   rA   Zset_short_ptrZattach_datalayoutZset_nvvm_reflect_ftzre   Zlink_extern_libsZoptimize_moduleZOPTIMIZE_O3Zget_int_attrr2   )rt   r   r   r<   rJ   r;   r   r   r   Zllvm_modprocr@   triplepathsZtotal_num_warpsretr   r   r   	make_llir?  sd    




zCUDABackend.make_llirc              	   C   s   t || jj}d}t|}t|| jj}t||||g |jd}	t	d|	}
t
|
dks\J |
d |d< |d  d|d  }tjd	d
| |	tjd}	tjdd| |	tjd}	tdd|	}	tjjrtd t|	 |	S )Nr   Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r,   r   r}   r/   r(   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*rM   z // -----// NVPTX Dump //----- //)r=   r   r:   rO   rA   r   Ztranslate_to_asmr[   r   findalllensub	MULTILINEr	   r   Z
dump_nvptxprint)rt   r   r   r   rJ   r;   r   r   r@   r   namesr   r   r   make_ptx{  s     zCUDABackend.make_ptxc                 C   s  t  j}tjddddT}tjdddd }|| |  |jd }tjj	r\dd	gndg}	|j
rlg nd
g}
t|}tjjrddgng }|jr|jdng }|g|	|
d||d| |jd|}zNtj|dd|d tj|jrt|j tj|jr"t|j W n tjy } zt|j}| }W d    n1 sb0    Y  tj|jrt|j |jdkrd}n$|jdtj krd}nd|j }t| d| dd| dW Y d }~n
d }~0 0 t|d}| }W d    n1 s0    Y  tj|r@t| W d    n1 sV0    Y  W d    n1 sv0    Y  |S )NFwz.ptx)deletemoderN   rz.logz.oz	-lineinfoz-suppress-debug-infoz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
rB   )r    r$   tempfileNamedTemporaryFilewriteflushr}   r	   r   r   r[   rO   r   Zdisable_ptxas_optrY   r4   r"   r   osexistsremoveCalledProcessErrorrC   rF   
returncodesignalSIGSEGVr
   r   )rt   r   r   r   rJ   r   fsrcZflogZfbinZ	line_infoZfmadr:   Zdisable_optZptx_extra_optionsZ	ptxas_cmdeZlog_filelogerrorrH   r   r   r   r   
make_cubin  sv    

( (JzCUDABackend.make_cubinc                    s    j |tjkr@ fdd|d<  fdd|d< n|tjkr^ fdd|d<  fdd|d< fd	d|d
< fdd|d< d S )Nc                    s    | | S r   )r   r   r   rJ   r<   rt   r   r   <lambda>  rz   z(CUDABackend.add_stages.<locals>.<lambda>r   c                    s    | | S r   )r   r   r   r   r   r     rz   Zttgirc                    s    | | S r   )r   r   r   r   r   r     rz   c                    s    | | S r   )r   r   r   r   r   r     rz   Zllirc                    s    | | jjS r   )r   r   r:   r   r<   rt   r   r   r     rz   ptxc                    s    | | jjS r   )r   r   r:   r   r   r   r   r     rz   r   )r   r:   r   ZTRITONZGLUON)rt   Zstagesr<   r   r   r   r   
add_stages  s    

zCUDABackend.add_stagesc                 C   s   t  }| d| jj S )Nr|   )r'   r   r:   )rt   r&   r   r   r   r     s    zCUDABackend.hash)r   r   r   staticmethodr   r   r   r2   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	functools	lru_cacher   __classcell__r   r   r   r   r      s*   



J<3r   ),Ztriton.backends.compilerr   r   r   Ztriton._C.libtritonr   r   r   r   Ztritonr	   Ztriton.runtime.errorsr
   dataclassesr   r  typingr   r   r   r   typesr   rD   r   r   r   r   r"   pathlibr   r   Z
NvidiaToolr    r  r'   r   r9   r=   rA   rI   rO   rQ   r   r   r   r   r   <module>   s:   

*