a
    h1                     @   sl  d Z ddlmZmZ ddlZddlmZ ddlmZm	Z	 ddl
mZmZmZ e rrddlmZ dd	lmZmZmZ e	eZG d
d dZdejejejeejeejejf f dddZeejef Zdejee eeeef  ee ddddZejeejdddZdej j!ejejejeejdf ee" ee" eej eej eejeej f d
ddZ#dS )a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalUnionN)version   )is_torch_flex_attn_availablelogging)_torch_versionis_torch_less_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attentionc                       sJ   e Zd ZdZdZdZdZ fddZej	j
dddd Zd	d
 Z  ZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                    s   | j d u rt | | _ | j S N)	_instancesuper__new__)clsargskwargs	__class__ d/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/integrations/flex_attention.pyr   7   s    
zWrappedFlexAttention.__new__)	recursivec                 C   sn   | j r|| jkrj|| _tdr0tjtdd| _n4tt	j
dkrX|rXtjtddd| _ntt| _d| _ dS )	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r   modeTN)_is_flex_compiledtrainingr	   torchcompiler   _compiled_flex_attentionr   parser   base_version)selfr    r   r   r   __init__=   s    
zWrappedFlexAttention.__init__c                 C   s   | j S r   )r#   )r&   r   r   r   __call__S   s    zWrappedFlexAttention.__call__)__name__
__module____qualname____doc__r   r   r#   r   r!   compilerdisabler'   r(   __classcell__r   r   r   r   r   .   s   
r   F)querykeyvaluereturnc                 K   s(   t  st| nt}|| ||fi |S r   )r
   r   r   )r0   r1   r2   r    r   Zflex_attention_compiledr   r   r   compile_friendly_flex_attentionW   s    	r4   Tr   )attention_mask_2dattention_chunk_sizeoffsets	is_causalr3   c              	      s   j \}}|s|}|s|}|t d t }tjjj dd|| fd  j}	  |durx d	dd |  fddfdd	}
 fd
d}|s|n|du rn|
|dur|d 
|	|d 
|	fdd}n}t||d|||	td dS )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r2   padNc                    s@   ||k}| |f | |f k} | |f dk}||@ |@ }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r   r   )	batch_idxhead_idxq_idxkv_idxZcausal_maskdocument_maskpadding_mask
final_maskr5   document_idsr   r   causal_mask_mod   s
    z4make_flex_block_causal_mask.<locals>.causal_mask_modc                    s.   | |f | |f k} | |||}||@ S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        r   )r<   r=   r>   r?   Z
chunk_maskZcausal_doc_mask)rE   
chunk_idxsr   r   chunk_causal_mask_mod   s    z:make_flex_block_causal_mask.<locals>.chunk_causal_mask_modc                    s4   | |f | |f k} | |f dk}||@ }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   r   )r<   r=   r>   r?   r@   rA   rB   rC   r   r   default_mask_mod   s    z5make_flex_block_causal_mask.<locals>.default_mask_modc                    s   | }|  }| |||S r   r   )r<   r=   r>   r?   Zoffset_qZ	offset_kv)	kv_offsetmask_mod_maybe_combinedq_offsetr   r   mask_mod   s    z-make_flex_block_causal_mask.<locals>.mask_modr   )rL   BHZQ_LENZKV_LENdevice_compile)shapeflex_default_block_sizer!   nnZ
functionalr:   rO   cloneZfill_Zcumsumtor   r	   )r5   r6   Zquery_lengthZ
key_lengthr7   r8   Z
batch_sizeZtotal_seq_lenZpad_lenrO   rG   rH   rL   r   )r5   rE   rF   rD   rI   rJ   rK   r   make_flex_block_causal_maskm   s>    "
rV   )hidden_statesn_repr3   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r9   N)rQ   expandZreshape)rW   rX   batchZnum_key_value_headsslenZhead_dimr   r   r   	repeat_kv   s
    0r\   )
moduler0   r1   r2   attention_maskscalingsoftcap	head_masks_auxr3   c	                    sL   d urt d |	dddkr*tdd }
d t|trB|}
n|d urtd d d d d d d |jd f  fdd}d	}|jd
 }||d
 @ dkrt||jd
 |jd
  }t||jd
 |jd
  }d}|	d}|jj	dk}t
|||||
||||| jd
}|r,|\}}||j}n|}d }|d
d }||fS )Nzm`flex_attention` does not support `head_mask`. Please set your attention to `eager` if you want this feature.Zdropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c           	         s   d urt |   } d ur:| | d | |  }  d urZ|  | | d d  } d urt j| dddj}t | }t | | }|jddd| }|| } | S )Nr   r;   T)dimZkeepdim)r!   tanhmaxvaluesexpsum)	Zscorer<   r=   r>   r?   Z
logits_maxZsinksZunnormalized_scoresZ
normalizerra   rb   Z
score_maskr`   r   r   	score_mod  s    z)flex_attention_forward.<locals>.score_modTr9   Fkernel_optionscpu)rk   
block_mask
enable_gqascalerl   
return_lser    r   )loggerZwarning_onceget
ValueError
isinstancer   rQ   r\   rO   typer4   r    rU   ZdtypeZ	transpose
contiguous)r]   r0   r1   r2   r^   r_   r`   ra   rb   r   rn   rk   ro   Znum_local_query_headsrl   rq   Zflex_attention_outputZattention_outputZlser   rj   r   flex_attention_forward   sV    
&

rx   )F)NNNNT)NNNN)$r,   typingr   r   r!   	packagingr   utilsr   r   Zutils.import_utilsr   r	   r
   Z!torch.nn.attention.flex_attentionr   rR   r   r   r   Z
get_loggerr)   rr   r   ZTensortupler4   intZOffsetboolrV   r\   rS   Modulefloatrx   r   r   r   r   <module>   s^   
-      r    