a
    ½Àh˜  ã                	   @   sx   d dl Z ddlmZ ddlmZ zeƒ r4d dlmZ W n eyH   Y n0 d	e jj	e j
e j
e j
e j
ee j
dœdd„ZdS )
é    Né   )ÚPagedAttentionCache)Úis_flash_attn_2_available)Úflash_attn_varlen_func)ÚmoduleÚqÚkÚvÚattention_maskÚcacheÚreturnc                 K   sè   |j ||| jfi |¤Ž\}}t| ddƒs,dn| jdf}|durD|j}d|v rZd| d¡ini }|| dd¡ d¡ ¡ | dd¡ d¡ ¡ | dd¡ d¡ ¡ | 	t
j¡| 	t
j¡ ¡ ||	f| jd	|d
œ|¤Ž}t|tƒrà|d }|dfS )aæ  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    Úsliding_windowF)éÿÿÿÿr   r   NZs_auxé   r   T)Zsoftmax_scaleZcausalZwindow_size)ÚupdateZ	layer_idxÚgetattrr   r   ÚgetZ	transposeZsqueezeÚ
contiguousÚtoÚtorchZint32ÚcloneZscalingÚ
isinstanceÚtuple)r   r   r   r	   r
   r   Zcu_seq_lens_qZcu_seq_lens_kZmax_seqlen_qZmax_seqlen_kZblock_tablesÚimplementationÚkwargsr   r   Zcustom_kwargsZattn_output© r   úa/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/integrations/flash_paged.pyÚpaged_attention_forward   s.    %
ùöô
r   )NNNNNNNN)r   Zgeneration.continuous_batchingr   Úutilsr   Z
flash_attnr   Ú	ExceptionÚnnÚModuleZTensorr   r   r   r   r   Ú<module>   s0           ôò