a
    hG                     @   s   d dl mZ d dlmZmZ d dlmZmZ d dlZddl	m
Z
 ddlmZ ddlmZmZ d	d
lmZmZmZ e G dd dZG dd dZdS )    )deque)floorsqrt)OptionalUnionN   )PretrainedConfig)GenerationConfig)attach_tracertraced   )RequestStateget_device_and_memory_breakdownloggerc                   @   s   e Zd Zejdddfeeejeje	e
ee	eeeje	f f  e
e	 ddddZee	eee	 dddZeedd	d
dZe	dddZeee	 d	ddZeeee	 ee	 dddZeejeje	eejejf dddZdS )PagedAttentionCached   N)configgeneration_configdevicedtypenum_requestslayer_device_maptp_sizereturnc                 C   s  || _ || _t|dd}|dur$|n|j| _t|dd}	|	durD|	n
|j|j | _|j| _t|dd| _|dur|dkr| j| dkrt	d| j d	| d
t
| j| j| j| j|j|jd}
|
jt|ddt|ddt|dd| j d\}}|| _|| _td| jd| jd | j|| j| jf| _g | _g | _t|jD ]t}|durV|| n|}tj| j| j |d}tj| j| j |d}tj| tj| | j| | j| q@tt|| _i | _dS )a  Initialize a paged attention cache for efficient memory usage.

        Args:
            config: Model configuration
            generation_config: Generation configuration containing cache parameters
            device: Device for the cache tensors
            dtype: Data type for the cache tensors
            layer_device_map: Optional mapping of layer indices to devices
            initial_prompt_shapes: Optional sample prompts to help calculate optimal cache size
        num_key_value_headsNhead_dim
block_size    r   r   zNumber of key value heads z+ must be divisible by tensor parallel size .r   r   	num_heads
num_layershidden_size
vocab_size
num_blocksmax_batch_tokensZ
max_memory?)r$   r%   max_memory_percentcache_dtypez7PagedAttentionCache initialized with self.num_blocks = z and self.max_batch_tokens =  )r   r   )r   r   getattrZnum_attention_headsr   r"   r   Znum_hidden_layersr   
ValueErrorPagedAttentionMemoryHandlerr#   %infer_num_blocks_and_max_batch_tokensr$   r%   r   warningZcache_shape	key_cachevalue_cacherangetorchZzerosZ_dynamoZmark_static_addressappendr   _free_blocks_block_tables)selfr   r   r   r   r   r   r   Zkv_headsr   Zmemory_handlerr$   r%   idxZlayer_deviceZnew_layer_key_cacheZnew_layer_value_cache r8   m/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/generation/continuous_batching/cache.py__init__   sV    



zPagedAttentionCache.__init__)n_blocks
request_idr   c                 C   s\   t | j|k rdS g }t|D ]}|| j  q|| jvrHg | j|< | j| | |S )z*Allocates n_blocks for a given request_id.F)lenr4   r1   r3   popleftr5   extend)r6   r;   r<   	allocated_r8   r8   r9   allocate_blocksn   s    

z#PagedAttentionCache.allocate_blocks)r<   r   c                 C   s8   || j v r$| j |}| j| ntd|  dS )z.Frees all blocks associated with a request_id.z6Attempted to free blocks for non-existent request_id: N)r5   popr4   r?   r   info)r6   r<   Zblocks_to_freer8   r8   r9   free_blocks}   s    
zPagedAttentionCache.free_blocks)r   c                 C   s
   t | jS )z,Returns the number of free blocks available.)r=   r4   )r6   r8   r8   r9   get_num_free_blocks   s    z'PagedAttentionCache.get_num_free_blocksc                 C   s   | j |g S )z&Returns the block table for a request.)r5   get)r6   r<   r8   r8   r9   get_block_table   s    z#PagedAttentionCache.get_block_table)statelogical_indicesr   c                 C   s   |j }| j|}|s$td| | j}g }|D ]X}|| }|| }	|t|krltd| d| d| || }
|
| |	 }|| q2|S )a  
        Maps logical sequence indices to physical cache indices using the block table, using PyTorch.

        Args:
            request_id: The request ID.
            logical_indices: A list of logical indices.

        Returns:
            A list of physical indices.

        Raises:
            ValueError: If no block table is found for the request ID.
            IndexError: If a logical index maps to a block index that is out of bounds.
        z!No block table found for request zLogical index z maps to block index z$ which is out of bounds for request )r<   r5   rG   r+   r   r=   
IndexErrorr3   )r6   rI   rJ   r<   Zblock_tabler   Zphysical_indicesr7   Z	block_idxZblock_offsetZphysical_block_numZphysical_indexr8   r8   r9   _get_physical_indices   s&    z)PagedAttentionCache._get_physical_indices)
key_statesvalue_states	layer_idxr   c           
      K   s   | j | j }| j| | j|| j}| j| | j|| j}	|d |d d |d d f< |d |	d d |d d f< |d d d |d d f |	d d d |d d f fS )Nr   )r$   r   r/   viewr   r   r0   )
r6   rM   rN   rO   Z
read_indexZwrite_indexkwargsZtotal_slotsZk_cache_flatZv_cache_flatr8   r8   r9   update   s    zPagedAttentionCache.update)__name__
__module____qualname__r2   float16r   r	   r   r   intr   dictr   strr:   r   listrB   rE   rF   rH   r   rL   ZTensortuplerR   r8   r8   r8   r9   r      s8   Q'r   c                   @   s  e Zd ZejZdZejZdZ	dZ
eeeeeeddddZedeed	d
dZdddejfee ee eejeeef dddZdejdfeejeeeef dddZdejfeeejedddZdejfeeejedddZddejfee ee ejeeeef dddZdS )r,         i   N)r   r   r    r!   r"   r#   r   c                 C   s(   || _ || _|| _|| _|| _|| _d S Nr   )r6   r   r   r    r!   r"   r#   r8   r8   r9   r:      s    	z$PagedAttentionMemoryHandler.__init__      ?)r'   r   c                 C   s,   t  \}}}}|t|| }t||  }|S r^   )r   maxrW   )r'   rA   totalreservedr@   available_memoryr8   r8   r9   get_available_memory   s    z0PagedAttentionMemoryHandler.get_available_memoryr&   )r$   r%   r'   r(   r   c                 C   s   |du r"|du r"|  ||\}}n>|durB|du rB| |||}n|dur`|du r`| |||}| |}| j|||d}t||krtd| d| ||fS )a  
        The memory footprint depends on the cache size C and the max batch tokens M in the following way:
            Mem = Mem(cache) + Mem(activation) + Mem(static_tensors)
        where:
            Mem(cache) = 2 * num_heads * head_dim * num_layers * cache_dtype.itemsize * C
            Mem(activation) = M * (hidden_size + vocab_size) * activation_dtype.itemsize
            Mem(static_tensors) ~= 8M * input_dtype.itemsize + M * C * activation_dtype.itemsize

        Depending on if C or M is given, we use different methods to infer the values (C = num_blocks * block_size) and
        since block_size is fixed, num_blocks is the true variable to find.
        N)r%   r$   r(   zMemory footprint z is more than available memory )'compute_num_blocks_and_max_batch_tokenscompute_max_batch_tokenscompute_num_blocksrd   compute_memory_footprintsumMemoryError)r6   r$   r%   r'   r(   rc   Zmemory_footprintr8   r8   r9   r-      s"    

zAPagedAttentionMemoryHandler.infer_num_blocks_and_max_batch_tokensg{Gz?)r'   r(   mr   c                 C   sz  |  |}td|  || jj | j| j  }d| j | j | j	 |j }d| | j
j }td|  td|  td|  || jj }|| | }	| }
|	d d| |
  }|dk rtd	||	 t| d|  }|dk rtd
|t|| j }|| jkr<td|d| j | j}t|| }|| jkrrtd|d| j | j}||fS )a  
        If neither M nor C is given, we assume M = m*C so we have to solve a second-order polynomial in C:
            Mem = C * 2 * self.num_heads * self.head_dim * self.num_layers * cache_dtype.itemsize
                + C * m * (hidden_size + vocab_size) * activation_dtype.itemsize
                + C * m * 8 * input_dtype.itemsize + C^2 * m * activation_dtype.itemsize

        We solve for C and then M = m*C.
        zCache memory: r\      zMemory per activation token: zMemory per cache token: zMemory per input token:    r   z)Discriminant is negative: discriminant = z3Greatest solution is negative: greatest_solution = znum_blocks = z9 is too large, setting to self._upper_bound_num_blocks = zmax_batch_tokens = z? is too large, setting to self._upper_bound_max_batch_tokens = )rd   r   rD   _activation_dtypeitemsizer"   r#   r    r   r!   _input_dtyper+   r   rW   r   _upper_bound_num_blocksr.   _upper_bound_max_batch_tokens)r6   r'   r(   rk   cache_memoryZmem_per_activation_tokenZmem_per_cache_tokenZmem_per_input_tokenabcZdiscriminantZgreatest_solutionr$   r%   r8   r8   r9   re     s4    

zCPagedAttentionMemoryHandler.compute_num_blocks_and_max_batch_tokens)r$   r'   r(   r   c                 C   sx   |  |}|| j }|}||d | j | j | j |j 8 }d| jj || jj  }|| j| j	 | jj 7 }t
|| S )a4  
        If C is given, we have a formula for M:
            num = (Mem - C * 2 * num_heads * head_dim * num_layers * cache_dtype.itemsize)
            denum = (8 * input_dtype.itemsize + C * activation_dtype.itemsize + (hidden_size + vocab_size) * activation_dtype.itemsize)
        M = num / denum
        r\   rl   )rd   r   r    r   r!   ro   rp   rn   r"   r#   rW   )r6   r$   r'   r(   rs   
cache_sizenumdenumr8   r8   r9   rf   A  s    

$z4PagedAttentionMemoryHandler.compute_max_batch_tokens)r%   r'   r(   r   c                 C   s   |  |}|}|| jj| j| j  | 8 }|d| | jj 8 }d| j | j | j |j }||| jj 7 }t	|| }t
|| j S )a4  
        If M is given, we have a formula for C:
            num = Mem - M * (hidden_size + vocab_size) * activation_dtype.itemsize - 8 * M * input_dtype.itemsize
            denum = 2 * num_heads * head_dim * num_layers * cache_dtype.itemsize + M * activation_dtype.itemsize
        C = num / denum
        rl   r\   )rd   rn   ro   r"   r#   rp   r    r   r!   rW   r   r   )r6   r%   r'   r(   rs   rx   ry   rw   r8   r8   r9   rg   X  s    
z.PagedAttentionMemoryHandler.compute_num_blocks)r$   r%   r(   r   c           	      C   s   | j j| j| j  }||9 }|d urT|| j }d| j | j | j |j }|| }nd}|d ur|d urtd| | j	j || | j j d| | j	j d| | j	j g}nd}|||fS )Nr\   r   )
rn   ro   r"   r#   r   r    r   r!   ri   rp   )	r6   r$   r%   r(   Zactivation_memory_footprintrw   Zbytes_per_tokenZcache_memory_footprintZstatic_memory_footprintr8   r8   r9   rh   p  s"    

	z4PagedAttentionMemoryHandler.compute_memory_footprint)r_   )rS   rT   rU   r2   Zbfloat16rn   Z_activation_safety_factorZint32rp   rr   rq   rW   r:   staticmethodfloatrd   rV   r   r   r[   r-   re   rf   rg   rh   r8   r8   r8   r9   r,      sx   
+
4r,   )collectionsr   mathr   r   typingr   r   r2   Zconfiguration_utilsr   Zgeneration.configuration_utilsr	   Zutils.metricsr
   r   classesr   r   r   r   r,   r8   r8   r8   r9   <module>   s    .