a
    h.                     @   s  d dl mZmZ d dlZd dlZd dlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ eeZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$g dZ%dS )    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging)deprecate_kwarg   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                       s.   e Zd ZdZdeee d fddZ  ZS )GraniteAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNconfig	layer_idxc                    s   t  || |j| _d S )N)super__init__Zattention_multiplierZscalingselfr   r   	__class__ g/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/granite/modular_granite.pyr   ,   s    zGraniteAttention.__init__)N)	__name__
__module____qualname____doc__r   r   intr   __classcell__r!   r!   r   r"   r   )   s   r   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	e e	ej
 e	eejejf  eeje	eejejf  f d
	ddZ  ZS )GraniteDecoderLayerr   c                    s(   t  || |j| _t||d| _d S )Nr   )r   r   residual_multiplierr   	self_attnr   r   r!   r"   r   2   s    zGraniteDecoderLayer.__init__Zpast_key_valuepast_key_valuesz4.58)new_nameversionNF)	hidden_statesattention_maskposition_idsr,   output_attentions	use_cachecache_positionposition_embeddingsreturnc	                 K   s   |}
|  |}| jf ||||||||d|	\}}|
|| j  }|}
| |}| |}|
|| j  }|f}|r|||f7 }|S )a/  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        )r/   r0   r1   r,   r2   r3   r4   r5   )Zinput_layernormr+   r*   Zpost_attention_layernormZmlp)r   r/   r0   r1   r,   r2   r3   r4   r5   kwargsZresidualZself_attn_weightsoutputsr!   r!   r"   forward7   s.    #
	



zGraniteDecoderLayer.forward)NNNFFNN)r#   r$   r%   r   r'   r   r   torchTensorr   
LongTensorr   booltupleFloatTensorr9   r(   r!   r!   r   r"   r)   1   s(          r)   c                   @   s   e Zd ZdS )GranitePreTrainedModelN)r#   r$   r%   r!   r!   r!   r"   r@   z   s   r@   c                       sx   e Zd Zed fddZdeej eej eej ee	 eej
 ee ee ee eej ee edddZ  ZS )	GraniteModelr   c                    s8   t     j| _t fddt jD | _d S )Nc                    s   g | ]}t  |qS r!   )r)   ).0r   rB   r!   r"   
<listcomp>       z)GraniteModel.__init__.<locals>.<listcomp>)r   r   embedding_multiplierr   Z
ModuleListrangenum_hidden_layerslayers)r   r   r   rB   r"   r      s
    zGraniteModel.__init__N)	input_idsr0   r1   r,   inputs_embedsr3   r2   output_hidden_statesr4   r7   r6   c
                 K   s  |d ur|n| j j}|d ur |n| j j}|d ur4|n| j j}|d u |d uA rTtd| jrr| jrr|rrtd d}|d u r| 	|}|| j
 }|r|d u rt| j d}|	d u r|d ur| nd}tj|||jd  |jd}	|d u r|	d}t| j |||	||d}|}| ||}|r d	nd }|r.d	nd }| jd | j j D ]R}|rX||f7 }||f||||||	|d
|
}|d }|rD||d f7 }qD| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FrB   r   r   )device)r   Zinput_embedsr0   r4   r,   r1   r!   )r0   r1   r,   r2   r3   r4   r5   )last_hidden_stater,   r/   
attentions)r   r2   rL   r3   
ValueErrorZgradient_checkpointingZtrainingloggerZwarning_onceZembed_tokensrF   r   Zget_seq_lengthr:   ZarangeshaperM   Z	unsqueezer   Z
rotary_embrI   rH   Znormr	   )r   rJ   r0   r1   r,   rK   r3   r2   rL   r4   r7   Zpast_seen_tokensZcausal_maskr/   r5   Zall_hidden_statesZall_self_attnsZdecoder_layerZlayer_outputsr!   r!   r"   r9      s~    


	
	

zGraniteModel.forward)	NNNNNNNNN)r#   r$   r%   r   r   r   r:   r<   r;   r   r?   r=   r   r   r	   r9   r(   r!   r!   r   r"   rA   ~   s.   	         rA   c                   @   s   e Zd Zdeej eej eej eeee	ej
 f  eej
 eej ee ee ee eej eeejf ee edddZdS )GraniteForCausalLMNr   )rJ   r0   r1   r,   rK   labelsr3   r2   rL   r4   logits_to_keepr7   r6   c                 K   s   |d ur|n| j j}|	d ur |	n| j j}	| jf ||||||||	|
d	|}|j}t|trht| d n|}| |d d |d d f }|| j j	 }d }|d ur| j
f ||| j jd|}t|||j|j|jdS )N)	rJ   r0   r1   r,   rK   r3   r2   rL   r4   )logitsrT   
vocab_size)lossrV   r,   r/   rO   )r   r2   rL   modelrN   
isinstancer'   sliceZlm_headZlogits_scalingZloss_functionrW   r
   r,   r/   rO   )r   rJ   r0   r1   r,   rK   rT   r3   r2   rL   r4   rU   r7   r8   r/   Zslice_indicesrV   rX   r!   r!   r"   r9      s<    
zGraniteForCausalLM.forward)NNNNNNNNNNr   )r#   r$   r%   r   r:   r<   r;   r   r   listr?   r=   r'   r   r   r
   r9   r!   r!   r!   r"   rS      s4              rS   )rS   rA   r@   )&typingr   r   r:   Ztorch.utils.checkpointr   Zcache_utilsr   r   Zmasking_utilsr   Zmodeling_outputsr	   r
   Zprocessing_utilsr   utilsr   r   Zutils.deprecationr   Zllama.modeling_llamar   r   r   r   r   Zconfiguration_graniter   Z
get_loggerr#   rQ   r   r)   r@   rA   rS   __all__r!   r!   r!   r"   <module>   s$   
Ij6