a
    hM                  
   @   s  d dl mZmZmZ d dlZd dlmZ d dlm  mZ	 ddl
mZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) G dd dej*Z+G dd dej*Z,dd Z-ej.e/ej.dddZ0d.ej*ej.ej.ej.eej. e1e1ee! dddZ2d/dd Z3G d!d" d"ej*Z4G d#d$ d$eZ5G d%d& d&ej*Z6e"G d'd( d(eZ7e"G d)d* d*e7Z8e"G d+d, d,e7eZ9g d-Z:dS )0    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )
OlmoConfigc                       s<   e Zd ZdZedd fddZejejdddZ  Z	S )	OlmoLayerNormz/LayerNorm but with no learnable weight or bias.N)hidden_sizereturnc                    s   t    |f| _d S N)super__init__normalized_shape)selfr   	__class__ b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/olmo/modeling_olmo.pyr      s    
zOlmoLayerNorm.__init__)hidden_statesr   c                 C   s,   |j }tj|jtjd| jd d dd|S )N)dtypegh㈵>)eps)r'   FZ
layer_normtotorchfloat32r    )r!   r&   Z
orig_dtyper$   r$   r%   forward#   s     zOlmoLayerNorm.forward)
__name__
__module____qualname____doc__intr   r+   Tensorr-   __classcell__r$   r$   r"   r%   r      s   r   c                       s$   e Zd Z fddZdd Z  ZS )OlmoMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S NFZbias)r   r   configr   Zintermediate_sizennLinear	gate_projup_proj	down_projr   Z
hidden_actact_fnr!   r8   r"   r$   r%   r   +   s    
zOlmoMLP.__init__c                 C   s$   |  | | || | }|S r   )r=   r>   r;   r<   )r!   xr=   r$   r$   r%   r-   5   s     zOlmoMLP.forward)r.   r/   r0   r   r-   r4   r$   r$   r"   r%   r5   *   s   
r5   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..N   dim)shaper+   cat)r@   x1Zx2r$   r$   r%   rotate_half:   s    rH   )r&   n_repr   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rE   expandreshape)r&   rI   batchnum_key_value_headsslenhead_dimr$   r$   r%   	repeat_kvA   s
    0rP           )modulequerykeyvalueattention_maskscalingdropoutkwargsc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )NrB   r   rA   )rD   r'   )ptrainingr   )rP   num_key_value_groupsr+   matmul	transposerE   r9   
functionalZsoftmaxr,   r*   r'   rX   r\   
contiguous)rR   rS   rT   rU   rV   rW   rX   rY   
key_statesvalue_statesattn_weightscausal_maskattn_outputr$   r$   r%   eager_attention_forwardM   s    
&rg   c           
      C   s^   | j |j  }}||}||}| | t| |  }|| t||  }	|||	|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r'   	unsqueezerH   r*   )
qkcossinposition_idsZunsqueeze_dimZq_typeZk_typeZq_embedZk_embedr$   r$   r%   apply_rotary_pos_embg   s    

rn   c                       s~   e Zd ZdZeed fddZedddddej	e
ej	ej	f eej	 ee eej e
ej	eej	 f d
ddZ  ZS )OlmoAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr8   	layer_idxc                    s   t    || _|| _t|d|j|j | _|j|j | _	| jd | _
|j| _d| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _d S )NrO   g      Tr7   )r   r   r8   rq   getattrr   Znum_attention_headsrO   rM   r]   rW   attention_dropoutZ	is_causalr9   r:   Zattention_biasq_projk_projv_projo_projr!   r8   rq   r"   r$   r%   r      s(    
zOlmoAttention.__init__past_key_valuepast_key_values4.58new_nameversionN)r&   position_embeddingsrV   rz   cache_positionr   c                 K   s  |j d d }g |d| jR }| |}	| |}
| |}| jjd ur|	j| jj | jjd |
j| jj | jjd |j| jj | jjd |	|	dd}	|
|	dd}
||	dd}|\}}t
|	|
||\}	}
|d ur|||d}||
|| j|\}
}t}| jjdkr.t| jj }|| |	|
||f| jsHdn| j| jd|\}}|jg |dR   }| |}||fS )	NrA   )minmaxr   rB   )rl   rk   r   eagerrQ   )rX   rW   )rE   rO   rt   ru   rv   r8   Zclip_qkvZclamp_viewr_   rn   updaterq   rg   Z_attn_implementationr   r\   rs   rW   rK   ra   rw   )r!   r&   r   rV   rz   r   rY   Zinput_shapeZhidden_shapeZquery_statesrb   rc   rk   rl   Zcache_kwargsZattention_interfacerf   rd   r$   r$   r%   r-      sF    






zOlmoAttention.forward)NN)r.   r/   r0   r1   r   r2   r   r   r+   r3   tupler   r   
LongTensorr-   r4   r$   r$   r"   r%   ro      s     ro   c                       s   e Zd Zeed fddZedddddeje	ej e	ej
 e	e e	e e	ej
 e	eejejf  ee ejd
	ddZ  ZS )OlmoDecoderLayerrp   c                    sF   t    |j| _t||d| _t|| _t|j| _t|j| _	d S )Nrp   )
r   r   r   ro   	self_attnr5   mlpr   input_layernormpost_attention_layernormrx   r"   r$   r%   r      s    

zOlmoDecoderLayer.__init__ry   rz   r{   r|   NF)	r&   rV   rm   rz   	use_cacher   r   rY   r   c              
   K   s^   |}	|  |}| jf |||||||d|\}}
|	| }|}	| |}| |}|	| }|S )N)r&   rV   rm   rz   r   r   r   )r   r   r   r   )r!   r&   rV   rm   rz   r   r   r   rY   Zresidual_r$   r$   r%   r-      s&    




zOlmoDecoderLayer.forward)NNNFNN)r.   r/   r0   r   r2   r   r   r+   r3   r   r   r   boolr   r   r   r-   r4   r$   r$   r"   r%   r      s&   	      r   c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	OlmoRotaryEmbeddinginv_freqNr8   c                    s   t    t|dr:t|jtr:|jd|jd| _nd| _|j| _	|j| _
|| _t| j | _| | j|\}| _| jd|dd | j| _d S )Nrope_scaling	rope_typetypedefaultr   F)
persistent)r   r   hasattr
isinstancer   dictgetr   Zmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr8   r   Zrope_init_fnattention_scalingZregister_bufferr   Zoriginal_inv_freq)r!   r8   devicer   r"   r$   r%   r     s    
zOlmoRotaryEmbedding.__init__c           
      C   s   | j d d d d f  |jd dd|j}|d d d d d f  }t|jjtrl|jjdkrl|jjnd}t	j
|dd^ | |  dd}t	j||fdd	}| | j }| | j }	||	fW  d    S 1 s0    Y  d S )
Nr   rA   r   ZmpscpuF)device_typeZenabledrB   rC   )r   floatrJ   rE   r*   r   r   r   strr+   Zautocastr_   rF   rk   r   rl   )
r!   r@   rm   Zinv_freq_expandedZposition_ids_expandedr   ZfreqsZembrk   rl   r$   r$   r%   r-     s    0&zOlmoRotaryEmbedding.forward)N)r.   r/   r0   r+   r3   __annotations__r   r   Zno_gradr   r-   r4   r$   r$   r"   r%   r      s
   

r   c                   @   sH   e Zd ZU eed< dZdZdgZdgZdZ	dZ
dZdZdZeedZdS )OlmoPreTrainedModelr8   modelTr   rz   )r&   
attentionsN)r.   r/   r0   r   r   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   ro   Z_can_record_outputsr$   r$   r$   r%   r   #  s   
r   c                       st   e Zd Zed fddZeedeej	 eej
 eej	 ee eej eej	 ee ee ed	ddZ  ZS )		OlmoModelr   c                    s|   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r$   )r   ).0rq   r   r$   r%   
<listcomp>?      z&OlmoModel.__init__.<locals>.<listcomp>r   F)r   r   Zpad_token_idZpadding_idx
vocab_sizer9   Z	Embeddingr   embed_tokensZ
ModuleListrangenum_hidden_layerslayersr   normr   
rotary_embZgradient_checkpointing	post_initr?   r"   r   r%   r   8  s    zOlmoModel.__init__N)		input_idsrV   rm   rz   inputs_embedsr   r   rY   r   c              	   K   s   |d u |d uA rt d|d u r*| |}|rB|d u rBt| jd}|d u rz|d urZ| nd}	tj|	|	|jd  |jd}|d u r|	d}t
| j|||||d}
|}| ||}| jd | jj D ] }||f|
||||d|}q| |}t||dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r   )r8   Zinput_embedsrV   r   rz   rm   )rV   rm   rz   r   r   )last_hidden_staterz   )
ValueErrorr   r   r8   Zget_seq_lengthr+   ZarangerE   r   rh   r
   r   r   r   r   r   )r!   r   rV   rm   rz   r   r   r   rY   Zpast_seen_tokensre   r&   r   Zdecoder_layerr$   r$   r%   r-   H  sP    

	

zOlmoModel.forward)NNNNNNN)r.   r/   r0   r   r   r   r   r   r+   r   r3   r   FloatTensorr   r   r   r   r-   r4   r$   r$   r"   r%   r   6  s*          r   c                       s   e Zd ZdgZddiZddgdgfiZ fddZeede	e
j e	e
j e	e
j e	e e	e
j e	e
j e	e e	e
j eee
jf ee ed
ddZ  ZS )OlmoForCausalLMzlm_head.weightlm_headZcolwise_repr&   logitsc                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S r6   )
r   r   r   r   r   r9   r:   r   r   r   r?   r"   r$   r%   r     s
    
zOlmoForCausalLM.__init__Nr   )r   rV   rm   rz   r   labelsr   r   logits_to_keeprY   r   c
              
   K   s   | j f |||||||d|
}|j}t|	tr<t|	 dn|	}| |dd|ddf }d}|dur| jf ||| jjd|
}t	|||j
|j|jdS )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, OlmoForCausalLM

        >>> model = OlmoForCausalLM.from_pretrained("meta-olmo/Olmo-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-olmo/Olmo-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r   rV   rm   rz   r   r   r   N)r   r   r   )lossr   rz   r&   r   )r   r   r   r2   slicer   Zloss_functionr8   r   r   rz   r&   r   )r!   r   rV   rm   rz   r   r   r   r   r   rY   outputsr&   Zslice_indicesr   r   r$   r$   r%   r-     s0     zOlmoForCausalLM.forward)	NNNNNNNNr   )r.   r/   r0   Z_tied_weights_keysZ_tp_planZ_pp_planr   r   r   r   r+   r   r3   r   r   r   r   r2   r   r   r   r-   r4   r$   r$   r"   r%   r     s8   	         r   )r   r   r   )rQ   )Nr   );typingr   r   r   r+   Ztorch.nnr9   Ztorch.nn.functionalr`   r)   Zactivationsr   Zcache_utilsr   r   Z
generationr	   Zmasking_utilsr
   Zmodeling_layersr   Zmodeling_outputsr   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zutils.deprecationr   Zutils.genericr   Zconfiguration_olmor   Moduler   r5   rH   r3   r2   rP   r   rg   rn   ro   r   r   r   r   r   __all__r$   r$   r$   r%   <module>   sT    
P-#NK