a
    hV                  	   @   s  d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z	 d dl
m	  mZ d dlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 e08e9Z:G dd de	j;Z<G dd de	j;Z=G dd dej	j;Z>G dd de	j;Z?G dd de	j@ZAed G d!d  d e	j;ZBG d"d# d#e	j;ZCejDejDejDeEejDejDf d$d%d&ZFejDeGejDd'd(d)ZHd]e	j;ejDejDejDeejD eIeId+d,d-ZJd^e	j;ejDejDejDeejD eIeId+d.d/ZKG d0d1 d1e	j;ZLG d2d3 d3eZMe.G d4d5 d5e)ZNe.G d6d7 d7eNZOG d8d9 d9eNeZPee.d:d;G d<d= d=e#ZQG d>d? d?ej	j;ZRG d@dA dAe	j;ZSdBdC ZTG dDdE dEe	j;ZUejDejDdFdGdHZVejDejDejDeEejDejDf dIdJdKZWG dLdM dMe	j;ZXG dNdO dOe	j;ZYG dPdQ dQeZZG dRdS dSe	j;Z[G dTdU dUe	j;Z\G dVdW dWe	j;Z]G dXdY dYeNZ^G dZd[ d[eNeZ_g d\Z`dS )_    N)	dataclass)CallableOptionalUnion)Llama4VisionConfig   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_maskcreate_chunked_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )Llama4ConfigLlama4TextConfigc                       s6   e Zd Zed fddZejejdddZ  ZS )Llama4TextExpertsconfigc                    sx   t    |j| _|j| _|j| _| j| _tt	
| j| jd| j | _tt	
| j| j| jf| _t|j | _d S N   )super__init__num_local_expertsnum_expertsintermediate_sizehidden_sizeZ
expert_dimnn	Parametertorchemptygate_up_proj	down_projr   
hidden_actact_fnselfr%   	__class__ f/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/llama4/modeling_llama4.pyr)   /   s    
 zLlama4TextExperts.__init__hidden_statesreturnc                 C   sb   | | jjd d| j}t|| j}|jddd\}}t|| | | j}| d| j}|S )a2  
        This should really not be run on a single machine, as we are reaching compute bound:
        - the inputs are expected to be "sorted" per expert already.
        - the weights are viewed with another dim, to match num_expert, 1, shape * num_tokens, shape

        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
            selected_experts (torch.Tensor): (batch_size * token_num, top_k)
            routing_weights (torch.Tensor): (batch_size * token_num, top_k)
        Returns:
            torch.Tensor
        r   r'   dim)	viewr2   shaper-   r0   Zbmmchunkr5   r3   )r7   r=   Zgate_upZgateupZnext_statesr:   r:   r;   forward9   s    zLlama4TextExperts.forward)	__name__
__module____qualname__r"   r)   r0   TensorrF   __classcell__r:   r:   r8   r;   r#   .   s   
r#   c                       s&   e Zd Zd fdd	Zdd Z  ZS )Llama4TextMLPNc                    sj   t    |d u r|j}|| _tj|j|dd| _tj|j|dd| _tj||jdd| _	t
|j | _d S NFbias)r(   r)   r,   r%   r.   Linearr-   	gate_projup_projr3   r   r4   activation_fn)r7   r%   r,   r8   r:   r;   r)   P   s    
zLlama4TextMLP.__init__c                 C   s$   |  | || | }| |S N)rS   rQ   rR   r3   )r7   xr3   r:   r:   r;   rF   \   s    zLlama4TextMLP.forward)NrG   rH   rI   r)   rF   rK   r:   r:   r8   r;   rL   O   s   rL   c                       s<   e Zd Zded fddZdd Zdd Zd	d
 Z  ZS )Llama4TextL2Normư>epsc                    s   t    || _d S rT   )r(   r)   rZ   )r7   rZ   r8   r:   r;   r)   b   s    
zLlama4TextL2Norm.__init__c                 C   s$   |t |djddd| j  S Nr'   r?   T)Zkeepdimr0   ZrsqrtpowmeanrZ   r7   rU   r:   r:   r;   _normf   s    zLlama4TextL2Norm._normc                 C   s   |  | |S rT   )r`   floattype_asr_   r:   r:   r;   rF   i   s    zLlama4TextL2Norm.forwardc                 C   s   d| j  S )Nzeps=rY   r7   r:   r:   r;   
extra_reprl   s    zLlama4TextL2Norm.extra_repr)rX   )	rG   rH   rI   ra   r)   r`   rF   rd   rK   r:   r:   r8   r;   rW   a   s   rW   c                       s6   e Zd Zd
 fdd	Zdd Zdd Zdd	 Z  ZS )Llama4TextRMSNormh㈵>c                    s&   t    || _tt|| _dS )z<
        Llama4RMSNorm is equivalent to T5LayerNorm
        N)r(   r)   rZ   r.   r/   r0   Zonesweight)r7   r-   rZ   r8   r:   r;   r)   q   s    
zLlama4TextRMSNorm.__init__c                 C   s$   |t |djddd| j  S r[   r\   r_   r:   r:   r;   r`   y   s    zLlama4TextRMSNorm._normc                 C   s   |  | |}|| j S rT   )r`   ra   rb   rg   )r7   rU   outputr:   r:   r;   rF   |   s    zLlama4TextRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerg   rC   rZ   rc   r:   r:   r;   rd      s    zLlama4TextRMSNorm.extra_repr)rf   )rG   rH   rI   r)   r`   rF   rd   rK   r:   r:   r8   r;   re   p   s   re   c                       s(   e Zd Z fddZ fddZ  ZS )Llama4Routerc                    s*   t  j|j|jdd |j| _|j| _d S rM   )r(   r)   r-   r*   r+   num_experts_per_toktop_kr6   r8   r:   r;   r)      s    zLlama4Router.__init__c                    s^   t  |}tj|| jdd\}}t|tdd||}tjj	
| |j}||fS )Nr    r@   z-inf)r(   rF   r0   Ztopkrl   Z	full_likera   Zscatter_r.   
functionalZsigmoidtodtype)r7   r=   router_logitsZrouter_top_valueZrouter_indicesrouter_scoresr8   r:   r;   rF      s
    zLlama4Router.forwardrV   r:   r:   r8   r;   rj      s   rj   Llama4TextMoec                       s$   e Zd Z fddZdd Z  ZS )rr   c                    sD   t    |j| _|j| _|j| _t|| _	t
|| _t|| _d S rT   )r(   r)   rk   rl   r-   
hidden_dimr*   r+   r#   expertsrj   routerrL   shared_expertr6   r8   r:   r;   r)      s    


zLlama4TextMoe.__init__c                 C   s   | d| j}| |\}}||jd d}||dd dd }| |}| |}|| |jd d|jd j	dd ||fS )Nr?   r    r   r@   )
reshapers   ru   repeatrC   	transposert   rv   Zadd_sum)r7   r=   rq   rp   Z	routed_inZ
routed_outoutr:   r:   r;   rF      s    

(zLlama4TextMoe.forwardrV   r:   r:   r8   r;   rr      s   	c                       sD   e Zd ZU ejed< ded fddZe e	dd Z
  ZS )	Llama4TextRotaryEmbeddinginv_freqNr$   c                    sp   t    |jd urdnd| _|j| _|j| _|| _t| j | _	| 	| j|\}| _
| jd|dd | j| _d S )NZllama3defaultr}   F)
persistent)r(   r)   Zrope_scalingZ	rope_typeZmax_position_embeddingsZmax_seq_len_cachedZoriginal_max_seq_lenr%   r   Zrope_init_fnattention_scalingZregister_bufferr}   Zoriginal_inv_freq)r7   r%   devicer}   r8   r:   r;   r)      s    
z"Llama4TextRotaryEmbedding.__init__c                 C   s   | j d d d d f  |jd dd}|d d d d d f  }t|jjtrd|jjdkrd|jjnd}tj	|ddD |
|j| dd}tt||}|| j }W d    n1 s0    Y  |S )	Nr   r?   r    ZmpscpuF)device_typeZenabledr'   )r}   ra   expandrC   
isinstancer   typestrr0   Zautocastrn   ry   ZpolarZ	ones_liker   )r7   rU   position_idsZinv_freq_expandedZposition_ids_expandedr   freqs	freqs_cisr:   r:   r;   rF      s    (&(z!Llama4TextRotaryEmbedding.forward)N)rG   rH   rI   r0   rJ   __annotations__r"   r)   Zno_gradr   rF   rK   r:   r:   r8   r;   r|      s
   

r|   )xqxkr   r>   c              	   C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t ||d d d d d d d f  d}t ||d d d d d d d f  d}|| ||fS )Nr?   r'   r   )r0   view_as_complexra   rw   rC   view_as_realflattenrb   )r   r   r   Zxq_Zxk_Zxq_outZxk_outr:   r:   r;   apply_rotary_emb   s
    ,,,,r   )r=   n_repr>   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rC   r   rw   )r=   r   batchnum_key_value_headsslenhead_dimr:   r:   r;   	repeat_kv   s
    0r           )modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d urf|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj	|
|| j
d}
t|
|	}|dd }||
fS )Nr'   r   r?   r@   ptrainingr    )r   num_key_value_groupsr0   matmulry   rC   r.   rm   softmaxr   r   
contiguousr   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr:   r:   r;   eager_attention_forward   s    
&r   c                 K   s   t || j}t || j}	t||dd| jd  }
|d url|d d d d d d d |jd f }|
| }
tjj	|
dd}
tjj
|
|| jd}
t|
|	}|dd }||
fS )	Nr'   r         r   r?   r@   r   r    )r   r   r0   r   ry   r   rC   r.   rm   r   r   r   r   r   r:   r:   r;   vision_eager_attention_forward   s    
&r   c                       s   e Zd ZdZed fddZedddddeje	ejejf e
ej e
e e
ej ee e	eje
ej e
e	ej  f d
ddZ  ZS )Llama4TextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr$   c                    s"  t    || _|| _t|d|j|j | _|j| _|j|j | _	|j| _| jd | _
|j| _|j| _|j| _|j| _d| _|j| | _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j|j| j |jd| _tj|j| j |j|jd| _| jjr| jrt|j| _d S )Nr   r   TrN   )r(   r)   r%   	layer_idxgetattrr-   num_attention_headsr   r   r   r   
attn_scalefloor_scaleattn_temperature_tuningattention_dropout	is_causalZno_rope_layersuse_roper.   rP   Zattention_biasq_projk_projv_projo_projZuse_qk_normrW   rms_norm_epsqk_normr7   r%   r   r8   r:   r;   r)     s8    
zLlama4TextAttention.__init__past_key_valuepast_key_values4.58new_nameversionN)r=   position_embeddingsr   r   cache_positionr   r>   c                 K   s  |j d d }g |d| jR }| ||}	| |jg |d| jR  }
| ||dd}| jrt|	|
|	|	j
\}	}
t| dr| |	}	| |
}
| jr| jstt| d | j d | j d }|d|d ddfg |ddR }|	| 	|	j}	|	dd}	|
dd}
|d ur\d|i}||
|| j|\}
}t}| jjdkrzt| jj }|| |	|
||f| jsdn| j| jd	|\}}|jg |dR    }| !|}||fS )
Nr?   r    r'   r         ?r   eagerr   )r   r   )"rC   r   r   rB   r   r   ry   r   r   rn   r   hasattrr   r   r0   logfloorra   r   r   r   ro   updater   r   r%   _attn_implementationr   r   r   r   rw   r   r   )r7   r=   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   Zattn_scalesZcache_kwargsattention_interfacer   r   r:   r:   r;   rF   8  sP    
 


**


zLlama4TextAttention.forward)NN)rG   rH   rI   __doc__r"   r)   r   r0   rJ   ri   r   r	   
LongTensorr   r   rF   rK   r:   r:   r8   r;   r     s     r   c                       s   e Zd Z fddZedddddejeej eej ee	ej  ee
 eej ee	ejejf  ee e	ejee	ejejf  f d		d
dZ  ZS )Llama4TextDecoderLayerc                    s   t    |j| _|| _|j| | _t||| _||jv | _	| j	rNt
|| _nt||jd| _t|j|jd| _t|j|jd| _d S )N)r,   rY   )r(   r)   r-   r   Zlayer_typesattention_typer   	self_attnZ
moe_layersis_moe_layerrr   feed_forwardrL   Zintermediate_size_mlpre   r   input_layernormpost_attention_layernormr   r8   r:   r;   r)   v  s    
zLlama4TextDecoderLayer.__init__r   r   r   r   NF)	r=   r   r   r   	use_cacher   r   r   r>   c              	   K   sr   |}	|  |}| jf ||||||d|\}
}|	|
 }|}	| |}| |}| jr^|\}}|	||	j }|S )N)r=   r   r   r   r   r   )r   r   r   r   r   rB   rC   )r7   r=   r   r   r   r   r   r   r   residualZattention_states_r:   r:   r;   rF     s(    

	

zLlama4TextDecoderLayer.forward)NNNFNN)rG   rH   rI   r)   r   r0   rJ   r   r   ri   boolr   r   FloatTensorrF   rK   r:   r:   r8   r;   r   u  s&         r   c                   @   s<   e Zd ZU eed< dZdgZdZdZdZ	dZ
dZdd ZdS )Llama4PreTrainedModelr%   Tr   Fc                 C   s8  t | jdr| jjn| jjj}t|tjrT|jjj	d|d |j
d urR|j
j  nt|tjr|jjj	d|d |jd ur|jj|j   nt|tjr|jjd |j
j  n|t|tr|jjd nbt|tr|jjj	d|d |jjj	d|d n0t|tr4|jjj	|jd |jjj	|jd d S )Ninitializer_ranger   )r^   stdr   )r   )r   r%   r   text_configr   r.   rP   rg   dataZnormal_rO   Zzero_	Embeddingpadding_idx	LayerNormZfill_re   r#   r2   r3   Llama4VisionModelclass_embeddingscalepositional_embedding_vlm)r7   r   r   r:   r:   r;   _init_weights  s.    




z#Llama4PreTrainedModel._init_weightsN)rG   rH   rI   r!   r   Zsupports_gradient_checkpointingZ_skip_keys_device_placementZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_can_compile_fullgraphZ_supports_attention_backendr   r:   r:   r:   r;   r     s   
r   c                       s   e Zd ZU dgZdZeed< eee	dZ
ed fddZeeedejeej eej ee eej ee eej ee eeef d		d
dZ  ZS )Llama4TextModelr   modelr%   )
attentionsr=   rp   r$   c                    s   t     j| _ j| _t j j| j| _t	 fddt
 jD | _t j jd| _t d| _d| _|   d S )Nc                    s   g | ]}t  |qS r:   )r   ).0r   r$   r:   r;   
<listcomp>      z,Llama4TextModel.__init__.<locals>.<listcomp>rY   r$   F)r(   r)   pad_token_idr   
vocab_sizer.   r   r-   embed_tokens
ModuleListrangenum_hidden_layerslayersre   r   normr|   
rotary_embgradient_checkpointing	post_initr6   r8   r$   r;   r)     s    zLlama4TextModel.__init__N)		input_idsr   r   r   inputs_embedsr   r   r   r>   c              
   K   sD  |d u |d uA rt d|d u r6| || jjj}|rN|d u rNt| jd}|d u r|d urf| nd}	tj	|	|	|j
d  |jd}|d u r|d}t| }
ts| j|||||d}tf i |tf i |d}
|}| ||}| jd | jj D ](}||f|
|j |||||d|}q| |}t||r<|nd d	S )
N:You must specify exactly one of input_ids or inputs_embedsr$   r   r    )r   )r%   Zinput_embedsr   r   r   r   )Zfull_attentionZchunked_attention)r   r   r   r   r   r   )last_hidden_stater   )
ValueErrorr   rn   rg   r   r
   r%   Zget_seq_lengthr0   arangerC   	unsqueezer   dictr   r   r   r   r   r   r   r   )r7   r  r   r   r   r  r   r   r   Zpast_seen_tokensZcausal_mask_mappingZmask_kwargsr=   freq_cisZdecoder_layerr:   r:   r;   rF     sX    



zLlama4TextModel.forward)NNNNNNN)rG   rH   rI   _no_split_modulesbase_model_prefixr"   r   r   r   rr   Z_can_record_outputsr)   r   r   r   r0   r   r   rJ   r	   r   r   r   r   r   ri   r   rF   rK   r:   r:   r8   r;   r     s:   
       
r   c                       s   e Zd ZU dgZdZdgZddiZeed< ed fdd	Z	e
edejeej eej eeeeej f  eej eej ee eej eeejf ee eeef dddZ  ZS )Llama4ForCausalLMr   language_modelzlm_head.weightlm_headZcolwise_repr%   r$   c                    s@   t  | t|| _|j| _tj|j|jdd| _| 	  d S rM   )
r(   r)   r   r   r   r.   rP   r-   r  r   r6   r8   r:   r;   r)   =  s
    
zLlama4ForCausalLM.__init__Nr   )r  r   r   r   r  labelsr   r   logits_to_keepr   r>   c
              
   K   s   | j f |||||||d|
}|d }t|	tr>t|	 dn|	}| |dd|ddf }d}|dur| jf ||| jjd|
}t|||j	|j
|jdS )az  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Llama4ForCausalLM

        >>> model = Llama4ForCausalLM.from_pretrained("meta-llama4/Llama4-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama4/Llama4-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r  r   r   r   r  r   r   r   N)logitsr  r   )lossr  r   r=   r   )r   r   intslicer  Zloss_functionr%   r   r   r   r=   r   )r7   r  r   r   r   r  r  r   r   r  r   outputsr=   Zslice_indicesr  r  r:   r:   r;   rF   F  s0    %zLlama4ForCausalLM.forward)	NNNNNNNNr   )rG   rH   rI   r
  r  Z_tied_weights_keys_tp_planr"   r   r)   r   r   r0   r   r   rJ   r   r	   listr   r   r  r   r   ri   r   rF   rK   r:   r:   r8   r;   r  6  s<   
	         
r  zQ
    Base class for Llava causal language model (or autoregressive) outputs.
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ejed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	Llama4CausalLMOutputWithPasta\  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr  r  r   r=   r   image_hidden_states)rG   rH   rI   r   r  r   r0   r   r   r  r   r  r=   ri   r   r  r:   r:   r:   r;   r    s   
r  c                       s$   e Zd Z fddZdd Z  ZS )Llama4VisionMLP2c                    s\   t    |j| _|j| _tj| j|jdd| _tj|j|jdd| _	t
 | _|j| _d S rM   )r(   r)   r-   r,   r.   rP   projector_input_dimfc1projector_output_dimfc2GELUrS   Zprojector_dropoutr   r6   r8   r:   r;   r)     s    

zLlama4VisionMLP2.__init__c                 C   s8   |  |}| |}tj|| j| jd}| | |S )Nr   )r  rS   Fr   r   r  r7   r=   r:   r:   r;   rF     s    

zLlama4VisionMLP2.forwardrV   r:   r:   r8   r;   r    s   	r  c                       s$   e Zd Z fddZdd Z  ZS )Llama4MultiModalProjectorc                    s(   t    tj|jj|jjdd| _d S rM   )	r(   r)   r.   rP   vision_configZvision_output_dimr   r-   linear_1r6   r8   r:   r;   r)     s    
z"Llama4MultiModalProjector.__init__c                 C   s   |  |}|S rT   )r$  )r7   image_featuresr=   r:   r:   r;   rF     s    
z!Llama4MultiModalProjector.forwardrV   r:   r:   r8   r;   r"    s   r"  c           
   	   C   s   | j \}}}tt|}| |||d} |  \}}}}| ||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }||d|j d }	|	S )Nr?   r   r'   r    r   )rC   r  mathsqrtrB   sizepermuter   )
Zinput_tensorZshuffle_ratioZ
batch_sizenum_patchesZchannels
patch_sizeheightwidthZreshaped_tensorZoutput_tensorr:   r:   r;   pixel_shuffle  s     $r.  c                       s0   e Zd Z fddZejejdddZ  ZS )Llama4VisionPixelShuffleMLPc                    s>   t    |j| _t|j| jd  | _|j| _t|| _	d S r&   )
r(   r)   pixel_shuffle_ratior  r  Z	inner_dimr  Z
output_dimr  mlpr6   r8   r:   r;   r)     s
    
z$Llama4VisionPixelShuffleMLP.__init__)encoded_patchesr>   c                 C   s   t || j}| |S rT   )r.  r0  r1  )r7   r2  r:   r:   r;   rF     s    z#Llama4VisionPixelShuffleMLP.forwardrG   rH   rI   r)   r0   rJ   rF   rK   r:   r:   r8   r;   r/    s   r/  freqs_cir   c                    s(   |j   fddt|jD }| j| S )Nc                    s,   g | ]$\}}|d ks | d  kr$|nd qS )r    r:   )r   idndimr:   r;   r     r   z)reshape_for_broadcast.<locals>.<listcomp>)r9  	enumeraterC   rB   )r5  r   rC   r:   r8  r;   reshape_for_broadcast  s    r;  )r   r   r5  r>   c                 C   s   t |  jg | jd d ddR  }t | jg |jd d ddR  }t||d}||j}t || 	d}t || 	d}|
| |
|fS )Nr?   r'   r4  r   )r0   r   ra   rw   rC   r;  rn   r   r   r   rb   )r   r   r5  Zquery_key_Z	query_outZkey_outr:   r:   r;   vision_apply_rotary_emb  s    ,,r=  c                       sj   e Zd Zed fddZdejejeej ee e	e
 eejeej eeej  f dddZ  ZS )	Llama4VisionAttentionr$   c                    s   t    || _|j| _|j| _|j|j | _d| _|j	| _	| jd | _
tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j| j dd| _tj| j| j | jdd| _d S )Nr    r   TrN   )r(   r)   r%   r-   Z	embed_dimr   Z	num_headsr   r   r   r   r.   rP   r   r   r   r   r6   r8   r:   r;   r)     s    
zLlama4VisionAttention.__init__N)r=   r5  r   r   r   r>   c                 K   s   |j d d }g |d| jR }| ||}| ||}	| ||}
t||	|d\}}	|dd}|	dd}	|
dd}
t}| j	j
dvrt| j	j
 }|| ||	|
d f| jsdn| jd dd|\}}|jg |dR   }| |}||fS )	Nr?   )r5  r    r'   )r   Zflex_attentionr   F)r   r   r   )rC   r   r   rB   r   r   r=  ry   r   r%   r   r   r   r   rw   r   r   )r7   r=   r5  r   r   r   r   r   r   r   r   r   r   r   r:   r:   r;   rF     s8    	

zLlama4VisionAttention.forward)NN)rG   rH   rI   r   r)   r0   rJ   r   r	   r   r   ri   rF   rK   r:   r:   r8   r;   r>    s     r>  c                       s0   e Zd Z fddZejejdddZ  ZS )Llama4VisionMLPc                    sJ   t    || _t | _tj|j|jdd| _	tj|j|jdd| _
d S )NTrN   )r(   r)   r%   r.   r  rS   rP   r-   r,   r  r  r6   r8   r:   r;   r)   7  s
    

zLlama4VisionMLP.__init__r<   c                 C   s"   |  |}| |}| |}|S rT   )r  rS   r  r!  r:   r:   r;   rF   >  s    


zLlama4VisionMLP.forwardr3  r:   r:   r8   r;   r?  6  s   r?  c                       sF   e Zd Zed fddZdejejeej ee dddZ	  Z
S )	Llama4VisionEncoderLayerr$   c                    sF   t    |j| _t|| _t|| _t|j| _	t|j| _
d S rT   )r(   r)   r-   r>  r   r?  r1  r.   r   r   r   r6   r8   r:   r;   r)   F  s    


z!Llama4VisionEncoderLayer.__init__N)hidden_stater5  r   output_attentionsc                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r^||f7 }|S )N)r5  r   )r   r   r   r1  )r7   rA  r5  r   rB  r   r   r  r:   r:   r;   rF   P  s     




z Llama4VisionEncoderLayer.forward)NN)rG   rH   rI   r   r)   r0   rJ   r   r   rF   rK   r:   r:   r8   r;   r@  E  s     r@  c                
       s`   e Zd ZdZed fddZd	ejejeej ee	 ee	 ee	 e
eef dddZ  ZS )
Llama4VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Llama4VisionEncoderLayer`].

    Args:
        config: Llama4VisionConfig
    r$   c                    s@   t     | _t fddt jD | _d| _ | _d S )Nc                    s   g | ]}t  qS r:   )r@  )r   r   r$   r:   r;   r   }  r   z0Llama4VisionEncoder.__init__.<locals>.<listcomp>F)	r(   r)   r%   r.   r   r   r   r   r   r6   r8   r$   r;   r)   z  s
    
 zLlama4VisionEncoder.__init__N)r=   r5  r   rB  output_hidden_statesreturn_dictr>   c                 C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}|rDdnd}|rPdnd}| jD ]<}	|rl||f }|	||||d}
|r||
d f }|
d }qZ|r||f }|stdd |||fD S t|||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr:   )rA  r   rB  r5  r    r   c                 s   s   | ]}|d ur|V  qd S rT   r:   r   vr:   r:   r;   	<genexpr>  r   z.Llama4VisionEncoder.forward.<locals>.<genexpr>r  r=   r   )r%   rB  rD  use_return_dictr   ri   r   )r7   r=   r5  r   rB  rD  rE  Zencoder_statesZall_attentionsZencoder_layerZlayer_outputsr:   r:   r;   rF     s2    



zLlama4VisionEncoder.forward)NNNN)rG   rH   rI   r   r   r)   r0   rJ   r   r   r   ri   r   rF   rK   r:   r:   r8   r;   rC  q  s       
rC  c                       s0   e Zd Z fddZejejdddZ  ZS )Llama4UnfoldConvolutionc                    s`   t    |j}t|tr"||f}tjj||jd| _tj	|j
|d  |d  |jdd| _d S )N)kernel_sizeZstrider   r    FrN   )r(   r)   r+  r   r  r0   r.   ZUnfoldunfoldrP   num_channelsr-   linear)r7   r%   rL  r8   r:   r;   r)     s    

z Llama4UnfoldConvolution.__init__r<   c                 C   s&   |  |}|ddd}| |}|S )Nr   r'   r    )rM  r)  rO  r!  r:   r:   r;   rF     s    

zLlama4UnfoldConvolution.forwardr3  r:   r:   r8   r;   rK    s   rK  c                       s$   e Zd Z fddZdd Z  ZS )Llama4VisionRotaryEmbeddingc                    sd  t    |j|j }tj|d tjd|d d}tj||d d gdd}d|d< || }|| }|j	|j
 d }d|jtd|dd |d   |   }|d d	 |d d d d f  jdd
d}|d d	 |d d d d f  jdd
d}	tj||	gd
d  dd d df }
|
|d
dddk d}
ttjt|
t|
gd
d}|| _d S )Nr'   )ro   r    r   r@   r   )r?   r?   r   ).Nr?   .)r(   r)   
image_sizer+  r0   r  Zint32rw   catr-   r   Z
rope_thetara   Zrepeat_interleaver   Zmasked_fillr   stackcossinr5  )r7   r%   idxZimg_idxZfrequencies_xZfrequencies_yZfreq_dimZ	rope_freqZfreqs_xZfreqs_yr   r	  r8   r:   r;   r)     s    
 ,((($z$Llama4VisionRotaryEmbedding.__init__c                 C   s   | j |jS rT   )r5  rn   r   r!  r:   r:   r;   rF     s    z#Llama4VisionRotaryEmbedding.forwardrV   r:   r:   r8   r;   rP    s   rP  c                       s~   e Zd ZU dZdgZeed< ed fddZdd Zde	j
ee	j
 ee ee ee eeee	j
d
f f dddZ  ZS )r   vision_modelr@  r%   r$   c                    s   t  | |j| _|j| _|j| _|j| _| j| j d d | _|jd | _t|| _	t
| jt| j | _t
| jt| j| j | _t|| _t
| j| _t
| j| _t|| _t|| _|   d S )Nr'   r    r   )r(   r)   rQ  r+  r-   rN  r*  r   rK  patch_embeddingr.   r/   r0   Zrandnr   r   rP  rotary_embeddingr   layernorm_prelayernorm_postrC  r   r/  vision_adapterr   r6   r8   r:   r;   r)     s     



zLlama4VisionModel.__init__c                 C   s   | j S )zg
        This function is used to fetch the first embedding layer to activate grads on inputs.
        )rX  rc   r:   r:   r;   get_input_embeddings  s    z&Llama4VisionModel.get_input_embeddingsN.)pixel_valuesr   rB  rD  rE  r>   c                 C   s  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|j\}}}}	d}
d}| |}|j\}}}|||
 | ||}| j|jd d|jd }t	j
||gdd}|d7 }|||
 |||}| jj|j|jd}|| }| |}||d|}| |}| j|d|||d}|j}| |}|ddddddf }| |}|rZ|jnd}|rn|d }nd}|std	d
 |||fD S t|||dS )a  

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, MllamaVisionModel

        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
        >>> model = MllamaVisionModel.from_pretrained(checkpoint)
        >>> processor = AutoProcessor.from_pretrained(checkpoint)

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> output = model(**inputs)

        >>> print(output.last_hidden_state.shape)
        torch.Size([1, 1, 4, 1025, 7680])
        ```
        Nr    r   r?   r@   ro   r   )r   rD  rB  r5  r'   c                 s   s   | ]}|d ur|V  qd S rT   r:   rF  r:   r:   r;   rH  j  r   z,Llama4VisionModel.forward.<locals>.<genexpr>rI  )r%   rB  rD  rJ  rC   rX  rw   r   r   r0   rR  r   rn   ro   r   rZ  rB   rY  r   r  r[  r\  r=   ri   r   )r7   r^  r   rB  rD  rE  Zbatch_size_times_num_tilesrN  r,  r-  Znum_concurrent_mediaZ
num_chunksrA  r   r*  rs   r   Zpositional_embeddingr5  rh   r=   r   r:   r:   r;   rF     sZ    





zLlama4VisionModel.forward)NNNN)rG   rH   rI   r  r
  r   r   r)   r]  r0   rJ   r   r   r   r   ri   rF   rK   r:   r:   r8   r;   r     s"   
	    r   c                       sF  e Zd ZU ddgZi ZdZeed< ed fddZdd	 Z	d
d Z
dd Zdd Zdd Zdd Zejeeee f edddZejejejdddZed!ejejeej eej ee eej eeeee f  ee eej ee ee ee ee eej eeejf ejee eee f dddZ!d"dd Z"  Z#S )#Llama4ForConditionalGenerationr   r@   r%   r$   c                    s^   t  | t|j| _t|| _t|j| _	|jj
| _
| jjd urL| jjnd| _|   d S )Nr?   )r(   r)   r   r#  rW  r"  multi_modal_projectorr  r   r  r   r%   r   r   r6   r8   r:   r;   r)   y  s    

z'Llama4ForConditionalGeneration.__init__c                 C   s
   | j  S rT   )r  r]  rc   r:   r:   r;   r]    s    z3Llama4ForConditionalGeneration.get_input_embeddingsc                 C   s   | j | d S rT   )r  set_input_embeddings)r7   r   r:   r:   r;   rc    s    z3Llama4ForConditionalGeneration.set_input_embeddingsc                 C   s
   | j  S rT   )r  get_output_embeddingsrc   r:   r:   r;   rd    s    z4Llama4ForConditionalGeneration.get_output_embeddingsc                 C   s   | j | d S rT   )r  set_output_embeddings)r7   Znew_embeddingsr:   r:   r;   re    s    z4Llama4ForConditionalGeneration.set_output_embeddingsc                 C   s   | j | d S rT   )r  set_decoder)r7   decoderr:   r:   r;   rf    s    z*Llama4ForConditionalGeneration.set_decoderc                 C   s
   | j  S rT   )r  get_decoderrc   r:   r:   r;   rh    s    z*Llama4ForConditionalGeneration.get_decoder)r^  vision_feature_layervision_feature_select_strategyc                 K   sJ   |dvrt d| j dd | D }| j|fddi|}|j}|S )a  
        Obtains image last hidden states from the vision tower and apply al projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            vision_feature_select_strategy (`str`):
                The feature selection strategy used to select the vision feature from the vision backbone.
                Can be one of `"default"` or `"full"`
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r~   fullz$Unexpected select feature strategy: c                 S   s   i | ]\}}|d ur||qS rT   r:   )r   krG  r:   r:   r;   
<dictcomp>  r   zELlama4ForConditionalGeneration.get_image_features.<locals>.<dictcomp>rD  F)r  rj  itemsrW  r  )r7   r^  ri  rj  r   Zimage_outputsrA  r:   r:   r;   get_image_features  s    z1Llama4ForConditionalGeneration.get_image_features)r  r  r%  c                 C   s   |du r8||   tj| jjtj|jdk}|d}n|| jjk}| }|	d
||j}||  | krtd| d|jd  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        Nr_  r?   z6Image features and image tokens do not match: tokens: z, features r   )r]  r0   Ztensorr%   Zimage_token_idlongr   allrz   r  Z	expand_asrn   Znumelr  rC   )r7   r  r  r%  special_image_maskZn_image_tokensr:   r:   r;   get_placeholder_mask  s    z3Llama4ForConditionalGeneration.get_placeholder_maskNr   )r  r^  r   r   r   r  ri  rj  r  r   rB  rD  rE  r   r  image_sizesr   r>   c                 K   s~  |dur|n| j j}|dur |n| j j}|dur4|n| j j}|durH|n| j jj}|dur^|n| j jj}|du |duA rtd|dur|durtd|du r|  |}|dur| j	||||d}|
d|d}| ||j|j}| j|||d}|||}| jf |||||
|||||d
|}|d }d}|	dur&|dur|dd|jd	 d	  df |j}|d
ddddf ||jdk  }|	d
d	df ||	jdk  }n.|d
ddddf  }|	d
d	df  }t }||
d|d|
d|j}|sV|f|d	d  }|durR|f| S |S t|||j|j|j|durv|nddS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration

        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
        ```Nr  zdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r^  ri  rj  rt  r?   )r  r%  )
r   r   r   r  r   rB  rD  rE  r   r  r   r    .)r  r  r   r=   r   r  )r%   rB  rD  rJ  r#  ri  rj  r  r]  ro  rB   r(  rb  rn   r   ro   rs  Zmasked_scatterr  rC   r   r.   ZCrossEntropyLossr  r   r=   r   )r7   r  r^  r   r   r   r  ri  rj  r  r   rB  rD  rE  r   r  rt  r   r%  Zvision_flatZprojected_vision_flatrr  r  r  r  Zshift_attention_maskZshift_logitsZshift_labelsZloss_fctrh   r:   r:   r;   rF     s    1



(*& z&Llama4ForConditionalGeneration.forwardc           
      K   s8   | j j|f|||||d|}	|d dkr4||	d< |	S )N)r   r  r   r   r  r   r^  )r  prepare_inputs_for_generation)
r7   r  r   r  r^  r   r   r  r   Zmodel_inputsr:   r:   r;   ru  W  s    
z<Llama4ForConditionalGeneration.prepare_inputs_for_generation)NNNNNNNNNNNNNNr   N)NNNNNN)$rG   rH   rI   r
  r  r  r!   r   r)   r]  rc  rd  re  rf  rh  r0   r   r   r  r  r   ro  r   rs  r   r   rJ   r	   r   r   r   ri   r  rF   ru  rK   r:   r:   r8   r;   r`  s  s|   
                
       r`  )r   r   r   r  r`  )r   )r   )ar&  dataclassesr   typingr   r   r   r0   Ztorch.nnr.   Ztorch.nn.functionalrm   r   Z/transformers.models.llama4.configuration_llama4r   Zactivationsr   Zcache_utilsr	   r
   Z
generationr   Zintegrationsr   Zmasking_utilsr   r   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   r   Zmodeling_rope_utilsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zutils.deprecationr   Zutils.genericr   Zconfiguration_llama4r!   r"   Z
get_loggerrG   loggerModuler#   rL   rW   re   rP   rj   rr   r|   rJ   ri   r   r  r   ra   r   r   r   r   r   r   r  r  r  r"  r.  r/  r;  r=  r>  r?  r@  rC  rK  rP  r   r`  __all__r:   r:   r:   r;   <module>   s   
!" " ^6&cQ;,R   