a
    h                 	   @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
  mZ ddlZddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 e' r>ddl2m3Z3 ddl4m5Z5 e(6e7Z8ee%ddG dd deZ9ee%ddG dd deZ:dCd d!Z;g fd"d#Z<G d$d% d%e
j=Z>G d&d' d'e
j?Z@G d(d) d)e
jAZBG d*d+ d+ej
jAZCd,d- ZDdDd.d/ZEG d0d1 d1e
jAZFdEe
jAejGejGejGeejG eHeHd3d4d5ZIG d6d7 d7e
jAZJG d8d9 d9eZKG d:d; d;eZLe%G d<d= d=e ZMe%G d>d? d?eMZNG d@dA dAeMeZOg dBZPdS )FzPyTorch Idefics model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)deprecate_kwarg   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformer)	BlockMask)make_flex_block_causal_maskz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )Zcustom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dS )IdeficsBaseModelOutputWithPastal  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r#   r   torchFloatTensor__annotations__r$   tupler%   r&   r'    r0   r0   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/idefics/modeling_idefics.pyr"   7   s   
r"   zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	IdeficsCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr$   r%   r&   r'   )r(   r)   r*   r+   r3   r   r,   r-   r.   r4   r$   listr%   r/   r&   r'   r0   r0   r0   r1   r2   [   s   
r2   Fc                 K   s:  t | jd ddd|d| j}| d|} |d|d< |d|d< |d|d< |d|d< d|v r|d }|d||d< |d ur|d||d	< |d d ur|d d||d< |d d ur|d d||d< nF|d d ur|d d||d< n"|d d ur2|d d||d< | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r,   arangeshapeviewrepeattodeviceZindex_selectget)	input_idsZexpand_sizeis_encoder_decoderr<   Zencoder_outputsmodel_kwargsZexpanded_return_idxr;   r0   r0   r1   expand_inputs_for_generation|   s6    	,
rG   c                    sf   t jt jt jd  fdd|D }|  D ]4|rVtfdd|D rVd q,d q,| S )N)	LayerNormLinear	Embeddingc                    s   g | ]} | qS r0   r0   ).0m)mappingr0   r1   
<listcomp>       z freeze_model.<locals>.<listcomp>c                 3   s   | ]}t  |V  qd S N)
isinstance)rK   t)moduler0   r1   	<genexpr>   rO   zfreeze_model.<locals>.<genexpr>TF)r   rH   rI   rJ   modulesanyrequires_grad_)modelmodule_exceptionsZmodule_exceptions_mappedr0   )rM   rS   r1   freeze_model   s    rZ   c                       sD   e Zd ZdZdee dd fddZdd Zed	d
dZ	  Z
S )IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    FN)partially_freezereturnc           	         s   |dur$||kr$t d| d| t jf |||||d| || _|| _|| _|| _|rj| jd | jdkrt	j
| j|||d| _dS )a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrB   dtypepadding_idxFr   )r^   r_   rB   r`   )
ValueErrorsuper__init__r^   ra   num_additional_embeddingsr\   weightrW   r   rJ   additional_embedding)	selfr^   re   r_   r\   rB   r`   ra   kwargs	__class__r0   r1   rd      s0    
z"IdeficsDecoupledEmbedding.__init__c                 C   sj   | j dkrt|| jS | }t|| jk}|| }| || j }d||< t|| j}|||< |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	re   FZ	embeddingrf   cloner,   wherer^   rg   )rh   rD   Zadditional_vocab_indicesZinput_ids_additional_vocabZadditional_embeddingsZfull_vectorr0   r0   r1   forward   s    
z!IdeficsDecoupledEmbedding.forwardr]   c                 C   s$   d| j  d| j d| j d| j S )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)r^   re   r_   r\   rh   r0   r0   r1   
extra_repr  s    z$IdeficsDecoupledEmbedding.extra_repr)FNNN)r(   r)   r*   r+   r   boolrd   ro   strrs   __classcell__r0   r0   rj   r1   r[      s       5'r[   c                       sT   e Zd ZdZdeeeeedd fddZejejdd	d
Z	e
dddZ  ZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    r   TN)in_featuresout_featuresout_additional_featuresbiasr\   r]   c                    sn   t  ||||| || _|| _|| _|| _|rL| jd |rL| jd |dkrjt	j
|||||d| _dS )aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )rx   ry   r{   rB   r`   N)rc   rd   rz   r\   rx   ry   rf   rW   r{   r   rI   additional_fc)rh   rx   ry   rz   r{   r\   rB   r`   rj   r0   r1   rd   *  s"    zIdeficsDecoupledLinear.__init__)inputr]   c                 C   s:   t || j| j}| jdkr6| |}t||fd}|S )Nr   r6   )rl   Zlinearrf   r{   rz   r|   r,   cat)rh   r}   outputZadditional_featuresr0   r0   r1   ro   N  s
    

zIdeficsDecoupledLinear.forwardrp   c              
   C   s0   d| j  d| j d| j d| jdu d| j 
S )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nrq   rx   ry   rz   r{   r\   rr   r0   r0   r1   rs   W  s    z!IdeficsDecoupledLinear.extra_repr)r   TTNN)r(   r)   r*   r+   intrt   rd   r,   Tensorro   ru   rs   rv   r0   r0   rj   r1   rw   !  s         $	rw   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	IdeficsRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)rc   rd   r   	Parameterr,   onesrf   variance_epsilon)rh   hidden_sizeepsrj   r0   r1   rd   ^  s    
zIdeficsRMSNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv rR| | jj}| j| S )N   r6   T)Zkeepdim)rA   r,   float32powmeanZrsqrtr   rf   r`   Zfloat16Zbfloat16)rh   r%   Zvariancer0   r0   r1   ro   f  s
    zIdeficsRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r/   rf   r>   r   rr   r0   r0   r1   rs   p  s    zIdeficsRMSNorm.extra_repr)r   )r(   r)   r*   rd   ro   rs   rv   r0   r0   rj   r1   r   ]  s   
r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )IdeficsEmbedding   '  Nc                    sz   t    || _|| _|| _d| jtjd| jdtjdj|tj	d| j   }| j
d|dd | j|| jjt d	 d S )
N      ?r   r   r`   rB   r`   inv_freqF
persistentseq_lenrB   r`   )rc   rd   dimmax_position_embeddingsbaser,   r=   int64rA   floatregister_buffer_set_cos_sin_cacher   rB   Zget_default_dtype)rh   r   r   r   rB   r   rj   r0   r1   rd   v  s    
&zIdeficsEmbedding.__init__c                 C   s|   || _ tj| j |tjd| j}td|| j}tj||fdd}| jd|	 
|dd | jd| 
|dd d S )	Nr   zi,j->ijr6   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr,   r=   r   Ztype_asr   Zeinsumr~   r   cosrA   sin)rh   r   rB   r`   rR   ZfreqsZembr0   r0   r1   r     s    z#IdeficsEmbedding._set_cos_sin_cachec                 C   sN   || j kr| j||j|jd | jd | j|jd| jd | j|jdfS )Nr   r   )r   r   rB   r`   r   rA   r   )rh   xr   r0   r0   r1   ro     s
    
zIdeficsEmbedding.forward)r   r   N)N)r(   r)   r*   rd   r   ro   rv   r0   r0   rj   r1   r   u  s   
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r   r   )r>   r,   r~   )r   x1Zx2r0   r0   r1   rotate_half  s    r   c                 C   sL   ||  |}||  |}| | t| |  }|| t||  }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsZunsqueeze_dimZq_embedZk_embedr0   r0   r1   apply_rotary_pos_emb  s
    r   c                       s.   e Zd Zeeed fddZdd Z  ZS )
IdeficsMLPr   intermediate_size
hidden_actc                    sN   t    tj||dd| _tj||dd| _tj||dd| _t| | _d S )NFr{   )	rc   rd   r   rI   	gate_proj	down_projup_projr	   act_fn)rh   r   r   r   rj   r0   r1   rd     s
    
zIdeficsMLP.__init__c                 C   s    |  | | || | S rP   )r   r   r   r   )rh   r   r0   r0   r1   ro     s    zIdeficsMLP.forward)r(   r)   r*   r   ru   rd   ro   rv   r0   r0   rj   r1   r     s
   r           )rS   querykeyvaluer<   scalingdropoutc           
      K   s|   t ||dd| }|d ur(|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr6   )r   r`   ptrainingr   r   )r,   matmul	transposer   
functionalZsoftmaxr   rA   r`   r   r   
contiguous)
rS   r   r   r   r<   r   r   ri   attn_weightsattn_outputr0   r0   r1   eager_attention_forward  s    
r   c                       s   e Zd ZdZdeeeeeeee d fddZ	e
jeedd	d
Zeddddde
jee
j ee
j ee
j eee
j  eeee
j ee
jee
j eee
j  f d	ddZ  ZS )IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FN)r   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc           	         s  t    || _|| _|| _|| | _|| _d| _| jd | _|| _	|d u rbt
d| jj d | j| | jkrtd| j d| d|| _ttjdstd	| jrt|jd
s| jn|jj}tj| j|| j dd| _tj||| j dd| _tj||| j dd| _nNtj| j|| j dd| _tj| j|| j dd| _tj| j|| j dd| _tj|| j |dd| _t| j| _|| _| jrt| j|jd| _t| j|jd| _ d S )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).Zscaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!rc   rd   r   r   r   head_dimr   Z	is_causalr   r   loggerwarning_oncerk   r(   rb   r   hasattrr   r   vision_configr   rI   q_projk_projv_projo_projr   
rotary_embr   r   rms_norm_epsq_layer_normk_layer_norm)	rh   r   r   r   r   r   r   r   Zkv_input_dimrj   r0   r1   rd     sz    




zIdeficsAttention.__init__)tensorr   bszc                 C   s    | ||| j| jdd S )Nr   r   )r?   r   r   r   r   )rh   r   r   r   r0   r0   r1   _shape>  s    zIdeficsAttention._shapepast_key_valuer$   4.58new_nameversion)	r%   key_value_statesr<   r   r$   output_attentions	use_cachecache_positionr]   c	                 K   s  | j p|d u}
| \}}}| |||| j| jdd}|
s| |||| j| jdd}| |||| j| jdd}nR| \}}}| |||| j| jdd}| |||| j| jdd}|j	d }|d ur||d 7 }|
s*| j
|t||d\}}t|||||\}}|d urRd|i}|||| j|\}}| jrn| |}| |}t}| jjdkr| jjdkr|rtd	 nt| jj }|| ||||f| jsd
n| j| jd|	\}}|||d }| |}|rd }||fS )Nr   r   r   r   )r   r   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r6   )r   sizer   r?   r   r   r   r   r   r>   r   maxr   updater   r   r   r   r   r   _attn_implementationr   r   r   r   r   r   reshaper   r   )rh   r%   r   r<   r   r$   r   r   r   ri   r   r   Zq_len_Zquery_statesZ
key_statesZvalue_statesZkv_lenZ
kv_seq_lenr   r   Zcache_kwargsZattention_interfacer   r   r0   r0   r1   ro   A  s\    ""$" 





zIdeficsAttention.forward)r   FNFN)NNNNFFN)r(   r)   r*   r+   r   r   rt   r   r   rd   r,   r   r   r   
LongTensorr/   ro   rv   r0   r0   rj   r1   r     sF        Q       r   c                       s   e Zd Zdeee d fddZedddddej	eej	 eej
 eeej	  ee ee eej
 eejeeejejf  f d
ddZ  ZS )IdeficsDecoderLayerNr   r   c                    sr   t    |j| _t| j|j|j||d| _t| j|j|j	d| _
t|j|jd| _t|j|jd| _|j| _d S )N)r   r   r   r   r   r   r   )rc   rd   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r   input_layernormpost_attention_layernormrh   r   r   rj   r0   r1   rd     s"    
zIdeficsDecoderLayer.__init__r   r$   r   r   F)r%   r<   r   r$   r   r   r   r]   c              
   K   s   |}	|  |}| jf |||||||d|\}}
tjj|| j| jd}|	| }|}	| |}| |}tjj|| j| jd}|	| }|f}|r||
f7 }|S )a^  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        )r%   r<   r   r$   r   r   r   r   )r   r   r   r   r   r   r   r   )rh   r%   r<   r   r$   r   r   r   ri   residualself_attn_weightsoutputsr0   r0   r1   ro     s0    





zIdeficsDecoderLayer.forward)N)NNNFFN)r(   r)   r*   r   r   r   rd   r   r,   r   r   r/   rt   r-   ro   rv   r0   r0   rj   r1   r     s$         r   c                       s   e Zd Zdeee d fddZedddddej	eej	 eej	 eej	 eej	 ee
 ee
 eeej	  eejeeejejf  f d
	ddZ  ZS )IdeficsGatedCrossAttentionLayerNr   c              	      s  t    |j| _t| j|jd|j||j|d| _t| j|j	|j
d| _t|j|jd| _t|j|jd| _|j| _t | _t | _|jdkr|jdkrttdd| j| _ttdd| j| _nD|jdkrttd| _ttd| _ntd	|j d
nV|jdkr|jdkrbttdd| j| _ttdd| j| _nD|jdkrttd| _ttd| _ntd	|j d
n|jdv r\|jdkrttjd|jdd| jfd| _ttjd|jdd| jfd| _nT|jdkrHttjd|jdd| _ttjd|jdd| _ntd	|j d
nt d|j dt!| drt!| dstdd S )NT)r   r   r   r   r   r   r   r   r   zerosZvectorr   r   z Unknown value for `alpha_type` ()r   >   gaussianrandomnormalr   )r   stdr   zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"rc   rd   r   r   r   r   r   
cross_attnr   r   r   r   r   r   r   r   r   r   ZTanhact_cross_attn	act_densealpha_initializerZ
alpha_typer   r,   r   r  r  rb   r   r  alphas_initializer_rangeNotImplementedErrorr   r   rj   r0   r1   rd     sl    
	


z(IdeficsGatedCrossAttentionLayer.__init__r   r$   r   r   F)	r%   r<   r'   r:   cross_attention_gater   r   r$   r]   c	                 K   s  |du rt d|du r t d|dur0td|}
| |}| jf ||||d|	\}}tjj|| j| jd}|	|dkdddddf d}|
| 
| j|  }|}
| |}| |}tjj|| j| jd}|
| | j|  }|f}|r||f7 }|S )	a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            cross_attention_gate (`torch.FloatTensor`, *optional*):
                gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r%   r   r<   r   r   r   r   )rb   r  r   r  r   r   r   r   r   masked_fillr  r  r   r   r  r  )rh   r%   r<   r'   r:   r  r   r   r$   ri   r   r   r   r0   r0   r1   ro   "  s@    

"


z'IdeficsGatedCrossAttentionLayer.forward)N)NNNNFFN)r(   r)   r*   r   r   r   rd   r   r,   r   rt   r/   r-   ro   rv   r0   r0   rj   r1   r     s(   B       r   c                   @   s>   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZdd Zd	S )
IdeficsPreTrainedModelr   rX   Tr   r   Fc                 C   s  | j j}t|tjtjfrF|jjjd|d |j	d urB|j	j
  nLt|tjr|jjjd|d |jd ur|jj|j 
  nt|tjr|jjd |j	j
  nt|tr|jjd nt|tr|jj  nt|trz| j jdkr|jj
  |jj
  nf| j jdkr>|jjd |jjd n:| j jdv r|jjjd| j jd |jjjd| j jd nt|tr|jj  d S )Nr   )r   r  r   r   r   >   r   r  r  )r   Zinitializer_rangerQ   r   rI   ZConv2drf   dataZnormal_r{   Zzero_rJ   ra   rH   Zfill_r   r   Zclass_embeddingr   r	  r  r  r
  r   Zlatents)rh   rS   r  r0   r0   r1   _init_weightsz  s8    



z$IdeficsPreTrainedModel._init_weightsN)r(   r)   r*   r   r.   Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attnZ_can_compile_fullgraphZ_supports_attention_backendr  r0   r0   r0   r1   r  n  s   
r  c                       s"  e Zd ZdZed fddZdddZg fdd	Zg fd
dZe	e
deej eej eej ee eej eej eej eej eej ee ee ee ee ee eej ee eeef dddZdeejdf ejejeedddZeejeeejejedddZ  ZS )IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c                    s  t     | _ j| _ j| _t j j j j	| jd| _
 jj| _ j| _ j| j_t j| _ jr j}t  jj|j|j|j|j| _t fddt jD | _ j| _ j| j }t fddt|D | _d| _ t! j j"d| _#| $  | %  d S )N)r^   re   r_   r\   ra   c                    s   g | ]}t  |d qS )r   )r   rK   ir  r0   r1   rN     rO   z)IdeficsModel.__init__.<locals>.<listcomp>c                    s   g | ]}t  |d qS r  )r   r  r  r0   r1   rN     rO   Fr   )&rc   rd   r   Zpad_token_idra   
vocab_sizer[   additional_vocab_sizer   freeze_text_layersembed_tokensr   Z
image_sizer   r   vision_modeluse_resamplerperceiver_configr   r   Zresampler_depthZresampler_n_headsZresampler_head_dimZresampler_n_latentsperceiver_resamplerr   Z
ModuleListrangeZnum_hidden_layerslayerscross_layer_intervalgated_cross_attn_layersgradient_checkpointingr   r   norm	post_initfreeze_relevant_params)rh   r   r  Znum_cross_layersrj   r  r1   rd     sJ    

	zIdeficsModel.__init__Nc                 C   s:   |d u r| j }|jr | |j |jr6t| j|jd d S N)rY   )r   r  Zfreeze_text_module_exceptionsfreeze_vision_layersrZ   r  Zfreeze_vision_module_exceptions)rh   r   r0   r0   r1   r%    s    z#IdeficsModel.freeze_relevant_paramsc                 C   s"   | j | jfD ]}t||d qd S r&  )r  r#  rZ   )rh   rY   rS   r0   r0   r1   r    s    zIdeficsModel.freeze_text_layersc                 C   s   t | j|d d S r&  )rZ   r  )rh   rY   r0   r0   r1   r'    s    z!IdeficsModel.freeze_vision_layersF)rD   r<   r   r$   inputs_embedsr7   r8   r9   r:   r   r   output_hidden_statesinterpolate_pos_encodingreturn_dictr   ri   r]   c           (   
   K   s  |dur|j n|j }|dur |n| jj}|dur4|n| jj}|
durH|
n| jj}
|dur\|n| jj}|du |duA r|td| jr| jr|
rt	
d d}
|du r| |}t|tdtfstd|
r|du rt| jd}|j\}}}|dur| nd}|| }|du r,tj|||jd  |j d	}|durz|du rz| d
d }||dkd |dd| df }n|du r|d}tdd |||fD dkrtdn|dur|j| j|d}|jdd \}}| j|| g|jdd R  }| j||dj}n<|durT| \}}}}|j| j|d}||| ||}| jj r|du r| !|}|d|d }}n| \}}}}|}n*|du r|d|d }}ntd|||| |}|	d}|	d
}	|	"ddd|}	|	|||| }	|durV| \}}}||f}|	du rJtj#||d	}	| $|	}	nd}	|	dkj%d
dj| jdj&dd|}|du rtj#||ftj'|j d}| (|||||}|} |rdnd}!|rdnd}"t)| j*D ]\}#}$|r|!| f7 }!|#| j+ dkr@| j,|#| j+  }%|%| ||f|	|||
dd|}&|&d } |$| f|||||
|d|}'|'d } |r|"|'d f7 }"q| -| } |r|!| f7 }!|||||}t.| ||!|"|dS )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r  r   r   rB   r6   c                 S   s   g | ]}|d u qS rP   r0   )rK   r   r0   r0   r1   rN   3  rO   z(IdeficsModel.forward.<locals>.<listcomp>r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)r`   rB   )r7   r*  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer   r   r   r0   )r:   r  r   r   r$   )r<   r   r$   r   r   r   )r#   r$   r%   r&   r'   )/rB   r   r   r)  r   use_return_dictrb   r"  r   r   r   r  rQ   typer
   r   r>   get_seq_lengthr,   r=   longZcumsumZmasked_fill_r   sumrA   r`   r   r?   r  r#   r   r  r  r@   r   Zinvert_attention_maskrV   Zsqueezert   _update_causal_mask	enumerater  r   r!  r#  r"   )(rh   rD   r<   r   r$   r(  r7   r8   r9   r:   r   r   r)  r*  r+  r   ri   rB   
batch_size
seq_lengthr   past_key_values_lengthZseq_length_with_pastZ
num_imagesr'   Zimage_seq_lenZimage_hidden_sizeZtext_seq_lenZimage_batch_sizeZimage_sequence_lengthZimage_hidden_shaper  r%   Zall_hidden_statesZall_self_attnsidxZdecoder_layerZcross_attn_blockr   Zlayer_outputsr0   r0   r1   ro     s    




$









"


	


zIdeficsModel.forwardr    )r<   input_tensorr   r$   r   c                 C   sB  | j jdkr(|d ur$|dk r$|S d S | j jdkrLt|tjrHt|}|S |d ur\| nd}|d urn|jnd}| j jdkr|s|st	j
|||| jdrd S |j}|jd }	|r| }
n"t|tjr|jd	 n
||	 d }
| j||	|
|||jd d
}| j jdkr>|d ur>|jjdv r>|s>t|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   Fr   )r(  r6  Zis_trainingr   r6   )sequence_lengthtarget_lengthr`   r   r4  )cudaZxpuZnpu)r   r   rV   rQ   r,   r   r!   r/  Zis_compileabler   Z_ignore_causal_mask_sdpar   r`   r>   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrB   r.  finfominZ_unmask_unattended)rh   r<   r8  r   r$   r   Zpast_seen_tokensZusing_compilable_cacher`   r9  r:  causal_mask	min_dtyper0   r0   r1   r2    sZ    






	z IdeficsModel._update_causal_mask)r<   r9  r:  r`   r   r4  c                 K   sF  | dur|   dkr| }n&t|j}tj||f|||jd}|dkrVtj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| durB|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer`   rB   r   )Zdiagonalr,  r6   r   )r   r,   r=  r>  fullrB   Ztriur=   r   expandrm   r>   rA   r  )r<   r9  r:  r`   r   r4  ri   r?  r@  Zmask_lengthZpadding_maskr0   r0   r1   r<    s*     $

6  zBIdeficsModel._prepare_4d_causal_attention_mask_with_cache_position)N)NNNNNNNNNNNNFNN)F)r(   r)   r*   r+   r   rd   r%  r  r'  r   r   r   r,   r   r   r
   r-   rt   r   r   r   r/   r"   ro   r2  staticmethodr   r`   r<  rv   r0   r0   rj   r1   r    st   2

               
 O Dr  c                       s   e Zd ZddgZd fdd	Zdd Zeedee	j
 ee	j ee	j
 ee ee	j ee	j ee	j ee	j ee	j ee	j
 ee ee ee ee ee ee	j
 ee eeef d	d
dZd fdd	Zdeeeef eeeef d fddZ  ZS )IdeficsForVisionText2Textzmodel.embed_tokens.weightzlm_head.weightNc                    s>   t  | t|| _t|j|j|jd|jd| _	| 
  d S )NFr   )rc   rd   r  rX   rw   r   r  r  Zfreeze_lm_headlm_headr$  )rh   r   r  rj   r0   r1   rd   2  s    
	z"IdeficsForVisionText2Text.__init__c                 C   s   |   }|  }t| jddrL|j|_|jdkrL|j|jks@J |jj|j_t	|drt	|dr|j
|_t	|drt	|dr|j|_dS )	z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
        Ztie_word_embeddingsTr   ry   r^   rz   re   N)Zget_output_embeddingsZget_input_embeddingsgetattrr   rf   re   rz   rg   r|   r   r^   ry   )rh   Zoutput_embeddingsZinput_embeddingsr0   r0   r1   tie_weightsA  s    
z%IdeficsForVisionText2Text.tie_weightsF)rD   r<   r   r$   r(  r7   r8   r9   r:   labelsr   r   r)  r*  r+  r   ri   r]   c                 K   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| jf |||||||||	||||d|d|}|d }| |}d}|
dur| jf ||
| j jd|}t|||j	|j
|j|jdS )aC  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```NT)rD   r<   r   r$   r(  r7   r8   r9   r:   r   r   r)  r*  r+  r   r   )r4   rI  r  )r3   r4   r$   r%   r&   r'   )r   r   r)  r-  rX   rF  Zloss_functionr  r2   r$   r%   r&   r'   )rh   rD   r<   r   r$   r(  r7   r8   r9   r:   rI  r   r   r)  r*  r+  r   ri   r   r%   r4   r3   r0   r0   r1   ro   V  sH    <
z!IdeficsForVisionText2Text.forwardc              
      s   i }|d ur(| j jr||d< q0||d< n||d< |dd|d< t j|f||||||
|	d||}|	d ur|d u r|d jd }|	d d | d f |d	< |S )
Nr9   r8   r7   r*  F)r$   r<   r(  r   r   r   r:   rD   r   r:   )r   r  poprc   prepare_inputs_for_generationr>   )rh   rD   r<   r   r(  r$   r   r7   r'   r:   r   ri   Zimages_kwargsZmodel_inputsr5  rj   r0   r1   rK    s4    

	
z7IdeficsForVisionText2Text.prepare_inputs_for_generation)r   rF   rE   r]   c                    s~   t  j|||fi |}d|v rp|d }|d d dd d f d}|ddrZ||d< ntj||gdd|d< |j|d< |S )Nr:   r6   r   r   Tr   r'   )rc   #_update_model_kwargs_for_generationr   rC   r,   r~   r'   )rh   r   rF   rE   ri   r:   Z	last_maskrj   r0   r1   rL    s    

z=IdeficsForVisionText2Text._update_model_kwargs_for_generation)N)NNNNNNNNNNNNNFNN)	NNNNNNNNN)F)r(   r)   r*   Z_tied_weights_keysrd   rH  r   r   r   r,   r   r   r
   r-   rt   r   r   r   r/   r2   ro   rK  r   dictru   r   rL  rv   r0   r0   rj   r1   rE  /  st                   
g         1 

rE  )rE  r  r  )r   FNN)r   )r   )Qr+   dataclassesr   typingr   r   r   r   r,   Ztorch.nn.functionalr   r   rl   Ztorch.utils.checkpointZactivationsr	   Zcache_utilsr
   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr   Zmodeling_outputsr   Zmodeling_utilsr   r   r   Zprocessing_utilsr   utilsr   r   r   r   r   Zutils.deprecationr   Zconfiguration_ideficsr   Z	perceiverr   Zvisionr   r   Z!torch.nn.attention.flex_attentionr    Zintegrations.flex_attentionr!   Z
get_loggerr(   r   r"   r2   rG   rZ   rJ   r[   rI   rw   Moduler   r   r   r   r   r   r   r   r   r   r   r  r  rE  __all__r0   r0   r0   r1   <module>   s   
    
-i<'
  'O -    V