a
    hS                  	   @   s  d Z ddlmZ ddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ eeZ dd Z!G dd dej"Z#dd Z$d-ddZ%d.ej"ej&ej&ej&eej& e'e'dddZ(G dd dej"Z)G dd  d ej"Z*G d!d" d"ej"Z+G d#d$ d$eZ,G d%d& d&ej"Z-eG d'd( d(eZ.d)d* Z/eG d+d, d,e.Z0d,d(gZ1dS )/zPyTorch Pixtral model.    )Callable)OptionalUnionN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc           
      C   s   g }| D ]v}|j dd  \}}tjt|t|dd}tj|dddddd\}}|| | }	||	d d df  qt|S )NZij)Zindexingdim   r   )	shapetorchZmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_widthZ	positionspatchheightwidthZmeshZh_gridZv_gridZids r&   h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgrid&   s    "r(   c                       sB   e Zd ZU dZejed< d fdd	Ze e	dd Z
  ZS )	PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    inv_freqNc           
         s  t    d| _|j| _|j| _|j|j }d| jt	
d| jd | j   }t	j
||jd}t	j
||jd}t	||d d d  }t	||dd d  }t	j|d d d d d f d|d|d d d d d f |ddgddd| jd }	| jd	t	j|	|	fddd
d d S )Ndefault      ?r   r   )devicer   r   r   r*   F)
persistent)super__init__Z	rope_typehead_dimr   Z
rope_thetabase
image_size
patch_sizer   r   floatr-   outerr    repeatr   Zregister_buffer)
selfconfigr-   Zmax_patches_per_sidefreqshwZfreqs_hZfreqs_wr*   	__class__r&   r'   r0   ?   s&    
$

zPixtralRotaryEmbedding.__init__c                 C   s   | j | }t|jjtr,|jjdkr,|jjnd}tj|dd$ |}| }| }W d    n1 sh0    Y  |j	|j
d|j	|j
dfS )NZmpscpuF)device_typeZenabled)dtype)r*   
isinstancer-   typestrr   ZautocastcossintorA   )r8   xposition_idsr:   r@   ZembrE   rF   r&   r&   r'   forwardX   s    
&&zPixtralRotaryEmbedding.forward)N)__name__
__module____qualname____doc__r   Tensor__annotations__r0   Zno_gradr   rJ   __classcell__r&   r&   r=   r'   r)   1   s   

r)   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r    )rH   x1Zx2r&   r&   r'   rotate_halfg   s    rS   c                 C   sD   | |}| |}| | t| |  }|| t||  }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerS   )qkrE   rF   rI   unsqueeze_dimZq_embedZk_embedr&   r&   r'   apply_rotary_pos_embn   s
    

rX           )modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur(|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr   r   )r   rA   )ptrainingr   r   )r   matmul	transposer   Z
functionalZsoftmaxfloat32rG   rA   r`   rb   
contiguous)
rZ   r[   r\   r]   r^   r_   r`   kwargsattn_weightsattn_outputr&   r&   r'   eager_attention_forward   s    
rj   c                
       sj   e Zd ZdZ fddZd	ejeej eeejejf  ee	 e
e eejeej f dddZ  ZS )
PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    s   t    || _|j| _|j| _| j| j | _d| _| jd | _	d| _|j
| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFg      ࿩bias)r/   r0   r9   hidden_sizeZ	embed_dimZnum_attention_heads	num_headsr1   Z	is_causalr_   Zattention_dropoutr`   r   Lineark_projv_projq_projo_projr8   r9   r=   r&   r'   r0      s    
zPixtralAttention.__init__NFhidden_statesr^   position_embeddingsoutput_attentionsrg   returnc                 K   sZ  |  \}}}| |}	| |}
| |}|	||| j| jdd}	|
||| j| jdd}
|||| j| jdd}|\}}t|	|
||dd\}	}
t	}| j
jdkr| j
jdkr|rtd nt| j
j }| j
jdkr|d	 j|jd
d|d	< || |	|
||f| jsdn| j| jd|\}}|||d }| |}|sRd}||fS )z#Input shape: Batch x Time x Channelr   r   r   )rW   eagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.flash_attention_2rI   T)Znon_blockingrY   )r`   r_   r   N)sizers   rq   rr   viewro   r1   rd   rX   rj   r9   _attn_implementationloggerZwarning_oncer   rG   r-   rb   r`   r_   r   rf   rt   )r8   rw   r^   rx   ry   rg   
batch_sizeZpatches_Zquery_statesZ
key_statesZvalue_statesrE   rF   Zattention_interfaceri   rh   r&   r&   r'   rJ      sF    





zPixtralAttention.forward)NNF)rK   rL   rM   rN   r0   r   rO   r   tupleboolr   r   rJ   rQ   r&   r&   r=   r'   rk      s      rk   c                       s$   e Zd Z fddZdd Z  ZS )
PixtralMLPc                    sr   t    || _|j| _|j| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _	t
|j | _d S )NFrl   )r/   r0   r9   rn   Zintermediate_sizer   rp   	gate_projup_proj	down_projr   Z
hidden_actact_fnru   r=   r&   r'   r0      s    
zPixtralMLP.__init__c                 C   s$   |  | | || | }|S N)r   r   r   r   )r8   rH   r   r&   r&   r'   rJ      s     zPixtralMLP.forward)rK   rL   rM   r0   rJ   rQ   r&   r&   r=   r'   r      s   
r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	PixtralRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r/   r0   r   	Parameterr   Zonesweightvariance_epsilon)r8   rn   epsr=   r&   r'   r0     s    
zPixtralRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   r   T)Zkeepdim)	rA   rG   r   re   powmeanZrsqrtr   r   )r8   rw   Zinput_dtypeZvariancer&   r&   r'   rJ     s
    zPixtralRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r   r   r   r   r8   r&   r&   r'   
extra_repr  s    zPixtralRMSNorm.extra_repr)r   )rK   rL   rM   r0   rJ   r   rQ   r&   r&   r=   r'   r     s   r   c                       sX   e Zd Z fddZdejejeeejejf  ee e	e
 eej dddZ  ZS )PixtralAttentionLayerc                    sB   t    t|jdd| _t|| _t|| _t|jdd| _	d S )Nh㈵>r   )
r/   r0   r   rn   attention_normr   feed_forwardrk   	attentionffn_normru   r=   r&   r'   r0     s
    


zPixtralAttentionLayer.__init__Nrv   c           	      K   sl   |}|  |}| jf ||||d|\}}|| }|}| |}| |}|| }|f}|rh||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rw   r^   rx   ry   )r   r   r   r   )	r8   rw   r^   rx   ry   rg   Zresidualrh   outputsr&   r&   r'   rJ     s&    




zPixtralAttentionLayer.forward)NN)rK   rL   rM   r0   r   rO   r   r   r   r   r   ZFloatTensorrJ   rQ   r&   r&   r=   r'   r     s     r   c                
       sf   e Zd Z fddZdeej eeejejf  ee ee ee e	e
 eeef dddZ  ZS )PixtralTransformerc                    sF   t    || _tj | _t|jD ]}| j	t
| q&d| _d S )NF)r/   r0   r9   r   r   Z
ModuleListlayersrangeZnum_hidden_layersr   r   Zgradient_checkpointing)r8   r9   r   r=   r&   r'   r0   I  s    
zPixtralTransformer.__init__N)r^   rx   ry   output_hidden_statesreturn_dictrg   rz   c                 K   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}|rDdnd}|rPdnd}	|}
| jD ]D}|rp||
f }||
|f||d|}|d }
|r^|	|d f }	q^|r||
f }|stdd |
||	fD S t|
||	dS )	av  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr&   )rx   ry   r   r   c                 s   s   | ]}|d ur|V  qd S r   r&   ).0vr&   r&   r'   	<genexpr>      z-PixtralTransformer.forward.<locals>.<genexpr>)Zlast_hidden_staterw   Z
attentions)r9   ry   r   Zuse_return_dictr   r   r
   )r8   Zinputs_embedsr^   rx   ry   r   r   rg   Zencoder_statesZall_attentionsrw   Zencoder_layerZlayer_outputsr&   r&   r'   rJ   Q  s:    


zPixtralTransformer.forward)NNNNN)rK   rL   rM   r0   r   r   rO   r   r   r   r   r   r
   rJ   rQ   r&   r&   r=   r'   r   H  s        
r   c                   @   sP   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdgZdZ	dZ
dZdZdd ZdS )	PixtralPreTrainedModelr9   modelpixel_valuesTr   c                 C   s`   | j j}t|tjtjfrD|jjjd|d |j	d ur\|j	j
  nt|tr\|jjd d S )NrY   )r   stdr,   )r9   Zinitializer_rangerB   r   rp   Conv2dr   dataZnormal_rm   Zzero_r   Zfill_)r8   rZ   r   r&   r&   r'   _init_weights  s    

z$PixtralPreTrainedModel._init_weightsN)rK   rL   rM   r   rP   base_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_supports_attention_backendZ_supports_flash_attnZ_supports_sdpaZ_supports_flex_attnZ_no_split_modulesr   r&   r&   r&   r'   r     s   
r   c                 C   s   |j }|j}|jd }t|j}tj||f|||d}t| d}tdg| d d  d}t	||D ]\}	}
d||	|
|	|
f< qp|d d d d d d f 
|jd ddd}|S )Nr   )Z
fill_valuerA   r-   r   r   )rA   r-   r   r   ZfinfominfulltensorZcumsumzipexpand)r!   r   rA   r-   Zseq_lenZd_minZcausal_maskZblock_end_idxZblock_start_idxstartendr&   r&   r'   generate_block_attention_mask  s    
*r   c                       sl   e Zd ZdZ fddZdd Zeed
ej	e
ej	 e
e e
e e
e ee eeef ddd	Z  ZS )PixtralVisionModelZvision_encoderc                    sh   t  | || _tj|j|j|j|jdd| _|j| _t	|jdd| _
t|| _t|| _|   d S )NF)Zin_channelsZout_channelsZkernel_sizeZstriderm   r   r   )r/   r0   r9   r   r   Znum_channelsrn   r4   
patch_convr   ln_prer   transformerr)   patch_positional_embeddingZ	post_initru   r=   r&   r'   r0     s    

zPixtralVisionModel.__init__c                 C   s   | j S r   )r   r   r&   r&   r'   get_input_embeddings  s    z'PixtralVisionModel.get_input_embeddingsN)r   image_sizesr   ry   r   rg   rz   c                    s   |d u r$|j \}}	}
}|
|fg| } |} fddt||D }tjdd |D ddd} |}t| jj	 jj
 d}||d<  ||} jjdkrd }ntd	d |D |} j|f||||d
d|S )Nc                    s:   g | ]2\}}|d d|d  j  d|d  j  f qS ).Nr   r   )r4   )r   Zembedr}   r   r&   r'   
<listcomp>  s   z.PixtralVisionModel.forward.<locals>.<listcomp>c                 S   s   g | ]}| d jqS )r   )flattenTr   ra   r&   r&   r'   r     r   r   r   )r"   rI   r|   c                 S   s    g | ]}|j d  |j d  qS )r   r   )r   r   r&   r&   r'   r     r   T)r^   rx   r   ry   r   )r   r   r   r   r    rT   r   r(   r9   r3   r4   r   r   r   r   )r8   r   r   r   ry   r   argsrg   r   r   r$   r%   Zpatch_embedsr!   rI   rx   r^   r&   r   r'   rJ     s<    


zPixtralVisionModel.forward)NNNN)rK   rL   rM   r   r0   r   r   r   r   rO   r   r   r   r   r   r   r
   rJ   rQ   r&   r&   r=   r'   r     s$       
r   )Nr   )rY   )2rN   collections.abcr   typingr   r   r   Ztorch.utils.checkpointr   Zactivationsr   Zmodeling_flash_attention_utilsr   Zmodeling_layersr	   Zmodeling_outputsr
   Zmodeling_rope_utilsr   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   Zconfiguration_pixtralr   Z
get_loggerrK   r   r(   Moduler)   rS   rX   rO   r5   rj   rk   r   r   r   r   r   r   r   __all__r&   r&   r&   r'   <module>   sP   
6
# P2KM