a
    haC                     @   s  d dl Z d dlmZmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ d	d
lmZmZ d	dlmZ d	dlmZmZ d	dlmZmZ d	dlmZmZ d	dlmZmZ ddl m!Z!m"Z" ej#e$dddZ%G dd deZ&G dd deZ'G dd deZ(G dd deZ)G dd deZ*G dd  d eZ+G d!d" d"eZ,G d#d$ d$eZ-G d%d& d&ej.Z/G d'd( d(ej0Z1G d)d* d*eZ2G d+d, d,e2Z3G d-d. d.eZ4eG d/d0 d0eeZ5g d1Z6dS )2    N)OptionalUnion)nn   )GenerationMixin)BaseModelOutput)PreTrainedModel)auto_docstringcan_return_tuple   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfig)logitsdimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NT)Zkeepdimr   )Zmemory_formatg      ?)softmaxmaxtorchZ
zeros_likeZlegacy_contiguous_formatZscatter_detach)r   r   Zy_softindexZy_hardret r"   c/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmax#   s
    
r$   c                   @   s   e Zd ZdS )Ovis2ModelOutputWithPastN__name__
__module____qualname__r"   r"   r"   r#   r%   -   s   r%   c                   @   s   e Zd ZdS )Ovis2CausalLMOutputWithPastNr&   r"   r"   r"   r#   r*   1   s   r*   c                   @   s   e Zd ZdS )Ovis2RMSNormNr&   r"   r"   r"   r#   r+   5   s   r+   c                   @   s   e Zd ZdS )Ovis2VisionMLPNr&   r"   r"   r"   r#   r,   9   s   r,   c                       s>   e Zd Zed fddZdd ZejejdddZ	  Z
S )	Ovis2VisionEmbeddingsconfigc                    s    t  | t|j|j| _d S N)super__init__r+   hidden_sizerms_norm_epsrms_normselfr/   	__class__r"   r#   r2   >   s    zOvis2VisionEmbeddings.__init__c                 C   s   t dd S NzNot needed for Ovis2)NotImplementedErrorr7   r"   r"   r#   interpolate_pos_encodingB   s    z.Ovis2VisionEmbeddings.interpolate_pos_encodingpixel_valuesreturnc                 C   sL   | j jj}|  |j|d}|ddd}| |}|| | j }|S )Ndtyper   r   )	Zpatch_embeddingweightrB   toflattenZ	transposer5   Zposition_embeddingposition_ids)r7   r?   Ztarget_dtypeZpatch_embeds
embeddingsr"   r"   r#   forwardE   s    

zOvis2VisionEmbeddings.forward)r'   r(   r)   r   r2   r=   r   FloatTensorTensorrH   __classcell__r"   r"   r8   r#   r-   =   s   r-   c                   @   s   e Zd ZdS )Ovis2VisionAttentionNr&   r"   r"   r"   r#   rL   P   s   rL   c                   @   s   e Zd ZdS )Ovis2VisionEncoderLayerNr&   r"   r"   r"   r#   rM   T   s   rM   c                       s"   e Zd Zed fddZ  ZS )Ovis2VisionEncoderr.   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS r"   )rM   ).0_r.   r"   r#   
<listcomp>[       z/Ovis2VisionEncoder.__init__.<locals>.<listcomp>)r1   r2   r   Z
ModuleListrangeZnum_hidden_layersZlayersr6   r8   r.   r#   r2   Y   s    zOvis2VisionEncoder.__init__)r'   r(   r)   r   r2   rK   r"   r"   r8   r#   rN   X   s   rN   c                       sH   e Zd Zed fddZedeej ee	 ee	 dddZ
  ZS )	Ovis2VisionTransformerr.   c                    s>   t    || _t|| _t|| _t|j|j	| _
d| _d S )NF)r1   r2   r/   r-   rG   rN   encoderr+   r3   r4   r5   Zgradient_checkpointingr6   r8   r"   r#   r2   _   s    


zOvis2VisionTransformer.__init__N)attention_maskoutput_attentionsoutput_hidden_statesc                 C   sj   |d ur|n| j j}|d ur |n| j j}| |}| j||||dd}|d }| |}t||j|jdS )NT)inputs_embedsrV   rW   rX   return_dictr   )last_hidden_statehidden_states
attentions)	r/   rW   rX   rG   rU   r5   r   r\   r]   )r7   r?   rV   rW   rX   r\   Zencoder_outputsr[   r"   r"   r#   rH   g   s$    

zOvis2VisionTransformer.forward)NNN)r'   r(   r)   r   r2   r
   r   r   rJ   boolrH   rK   r"   r"   r8   r#   rT   ^   s      rT   c                       s(   e Zd Zejejd fddZ  ZS )Ovis2VisualEmbeddingTable)visual_tokensr@   c                    s8   |j tjtjtjtjtjfv r*t |S t	|| j
S r0   )rB   r   Zint8Zint16Zint32Zint64longr1   rH   matmulrC   )r7   r`   r8   r"   r#   rH      s    z!Ovis2VisualEmbeddingTable.forward)r'   r(   r)   r   rJ   rH   rK   r"   r"   r8   r#   r_      s   r_   c                   @   s@   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdZdZdS )Ovis2PreTrainedModelr/   modelTrL   past_key_valuesN)r'   r(   r)   r   __annotations__Zbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_cache_classZ_supports_flash_attnZ_supports_flex_attnZ_supports_sdpaZ_can_compile_fullgraphZ_supports_attention_backendr"   r"   r"   r#   rc      s   
rc   c                       sJ   e Zd ZU eed< ed fddZejeej	ej	f dddZ
  ZS )Ovis2VisionModelr/   r.   c                    sl   t  | || _t|| _|j| _|j| _tj|j	|j
 |j
 | j| j dd| _t| j| j | _d S NF)Zbias)r1   r2   r/   rT   transformernum_visual_indicator_tokens
vocab_sizer   Linearr3   hidden_stridehead_linearZ	LayerNorm	head_normr6   r8   r"   r#   r2      s    

zOvis2VisionModel.__init__r>   c              	   C   sB  |  |}|j}| jjdkr|j\}}}| jj}tt|}|| |krRtd|||  | }	t	j
|ddd|	d|	fdd}||	7 }|||| ||| ||}|dddddd}||d	|| | }| |}
| |
}
| jjd
krt	j
j|
d	dd}n:| jjdkr t|
d	d}n| jjdkr>t	j
j|
d	d}|S )Nr   z.Token sequence length must be a perfect squarer   Zconstantr   r         Zgumbel_argmaxT)r   hardZ	st_argmaxr   r   )ri   r[   r/   rm   shapeintmathsqrt
ValueErrorr   Z
functionalpadZreshapeZpermutern   ro   Ztokenize_functionZgumbel_softmaxr$   r   )r7   r?   outputsr[   Z
num_imagesZseq_lenZ
hidden_dimrm   Zsqrt_lZpad_sizer   Z
prob_tokenr"   r"   r#   rH      s6    


zOvis2VisionModel.forward)r'   r(   r)   r   rf   r2   r   rI   tuplerJ   rH   rK   r"   r"   r8   r#   rg      s   
rg   c                       s   e Zd Zi Zed fddZejejdddZe	e
dejejeej eej eeej  eej eej ee ee ee ee eej eeejf eeef d	d
dZ  ZS )
Ovis2Modelr.   c                    sZ   t  | t|j| _t|jj|j| _|jj| _	|j| _|j
| _
t|j| _| `d S r0   )r1   r2   rg   Zvision_configvision_towerr_   rk   r3   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorr6   r8   r"   r#   r2      s    
zOvis2Model.__init__r>   c           	      C   s   |  |}|j\}}}tj||| j jf|j|jd|jd}tj||gdd}| 	|}tj
| j| j j | jtjd|j}| 	|}||fS )NF)rB   deviceZrequires_gradlayoutr   rt   rA   )r~   ru   r   Zzerosrj   rB   r   r   catr   Zaranger   ra   rD   )	r7   r?   image_featuresZ
batch_sizeZimg_seq_lenrP   Zpadding_tensorZvisual_indicatorvisual_indicator_featuresr"   r"   r#   get_image_features   s(    


zOvis2Model.get_image_featuresNr   	input_idsr?   rV   rF   re   rY   labels	use_cacherW   rX   rZ   cache_positionlogits_to_keepr@   c                 K   sZ  |	d ur|	n| j j}	|
d ur |
n| j j}
|d u |d uA r@td|d u rT|  |}|d ur| j|d\}}| j|||d}|||}t| j	D ]v\}}|d u r||  t
j|t
j|jdk}|d}n||k|j}| r|| || |j|j||< q| jf ||||||	|
d||d
|}t|j|j|j|j|d urR|nd dS )	Nz:You must specify exactly one of input_ids or inputs_embedsr?   )rY   r   )rB   r   rr   T)
rV   rF   re   rY   r   rW   rX   rZ   r   r   )r[   re   r\   r]   image_hidden_states)r/   rW   rX   ry   Zget_input_embeddingsr   Zget_placeholder_maskZmasked_scatter	enumerater   r   Ztensorra   r   allrD   anyZ	expand_asrB   r   r%   r[   re   r\   r]   )r7   r   r?   rV   rF   re   rY   r   r   rW   rX   rZ   r   r   kwargsr   r   Zspecial_image_maskiZvisual_indicator_idmaskr{   r"   r"   r#   rH      sd    
zOvis2Model.forward)NNNNNNNNNNNNr   )r'   r(   r)   _checkpoint_conversion_mappingr   r2   r   rI   r   r
   r	   
LongTensorr   rJ   listr^   r   rv   r|   r%   rH   rK   r"   r"   r8   r#   r}      sH                
r}   c                       s   e Zd Zi Zed fddZedd Zej	dddZ
eedejej	eej eej eeej	  eej	 eej ee ee ee ee eej eeejf eeef dddZ  ZS )Ovis2ForConditionalGenerationr.   c                    s&   t  | tj|j|jdd| _d S rh   )r1   r2   r   rl   r3   rk   lm_headr6   r8   r"   r#   r2   M  s    z&Ovis2ForConditionalGeneration.__init__c                 C   s   t dd S r:   )AttributeErrorr<   r"   r"   r#   r   Q  s    z3Ovis2ForConditionalGeneration.multi_modal_projectorr   c                 C   s   | j j|dS )Nr   )rd   r   )r7   r?   r"   r"   r#   r   U  s    z0Ovis2ForConditionalGeneration.get_image_featuresNr   r   c                 K   s   |	dur|	n| j j}	|
dur |
n| j j}
| jf ||||||||	|
d|d|}|d }t|trnt| dn|}| |dd|ddf }d}|dur| jf ||| j j	j
d|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)r   r?   rV   rF   re   rY   r   rW   rX   rZ   r   r   )r   r   rk   )lossr   re   r\   r]   r   )r/   rW   rX   rd   
isinstancerv   slicer   Zloss_functionr   rk   r*   re   r\   r]   r   )r7   r   r?   rV   rF   re   rY   r   r   rW   rX   rZ   r   r   r   r{   r\   Zslice_indicesr   r   r"   r"   r#   rH   X  sH    .z%Ovis2ForConditionalGeneration.forward)NNNNNNNNNNNNr   )r'   r(   r)   r   r   r2   propertyr   r   rI   r   r
   r	   r   r   rJ   r   r^   r   rv   r|   r*   rH   rK   r"   r"   r8   r#   r   I  sH   
             
r   )rc   r}   r   )7rw   typingr   r   r   r   Z
generationr   Zmodeling_outputsr   Zmodeling_utilsr   utilsr	   r
   Zaimv2.modeling_aimv2r   r   autor   Zllama.modeling_llamar   r   Zllava.modeling_llavar   r   Zllava_next.modeling_llava_nextr   r   Zsiglip.modeling_siglipr   r   Zconfiguration_ovis2r   r   rJ   rv   r$   r%   r*   r+   r,   r-   rL   rM   rN   ModulerT   Z	Embeddingr_   rc   rg   r}   r   __all__r"   r"   r"   r#   <module>   s>   
*5ve