a
    hx                  
   @   s~  d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ eeeeef  eeeeeef  eeeeef   eeeeeef   f ZG d
d deddZG dd deddZG dd deddZG dd deZeeeeef eeeef dddZ eeedddZ!dd Z"dd Z#dd Z$d#d!d"Z%dgZ&dS )$zProcessor class for KOSMOS-2.    N)OptionalUnion   )BatchFeature)
ImageInput
is_batched)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedToken)BatchEncoding	TextInputc                   @   s6   e Zd ZU eee  ed< ee ed< ee ed< dS )Kosmos2ImagesKwargsbboxesnum_image_tokensfirst_image_token_idN)__name__
__module____qualname__r   listfloat__annotations__int r   r   j/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/kosmos2/processing_kosmos2.pyr   %   s   
r   F)totalc                   @   s   e Zd ZU ee ed< dS )Kosmos2TextKwargsadd_eos_tokenN)r   r   r   r   boolr   r   r   r   r   r   +   s   
r   c                
   @   s@   e Zd ZU eed< eed< dddddddddd	ddid	Zd
S )Kosmos2ProcessorKwargstext_kwargsimages_kwargsTFr   )	add_special_tokenspaddingZstrideZreturn_overflowing_tokensZreturn_special_tokens_maskZreturn_offsets_mappingreturn_token_type_idsverboser   r   @   )r"   r#   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r!   /   s   
r!   c                	       s  e Zd ZdZddgZdZdZd" fdd	Zd#ee	e
ee
 f ee ed
ddZdd Zdd Zd$e	e
ee
 f eeee e	eee f dddZd%ddZd&ddZedd Zee	eee  eee  f edddZe	eeef eeeeef f eeef dd d!Z  ZS )'Kosmos2Processora,  
    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
    processor.

    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
    for more information.

    Args:
        image_processor (`CLIPImageProcessor`):
            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
        tokenizer (`XLMRobertaTokenizerFast`):
            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
            The number of tokens that represent patch indices.
    image_processor	tokenizer)ZCLIPImageProcessorZCLIPImageProcessorFastZAutoTokenizer   c                    s   d|_ d| _d| _d| _d| _d| _d| _d| _d	| _d
| _	d| _
d| _| j| j| j| j| j| j| j| j| j	| j
| jg| _|| _dd t| jD }g }| j| D ]}|t|dddd q|| t || d S )NFz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding>c                 S   s"   g | ]}d t |d dqS )<patch_index_   >)strzfill.0xr   r   r   
<listcomp>~       z-Kosmos2Processor.__init__.<locals>.<listcomp>T)lstriprstrip
normalized)r&   Z	eod_token	boi_token	eoi_tokenZ	eoc_tokenZ	eol_tokenZ	bop_tokenZ	eop_tokenZ	boo_tokenZ	eoo_tokenZ	dom_tokenZ	grd_tokenZ
tag_tokensnum_patch_index_tokensrangeappendr   Z
add_tokenssuper__init__)selfr+   r,   r>   kwargsZpatch_index_tokensZtokens_to_addtoken	__class__r   r   rB   Z   s>    
zKosmos2Processor.__init__N)imagestextrD   returnc                    s\  |du r|du rt djtfdjji|}|d dd}|d dd}|d dd}	|d	 d
d}
|d	 d }|d	 d }|d	 dd}t }|durȈj|fi |d }|	| |durj
||||d}|r,|
s,t|trjj | }nt|tr,fdd|D }|d	 d o<|
|d	 d< |du rT|nd|d	 d< |du rn|nd|d	 d< jf d|i|d	 }|	| ||d	 d< ||d	 d< ||d	 d< |durX|durX|	du rjjd }	|}t|d }tt|	|	| }dgdg|  dg }g }g }|d }t|trR|g}|d g|d< |D ]n}|d| | ||| d  }|| t|}|rdg| }|dgt|t|  7 }|| qVt|trtdd t|jD dd d}|d \}}|d \}}|d	 d o|
|d	 d< d|d	 d< jf d|| gi|d	 }t|jd  | krjjdkr fdd|D } fdd|D } fdd|d D |d< nNjjd kr fd!d|D } fd"d|D } fd#d|d D |d< t|tr:|du r:|d }|d d |d< |d }|	t||d |d$|d% |S )&a	  
        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.

        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.

        Args:
            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional* defaults to 64):
                The number of (consecutive) places that are used to mark the placeholders to store image information.
                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
            first_image_token_id (`int`, *optional*):
                The token id that will be used for the first place of the subsequence that is reserved to store image
                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
            add_eos_token (`bool`, defaults to `False`):
                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
        Nz*You have to specify either images or text.Ztokenizer_init_kwargsr#   r   r   r(   r   r"   r   Fr$   r%   return_tensors)r   c                    s   g | ]} j j | qS r   )r,   	bos_token)r5   srC   r   r   r7      r8   z-Kosmos2Processor.__call__.<locals>.<listcomp>rI      r   	input_idsattention_maskc                 S   s   g | ]\}}|t |fqS r   len)r5   idxr6   r   r   r   r7      r8   c                 S   s   | d S Nr   )r6   r   r   r   <lambda>   r8   z+Kosmos2Processor.__call__.<locals>.<lambda>)keyrV   rightc                    s&   g | ]}|j jg t|   qS r   r,   Zpad_token_idrS   r4   max_len_paddedrC   r   r   r7     r8   c                    s"   g | ]}|d g t |   qS r   rR   r4   r\   r   r   r7     s   c                    s"   g | ]}|d g t |   qS r]   rR   r4   r^   r   r   r7   	  s   leftc                    s&   g | ]}j jg t|  | qS r   rZ   r4   r[   r   r   r7     r8   c                    s"   g | ]}d g t |  | qS r]   rR   r4   r^   r   r   r7     s   c                    s"   g | ]}d g t |  | qS r]   rR   r4   r^   r   r   r7     s   )rP   rQ   image_embeds_position_mask)dataZtensor_type)
ValueErrorZ_merge_kwargsr!   r,   Zinit_kwargspop
setdefaultr   r+   updatepreprocess_examples
isinstancer2   rL   r   Zunk_token_idr   r?   r@   copyrS   sorted	enumeraterP   Zpadding_sider   )rC   rH   rI   ZaudioZvideosrD   Zoutput_kwargsr   r   r   r   r$   r%   rK   encodingZimage_encodingZtext_encodingZwith_bosstart_indexZimage_token_idsZbase_image_embeds_position_maskrP   r`   Zall_input_idsZtext_idsmaskZsorted_length_Zmin_len_not_paddedrT   r   r[   r   __call__   s    




 









zKosmos2Processor.__call__c                 C   s   |du rdS t |tstd|D ]x}|du r2q"nt |tsB|g}|D ]R}t |trt|dkrrtdd |D sFt|dkrtdd |D sFtdqFq"dS )	a  
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        Nz@`bboxes` (for a single text example) should be `None` or a list.   c                 s   s   | ]}t |tV  qd S N)rg   r   r4   r   r   r   	<genexpr>B  r8   zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>r0   c                 s   s   | ]}t |tV  qd S rq   )rg   r   r4   r   r   r   rr   C  r8   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)rg   r   rb   tuplerS   all)rC   r   bboxelementr   r   r   _check_bboxes_for_single_text)  s,    




z.Kosmos2Processor._check_bboxes_for_single_textc                 C   s.   |  }|d ur| d| }| ||}|S )N )strip_insert_patch_index_tokens)rC   rI   imager   img_info_tokensr   r   r   _preprocess_single_exampleL  s
    z+Kosmos2Processor._preprocess_single_exampler(   )textsrH   r   r   rJ   c           	         sD  j g| }dj g| jg  d}t|tr>d}|g}|du rVdgt| }nt|sd|g}t|t|krtdt| dt| d|s| |g}n>|durt|t	std|D ]}| qndgt| }t|t|krtd	t| dt| d fd
dt
|||D }|s@|d }|S )a-  Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, list[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, list[TextInput]]`: The processed texts with image and patch index tokens.
        rx   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got c                    s"   g | ]\}}} ||| qS r   )r}   )r5   rI   r{   ru   r|   rC   r   r   r7     s   z8Kosmos2Processor.preprocess_examples.<locals>.<listcomp>r   )r<   joinr=   rg   r2   rS   r   rb   rw   r   zip)	rC   r~   rH   r   r   Z
img_tokensZbatchedr6   resultr   r   r   rf   V  sB    



z$Kosmos2Processor.preprocess_examplesTc                 C   s    | | jd }|rt|S |S rU   )splitr=   +clean_text_and_extract_entities_with_bboxes)rC   rI   cleanup_and_extractcaptionr   r   r   post_process_generation  s    z(Kosmos2Processor.post_process_generationc                    s(    j |fd|i|} fdd|D S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        skip_special_tokensc                    s   g | ]} j |d dqS )F)r   )r   )r5   rI   rN   r   r   r7     r8   zDKosmos2Processor.post_process_image_text_to_text.<locals>.<listcomp>)Zbatch_decode)rC   Zgenerated_outputsr   rD   Zgenerated_textsr   rN   r   post_process_image_text_to_text  s    z0Kosmos2Processor.post_process_image_text_to_textc                 C   s   | j j}| jj}|| dg S )Nr`   )r,   model_input_namesr+   )rC   Ztokenizer_input_namesZimage_processor_input_namesr   r   r   r     s    z"Kosmos2Processor.model_input_names)rI   r   rJ   c                 C   sT  |d u st |dkr|S ttjd|d}t |t |krXtdt | dt | dd}g }t||D ]\}}| \}}	||||	  |	}|d u rqjt|t	r|g}g }
t
dd |D std	|D ]&}| |\}}|
| d
|  qt |
dkrqjd|
}|d| d qj|t |k rF|||d   d|}|S )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c                 s   s   | ]}|d uV  qd S rq   r   )r5   boxr   r   r   rr     r8   z>Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>zTThe multiple bounding boxes for a single phrase should not contain any `None` value.rx   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )rS   r   refinditerrb   r   spanr@   rg   rs   rt   #_convert_bbox_to_patch_index_tokensr   )rC   rI   r   Zmatched_phrasescurr_posbufferZmatchedru   rn   endZpatch_index_stringsr   Zpatch_index_1Zpatch_index_2Zposition_strr   r   r   rz     sB    


z+Kosmos2Processor._insert_patch_index_tokens)ru   rJ   c                 C   sh   t |dkr|\}}ntt| j}t||\}}dt|d d}dt|d d}||fS )Nrp   r/   r0   r1   )rS   r   mathsqrtr>   coordinate_to_patch_indexr2   r3   )rC   ru   Zidx_1Zidx_2num_patches_per_sideZtoken_1Ztoken_2r   r   r   r     s    
z4Kosmos2Processor._convert_bbox_to_patch_index_tokens)r-   )NNNN)NNr(   )T)T) r   r   r   __doc__
attributesZimage_processor_classZtokenizer_classrB   r   r   r   r   r   r!   r   ro   rw   r}   	BboxInputr   r   r2   rf   r   r   propertyr   rs   r   rz   r   __classcell__r   r   rF   r   r*   D   sH   /     ##   B


*.
r*   )ru   r   rJ   c                 C   s   | \}}}}||kr||ks$t dt|| }t|| }t|| d }t|| d }	|| | }
|	| | }|
|fS )a  Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.rO   )rb   r   floorceil)ru   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxr   r   r   r     s    r   )r   r   r   c                 C   s   d| }| | }| | }|| }|| }| |krZ|| }|| }	|| | }
|| | }nz||ksj||kr|| }|| }	|| | }
|| | }n@|| |d  }|| |d  }	|| |d  }
|| |d  }||	|
|fS )a  
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    g      ?rp   r   )r   r   r   	cell_sizer   r   r   r   r   r   r   r   r   r   r   patch_index_to_coordinate  s(    r   c              	   C   s6  d}t || }g }|D ]}|d}| \}}}|sZd}|dd |dd f}|d}	g }
|	D ]v}t d|}t d|dd }|rl|rl|r|
t|dt|df ql|
t|dt|df ql|r||||
f q|
D ]0}d|d  d	|d  d
}||||gf qq|S )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This functioin is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>rp   Nr   r.   z<patch_index_(\d+)>rO   r/   z><patch_index_r1   )	r   r   r   groupsr   searchr@   r   group)rI   patternmatchesentities_with_patch_indicesmatchr   Z
phrase_tagphraseZmatch_contentZpatch_index_pairsZentity_bboxespairr6   yru   entityr   r   r   #extract_entities_with_patch_indicesB  s0    


$$r   c                 C   sP   | \}\}}t tdd|d| }t tdd|d| }|||ff}|S )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)rS   r   sub)r   rI   entity_namestartr   Zadjusted_startZadjusted_endadjusted_entityr   r   r   adjust_entity_positions|  s
    r   c                 C   s   |   }t| t|   }g }|D ]j\}\}}}t|t|  }	t|t|  }
|| |	 }|| |
 }|  }||||f|f q$||fS )z9Remove the spaces around the text and the entities in it.)ry   rS   r9   r:   r@   )rI   entitiesnew_textZleading_spacesZnew_entitiesr   r   r   r   Zentity_name_leading_spacesZentity_name_trailing_spacesr   r   r   _cleanup_spaces  s    r       c           
         sp   t dd| }t| }g }|D ]F}|dd |d  }}t|| } fdd|D }	|||	f  qt||S )a  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```r   r   r   rp   c                    s    g | ]}t |d  |d  qS )r   rO   )r   )r5   ru   r   r   r   r7     r8   z?clean_text_and_extract_entities_with_bboxes.<locals>.<listcomp>)r   r   r   r   r@   r   )
rI   r   Zprocessed_textr   r   itemr   r   r   Zbboxes_in_coordsr   r   r   r     s    
r   )r   )'r   rh   r   r   typingr   r   Zimage_processing_utilsr   Zimage_utilsr   r   Zprocessing_utilsr   r	   r
   r   r   Ztokenization_utilsr   Ztokenization_utils_baser   r   r   rs   r   r   r   r   r   r!   r*   r   r   r   r   r   r   __all__r   r   r   r   <module>   s<      5&-:

