a
    h,                     @   sb   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 g d	ZG d
d deZdgZdS )z
Processor class for EVOLLA.
    N)OptionalUnion   )BatchFeature)ProcessorMixin   )AutoTokenizer)aa_seqfoldseekZmsac                       s   e Zd ZdZddgZdgZdZdZdZd fd	d
	Z	d ddZ
d!edddZd"eeee ef  eeeee  ee f  ee ee dddZdd Zdd Zdd Zdd Z fddZe fddZ  ZS )#EvollaProcessoran  
    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.

    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.

    Args:
        protein_tokenizer (`EsmTokenizer`):
            An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
        tokenizer (`LlamaTokenizerFast`, *optional*):
            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
        protein_max_length (`int`, *optional*, defaults to 1024):
            The maximum length of the sequence to be generated.
        text_max_length (`int`, *optional*, defaults to 512):
            The maximum length of the text to be generated.
    protein_tokenizer	tokenizerZsequence_max_lengthr   N      c                    sF   |d u rt d|d u r t dt || d| j_|| _|| _d S )Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__r   Z	pad_tokenprotein_max_lengthtext_max_length)selfr   r   r   r   kwargs	__class__ h/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/evolla/processing_evolla.pyr   ;   s    zEvollaProcessor.__init__c           	      C   s`   g }|D ]<}| d}| d}ddd t||D }|| q| jj|dd|dd}|S )	Nr	   r
    c                 S   s    g | ]\}}|  |  qS r   )upperlower).0sfr   r   r   
<listcomp>L       z4EvollaProcessor.process_proteins.<locals>.<listcomp>ptT)return_tensors
truncation
max_lengthpadding)getjoinzipappendr   Zbatch_encode_plus)	r   proteinsr   Zsa_sequencesZproteinr	   r
   Zsa_sequence	sa_tokensr   r   r   process_proteinsG   s    


z EvollaProcessor.process_proteins)r   c                 C   sD   g }|D ] }| j j|ddd}|| q| j |dddd|d}|S )NFT)tokenizeZadd_generation_promptr#   Zlongest)Zadd_special_tokensr$   r'   r%   r&   )r   Zapply_chat_templater+   )r   Ztextsr   ZpromptsmessagespromptZprompt_inputsr   r   r   process_textT   s"    zEvollaProcessor.process_text)r,   messages_listr   r   c           	      K   s  |du s|du rt d|dur$|n| j}|dur6|n| j}t|trL|g}t|ttfrrt|d ttfsr|g}t|ttfrtdd |D st dt|ttfrtdd |D st dd	t	 d
| t|ttfrd|D ]x}t|ttfst dt
| dtdd |D s,t dtdd |D sRtdd |D rt d| qnt dt
| d| ||}| ||}t|d |d |d |d ddS )av  This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
        the model.

        Args:
            proteins (`Union[List[dict], dict]`):
                A list of dictionaries or a single dictionary containing the following keys:
                    - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
                    - `"foldseek"` (`str`) -- The foldseek string of the protein.
            messages_list (`Union[List[List[dict]], List[dict]]`):
                A list of lists of dictionaries or a list of dictionaries containing the following keys:
                    - `"role"` (`str`) -- The role of the message.
                    - `"content"` (`str`) -- The content of the message.
            protein_max_length (`int`, *optional*, defaults to 1024):
                The maximum length of the sequence to be generated.
            text_max_length (`int`, *optional*, defaults to 512):
                The maximum length of the text.

        Return:
            a dict with following keys:
                - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
                - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
                - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
                - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
        Nz3You need to specify `messages_list` and `proteins`.r   c                 s   s   | ]}t |tV  qd S N
isinstancedictr   pr   r   r   	<genexpr>   r"   z+EvollaProcessor.__call__.<locals>.<genexpr>zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c                 s   s$   | ]}t d d | D V  qdS )c                 s   s   | ]}|t v V  qd S r4   )PROTEIN_VALID_KEYS)r   kr   r   r   r:      r"   z5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>N)allkeysr8   r   r   r   r:      s   z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c                 s   s   | ]}t |tV  qd S r4   r5   r   mr   r   r   r:      r"   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c                 s   s   | ]}t | d kV  qdS )r   N)lenr>   r@   r   r   r   r:      r"   c                 s   s"   | ]}t | d dhkV  qdS )ZrolecontentN)setr>   r@   r   r   r   r:      s   zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)Zprotein_input_idsZprotein_attention_maskrE   rF   )data)r   r   r   r6   r7   listtupler=   r)   r;   typeanyr.   r2   r   )	r   r,   r3   r   r   r   r0   r-   Ztext_tokensr   r   r   __call__l   s`    !
  zEvollaProcessor.__call__c                 O   s   | j j|i |S r4   )r   batch_decoder   argsr   r   r   r   rM      s    zEvollaProcessor.batch_decodec                 O   s   | j j|i |S r4   )r   decoderN   r   r   r   rP      s    zEvollaProcessor.decodec                 O   s   | j j|i |S r4   )r   rM   rN   r   r   r   protein_batch_decode   s    z$EvollaProcessor.protein_batch_decodec                 O   s   | j j|i |S r4   )r   rP   rN   r   r   r   protein_decode   s    zEvollaProcessor.protein_decodec                    s   | j tj|| j d| jv }|r2| jdnd }|rN|d urN| jd t	 j|fi |}|r||d ur|| j
|d |S )Nr   )r   save_pretrainedospathr)   protein_tokenizer_dir_name
attributesindexremover   insert)r   Zsave_directoryr   Zprotein_tokenizer_presentZprotein_tokenizer_indexoutputsr   r   r   rS      s    
zEvollaProcessor.save_pretrainedc                    s@   t  j|fi |}t|tr&|d }tj|| jd}||_|S )Nr   )Z	subfolder)r   from_pretrainedr6   rI   r   rV   r   )clsZpretrained_model_name_or_pathr   	processorr   r   r   r   r\      s    
zEvollaProcessor.from_pretrained)Nr   r   )r   )r   )NNNN)__name__
__module____qualname____doc__rW   Zvalid_kwargsZprotein_tokenizer_classZtokenizer_classrV   r   r.   intr2   r   r   rH   r7   rL   rM   rP   rQ   rR   rS   classmethodr\   __classcell__r   r   r   r   r       s:   
     Yr   )rb   rT   typingr   r   Zfeature_extraction_utilsr   Zprocessing_utilsr   autor   r;   r   __all__r   r   r   r   <module>   s    X