a
    hK                     @   sj   d Z ddlmZ ddlmZ ddlmZ ddlmZ e	e
ZG dd deZG d	d
 d
eZdd
gZdS )zmT5 model configuration    )Mapping   )PretrainedConfig)OnnxSeq2SeqConfigWithPast)loggingc                       s:   e Zd ZdZdZdgZdddddZd fdd	Z  ZS )	MT5Configa7  
    This is the configuration class to store the configuration of a [`MT5Model`] or a [`TFMT5Model`]. It is used to
    instantiate a mT5 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the mT5
    [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 250112):
            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. In the conventional context, it is typically expected that `d_kv` has to be equal to `d_model // num_heads`.
            But in the architecture of mt5-small, `d_kv` is not equal to `d_model //num_heads`. The `inner_dim` of the projection layer will be defined as `num_heads * d_kv`.
        d_ff (`int`, *optional*, defaults to 1024):
            Size of the intermediate feed forward layer in each `T5Block`.
        num_layers (`int`, *optional*, defaults to 8):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    Zmt5Zpast_key_valuesd_model	num_heads
num_layersd_kv)Zhidden_sizeZnum_attention_headsZnum_hidden_layersZhead_dim     @         N          皙?ư>      ?
gated-geluTT5TokenizerFr              c              	      s   || _ || _|| _|| _|| _|d ur*|n| j| _|| _|| _|	| _|
| _	|| _
|| _|| _|| _|| _| jd}|d | _|d dk| _t|dkr|d dkst|dkrtd| d|d	krd
| _t jf ||||||d| d S )N-r   Zgatedr      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'r   Zgelu_new)is_encoder_decodertokenizer_classtie_word_embeddingspad_token_ideos_token_iddecoder_start_token_id)
vocab_sizer   r   d_ffr
   num_decoder_layersr	   relative_attention_num_bucketsrelative_attention_max_distancedropout_rateclassifier_dropoutlayer_norm_epsiloninitializer_factorfeed_forward_proj	use_cachesplitZdense_act_fnZis_gated_actlen
ValueErrorsuper__init__)selfr$   r   r   r%   r
   r&   r	   r'   r(   r)   r+   r,   r-   r   r.   r   r    r!   r"   r#   r*   kwargsZact_info	__class__ e/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/mt5/configuration_mt5.pyr3   R   sF    
$
zMT5Config.__init__)r   r   r   r   r   Nr   r   r   r   r   r   r   TTr   Fr   r   r   r   )	__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferenceZattribute_mapr3   __classcell__r8   r8   r6   r9   r      s<   -	                     r   c                   @   sR   e Zd Zeeeeeef f dddZeedddZee	dddZ
dS )	MT5OnnxConfig)returnc                 C   sx   ddddddd}| j rDd|d d< ddi|d	< dd
d|d< nddd|d	< ddd|d< | j rt| j|dd |S )NbatchZencoder_sequence)r   r   )Z	input_idsattention_maskz past_encoder_sequence + sequencerB   r   r   Zdecoder_input_idsz past_decoder_sequence + sequenceZdecoder_attention_maskZdecoder_sequenceinputs)	direction)Zuse_pastZfill_with_past_key_values_)r4   Zcommon_inputsr8   r8   r9   rC      s    zMT5OnnxConfig.inputsc                 C   s   dS )N   r8   r4   r8   r8   r9   default_onnx_opset   s    z MT5OnnxConfig.default_onnx_opsetc                 C   s   dS )NgMb@?r8   rF   r8   r8   r9   atol_for_validation   s    z!MT5OnnxConfig.atol_for_validationN)r:   r;   r<   propertyr   strintrC   rG   floatrH   r8   r8   r8   r9   r?      s    r?   N)r=   collections.abcr   Zconfiguration_utilsr   Zonnxr   utilsr   Z
get_loggerr:   loggerr   r?   __all__r8   r8   r8   r9   <module>   s   
|