a
    hG                 
   @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlm  mZ ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( e#)e*Z+dd Z,dd Z-dd Z.daddZ/e
j0e
j0dddZ1ee!ddG dd de Z2ee!ddG dd  d e Z3ee!G d!d" d"e Z4G d#d$ d$ej5Z6G d%d& d&ej5Z7G d'd( d(ej5Z8G d)d* d*ej5Z9G d+d, d,ej5Z:G d-d. d.ej5Z;G d/d0 d0ej5Z<G d1d2 d2ej5Z=G d3d4 d4ej5Z>G d5d6 d6eZ?G d7d8 d8ej5Z@G d9d: d:ej5ZAG d;d< d<ej5ZBG d=d> d>ej5ZCdbej5e
j0e
j0e
j0ee
j0 eDeDee
j0 d@dAdBZEG dCdD dDej5ZFG dEdF dFej5ZGG dGdH dHej5ZHG dIdJ dJej5ZIG dKdL dLej5ZJG dMdN dNeZKG dOdP dPej5ZLG dQdR dRej5ZMe!G dSdT dTeZNG dUdV dVeNZOe!dWdG dXdY dYeNZPe!G dZd[ d[eNZQe!G d\d] d]eNZRe!G d^d_ d_eNZSg d`ZTdS )czPyTorch CLAP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                 C   sJ   | j \}}}| dddddddf dd|d}|||| |}|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthZclasses_numZ	upsampled r$   b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/clap/modeling_clap.pyinterpolate+   s    
(r&   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r            r   viewpermute
contiguous)r    window_sizer"   heightwidthnum_channelswindowsr$   r$   r%   window_partition<   s    $r4   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r*   r   r   r   r'   r(   r)   r+   )r3   r/   r0   r1   r2   r$   r$   r%   window_reverseQ   s    
$r5   c                 C   s6   |  | }tj|dd|| | }| | S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   dim)neinttorchZcumsumZtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskZincremental_indicesr$   r$   r%   "create_position_ids_from_input_idse   s    r@   )logitsreturnc                 C   s"   t jt| | jd}tj| |S )Ndevice)r:   arangelenrD   r   
functionalZcross_entropy)rA   labelsr$   r$   r%   contrastive_lossw   s    rI   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )Zcustom_introc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r    
attentions)__name__
__module____qualname____doc__rK   r   r:   FloatTensor__annotations__rL   r    tuplerM   r$   r$   r$   r%   rJ   |   s
   
rJ   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsrL   .r    rM   )rN   rO   rP   rQ   rV   r   r:   rR   rS   rL   r    rT   rM   r$   r$   r$   r%   rU      s
   
rU   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< ee d
ddZdS )
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrK   rV   text_model_outputaudio_model_outputrB   c                    s   t  fdd  D S )Nc                 3   s,   | ]$}|d vr | nt  | V  qdS ))r[   r\   N)getattrto_tuple).0kselfr$   r%   	<genexpr>   s   z&ClapOutput.to_tuple.<locals>.<genexpr>)rT   keysrb   r$   rb   r%   r_      s    zClapOutput.to_tuple)rN   rO   rP   rQ   rX   r   r:   rR   rS   rY   rZ   rK   rV   r[   r   r\   rT   r   r_   r$   r$   r$   r%   rW      s   
rW   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    Nc                    s   t    || _d S N)super__init__	drop_prob)rc   rj   	__class__r$   r%   ri      s    
zClapDropPath.__init__c                 C   sj   | j dks| js|S d| j  }|jd fd|jd   }|tj||j|jd }|  |	|| }|S )N        r   r   )r   dtyperD   )
rj   trainingr   ndimr:   Zrandro   rD   Zfloor_div)rc   r    Z	keep_probr   Zrandom_tensoroutputr$   r$   r%   forward   s    
zClapDropPath.forward)N)rN   rO   rP   rQ   ri   rt   __classcell__r$   r$   rk   r%   rf      s   rf   c                       s.   e Zd ZdZed fddZdd Z  ZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    s   t    |j}|j}t|| }ttj||ddddt|tj	ddtj||ddddt|| _
ttdtj||ddddt|tj	ddtj||ddddt|| _t | _d S )Nr   r   Zkernel_sizeZstridepaddingT)Zinplace)rh   ri   patch_embeds_hidden_sizeZaff_block_rr9   r   Z
SequentialConv2dBatchNorm2dZReLU	local_attZAdaptiveAvgPool2d
global_attZSigmoidsigmoid)rc   rx   channelsZdownsize_ratioZinter_channelsrk   r$   r%   ri      s(    


	zClapAudioAFFBlock.__init__c                 C   sF   || }|  || | }| |}d| | d| d|   }|S )Nr'   r   )r~   r   r   )rc   r    ZresidualZattention_inputZfused_layer_outputrs   r$   r$   r%   rt      s
    
zClapAudioAFFBlock.forwardrN   rO   rP   rQ   r   ri   rt   ru   r$   r$   rk   r%   rv      s   rv   c                       s0   e Zd ZdZed fddZdddZ  ZS )	ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    rw   c                    s  t    t|jtr"|j|jfn|j}t|jtr@|j|jfn|j}t|jtr^|j|jfn|j}|| _|| _|d |d  |d |d  f| _| jd | jd  | _	|j
| _|j| _|d |d  d |d |d  d f}| jr|jdkrdnd}tj|j| |j|||d| _|jr*t|jnt | _| jrt|| _tj|j|j|d |d d f|d |d d f|d| _d S )Nr   r   r'   Zchannel_mapr(   ry   r   )rh   ri   
isinstance	spec_sizer9   
patch_sizepatch_strideimg_size	grid_sizeZnum_patchesZflatten_patch_embedsflattenenable_fusionZfusion_typer   r|   Zpatch_embed_input_channelsr{   projZenable_patch_layer_norm	LayerNormIdentitynormrv   fusion_model
mel_conv2d)rc   rx   r   r   r   rz   Zscale_factorrk   r$   r%   ri     s>    
"(
zClapAudioPatchEmbed.__init__Nc              
   C   s  | j rb|d d ddd d d d f }|j\}}}}|| jd ksR|| jd krtd| d| d| jd  d| jd  d	| |}|d}t|dkr\||dd d d d d f  }	|	j\}}}}|	|| d||}	| 	|	}	|	j\}
}}}|	|||||}	|	
d d	}	|	d}tjj|	d|| fd
d}	| || |	||< |}nf|j\}
}
}}|| jd ks|| jd krtd| d| d| jd  d| jd  d	| |}| jr|ddd}| |}|S )Nr   r   zInput audio size (*z) doesn't match model (z).r*   )r   r'   r   r   r(   r   Zconstantr'   )r   r   r   
ValueErrorr   sizerF   r.   r,   r   r-   r   r:   r   rG   padr   	transposer   )rc   r    Zis_longer_idxZglobal_hidden_statesr"   r2   r0   r1   Zoutput_widthZlocal_hidden_states_featuresZlocal_widthr$   r$   r%   rt   9  sF     (

 

 (

zClapAudioPatchEmbed.forward)Nr   r$   r$   rk   r%   r   	  s   *r   c                       sL   e Zd Z fddZdejeej eej ee e	ej dddZ
  ZS )	ClapAudioSelfAttentionc                    s
  t    || dkr,td| d| d|| _t|| | _| j| j | _t|tj	j
r`|n||f| _ttd| jd  d d| jd  d  || _t| jd }t| jd }tt||gdd}t|d}|d d d d d f |d d d d d f  }	|	ddd }	|	d d d d df  | jd d 7  < |	d d d d df  | jd d 7  < |	d d d d df  d| jd  d 9  < |	d	}
| d
|
 tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _t|j| _ d S )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r'   r   Zij)Zindexingr*   relative_position_indexbias)!rh   ri   r   num_attention_headsr9   attention_head_sizeall_head_sizer   collectionsabcIterabler/   r   	Parameterr:   zerosrelative_position_bias_tablerE   stackr   r   r-   r.   sumregister_bufferLinearZqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)rc   rx   r7   	num_headsr/   Zcoords_hZcoords_wZcoordsZcoords_flattenZrelative_coordsr   rk   r$   r%   ri   m  s8    
*,((,
zClapAudioSelfAttention.__init__NFr    attention_mask	head_maskoutput_attentionsrB   c                 C   s  |j \}}}||d| jf}| ||dd}	| ||dd}
| ||dd}t|	|
dd}|t	
| j }| j| jd }|| jd | jd  | jd | jd  d}|ddd }||d }|d ur8|j d }||| || j||}||dd }|d| j||}tjj|dd}| |}|d urd|| }t||}|dddd }| d d | jf }||}|r||fn|f}|S )Nr*   r   r'   r   r6   r   )r   r   r   r,   r   r   r   r:   matmulmathsqrtr   r   r/   r-   r.   	unsqueezer   r   rG   softmaxr   r   r   )rc   r    r   r   r   r"   r7   r2   hidden_shapeZquery_layerZ	key_layerZvalue_layerZattention_scoresZrelative_position_biasZ
mask_shapeZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr$   r$   r%   rt     s<    &




zClapAudioSelfAttention.forward)NNFrN   rO   rP   ri   r:   Tensorr   rR   boolrT   rt   ru   r$   r$   rk   r%   r   l  s   (   r   c                       s4   e Zd Z fddZejejejdddZ  ZS )ClapAudioSelfOutputc                    s*   t    t||| _t|j| _d S rg   )rh   ri   r   r   denser   r   r   rc   rx   r7   rk   r$   r%   ri     s    
zClapAudioSelfOutput.__init__r    input_tensorrB   c                 C   s   |  |}| |}|S rg   r   r   rc   r    r   r$   r$   r%   rt     s    

zClapAudioSelfOutput.forwardrN   rO   rP   ri   r:   r   rt   ru   r$   r$   rk   r%   r     s   r   c                       sT   e Zd Z fddZdd Zd
ejeej eej ee	 e
ej ddd	Z  ZS )ClapAudioAttentionc                    s2   t    t||||| _t||| _t | _d S rg   )rh   ri   r   rc   r   rs   setpruned_heads)rc   rx   r7   r   r/   rk   r$   r%   ri     s    
zClapAudioAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S Nr   r   r6   rF   r   rc   r   r   r   r   r   r   r   rs   r   r   unionrc   Zheadsindexr$   r$   r%   prune_heads  s    zClapAudioAttention.prune_headsNFr   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   rc   rs   )rc   r    r   r   r   self_outputsattention_outputr   r$   r$   r%   rt     s    zClapAudioAttention.forward)NNFrN   rO   rP   ri   r   r:   r   r   rR   r   rT   rt   ru   r$   r$   rk   r%   r     s      r   c                       s0   e Zd Z fddZejejdddZ  ZS )ClapAudioIntermediatec                    sH   t    t|t|j| | _t|jt	r<t
|j | _n|j| _d S rg   )rh   ri   r   r   r9   	mlp_ratior   r   
hidden_actstrr	   intermediate_act_fnr   rk   r$   r%   ri     s
    
zClapAudioIntermediate.__init__r    rB   c                 C   s   |  |}| |}|S rg   r   r   rc   r    r$   r$   r%   rt   
  s    

zClapAudioIntermediate.forwardr   r$   r$   rk   r%   r     s   r   c                       s0   e Zd Z fddZejejdddZ  ZS )ClapAudioOutputc                    s4   t    tt|j| || _t|j| _	d S rg   )
rh   ri   r   r   r9   r   r   r   hidden_dropout_probr   r   rk   r$   r%   ri     s    
zClapAudioOutput.__init__r   c                 C   s   |  |}| |}|S rg   r   r   r$   r$   r%   rt     s    

zClapAudioOutput.forwardr   r$   r$   rk   r%   r     s   r   c                	       st   e Zd Zd fdd	Zdd Zdd Zd	d
 Zdeje	e
e
f eej ee ee e	ejejf dddZ  ZS )ClapAudioLayerrm   r   c                    s   t    |j| _|| _|j| _|| _tj||jd| _	t
|||| jd| _|dkr\t|nt | _tj||jd| _t||| _t||| _d S )Neps)r/   rm   )rh   ri   chunk_size_feed_forward
shift_sizer/   input_resolutionr   r   layer_norm_epslayernorm_beforer   	attentionrf   r   	drop_pathlayernorm_afterr   intermediater   rs   )rc   rx   r7   r   r   drop_path_rater   rk   r$   r%   ri     s    
zClapAudioLayer.__init__c                 C   s@   t || jkr<td| _tj r2t t|nt || _d S Nr   )minr/   r   r   r:   Zjit
is_tracingtensor)rc   r   r$   r$   r%   set_shift_and_window_size,  s    
 z(ClapAudioLayer.set_shift_and_window_sizec              	   C   s  | j dkrtjd||df||d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ].}	|D ]$}
||d d |	|
d d f< |d7 }qqt|| j}|d| j| j }|d|d }||dkd|dkd}nd }|S )Nr   r   rn   r*   r'   g      Yrm   )	r   r:   r   slicer/   r4   r,   r   Zmasked_fill)rc   r0   r1   ro   rD   Zimg_maskZheight_slicesZwidth_slicescountZheight_sliceZwidth_sliceZmask_windows	attn_maskr$   r$   r%   get_attn_mask4  s*    zClapAudioLayer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS r   )r/   r   rG   r   )rc   r    r0   r1   	pad_rightZ
pad_bottom
pad_valuesr$   r$   r%   	maybe_padP  s
    zClapAudioLayer.maybe_padNFr    input_dimensionsr   r   always_partitionrB   c                 C   s  |s|  | n |\}}| \}}	}
|}| |}|||||
}| |||\}}|j\}	}}}	| jdkrtj|| j | j fdd}n|}t	|| j
}|d| j
| j
 |
}| j|||j|jd}| j||||d}|d }|d| j
| j
|
}t|| j
||}| jdkr,tj|| j| jfdd}n|}|d dkpH|d dk}|rt|d d d |d |d d f  }|||| |
}|| | }| |}| |}|| | }|r||d	 fn|f}|S )
Nr   )r   r'   )Zshiftsdimsr*   rn   )r   r   r)   r   )r   r   r   r,   r   r   r   r:   Zrollr4   r/   r   ro   rD   r   r5   r.   r   r   r   rs   )rc   r    r   r   r   r   r0   r1   r"   r   r   Zshortcutr   Z
height_padZ	width_padZshifted_hidden_statesZhidden_states_windowsr   Zattention_outputsr   Zattention_windowsZshifted_windowsZ
was_paddedlayer_outputlayer_outputsr$   r$   r%   rt   W  sH    

$

zClapAudioLayer.forward)rm   r   )NFF)rN   rO   rP   ri   r   r   r   r:   r   rT   r9   r   rR   r   rt   ru   r$   r$   rk   r%   r     s      
r   c                       sT   e Zd Z fddZdejeeef eej	 ee
 ee
 eej dddZ  ZS )	ClapAudioStagec                    sh   t     | _| _t fddt|D | _|d urX|tjd| _	nd | _	d| _
d S )Nc              
      s:   g | ]2}t  | |d  dkr(dn jd  dqS )r'   r   )rx   r7   r   r   r   r   )r   r/   r`   irx   r7   r   r   r   r$   r%   
<listcomp>  s   	z+ClapAudioStage.__init__.<locals>.<listcomp>)r7   
norm_layerF)rh   ri   rx   r7   r   
ModuleListrangeblocksr   
downsampleZpointing)rc   rx   r7   r   depthr   r   r
  rk   r  r%   ri     s    
	zClapAudioStage.__init__NFr   c                 C   s   |\}}t | jD ]4\}}	|d ur*|| nd }
|	|||
||}|d }q|}| jd ur|d d |d d  }}||||f}| ||}n||||f}|||f}|r||dd  7 }|S )Nr   r   r'   )	enumerater	  r
  )rc   r    r   r   r   r   r0   r1   r  layer_modulelayer_head_maskr   !hidden_states_before_downsamplingZheight_downsampledZwidth_downsampledoutput_dimensionsZstage_outputsr$   r$   r%   rt     s"    



zClapAudioStage.forward)NFF)rN   rO   rP   ri   r:   r   rT   r9   r   rR   r   rt   ru   r$   r$   rk   r%   r    s      
r  c                       s^   e Zd ZdZejfee eejdd fddZ	dd Z
ejeeef ejdd	d
Z  ZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    N)r   r7   r  rB   c                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr(   r'   Fr   )rh   ri   r   r7   r   r   	reductionr   )rc   r   r7   r  rk   r$   r%   ri     s
    
zClapAudioPatchMerging.__init__c                 C   sF   |d dkp|d dk}|rBddd|d d|d f}t j||}|S )Nr'   r   r   )r   rG   r   )rc   input_featurer0   r1   Z
should_padr   r$   r$   r%   r     s
    zClapAudioPatchMerging.maybe_pad)r  r   rB   c                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r'   r   r*   r(   )r   r,   r   r:   catr   r  )rc   r  r   r0   r1   r"   r7   r2   Zinput_feature_0Zinput_feature_1Zinput_feature_2Zinput_feature_3r$   r$   r%   rt     s    $$$$

zClapAudioPatchMerging.forward)rN   rO   rP   rQ   r   r   rT   r9   Moduleri   r   r:   r   rt   ru   r$   r$   rk   r%   r    s   $r  c                       sj   e Zd Z fddZdd Zdeej eej ee ee ee ee ee e	e
ef dd	d
Z  ZS )ClapAudioEncoderc                    s  t    t j_ _t _ j_jj	_	 j
_
 j
 j _t jdjd   _dd tjd jt jddD jjfddtjD _t fd	dtjD _d
_t j_tj_ j_td_ d S )Nr'   r   c                 S   s   g | ]}|  qS r$   )item)r`   xr$   r$   r%   r        z-ClapAudioEncoder.__init__.<locals>.<listcomp>r   cpurC   c                    s,   g | ]$} d  d|   d d|  fqS )r   r'   r   r$   r  )r   r$   r%   r     r  c                    s|   g | ]t}t  t jd |  j|  j|  j| t jd| t jd|d   |jd k rptnddqS )r'   Nr   )rx   r7   r   r  r   r   r
  )	r  r9   r{   input_resolutionsdepthsr   r   
num_layersr  )r`   Zi_layer)rx   r   rc   r$   r%   r  #  s   
*F)!rh   ri   rF   r  r  rx   r   patch_embedr   r   r   Znum_mel_bins
freq_ratior9   r{   Znum_featuresr:   Zlinspacer   r   r   r  r  r   r  layersgradient_checkpointingr}   
batch_normr   r   ZAdaptiveAvgPool1davgpoolrc   rx   rk   )rx   r   r   rc   r%   ri     s,    


$
zClapAudioEncoder.__init__c                 C   s   |j \}}}}t| j| j }| j| j }||ks:||krBtd||k rbtjj|||fddd}||k rtjj|||fddd}|j \}}}	}
|||| j |	| j |
}|	dddd
 }||||
| j |	| j }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizeZbicubicT)modeZalign_cornersr   r   r   r'   )r   r9   r   r  r   r   rG   r&   r   r-   r.   )rc   normalized_input_featuresr   r#   Zfreq_lengthZ
spec_widthZspec_heightbatchr   timefreqr$   r$   r%   reshape_mel2img8  s,    z ClapAudioEncoder.reshape_mel2imgNFT)	is_longerr   r   output_hidden_states(output_hidden_states_before_downsamplingr   return_dictrB   c	           $      C   s(  | dd}| |}	|	 dd}	d }
| jrJ||j}t|dkd }
| |	}|jd }| 	||
}|rrdnd }|r~dnd }|rdnd }| j
d }|r|j\}}}|j|g||R  }|dddd}||f7 }||f7 }t| jD ]&\}}|d ur|| nd }| j
| }||||||}|d }|d }|d }|d |d f}|r|r|j\}}}|j|g|d |d f|R  }|dddd}||f7 }||f7 }nR|r |s |j\}}}|j|g||R  }|dddd}||f7 }||f7 }|r||dd  7 }q| |}|j\}}}|dt| jd   | jd  }|dt| jd   | jd  }|ddd ||||}|j\}}} }!| | j }"|||| |" |"|!}|ddddd |||"d}| t|d}#t|#d}#|std	d
 ||#||fD S t||#||dS )Nr   r   r   r'   r$   r   r*   r(   c                 s   s   | ]}|d ur|V  qd S rg   r$   )r`   vr$   r$   r%   rd     s   z+ClapAudioEncoder.forward.<locals>.<genexpr>rL   pooler_outputr    rM   )r   r"  r   torD   r:   wherer*  r   r  r  r,   r-   r  r   r   rF   r  r   r.   r   r  r#  r   rT   r   )$rc   input_featuresr+  r   r   r,  r-  r   r.  r&  Zis_longer_list_idxZis_longer_listr    Z
frames_numall_hidden_statesZall_reshaped_hidden_statesall_self_attentionsr   r"   r   hidden_sizeZreshaped_hidden_stater  r  r  r   r  r  rL   Z
n_channelsZ
freq_shapeZtemporal_shapeZn_frequenciesZn_tempZ
c_freq_binZlatent_outputr$   r$   r%   rt   \  s    











  
 zClapAudioEncoder.forward)NNFFFFT)rN   rO   rP   ri   r*  r   r:   rR   r   r   rT   rU   rt   ru   r$   r$   rk   r%   r    s&   ('       
r  c                       s2   e Zd Zeeef d fddZdd Z  ZS )ClapProjectionLayerrw   c                    sH   t    || _|j}|j}t||| _t|j	 | _
t||| _d S rg   )rh   ri   rx   r7  projection_dimr   r   linear1r	   Zprojection_hidden_act
activationlinear2)rc   rx   r7  r9  rk   r$   r%   ri     s    
zClapProjectionLayer.__init__c                 C   s"   |  |}| |}| |}|S rg   )r:  r;  r<  r   r$   r$   r%   rt     s    


zClapProjectionLayer.forward)	rN   rO   rP   r   r   r   ri   rt   ru   r$   r$   rk   r%   r8    s   
r8  c                       s2   e Zd ZdZ fddZd
ddZdd	 Z  ZS )ClapTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _t|dd| _| jdt|jddd | jd	tj| j tjd
dd |j| _tj|j|j| jd| _	d S )N)r=   r   position_embedding_typeabsoluteposition_ids)r   r*   T)
persistenttoken_type_ids)ro   )rh   ri   r   	EmbeddingZ
vocab_sizer7  Zpad_token_idword_embeddingsZmax_position_embeddingsposition_embeddingsZtype_vocab_sizetoken_type_embeddingsr   r   r   r   r   r^   r>  r   r:   rE   expandr   r@  r   r;   r=   r$  rk   r$   r%   ri     s"    
zClapTextEmbeddings.__init__Nr   c                 C   s   |d u r*|d ur t || j|}n
| |}|d ur<| }n| d d }|d }|d u rt| dr| jd d d |f }||d |}	|	}ntj|tj	| j
jd}|d u r| |}| |}
||
 }| jdkr| |}||7 }| |}| |}|S )Nr*   r   rB  r   rn   r?  )r@   r=   &create_position_ids_from_inputs_embedsr   hasattrrB  rG  r:   r   r;   r@  rD   rD  rF  r>  rE  r   r   )rc   r<   rB  r@  inputs_embedsr>   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrF  
embeddingsrE  r$   r$   r%   rt     s0    








zClapTextEmbeddings.forwardc                 C   sN   |  dd }|d }tj| jd || j d tj|jd}|d|S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr*   r   rn   r   )r   r:   rE   r=   r;   rD   r   rG  )rc   rJ  rK  Zsequence_lengthr@  r$   r$   r%   rH  .  s    	z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds)NNNNr   )rN   rO   rP   rQ   ri   rt   rH  ru   r$   r$   rk   r%   r=    s
    
(r=  rm   )moduler   r   r   r   scalingr   r   c                 K   s   t ||dd| }	|d urN|d d d d d d d |jd f }
|	|
 }	tjj|	dt jd|j	}	tjj
|	|| jd}	|d ur|	|dddd }	t |	|}|dd }||	fS )Nr'   r   r   r*   )r7   ro   )prp   r   )r:   r   r   r   r   rG   r   Zfloat32r2  ro   r   rp   r,   r.   )rP  r   r   r   r   rQ  r   r   kwargsattn_weightsZcausal_maskattn_outputr$   r$   r%   eager_attention_forwardA  s    &rV  c                       sL   e Zd Z fddZdejeej eej ee e	ej dddZ
  ZS )	ClapTextSelfAttentionc                    s   t    |j|j dkr>t|ds>td|j d|j d|| _|j| _t|j|j | _| j| j | _	t
|j| j	| _t
|j| j	| _t
|j| j	| _t
|j| _|j| _| jd | _d S )Nr   Zembedding_sizer   r   r         )rh   ri   r7  r   rI  r   rx   r9   r   r   r   r   r   r   r   r   r   r   attention_dropoutrQ  r$  rk   r$   r%   ri   ^  s"    

zClapTextSelfAttention.__init__NFr   c                 K   s   |j d d }g |d| jR }| ||dd}| ||dd}	| ||dd}
t}| jj	dkrt
| jj	 }|| ||	|
|f| jsdn| j| j|d|\}}|jg |dR   }|r||fn|f}|S )Nr*   r   r'   eagerrm   )r   rQ  r   )r   r   r   r,   r   r   r   rV  rx   Z_attn_implementationr   rp   rY  rQ  r   r.   )rc   r    r   r   r   rS  rK  r   Zquery_statesZ
key_statesZvalue_statesZattention_interfacerU  rT  r   r$   r$   r%   rt   s  s0    	
zClapTextSelfAttention.forward)NNFr   r$   r$   rk   r%   rW  ]  s      rW  c                       s4   e Zd Z fddZejejejdddZ  ZS )ClapTextSelfOutputc                    sB   t    t|j|j| _tj|j|jd| _t|j	| _
d S Nr   )rh   ri   r   r   r7  r   r   r   r   r   r   r$  rk   r$   r%   ri     s    
zClapTextSelfOutput.__init__r   c                 C   s&   |  |}| |}| || }|S rg   r   r   r   r   r$   r$   r%   rt     s    

zClapTextSelfOutput.forwardr   r$   r$   rk   r%   r[    s   r[  c                       sT   e Zd Z fddZdd Zd
ejeej eej ee	 e
ej ddd	Z  ZS )ClapTextAttentionc                    s*   t    t|| _t|| _t | _d S rg   )rh   ri   rW  rc   r[  rs   r   r   r$  rk   r$   r%   ri     s    


zClapTextAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S r   r   r   r$   r$   r%   r     s    zClapTextAttention.prune_headsNFr   c           	      K   s@   | j |f|||d|}| |d |}|f|dd   }|S N)r   r   r   r   r   r   )	rc   r    r   r   r   rS  r   r   r   r$   r$   r%   rt     s    zClapTextAttention.forward)NNFr   r$   r$   rk   r%   r^    s      r^  c                       s0   e Zd Z fddZejejdddZ  ZS )ClapTextIntermediatec                    sB   t    t|j|j| _t|jt	r6t
|j | _n|j| _d S rg   )rh   ri   r   r   r7  intermediate_sizer   r   r   r   r	   r   r$  rk   r$   r%   ri     s
    
zClapTextIntermediate.__init__r   c                 C   s   |  |}| |}|S rg   r   r   r$   r$   r%   rt     s    

zClapTextIntermediate.forwardr   r$   r$   rk   r%   r`    s   r`  c                       s4   e Zd Z fddZejejejdddZ  ZS )ClapTextOutputc                    sB   t    t|j|j| _tj|j|jd| _t	|j
| _d S r\  )rh   ri   r   r   ra  r7  r   r   r   r   r   r   r$  rk   r$   r%   ri     s    
zClapTextOutput.__init__r   c                 C   s&   |  |}| |}| || }|S rg   r]  r   r$   r$   r%   rt     s    

zClapTextOutput.forwardr   r$   r$   rk   r%   rb    s   rb  c                       sT   e Zd Z fddZd
ejeej eej ee e	ej dddZ
dd	 Z  ZS )ClapTextLayerc                    s:   t    |j| _d| _t|| _t|| _t|| _	d S )Nr   )
rh   ri   r   seq_len_dimr^  r   r`  r   rb  rs   r$  rk   r$   r%   ri     s    


zClapTextLayer.__init__NFr   c           
      K   sP   | j |f|||d|}|d }|dd  }t| j| j| j|}	|	f| }|S r_  )r   r   feed_forward_chunkr   rd  )
rc   r    r   r   r   rS  Zself_attention_outputsr   r   r   r$   r$   r%   rt     s     
zClapTextLayer.forwardc                 C   s   |  |}| ||}|S rg   )r   rs   )rc   r   Zintermediate_outputr   r$   r$   r%   re    s    
z ClapTextLayer.feed_forward_chunk)NNF)rN   rO   rP   ri   r:   r   r   rR   r   rT   rt   re  ru   r$   r$   rk   r%   rc    s      rc  c                       sd   e Zd Z fddZed	ejeej eej ee	 ee	 ee	 e
eej ef dddZ  ZS )
ClapTextEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r$   )rc  r  rw   r$   r%   r  !  r  z,ClapTextEncoder.__init__.<locals>.<listcomp>F)	rh   ri   rx   r   r  r  num_hidden_layerslayerr!  r$  rk   rw   r%   ri     s    
 zClapTextEncoder.__init__NFT)r    r   r   r   r,  r.  rB   c                 K   s   |rdnd }|rdnd }	t | jD ]\\}
}|r8||f }|d urH||
 nd }|f ||||d|}|d }|r"|	|d f }	q"|r||f }t|||	dS )Nr$   )r    r   r   r   r   r   )rL   r    rM   )r  rh  r   )rc   r    r   r   r   r,  r.  rS  r5  r6  r  r  r  r   r$   r$   r%   rt   $  s0    

zClapTextEncoder.forward)NNFFT)rN   rO   rP   ri   r   r:   r   r   rR   r   r   rT   r   rt   ru   r$   r$   rk   r%   rf    s         rf  c                       s0   e Zd Z fddZejejdddZ  ZS )ClapTextPoolerc                    s*   t    t|j|j| _t | _d S rg   )rh   ri   r   r   r7  r   ZTanhr;  r$  rk   r$   r%   ri   P  s    
zClapTextPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S r   )r   r;  )rc   r    Zfirst_token_tensorpooled_outputr$   r$   r%   rt   U  s    

zClapTextPooler.forwardr   r$   r$   rk   r%   ri  O  s   ri  c                   @   s.   e Zd ZU eed< dZdZejdddZ	dS )ClapPreTrainedModelrx   clapF)rP  c                 C   sP  | j j}t|trF|jjjjd|d d |jjjjd|d d nt|t	r|j
jt| j j |jjt| j j nt|tjr|jjjd|d d nt|tjtjfr|jj  |jjd nxt|tjtjfr4| j jd d| j j d  | }tjj|j|d |jdurL|jj  nt|trL|jj  dS )	zInitialize the weightsrm   g{Gz?)meanstdg      ?rX  r'   )rn  N)rx   Zinitializer_factorr   r=  rE  weightdataZnormal_rF  	ClapModellogit_scale_aZfill_r   loglogit_scale_init_valuelogit_scale_tr   rC  r   r}   r   Zzero_r|   r   r7  rg  initr   r   )rc   rP  factorZin_proj_stdr$   r$   r%   _init_weightsd  s&    

 z!ClapPreTrainedModel._init_weightsN)
rN   rO   rP   r   rS   Zbase_model_prefixZsupports_gradient_checkpointingr   r  rx  r$   r$   r$   r%   rk  ^  s   
rk  c                
       s~   e Zd ZU eed< dZed fddZejdddZ	e
deej eej ee ee ee eeef d
ddZ  ZS )ClapAudioModelrx   r4  rw   c                    s"   t  | t|| _|   d S rg   )rh   ri   r  audio_encoder	post_initr$  rk   r$   r%   ri     s    
zClapAudioModel.__init__r]   c                 C   s
   | j jjS rg   )rz  r  r   rb   r$   r$   r%   get_input_embeddings  s    z#ClapAudioModel.get_input_embeddingsNr4  r+  r   r,  r.  rB   c                 C   sP   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j|||||dS )ae  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```Nr4  r+  r   r,  r.  )rx   use_return_dictr   r,  rz  )rc   r4  r+  r   r,  r.  r$   r$   r%   rt     s    zClapAudioModel.forward)NNNNN)rN   rO   rP   r   rS   main_input_nameri   r   r  r|  r   r   r:   rR   
BoolTensorr   r   rT   r   rt   ru   r$   r$   rk   r%   ry  |  s$   
     
ry  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                       s   e Zd ZU eed< d fdd	Zdd Zdd Zee	de
ej e
ej e
ej e
ej e
ej e
ej e
e e
e e
e eeej ef d

ddZ  ZS )ClapTextModelrx   Tc                    sD   t  | || _t|| _t|| _|r2t|nd| _| 	  dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rh   ri   rx   r=  rO  rf  encoderri  poolerr{  )rc   rx   Zadd_pooling_layerrk   r$   r%   ri     s    

zClapTextModel.__init__c                 C   s   | j jS rg   rO  rD  rb   r$   r$   r%   r|    s    z"ClapTextModel.get_input_embeddingsc                 C   s   || j _d S rg   r  rc   r   r$   r$   r%   set_input_embeddings  s    z"ClapTextModel.set_input_embeddingsN)
r<   r   rB  r@  r   rJ  r   r,  r.  rB   c
                 C   s  |d ur|n| j j}|d ur |n| j j}|	d ur4|	n| j j}	|d urV|d urVtdn@|d urt| || | }
n"|d ur| d d }
ntd|
\}}|d ur|jn|j}|d u rtj	||f|d}|d u r t
| jdr| jjd d d |f }|||}|}ntj|
tj|d}| ||
}| || j j}| j||||d}| j|||||dd	}|d
 }| jd ur| |nd }t|||j|jdS )NzDYou cannot specify both input_ids and inputs_embeds at the same timer*   z5You have to specify either input_ids or inputs_embedsrC   rB  rn   )r<   r@  rB  rJ  T)r   r   r   r,  r.  r   r0  )rx   r   r,  r  r   Z%warn_if_padding_and_no_attention_maskr   rD   r:   ZonesrI  rO  rB  rG  r   r;   Zget_extended_attention_maskZget_head_maskrg  r  r  r   r    rM   )rc   r<   r   rB  r@  r   rJ  r   r,  r.  rK  r"   rL  rD   rM  rN  Zextended_attention_maskZembedding_outputZencoder_outputsZsequence_outputrj  r$   r$   r%   rt     s\    


zClapTextModel.forward)T)	NNNNNNNNN)rN   rO   rP   r   rS   ri   r|  r  r   r   r   r:   r   r   r   rT   r   rt   ru   r$   r$   rk   r%   r    s6   
         r  c                       s  e Zd ZU eed< ed fddZedeej	 eej	 eej	 ee
 ee
 ee
 ejdddZedeej	 eej	 eej	 ee
 ee
 ee
 ejd	d
dZeedeej eej eej eej	 eej ee
 ee
 ee
 ee
 eeef d
ddZ  ZS )rq  rx   rw   c                    s   t  | t|jts.tdt|j dt|jtsPtdt|j d|j}|j}t	
tt|j| _t	
tt|j| _|j| _t|| _t|| _t|| _t|| _|   d S )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )rh   ri   r   text_configr   	TypeErrortypeaudio_configr   r   r   r:   r   r   rs  rt  rr  ru  r9  r  
text_modelr8  text_projectionry  audio_modelaudio_projectionr{  )rc   rx   r  r  rk   r$   r%   ri   .  s.    



zClapModel.__init__Nr<   r   r@  r   r,  r.  rB   c           
      C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j||||||d}|durb|d n|j}| |}	tj|	dd}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr<   r   r@  r   r,  r.  r   r*   r6   )	rx   r   r,  r  r  r1  r  F	normalize)
rc   r<   r   r@  r   r,  r.  text_outputsrj  Ztext_featuresr$   r$   r%   get_text_featuresN  s     	
zClapModel.get_text_features)r4  r+  r   r   r,  r.  rB   c           
      C   sz   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j|||d}|sX|d n|j}| |}	tj|	dd}	|	S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```python
        >>> from transformers import AutoFeatureExtractor, ClapModel
        >>> import torch

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))
        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> audio_features = model.get_audio_features(**inputs)
        ```N)r4  r+  r.  r   r*   r6   )	rx   r   r,  r  r  r1  r  r  r  )
rc   r4  r+  r   r   r,  r.  audio_outputsrj  Zaudio_featuresr$   r$   r%   get_audio_features~  s    
zClapModel.get_audio_features)
r<   r4  r+  r   r@  return_lossr   r,  r.  rB   c
              	   C   sB  |dur|n| j j}|dur |n| j j}|	dur4|	n| j j}	| j||||dd}
| j|||||dd}|	sr|
d n|
j}| |}|	s|d n|j}| |}||j	dddd }||j	dddd }| j
 }| j }t|| | }t|| | }d}|r,t|}t| }|| d	 }t|||||||
d
S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]

        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```NTr~  r  r   r'   r*   )rR  r7   Zkeepdimg       @)rX   rY   rZ   rK   rV   r[   r\   )rx   r   r,  r  r  r  r1  r  r  r   ru  exprr  r:   r   trI   rW   )rc   r<   r4  r+  r   r@  r  r   r,  r.  r  r  rV   rK   Zlogit_scale_textZlogit_scale_audiorZ   rY   rX   Zcaption_lossZ
audio_lossr$   r$   r%   rt     sV    *	



zClapModel.forward)NNNNNN)NNNNNN)	NNNNNNNNN)rN   rO   rP   r   rS   ri   r   r   r:   r   r   rR   r  r  r   Z
LongTensorr  r   rT   rW   rt   ru   r$   r$   rk   r%   rq  *  sr   
       /      1         
rq  c                       s   e Zd ZU eed< ed fddZejdddZdd	 Z	e
edeej eej eej ee ee ee eeef dddZ  ZS )ClapTextModelWithProjectionrx   rw   c                    s,   t  | t|| _t|| _|   d S rg   )rh   ri   r  r  r8  r  r{  r$  rk   r$   r%   ri     s    

z$ClapTextModelWithProjection.__init__r]   c                 C   s
   | j jjS rg   r  rO  rD  rb   r$   r$   r%   r|    s    z0ClapTextModelWithProjection.get_input_embeddingsc                 C   s   || j j_d S rg   r  r  r$   r$   r%   r     s    z0ClapTextModelWithProjection.set_input_embeddingsNr  c           
      C   s\   |dur|n| j j}| j|||||dd}|s6|d n|j}| |}	t|	|j|j|jdS )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```NTr  r   )rK   rL   r    rM   )	rx   r  r  r1  r  rJ   rL   r    rM   )
rc   r<   r   r@  r   r,  r.  r  rj  rK   r$   r$   r%   rt   #  s"    	
z#ClapTextModelWithProjection.forward)NNNNNN)rN   rO   rP   r   rS   ri   r   r  r|  r  r   r   r   r:   r   r   r   rT   rJ   rt   ru   r$   r$   rk   r%   r    s*   
      
r  c                       s   e Zd ZU eed< dZed fddZejdddZ	e
edeej eej ee ee ee eeef d
ddZ  ZS )ClapAudioModelWithProjectionrx   r4  rw   c                    s,   t  | t|| _t|| _|   d S rg   )rh   ri   ry  r  r8  r  r{  r$  rk   r$   r%   ri   X  s    

z%ClapAudioModelWithProjection.__init__r]   c                 C   s   | j jjjS rg   )r  rz  r  r   rb   r$   r$   r%   r|  _  s    z1ClapAudioModelWithProjection.get_input_embeddingsNr}  c           	      C   s   |dur|n| j j}|dur |n| j j}|dur4|n| j j}| j||||dd}|s\|d n|j}| |}t||j|j	|j
dS )av  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```NTr~  r   )rV   rL   rM   r    )rx   r  r   r,  r  r1  r  rU   rL   rM   r    )	rc   r4  r+  r   r,  r.  r  rj  rV   r$   r$   r%   rt   b  s&    
z$ClapAudioModelWithProjection.forward)NNNNN)rN   rO   rP   r   rS   r  ri   r   r  r|  r   r   r   r:   rR   r  r   r   rT   rU   rt   ru   r$   r$   rk   r%   r  S  s&   
     
r  )rq  rk  r  r  ry  r  )r   )rm   N)UrQ   r   r   dataclassesr   typingr   r   r   r   r:   Ztorch.nn.functionalr   rG   r  Zactivationsr	   Zmodeling_layersr
   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   r   r   utilsr   r   r   r   r   Zconfiguration_clapr   r   r   Z
get_loggerrN   loggerr&   r4   r5   r@   r   rI   rJ   rU   rW   r  rf   rv   r   r   r   r   r   r   r   r  r  r  r8  r=  floatrV  rW  r[  r^  r`  rb  rc  rf  ri  rk  ry  r  rq  r  r  __all__r$   r$   r$   r%   <module>   s   

$(c`'~=6 Fa  ;.)2;e h@G