a
    h5U                     @   s   d dl mZmZmZmZ d dlZd dlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ e e!Z"G dd deZ#dS )    )AnyCallableOptionalUnionN   )(DiaClassifierFreeGuidanceLogitsProcessor"DiaEOSChannelFilterLogitsProcessor!DiaEOSDelayPatternLogitsProcessorLogitsProcessorListTemperatureLogitsWarper)StoppingCriteriaList)BaseStreamer)GenerateOutputGenerationConfigGenerationMixinGenerationMode)is_deepspeed_zero3_enabled)is_fsdp_managed_module)PreTrainedModel)loggingc                       sN  e Zd ZdZdeee ejee	eej
gee f  ee ee eeeef  eej
 eej
 ed
 fddZdee ee eeeef d fddZdeej
 eej
 eeeej
f  eej
ee eeej
f f d fd	d
Zdeeeeej
f ej
eej eejeeej
f f dddZd fdd	Zeej
eeej
 ej
dddZd eej
 ee ee ee ee	eej
gee f  ee ed ed eej
 eej
 ee ee dddZe d!eej
 ee ee ee ee	eej
gee f  ee ed ed eej
 eej
 ee ee eeejf dddZ  Z S )"DiaGenerationMixinN)
generation_configinput_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnlogits_processordevicemodel_kwargsnegative_prompt_idsnegative_prompt_attention_maskreturnc
                    s   |j }
|j}d |_ d |_t }|d ur<|dkr<|t| |tt| jj| jj	d t
 j|||d |||||	d	}|
d ur|
dkrt|
|jd}|d| |t| jj| jj	|j|d |
|_ ||_|S )Ng      ?)num_channelseos_token_id	r   r   r   r   r   r   r   r   r      )guidance_scaleZguidance_top_kr   )delay_patternr"   Zmax_generation_lenr   )r%   Ztemperaturer
   appendr   r   lenconfigr&   r"   super_get_logits_processorr   Ztop_kinsertr	   
max_length)selfr   r   r   r   r   r   r   r   r   Zoriginal_guidance_scaleZoriginal_temperatureZcustom_processorsZmerged_processorsZcfg_processor	__class__ b/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/models/dia/generation_dia.pyr+   ,   sR    

z(DiaGenerationMixin._get_logits_processor)r   use_model_defaultskwargsr    c                    sN   t  j||fi |\}}| jt| jj7  _|jd uoB|jdk| _||fS Nr$   )r*   _prepare_generation_configr-   maxr)   r&   r%   	_uses_cfg)r.   r   r3   r4   r   r/   r1   r2   r6   o   s    
z-DiaGenerationMixin._prepare_generation_config)inputsbos_token_idr   r    c                    sh   t  j|||d\}}}| jr^t|}tj||gdd}|dd d ur^|d dd|d< |||fS )N)r9   r:   r   r   dimattention_mask   r$   )r*   _prepare_model_inputsr8   torchZ
zeros_likecatgetrepeat)r.   r9   r:   r   Z
input_nameZunconditioned_inputsr/   r1   r2   r?      s    
z(DiaGenerationMixin._prepare_model_inputs)
batch_sizemodel_input_namer   decoder_start_token_idr   r    c                 C   sR  d }}|dur"d|v r"| d}|dur<d|v r<| d}|du sL|du rtd|du d|du d | jjj}| jr|d n|}	|du rtj|	d|f|tj	|d	}tj
|	|jd ftj	|d
}|	 }
|jd |dddddf | jjkjdd  }|
ddd|f dd	 }|ddd|f 	 }||d< |
|d< ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNdecoder_input_idsdecoder_attention_maskz[In order to generate with Dia, we need the processed audio input: Got `decoder_input_ids`: z" and got `decoder_attention_mask`=z]. This can be achieved via the [`DiaProcessor`] but now defaulting to non-delayed generation.r>   r$   )dtyper   )sizerI   r   r   r;   decoder_delay_mask)poploggerZwarning_oncer)   decoder_configr!   r8   r@   fulllongZonesshapepad_token_idsumr7   	transpose)r.   rD   rE   r   rF   r   rG   rH   r!   Zreal_batch_size
delay_maskZvalid_input_sizer1   r1   r2   )_prepare_decoder_input_ids_for_generation   s<    



2 z<DiaGenerationMixin._prepare_decoder_input_ids_for_generationc           	         s"  | j r|d jd d n|d jd }||| jjjddd}t j|fd|i|}| 	|| jj
||d< |ddr|d	 d dkr|d d d dd d f d d d d d f |d< |d  |d< | j rd
D ]D}||d d urtdgdg|| jd   }|| j| ||< q|S )Nr   r>   rK   r$   encoder_outputsrG   	use_cacheFZcache_position)rG   rH   Zdecoder_position_ids)r8   rR   reshaper)   rO   r!   rU   r*   prepare_inputs_for_generationapply_delay_maskrS   rB   
contiguoustuplendimrC   )	r.   	input_idsrX   rL   r4   rD   Zmodel_inputskeyZrepeat_patternr/   r1   r2   r[      s    &
0z0DiaGenerationMixin.prepare_inputs_for_generation)r`   pad_idrV   r    c                 C   s   |d u r| S t | jd |jd }|d d d |d d f }| d d d |d d f }t||k||| d d d |d d f< | S r5   )minrR   r@   where)r`   rb   rV   Zmask_lenZ
valid_maskZvalid_inputr1   r1   r2   r\      s    (z#DiaGenerationMixin.apply_delay_maskr   r   r9   r   r   stopping_criteriar   synced_gpusassistant_modelstreamerr   r   r3   custom_generatec                 K   s  | dd }| dd }| j||fi |\}}| |  | ||| |d u rnt sbt| olt dk}|d urz|nt	 }|d ur|nt
 }|dd d u}| ||j|\}}}|jd }|j}| j|||d d|vr| ||||}| j||||j|jd\}}|jr | ||}|d ur8||  |jd	 }|d
d u oZ|jd u}|dd u ot|jd u}| j||||||d}|  rd|vrd|d< | ||| |jd }|jd |kr|dkr| jjs||jd 7 }| ||||| | |}|d ur,|j!dkr,t"d| j#||||||j||	|
d	}| j$f |||d|}|j%|d< |&d	|jd	 }|t'j(t'j)fv r|j*dkrt"d| j+|f|||||d|S t"dd S )N	tokenizerassistant_tokenizerr$   r=   r   )r   rX   )rD   rE   r   rF   r   rK   r-   
min_length)r   has_default_max_lengthhas_default_min_lengthrE   inputs_tensorinput_ids_lengthZlogits_to_keepZinputs_embedszZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.r#   )r   rf   rk   rY   z2`num_return_sequences>1` is incompatible with Dia.)r   rf   r   rg   ri   zGot incompatible mode for generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.),rM   r6   Z_validate_model_kwargscopyZ_validate_assistantr   r   distZget_world_sizer
   r   rB   r?   r:   rR   r   Z_prepare_special_tokensZ._prepare_encoder_decoder_kwargs_for_generationrW   Z_decoder_start_token_tensorZtoken_healingZheal_tokensputcpur-   rm   Z_prepare_generated_lengthZ_supports_logits_to_keepZ_validate_generated_lengthr)   Zis_encoder_decoderZ_prepare_cache_for_generationZget_generation_modeZ	num_beams
ValueErrorr+   Z_get_stopping_criteriarY   rZ   r   ZSAMPLEZGREEDY_SEARCHZnum_return_sequencesZ_sample)r.   r9   r   r   rf   r   rg   rh   ri   r   r   r3   rj   r4   rk   rl   r   Zkwargs_has_attention_maskrp   rE   rD   r   r`   rq   rn   ro   Zmax_cache_lengthZgeneration_modeZprepared_logits_processorZprepared_stopping_criteriar1   r1   r2   _main_generate_loop   s    










z&DiaGenerationMixin._main_generate_loop)r9   r   r   rf   r   rg   rh   ri   r   r   r3   rj   r    c                 K   s   | d}|d ur| }| jf |||||||||	|
||d|}t|tj }|r^|j}n|}| jjj	}|j
d | }|||ddd}| || jj|}|r||_n|}|S )NrG   re   r   rK   r$   r>   )rB   clonerw   
isinstancer@   Tensor	sequencesr)   rO   r!   rR   rZ   rU   r\   rS   )r.   r9   r   r   rf   r   rg   rh   ri   r   r   r3   rj   r4   rV   outputZreturn_dict_in_generateZoutput_sequencesr!   Zbszr1   r1   r2   generate  s>    

zDiaGenerationMixin.generate)NNNNNNNN)N)NNN)N)NN)NNNNNNNNNNNN)NNNNNNNNNNNN)!__name__
__module____qualname__r8   r   r   intr@   Z
LongTensorr   rz   listr
   strdictr   r+   boolr^   r6   r?   r   rW   r[   staticmethodr\   r   rw   Zno_gradr   r   r}   __classcell__r1   r1   r/   r2   r   (   s           D 
    5  %                          r   )$typingr   r   r   r   r@   Ztorch.distributeddistributedrs   Zgeneration.logits_processr   r   r	   r
   r   Zgeneration.stopping_criteriar   Zgeneration.streamersr   Zgeneration.utilsr   r   r   r   Zintegrations.deepspeedr   Zintegrations.fsdpr   Zmodeling_utilsr   utilsr   Z
get_loggerr~   rN   r   r1   r1   r1   r2   <module>   s   
