a
    ho                    @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZmZ d dlmZmZ d dlmZ d dl
mZ d dlmZmZ d dlmZ d d	lmZ d dlZd d
lm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1 ddl)m2Z2 e0 rxd dl3Z3d dlm4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z: e$ rd dl;Z;e( rd dl<m=Z= e& oe# oe' oe% Z>e>rd dl?Z?d dl@mAZAmBZB d dlCmDZD d dlEmFZFmGZG d dlHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZPmQZQmRZRmSZS d dlTmUZU d dlVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe d dlfmgZg d dlhmiZimjZjmkZk G dd  d egd!d"ZlG d#d$ d$eUd!d"ZmG d%d& d&eKd!d"ZnejelZoejemZpejenZqh d'Zrh d(Zsh d)Zte1uevZwd*d+d,d-iZxeyexz Z{G d.d/ d/ej|Z}ed0d1d2Z~ed3d3d4d5d6ZG d7d8 d8ZG d9d: d:ZeG d;d< d<ZG d=d> d>e2Zevd?kre Ze  dS )@    N)ArgumentParser	Namespace)	GeneratorIterable)	dataclassfield)BytesIO)Thread)OptionalUnion
model_infoHF_HUB_OFFLINE)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)is_fastapi_availableis_librosa_availableis_openai_availableis_pydantic_availableis_uvicorn_availableis_vision_available   )
AutoConfigLogitsProcessorListPreTrainedTokenizerFastProcessorMixinTextIteratorStreamer)is_torch_availablelogging   )BaseTransformersCLICommand)AutoProcessorBitsAndBytesConfigGenerationConfigPreTrainedModel)ContinuousBatchingManagerRequestStatus)Image)FastAPIHTTPException)CORSMiddleware)JSONResponseStreamingResponse)Transcription)TranscriptionCreateParamsBase)ChatCompletionMessageParam)ChatCompletionChunkChoiceChoiceDeltaChoiceDeltaToolCallChoiceDeltaToolCallFunction)CompletionCreateParamsStreaming)ResponseResponseCompletedEventResponseContentPartAddedEventResponseContentPartDoneEventResponseCreatedEventResponseErrorResponseErrorEventResponseFailedEventResponseInProgressEventResponseOutputItemAddedEventResponseOutputItemDoneEventResponseOutputMessageResponseOutputTextResponseTextDeltaEventResponseTextDoneEvent)ResponseCreateParamsStreaming)	BaseModelTypeAdapterValidationErrorc                   @   s   e Zd ZU dZeed< dS ))TransformersResponseCreateParamsStreamingz
        OpenAI's ResponseCreateParamsStreaming with an additional field for the generation config (as a json string).
        generation_configN__name__
__module____qualname____doc__str__annotations__ rS   rS   Y/var/www/html/assistant/venv/lib/python3.9/site-packages/transformers/commands/serving.pyrJ   w   s   
rJ   F)totalc                   @   s   e Zd ZU dZeed< dS )+TransformersCompletionCreateParamsStreamingz
        OpenAI's CompletionCreateParamsStreaming with an additional field for the generation config (as a json string).
        rK   NrL   rS   rS   rS   rT   rV   ~   s   
rV   c                   @   s2   e Zd ZU dZeed< eed< dZee	 ed< dS )%TransformersTranscriptionCreateParamsz
        OpenAI's TranscriptionCreateParamsBase with an additional field for the generation config (as a json string).
        filerK   FstreamN)
rM   rN   rO   rP   bytesrR   rQ   rY   r
   boolrS   rS   rS   rT   rW      s   
rW   >   previous_response_idservice_tiertop_logprobsuserZ	reasoning
backgroundtool_choiceprompttextZmax_tool_callsincludeZ
truncationstore>   Zpresence_penaltyZaudiora   stopZ	functionsr^   Zweb_search_optionsZstream_optionsZ
predictionZmax_completion_tokensmetadatanr]   logprobsr_   Zfunction_callparallel_tool_callsresponse_formatZ
modalitiesZreasoning_effortre   >   Ztimestamp_granularitiesrk   Zchunking_strategyrb   rd   languageZqwenz<tool_call>z</tool_call>)startendc                   @   s   e Zd ZdZdZdZdZdS )ModalityLLMVLMSTTTTSN)rM   rN   rO   rp   rq   rr   rs   rS   rS   rS   rT   ro      s   ro   argsc                 C   s   t | S )z~
    Factory function used to instantiate serving server from provided command line arguments.

    Returns: ServeCommand
    )ServeCommandrt   rS   rS   rT   serve_command_factory   s    rw   r$   )reqmodel_generation_configreturnc                 K   sb  |  ddur(tf i t| d }n
t|}|jf i |}| D ]\}}|durJt||| qJ|  ddurt	| d |_
|  ddurt	| d |_
|  ddurt| d |_|  ddur| d |_|  ddur| d |_|  ddur"t| d |_t| d d	kr"d
|_|  ddur@t| d |_|  ddur^t| d  |S )a  
    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
    Other parameters in the request will be applied on top of the baseline.

    Args:
        req (`dict`):
            The request which may optionally contain generation parameters.
        model_generation_config (`GenerationConfig`):
            The model's default generation config.
        kwargs (`dict`):
            Additional parameters to set in the generation config.

    Returns:
        The prepared `GenerationConfig` object.
    rK   NZmax_output_tokensZ
max_tokensZfrequency_penaltyZ
logit_biasrf   temperatureg        Ftop_pseed)getr$   jsonloadscopydeepcopyupdateitemssetattrintmax_new_tokensfloatZrepetition_penaltyZsequence_biasZstop_stringsr{   	do_sampler|   torchmanual_seed)rx   ry   kwargsrK   Znon_standard_kwargskvrS   rS   rT   !create_generation_config_from_req   s4    


r   c                   @   s    e Zd ZdZdd Zdd ZdS )	ToolStatez7Lightweight class to keep track of the tool call state.c                 C   s   |    d S N)resetselfrS   rS   rT   __init__!  s    zToolState.__init__c                 C   s   d| _ d| _d| _d| _dS )z>Reset the tool call state (assumes we're outside a tool call).Fr    N)inside_tool_callhas_tool_name_definedarg_nesting_levelbufferr   rS   rS   rT   r   $  s    zToolState.resetN)rM   rN   rO   rP   r   r   rS   rS   rS   rT   r     s   r   c                   @   sD   e Zd ZdZddeeed  dddZdd	 Zd
d Z	dd Z
dS )
TimedModelz
    A class that holds a PreTrainedModel instance and its associated processor.
    Automatically deletes the instances after a specified timeout.
    Nr%   )r   r   )modeltimeout_seconds	processorc                 C   s>   || _ t|j| _|| _|| _t| j| j| _	| j	
  d S r   )r   rQ   Zname_or_path_name_or_pathr   r   	threadingTimer_delete_model_timerrm   )r   r   r   r   rS   rS   rT   r   2  s    zTimedModel.__init__c                 C   s*   | j   t| j| j| _ | j   dS )z2Reset the timer for the deletion of the instances.N)r   cancelr   r   r   r   rm   r   rS   rS   rT   reset_timer?  s    
zTimedModel.reset_timerc                 C   sb   t | dr^| jdur^| `| `d| _d| _t  tj rDtj  t	
| j d| j d dS )z>Delete the wrapped model and processor and clean up resources.r   Nz was removed from memory after z seconds of inactivity)hasattrr   r   gcZcollectr   cudaZis_availableZempty_cacheloggerinfor   r   r   rS   rS   rT   r   E  s    

zTimedModel._delete_modelc                 C   s   t | d p| jdu S )z)Check if the instances have been deleted.r   N)r   r   r   rS   rS   rT   
is_deletedV  s    zTimedModel.is_deleted)N)rM   rN   rO   rP   r   r
   r   r   r   r   r   rS   rS   rS   rT   r   ,  s   	 
r   c                   @   s  e Zd ZU dZedddidZeed< eddg d	d
dZe	e ed< eddg d	d
dZ
e	e ed< edddidZeed< edddidZe	e ed< edddidZeed< edddidZeed< eddddgd
dZeed< edddidZeed< edddidZeed< ed dd!idZeed"< ed#dd$idZeed%< ed&dd'idZeed(< eddd)idZe	e ed*< eddd+idZeed,< eddd-idZeed.< eddd/idZe	e ed0< d1d2 ZdS )3ServeArgumentsz
    Arguments for the serve CLI.

    See the metadata arg for each argument's description -- the metadata will be printed with
    `transformers serve --help`
    autohelpzfDevice to use for inference; will default to `auto` andplace the model on an accelerator if available.)defaultrg   deviceNzA`torch_dtype` is deprecated! Please use `dtype` argument instead.)r   Zbfloat16Zfloat16Zfloat32)r   choicestorch_dtypezOverride the default `torch.dtype` and load the model under this dtype. If `'auto'` is passed, the dtype will be automatically derived from the model's weights.dtypeFz2Whether to trust remote code when loading a model.trust_remote_codezWhich attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`.attn_implementationzIWhether to use 8 bit precision for the base model - works only with LoRA.load_in_8bitzIWhether to use 4 bit precision for the base model - works only with LoRA.load_in_4bitZnf4zQuantization type.Zfp4bnb_4bit_quant_typez#Whether to use nested quantization.use_bnb_nested_quant	localhostz$Interface the server will listen to.hosti@  zPort the server will listen to.porti,  z@Time in seconds after which a model will be removed from memory.model_timeoutr   z8Logging level as a string. Example: 'info' or 'warning'.	log_levelz1The default seed for torch, should be an integer.default_seedztWhether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled.enable_corsz+Whether to turn on strict input validation.input_validationzName of the model to be forced on all requests. This is useful for testing Apps that don't allow changing models in the request.force_modelc                 C   s    | j dur| jdkr| j | _dS )z(Only used for BC `torch_dtype` argument.Nr   )r   r   r   rS   rS   rT   __post_init__  s    zServeArguments.__post_init__)rM   rN   rO   rP   r   r   rQ   rR   r   r
   r   r   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rS   rS   rS   rT   r   [  s   

r   c                	   @   s  e Zd ZeedddZedddZedde	d	d
dZ
edddZedddZedddZdDee ee ee ee ee eed  edddZdedddZdd Zejddeeeef  d d!d"Zeeeddf d#d$d%Zed&ed'd(d)Zeed*d+d,Zeeeddf d#d-d.Zeeeddf d#d/d0Zeeeddf d#d1d2Zee d#d3d4Z!eeed5 d6d7d8Z"eed9d:d;Z#ed<d=d>Z$ee%d&e&f d?d@dAZ'ee%d&e(f d?dBdCZ)dS )Erv   )parserc                 C   s$   t f}| jd|d}|jtd dS )z
        Register this command to argparse so it's available for the transformer-cli

        Args:
            parser: Root parser to register command-specific arguments
        serve)dataclass_types)funcN)r   
add_parserset_defaultsrw   )r   r   Zserve_parserrS   rS   rT   register_subcommand  s    z ServeCommand.register_subcommandrt   c                 C   s   t std|| _| jjdk| _| jj| _| jjd urDt| jj t	
d}|t	j| jj   t	
d}|t	j| jj   i | _d | _d | _d | _d | _d S )NzaMissing dependencies for the serving CLI. Please install with `pip install transformers[serving]`Z
sdpa_pagedtransformersz+transformers.generation.continuous_batching)serve_dependencies_availableImportErrorru   r   use_continuous_batchingr   r   r   r   r   
get_loggersetLevelZ
log_levelsr   lowerloaded_models#running_continuous_batching_managerlast_messageslast_kv_cache
last_model)r   ru   Ztransformers_loggerZ	cb_loggerrS   rS   rT   r     s$    


zServeCommand.__init___TypedDictMetarH   requestschema	validatorunused_fieldsc           
   
   C   s   t d|  t| }|j}|| }|rPt d|  tdd| d| jjrz|	| W nH t
y } z0t d|   td| dW Y d}~n
d}~0 0 ||@ }	|	rt d|	  tdd|	 ddS )a  
        Validates the request against the schema, and checks for unexpected keys.

        Args:
            request (`dict`):
                The request to validate.
            schema (`_TypedDictMeta`):
                The schema of the request to validate. It is a `TypedDict` definition.
            validator (`TypeAdapter`):
                The validator to use to validate the request. Built from `schema`.
            unused_fields (`set`):
                Fields accepted by `schema`, but not used in `transformers serve`.

        Raises:
            HTTPException: If the request is invalid or contains unexpected or unused fields.
        zValidating request: z Unexpected keys in the request: i  )status_codedetailzValidation error: NzUnused fields in the request: )r   debugsetkeysZ__mutable_keys__errorr*   ru   r   Zvalidate_pythonrI   errors)
r   r   r   r   r   Z
input_keysZpossible_keysZunexpected_keyseZunused_fields_in_requestrS   rS   rT   _validate_request  s&    &
zServeCommand._validate_requestr   c                 C   s   | j |tttd d S Nr   )r   rJ   response_validatorUNUSED_RESPONSE_FIELDSr   r   rS   rS   rT   validate_response_request!  s    z&ServeCommand.validate_response_requestc                 C   s   | j |tttd d S r   )r   rV   completion_validatorUNUSED_CHAT_COMPLETION_FIELDSr   rS   rS   rT    validate_chat_completion_request)  s    z-ServeCommand.validate_chat_completion_requestc                 C   s   | j |tttd d S r   )r   rW   transcription_validatorUNUSED_TRANSCRIPTION_FIELDSr   rS   rS   rT   validate_transcription_request1  s    z+ServeCommand.validate_transcription_requestr   Nr4   )
request_idcontentr   rolefinish_reason
tool_callsrz   c              
   C   sF   t |tt |tt|||dd|dgddd}d|jdd	 d
S )a  
        Builds a chunk of a streaming OpenAI Chat Completion response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            request_id (`str`):
                The request ID.
            content (`str`, *optional*):
                Content of the response from the model.
            model (`str`, *optional*):
                The model that generated the content.
            role (`str`, *optional*):
                The role of the next content, until a new role is defined.
            finish_reason (`str`, *optional*):
                The reason the generation by the model has finished.
            tool_calls (`list[ChoiceDeltaToolCall]`, *optional*):
                Data about the tool calls, when they are triggered.

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        )r   r   r   r   )deltaindexr   r   zchat.completion.chunk)idcreatedr   r   Zsystem_fingerprintobjectdata: TZexclude_none

)r1   r   timer2   r3   model_dump_json)r   r   r   r   r   r   r   chunkrS   rS   rT   build_chat_completion_chunk9  s$     
z(ServeCommand.build_chat_completion_chunkrG   )responserz   c                 C   s   d|j dd dS )a  
        Builds a event of a streaming OpenAI Response response.

        IMPORTANT: The serialized chunk won't contain empty fields (fields with `None`). Some downstream apps,
        like Cursor, assume that when the field exists, it has data.

        Args:
            response (`BaseModel`):
                The response to build an event from. One of the multiple OpenAI Response output types

        Returns:
            `str`: The built chunk, a string containing a JSON string with the payload.
        r   Tr   r   )r   )r   r  rS   rS   rT   build_response_eventm  s    z!ServeCommand.build_response_eventc                    s   t  } jr2|jtdgddgdgd td n
td |dtd fdd	}|d
td fdd}ddlm	} |d|d fdd}|
d|d fdd}tj| jj jj jjd d S )N*T)Zallow_originsZallow_credentialsZallow_methodsZallow_headerszUCORS allow origin is set to `*`. This is not recommended for production environments.zaSome apps may require CORS. Consider launching the server with `--enable-cors` if you see errors.z/v1/chat/completionsr   c                    s4    j | d  jr | }n
 | }t|ddS Nr   text/event-stream
media_type)r   r   #continuous_batching_chat_completiongenerate_chat_completionr-   r   outputr   rS   rT   chat_completion  s
    
z)ServeCommand.run.<locals>.chat_completionz/v1/responsesc                    s"    j | d  | }t|ddS r  )r   generate_responser-   r  r   rS   rT   	responses  s    
z#ServeCommand.run.<locals>.responsesr   )Requestz/v1/audio/transcriptionsc              
      s   |   4 I d H j}t|d  I d H |d d}td|d j d|d j d|d jd dd	 W d   I d H  q1 I d H s0    Y   j|d
  	|}t
|ddS )NrX   r   )rX   r   zReceived file: z; MIME type: z; size:    z.2fz KiBr   r  r  )formrW   readr   r   filenamecontent_typesizer   generate_transcriptionr-   )r   r  Zparsed_requestr  r   rS   rT   audio_transcriptions  s    .
z.ServeCommand.run.<locals>.audio_transcriptionsz
/v1/modelsc                      s   t d  dS )Nlist)r   data)r,   get_gen_modelsrS   r   rS   rT   get_all_models  s    z(ServeCommand.run.<locals>.get_all_models)r   r   r   )r)   r   Zadd_middlewarer+   r   Zwarning_oncepostdictfastapir  optionsr~   uvicornrunru   r   r   r   )r   Zappr  r  r  r  r  rS   r   rT   r#  }  s4    	zServeCommand.run)maxsize)rz   c                 C   s:   g d}t rdd |D S dd |D }dd |D S dS )a.  
        This is by no means a limit to which models may be instantiated with `transformers serve`: any chat-based
        model working with generate can work.

        This is a limited list of models to ensure we have a discoverable /v1/models endpoint for third-party
        integrations.
        )zMenlo/Jan-nanozMenlo/Jan-nano-128kzQwen/Qwen2.5-0.5B-InstructzQwen/Qwen2.5-3B-InstructzQwen/Qwen2.5-7B-InstructzQwen/Qwen2.5-14B-Instructz meta-llama/Llama-3.1-8B-Instructz meta-llama/Llama-3.2-1B-Instructz!meta-llama/Llama-3.3-70B-InstructzHuggingFaceTB/SmolVLM-Instructz!ibm-granite/granite-vision-3.2-2bzQwen/Qwen2.5-VL-7B-Instructc                 S   s.   g | ]&}|d t j   |dd dqS )r   /r   r   r   r   Zowned_by)datetimenow	timestampsplit.0r   rS   rS   rT   
<listcomp>  s   z/ServeCommand.get_gen_models.<locals>.<listcomp>c                 S   s   g | ]}t |qS rS   r   r+  rS   rS   rT   r-        c                 S   s$   g | ]}|j d |j |jdqS )r   r&  )r   
created_atr)  authorr+  rS   rS   rT   r-    s   Nr   )r   modelsZmodel_infosrS   rS   rT   r    s    	
zServeCommand.get_gen_models)rx   rz   c                    s    d jk}_|rBjdurBjjddd d_\}}t|dr`|jn|}t|j|j	|j
ddd	dd
dd
 jdu r|j dd_t j_j  |jd ddd|j} fdd}||d S )a'  
        Generates an OpenAI Chat Completion using continuous batching.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        r   NTr   )blocktimeout	tokenizerFr    r  
   Zfifo)	ry   eos_token_idpad_token_idZ	use_cacheZ
num_blocks
block_sizer   Zmax_batch_tokensZ	scheduler)rK   Z	streamingmessagespt)return_tensorsadd_generation_promptc              
   3   s
  zj j| d jd}d}j|ddV  j D ]}|j|krHq8dd urn|sn|jtjkrjq8nd}|jtjkr~dnd }|jtjkrj||dV   qq8j||j	d	V  q8W nF t
y } z,tt| d
t| dV  W Y d }~n
d }~0 0 d S )Nr   )r   r   F	assistantr   r   Trf   r   r   )r   r   r   data: {"error": ""})r   Zadd_requestr~   r   r  r   statusr'   FINISHED
next_token	Exceptionr   r   rQ   )Z_inputsr   Zqueue_is_flushedresultr   r   rK   model_id_and_revisionrx   r   rS   rT   stream_chat_completion  s2    

zPServeCommand.continuous_batching_chat_completion.<locals>.stream_chat_completionr   )process_model_namer   r   rf   load_model_and_processorr   r4  r   rK   r6  r7  Zinit_continuous_batchingr   Zlogit_processorrm   apply_chat_templatetor   )r   rx   must_discard_cacher   r   r4  inputsrI  rS   rG  rT   r
    s@    




$z0ServeCommand.continuous_batching_chat_completionr%   )r   rz   c                 C   sB   | j j}|t v rtj}n"|t v r0tj}ntd| |S )NzUnknown modality: )		__class__rM   r   valuesro   rq   r   rp   
ValueError)r   Zmodel_classnamemodalityrS   rS   rT   get_model_modalityE  s    zServeCommand.get_model_modality)rS  c                 C   s  g }| D ]t}|d g d}|t jkrt|d tr>|d }nDt|d trg }|d D ]}|d dkrX||d  qXd|}||d< n|t jkrtt|d tr|d d|d d n|d D ]}|d dkr|d | q|d dkrd	|d d
 v rRt	dd|d d
 }t
tt|}tjddd}	|	j}
||	j n|d d
 }
|d d|
d q|| q|S )Nr   r   r   r   typerc    )rV  rc   Z	image_urlbase64urlz^data:image/.+;base64,r   z.pngF)suffixdeleteimage)rV  rY  )ro   rp   
isinstancerQ   r  appendjoinrq   resubr(   openr   rX  	b64decodetempfileNamedTemporaryFilenamesave)r9  rS  processor_inputsmessageZparsed_messageZparsed_contentr   Z
image_datar\  rX   rY  rS   rS   rT   *get_processor_inputs_from_inbound_messagesQ  s<    




z7ServeCommand.get_processor_inputs_from_inbound_messagesc                    s^  j jdurj j|d< |d }|d d dkr4dS |d jk}_\}}||}dtD ] }|jj	d 
 v r~| qq~|j|d|d	d
ddd}|j}|ddd}	djj	d 
 v rd}	t||	dd}
t|jd}d}|r(|s(j}i ||
|d|d  fdd}||
S )a  
        Generates an OpenAI Chat Completion using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Chat Completion for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Chat Completion chunks.
        Nr   r9  r   r=  r   Ttoolsr:  )r<  rl  r;  Zreturn_dicttokenizer   req_0gptossFskip_special_tokensZskip_promptry   )streamerrK   return_dict_in_generatepast_key_valuesc              
   3   s  d}d }dj jd  v r$d}d}fdd}t| d}d	}z^z|  t }jd
dV  | D ]}dj jd  v r|dr|d td  }||7 }|r||v rnd}qnnqnd ur|	 t
 d krd|_qn|	 t
 d kr|  j|d ddV  qn|jr| j|7  _|jstd|j}	|	d u rVqnn
|	d}	d|_tt|	ddd|d d}
n~|d	krqnd|jvrqn| j|d7  _| j|d8  _|jdk rd	|dd d d }tt|dddd}
j|d |
gdV  qn|d	krnj||dV  qnj|dd V  |  W nF ty } z,tt| d!t| d"V  W Y d }~n
d }~0 0 W |  n
|  0 d S )#NFro  r   T<|channel|>final<|message|>c                     s    j f i | }|j_d S r   generateru  r   r   Zgenerate_outputr   r   rS   rT   generate_with_cache  s    zbServeCommand.generate_chat_completion.<locals>.stream_chat_completion.<locals>.generate_with_cachetargetr   r   r=  r>  
<|return|>rm   rn   r   )r   r   r   r   z\"name\": \"(.*?)\"r    )rf  functionZ
_tool_call)r  r   rV  r   z"arguments": {{})	arguments)r  r   rV  )r   r   r   r   )r   r   rf   r?  r@  rA  )configarchitecturesr   r	   rm   r   r  endswithlenstrip_TOOL_CALL_TOKENSr   r   r   r   r`  searchgroupr4   r5   r   countr_  r*  rE  r   r   rQ   )rs  _request_id
filter_cotcot_trace_endr{  threadresultsZ
tool_staterF  Z	tool_nameZtoolr   generation_kwargsr   rH  r   r   Ztool_model_familyrS   rT   rI    s    







*zEServeCommand.generate_chat_completion.<locals>.stream_chat_completion)ru   r   rJ  r   rK  rT  rj  _MODELS_WITH_TOOL_SUPPORTr  r  r   rL  r~   rM  r   r   r   rK   is_continuationr   )r   rx   r9  rN  r   rS  rh  Zsupported_model_familiesrO  rq  generation_streamerrK   r   rI  rS   r  rT   r    s`    


{z%ServeCommand.generate_chat_completionc           
         s   d jk}_\}td trldv rRdd dgng }|dd d ntd trdv rʈd d d dkrdd dgd }q҈d }d |d d	< nd }nDtd trdv rdd dgng }|d  ntd
|j	|ddd}|
j}ddd}djjd  v r^d}t||dd}tjd}d}r|sj}|t|||d|d  fdd}	|	|S )a	  
        Generates an OpenAI Response using `generate`.

        Args:
            req (`dict`): The request to generate an OpenAI Response for.

        Returns:
            `Generator[str, None, None]`: A generator that yields the OpenAI Response events.
        r   inputinstructionssystemrU  r_   r   r   r   z%inputs should be a list, dict, or strTr:  )r<  r;  r\   rn  ro  Frp  rr  N)rO  Zattention_maskrs  rK   rt  ru  c                 3   sH  d}d }dj jd  v r$d}d}fdd}t| d}d}d}d}zz|  t }	td	|td
 |	dddddiidg g dddddd}
|d7 }	|
V  t
d|td
 |	dddddiidg g dddddd}|d7 }	|V  td||td dddg dd}|d7 }	|V  tdd |||td d!g d"d#}|d7 }	|V  d!}| D ]}dj jd  v r|d$r|d td$  }||7 }|r||v rd}d!}qnqtd%d ||||d!d&d'gd(}|d7 }	|V  qtd)d ||d|d!d&d'gd*}|d7 }	|V  td+d |||td |jg d"d#}|d7 }|d7 }	|V  td,||td dd-d|jgg d.d}|d7 }|d7 }	|V  td/|td
 |	d-ddddii|jgdg ddddd0d}|d7 }	|V  |  W n ty, } ztd1t|  td2|t|d3}|d7 }	|V  td4|td
 |	d5ddddiig dg dddt d6t|d7d8d}|d7 }	|V  W Y d }~n
d }~0 0 W |  n
|  0 d S )9NFro  r   Trv  c                     s    j f i | }|j_d S r   rw  ry  rz  rS   rT   r{    s    zTServeCommand.generate_response.<locals>.stream_response.<locals>.generate_with_cacher|  zresponse.createdZresp_Zqueuedr  formatrV  rc   r  rj   r   rg   )r   r/  rB  r   r  rc   r   rl  r  rj   ra   rg   )rV  sequence_numberr  r    zresponse.in_progressZin_progresszresponse.output_item.addedZmsg_ri  r=  )r   rV  rB  r   r   )rV  r  output_indexitemzresponse.content_part.addedZoutput_textr   )rV  rc   annotations)rV  item_idr  r  content_indexpartr~  zresponse.output_text.deltagX@)tokenZlogprob)rV  r  r  r  r  r   ri   zresponse.output_text.done)rV  r  r  r  r  rc   ri   zresponse.content_part.donezresponse.output_item.done	completed)r   rV  rB  r   r   r  zresponse.completed)r   r/  rB  r   r  rc   r  r   rl  rj   ra   rg   z"Exception in response generation: r   )rV  r  ri  zresponse.failedfailedserver_error)coderi  )r   r/  rB  r   r  rc   r  r   rl  rj   ra   rg   r   )!r  r  r   r	   rm   r   r;   r7   r~   r  r?   r@   rB   r9   rC   r  r  rD   rE   r:   rc   rA   r  r8   r  r_  rE  r   r   rQ   r=   r>   r<   )rs  r  r  r  r{  r  r  r  r  r/  Zresponse_createdZresponse_in_progressZresponse_output_item_addedZresponse_content_part_addedr  rF  Zresponse_output_text_deltaZresponse_output_text_doneZresponse_content_part_doneZresponse_output_item_doneZresponse_completedr   Zerror_eventZresponse_failedr  r   rH  rx   r   r   rS   rT   stream_response  sz   





	
	


$z7ServeCommand.generate_response.<locals>.stream_response)rJ  r   rK  r]  rQ   r^  r  r  rR  rL  rM  r   r~   r  r  r   r   r   rK   r  r   r   Z	ones_like)
r   rx   rN  r   rO  rq  r  rK   r   r  rS   r  rT   r  G  sV    

	 dzServeCommand.generate_responsec           
         s   t  std| |d }| |\tjddd}t|jd}jj	}t
|d }tj||dd\}}||dd	j  d
 j d
< ||dd fdd}	|	 S )a  
        Generates an OpenAI Transcription using the audio file.

        Args:
            req (`dict`): The request containing the audio file and model information.

        Returns:
            `Generator[str, None, None]`: A generator that yields the transcription result.
        z]Missing librosa dependency for audio transcription. Please install with `pip install librosa`r   Trp  rr  rX   )srmonor:  )sampling_rater;  Zinput_features)rs  rK   rt  c                  3   sF   j f i  } j| jddd }t|d}|jdd V  d S )NT)rq  r   )rc   r   )rx  Zbatch_decode	sequencesr.   r   )Zgenerated_idsZtranscription_textZtranscriptionZaudio_inputsaudio_modelaudio_processorr  rS   rT   _generate_transcription  s    
zDServeCommand.generate_transcription.<locals>._generate_transcription)r   r   rJ  load_audio_model_and_processorr   r4  r   rK   Zfeature_extractorr  ior   librosaloadrM  r   r   )
r   rx   rH  r  rK   Zmodel_sampling_rateZaudio_bytesZaudio_array_r  rS   r  rT   r  k  s2    z#ServeCommand.generate_transcriptionc                 C   sx   | dp| d}d}| jdu r(d}nFt| jt|kr@d}n.tt| jD ]}| j| || krNd} qnqN|| _|S )aD  
        Determines whether the current request is a continuation of the last request. In other words, if it is the
        same chat session.

        Args:
            req (`dict`): The request to check.

        Returns:
            `True` if the request is a continuation of the last request, `False` otherwise.
        r9  r  TNF)r~   r   r  range)r   rx   r9  Zreq_continues_last_messagesirS   rS   rT   r    s    
zServeCommand.is_continuationr#   )ru   rz   c                 C   s<   | j r"td| j| j| j| jd}n| jr4tdd}nd}|S )a  
        Returns the quantization config for the given CLI arguments.

        Args:
            args (`ServeArguments`): The serve arguments. May contain quantization settings, device, etc.

        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        T)r   Zbnb_4bit_compute_dtyper   Zbnb_4bit_use_double_quantZbnb_4bit_quant_storage)r   N)r   r#   r   r   r   r   )ru   quantization_configrS   rS   rT   get_quantization_config  s    z$ServeCommand.get_quantization_config)model_idrz   c                 C   s*   | j jdur| j j}d|v r |S | dS )aR  
        Applies the `force_model` CLI argument and canonicalizes the model name to the format "model_id@revision".
        If the model_id DOESN'T contain an @, it defaults to "model_id@main".

        Args:
            model_id (`str`): The model ID.

        Returns:
            `str`: The canonicalized model name to be used
        N@z@main)ru   r   )r   r  rS   rS   rT   rJ    s
    zServeCommand.process_model_name)rH  c                 C   sD  | j }td|  d|v r0|dd\}}n
|d }}tj|||jd}|jdv r\|jn
tt	|j}| 
|}||j|d|jd}|d	ur||d
< tj|fi |}	tt|	jd }
|
j|fi |}t|dd	d	u r||j}|jjd	u o|jjdk}|jjd	uo|jjdk }|s$|r,d|j_td|  ||fS )a  
        Generic method to load a model and a data processor from a model ID and revision, making use of the serve CLI
        arguments.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.
            model_cls (`type[PreTrainedModel]`):
                The model class to load.

        Returns:
            `tuple[PreTrainedModel, Union[ProcessorMixin, PreTrainedTokenizerFast]]`: The loaded model and
            data processor (tokenizer, audio processor, etc.).
        zLoading r  r    main)revisionr   )r   Nr   )r  r   r   Z
device_mapr   Nr  r   Zhf_device_map   r  zLoaded model )ru   r   r   r*  r"   Zfrom_pretrainedr   r   getattrr   r  r   r   r   r  rM  r   rK   r   
max_length)r   rH  ru   r  r  Zdata_processorr   r  Zmodel_kwargsr  architecturer   Zhas_default_max_lengthZhas_short_max_new_tokensrS   rS   rT   _load_model_and_data_processor  sB    

z+ServeCommand._load_model_and_data_processor)rH  rz   c                 C   sn   || j vs| j |  r@| |\}}t|| jj|d| j |< n&| j |   | j | j}| j | j}||fS )a\  
        Loads the text model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, PreTrainedTokenizerFast]`: The loaded text model and processor.
        r   r   	r   r   r  r   ru   r   r   r   r   )r   rH  r   r   rS   rS   rT   rK  $  s    z%ServeCommand.load_model_and_processorc                 C   sn   || j vs| j |  r@| |\}}t|| jj|d| j |< n&| j |   | j | j}| j | j}||fS )aU  
        Loads the audio model and processor from the given model ID and revision into the ServeCommand instance.

        Args:
            model_id_and_revision (`str`):
                The model ID and revision to load.

        Returns:
            `tuple[PreTrainedModel, ProcessorMixin]`: The loaded audio model and processor.
        r  r  )r   rH  r  r  rS   rS   rT   r  ?  s    z+ServeCommand.load_audio_model_and_processor)r   NNNNN)*rM   rN   rO   staticmethodr   r   r   r   r  r   r   r   r   r   r
   rQ   r  r  r  r#  	functools	lru_cacheanyr  r   r
  ro   rT  rj  r  r  r  r[   r  r  rJ  r  tupler   rK  r   r  rS   rS   rS   rT   rv     sd   "1
      
4A
-Y- I  &0>
rv   __main__)rX  r   r'  enumr  r   r  r   r`  rd  r   r   argparser   r   collections.abcr   r   dataclassesr   r   r   r	   typingr
   r   Zhuggingface_hubr   Zhuggingface_hub.constantsr   r   Z&transformers.models.auto.modeling_autor   r   Ztransformers.utils.import_utilsr   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r!   r   r"   r#   r$   r%   Zgeneration.continuous_batchingr&   r'   r  ZPILr(   r   r"  r   r)   r*   Zfastapi.middleware.corsr+   Zfastapi.responsesr,   r-   Z openai.types.audio.transcriptionr.   Z.openai.types.audio.transcription_create_paramsr/   Zopenai.types.chatr0   Z'openai.types.chat.chat_completion_chunkr1   r2   r3   r4   r5   Z*openai.types.chat.completion_create_paramsr6   Zopenai.types.responsesr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   Z-openai.types.responses.response_create_paramsrF   ZpydanticrG   rH   rI   rJ   rV   rW   r   r   r   r   r   r   r   rM   r   r  r  r   r  Enumro   rw   r  r   r   r   r   rv   r   r#  rS   rS   rS   rT   <module>   s    	D



;/g         
