a
    hG                     @   s>   d Z ddlZddlmZ ddlmZ dd ZG dd	 d	ZdS )
zA
Loss functions for linear models with raw_prediction = X @ coef
    N)sparse   )squared_normc                 C   sZ   | j d }t| r8| jtj|df||fd |   S |dddf |  }| j| S dS )z/Compute the sandwich product X.T @ diag(W) @ X.r   shapeN)r   r   issparseT
dia_matrixZtoarray)XW	n_samplesZWX r   ]/var/www/html/assistant/venv/lib/python3.9/site-packages/sklearn/linear_model/_linear_loss.pysandwich_dot   s    

r   c                   @   sl   e Zd ZdZdd ZdddZdd Zd	d
 Zdd ZdddZ	dddZ
dddZdddZdddZdS )LinearModelLossa
	  General class for loss functions with raw_prediction = X @ coef + intercept.

    Note that raw_prediction is also known as linear predictor.

    The loss is the average of per sample losses and includes a term for L2
    regularization::

        loss = 1 / s_sum * sum_i s_i loss(y_i, X_i @ coef + intercept)
               + 1/2 * l2_reg_strength * ||coef||_2^2

    with sample weights s_i=1 if sample_weight=None and s_sum=sum_i s_i.

    Gradient and hessian, for simplicity without intercept, are::

        gradient = 1 / s_sum * X.T @ loss.gradient + l2_reg_strength * coef
        hessian = 1 / s_sum * X.T @ diag(loss.hessian) @ X
                  + l2_reg_strength * identity

    Conventions:
        if fit_intercept:
            n_dof =  n_features + 1
        else:
            n_dof = n_features

        if base_loss.is_multiclass:
            coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
        else:
            coef.shape = (n_dof,)

        The intercept term is at the end of the coef array:
        if base_loss.is_multiclass:
            if coef.shape (n_classes, n_dof):
                intercept = coef[:, -1]
            if coef.shape (n_classes * n_dof,)
                intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
            intercept.shape = (n_classes,)
        else:
            intercept = coef[-1]

        Shape of gradient follows shape of coef.
        gradient.shape = coef.shape

        But hessian (to make our lives simpler) are always 2-d:
        if base_loss.is_multiclass:
            hessian.shape = (n_classes * n_dof, n_classes * n_dof)
        else:
            hessian.shape = (n_dof, n_dof)

    Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as

        coef.reshape((n_classes, -1), order="F")

    The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
    coefficients without intercept, coef[:, :-1], contiguous and speeds up
    matrix-vector computations.

    Note: If the average loss per sample is wanted instead of the sum of the loss per
    sample, one can simply use a rescaled sample_weight such that
    sum(sample_weight) = 1.

    Parameters
    ----------
    base_loss : instance of class BaseLoss from sklearn._loss.
    fit_intercept : bool
    c                 C   s   || _ || _d S )N)	base_lossfit_intercept)selfr   r   r   r   r   __init__h   s    zLinearModelLoss.__init__Nc                 C   sZ   |j d }| jj}| jr"|d }n|}| jjrFtj|||f|dd}ntj|||d}|S )a  Allocate coef of correct shape with zeros.

        Parameters:
        -----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        dtype : data-type, default=None
            Overrides the data type of coef. With dtype=None, coef will have the same
            dtype as X.

        Returns
        -------
        coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
            Coefficients of a linear model.
           F)r   dtypeorder)r   r   )r   r   	n_classesr   is_multiclassnpZ
zeros_like)r   r
   r   
n_featuresr   n_dofcoefr   r   r   init_zero_coefl   s    

zLinearModelLoss.init_zero_coefc                 C   s   | j js.| jr$|d }|dd }qd}|}nV|jdkrP|j| j jdfdd}n|}| jr|dddf }|ddddf }nd}||fS )a  Helper function to get coefficients and intercept.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        N        r   r   r   )r   r   r   ndimreshaper   )r   r   	interceptweightsr   r   r   weight_intercept   s    
z LinearModelLoss.weight_interceptc                 C   s<   |  |\}}| jjs$|| | }n||j | }|||fS )ai  Helper function to get coefficients, intercept and raw_prediction.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.

        Returns
        -------
        weights : ndarray of shape (n_features,) or (n_classes, n_features)
            Coefficients without intercept term.
        intercept : float or ndarray of shape (n_classes,)
            Intercept terms.
        raw_prediction : ndarray of shape (n_samples,) or             (n_samples, n_classes)
        )r'   r   r   r   )r   r   r
   r&   r%   raw_predictionr   r   r   weight_intercept_raw   s
    z$LinearModelLoss.weight_intercept_rawc                 C   s&   |j dkr|| nt|}d| | S )z5Compute L2 penalty term l2_reg_strength/2 *||w||_2^2.r   g      ?)r#   r   )r   r&   l2_reg_strengthZnorm2_wr   r   r   
l2_penalty   s    zLinearModelLoss.l2_penaltyr!   r   c                 C   s\   |du r|  ||\}}	}n| |\}}	| jj||d|d}
tj|
|d}
|
| || S )a  Compute the loss as weighted average over point-wise losses.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.
        NZy_truer(   sample_weight	n_threadsr&   )r)   r'   r   lossr   averager+   )r   r   r
   yr-   r*   r.   r(   r&   r%   r0   r   r   r   r0      s    'zLinearModelLoss.lossc                 C   s\  |j | jj \}}	}
|	t| j }|du r>| ||\}}}n| |\}}| jj||||d\}}|du rp|nt	|}|	 | }|| 
||7 }|| }| jjstj||jd}|j| ||  |d|	< | jr|	 |d< nptj|
|f|jdd}|j| ||  |ddd|	f< | jr<|j	dd|dddf< |jd	krT|jdd
}||fS )a\  Computes the sum of loss and gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        loss : float
            Weighted average of losses per sample, plus penalty.

        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr,   r   r    r   r   r   r   Zaxisr   r"   )r   r   r   intr   r)   r'   loss_gradientr   sumr+   r   
empty_liker   r   emptyr#   ravel)r   r   r
   r2   r-   r*   r.   r(   r   r   r   r   r&   r%   r0   grad_pointwisesw_sumgradr   r   r   r7   
  s6    *
"zLinearModelLoss.loss_gradientc                 C   s>  |j | jj \}}	}
|	t| j }|du r>| ||\}}}n| |\}}| jj||||d}|du rl|nt	|}|| }| jj
stj||jd}|j| ||  |d|	< | jr|	 |d< |S tj|
|f|jdd}|j| ||  |ddd|	f< | jr|j	dd|dddf< |jd	kr6|jdd
S |S dS )a  Computes the gradient w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.
        Nr,   r3   r    r   r4   r   r5   r   r"   )r   r   r   r6   r   r)   r'   gradientr   r8   r   r9   r   r   r:   r#   r;   )r   r   r
   r2   r-   r*   r.   r(   r   r   r   r   r&   r%   r<   r=   r>   r   r   r   r?   X  s4    '"zLinearModelLoss.gradientc
                 C   sB  |j | jj \}
}}|t| j }|	du r>| ||\}}}	n| |\}}|du rX|
nt|}|du r~tj	||j
dd}nF|j |j krtd|j  d|j  dn| jjr|jjstdn|}|j}|du rtj||f|j
d}nZ|j ||fkrtd	||f d
|j dn,| jjr>|jjs>|jjs>tdn|}| jjsr| jj||	||d\}}|| }|| }tj|dk|ddk}t|}|j| ||  |d|< | jr| |d< |r|||fS t|||d|d|f< |dkr0|jjrdnd}|jd|dd|| |d   |7  < | jr8|j| }||dddf< ||dddf< | |d< n| jj||	||d\}}|| }|j||fdd}|j| ||  |ddd|f< | jr|jdd|dddf< |jdkr|jdd}|dur|| }nd| }t|D ]}|dd|f d|dd|f   | }t|||||| |||| |f< | jr|j| }||||| ||| | f< |||| | ||| |f< | ||| | || | f< t|d |D ]}|dd|f  |dd|f  | }t|||||| |||| |f< | jr|j| }||||| ||| | f< |||| | ||| |f< | ||| | || | f< ||d||d|f ||d||d|f< qq"|dkr4|jjrdnd}|jd|dd|d | | || d   |7  < d}|||fS )a~  Computes gradient and hessian w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.
        gradient_out : None or ndarray of shape coef.shape
            A location into which the gradient is stored. If None, a new array
            might be created.
        hessian_out : None or ndarray of shape (n_dof, n_dof) or             (n_classes * n_dof, n_classes * n_dof)
            A location into which the hessian is stored. If None, a new array
            might be created.
        raw_prediction : C-contiguous array of shape (n_samples,) or array of             shape (n_samples, n_classes)
            Raw prediction values (in link space). If provided, these are used. If
            None, then raw_prediction = X @ coef + intercept is calculated.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessian : ndarray of shape (n_dof, n_dof) or             (n_classes, n_dof, n_dof, n_classes)
            Hessian matrix.

        hessian_warning : bool
            True if pointwise hessian has more than 25% of its elements non-positive.
        Nr   r4   z4gradient_out is required to have shape coef.shape = z; got .z"gradient_out must be F-contiguous.r3   z'hessian_out is required to have shape (z); got hessian_out.shape=zhessian_out must be contiguous.r,   r   r/   g      ?r    Cr"   r   )r    r    r5   g      ?r   F)r   r   r   r6   r   r)   r'   r   r8   r9   r   
ValueErrorr   flagsf_contiguoussizer:   c_contiguousgradient_hessianr1   absr   r   r$   gradient_probar#   r;   range)r   r   r
   r2   r-   r*   r.   Zgradient_outZhessian_outr(   r   r   r   r   r&   r%   r=   r>   nZhessr<   hess_pointwiseZhessian_warningr   ZXhprobaswkhlr   r   r   rG     s   7










"&

(


&


4
z LinearModelLoss.gradient_hessianc                    s   j jj \}tj  \}}	
du r@|nt
jjs@jj	||	
|d\}
}|
 }
| }tj
jd} j|
   |d< jr|
 |d< | t rtj|df||fd  n|ddtjf   jr&ttjddt fdd	}njj||	
|d\}
	|
 }
tjfjd
d}|
j    |dddf< jr|
jdd|dddf<  	
fdd	}jdkr|jd
d|fS ||fS )a  Computes gradient and hessp (hessian product function) w.r.t. coef.

        Parameters
        ----------
        coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
            Coefficients of a linear model.
            If shape (n_classes * n_dof,), the classes of one feature are contiguous,
            i.e. one reconstructs the 2d-array via
            coef.reshape((n_classes, -1), order="F").
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training data.
        y : contiguous array of shape (n_samples,)
            Observed, true target values.
        sample_weight : None or contiguous array of shape (n_samples,), default=None
            Sample weights.
        l2_reg_strength : float, default=0.0
            L2 regularization strength
        n_threads : int, default=1
            Number of OpenMP threads to use.

        Returns
        -------
        gradient : ndarray of shape coef.shape
             The gradient of the loss.

        hessp : callable
            Function that takes in a vector input of shape of gradient and
            and returns matrix-vector product with hessian.
        Nr,   r3   r    r   r   r5   c                    s   t | }t r4 j| d    |d < n$t j j| d  g|d < |d   | d   7  < jr|d   | d  7  < | d   | d   |d< |S )Nr    )r   r9   r   r   r   ZlinalgZ	multi_dotr   )sret)r
   hXhX_sumhessian_sumr*   r   r   r   r   hessp  s    

 $  z7LinearModelLoss.gradient_hessian_product.<locals>.hesspr   r4   c                    s  | j dfdd} jr>| d d df }| d d d df } nd} | j | }| | jddd d tjf 7 }|9 }d ur|d d tjf 9 }tjf
jdd}|j  	 |   |d d d f< jr|jdd	 |d d df< jdkr|j	ddS |S d S )Nr    r   r"   r   r   r5   r4   )
r$   r   r   r8   r   newaxisr:   r   r#   r;   )rR   Zs_intercepttmpZ	hess_prod)r
   r   r*   r   r   r   rM   r-   r   r=   r&   r   r   rW     s"    $&r   r"   )r   r   r   r6   r   r)   r   r8   r   rG   r9   r   r   r   r   r	   rX   ZsqueezeZasarrayZ
atleast_1drI   r:   r#   r;   )r   r   r
   r2   r-   r*   r.   r   r%   r(   r<   rL   r>   rW   r   )r
   r   rT   rU   rV   r*   r   r   r   rM   r-   r   r=   r&   r   gradient_hessian_product  sV     




" z(LinearModelLoss.gradient_hessian_product)N)Nr!   r   N)Nr!   r   N)Nr!   r   N)Nr!   r   NNN)Nr!   r   )__name__
__module____qualname____doc__r   r   r'   r)   r+   r0   r7   r?   rG   rZ   r   r   r   r   r   %   s@   B
' 
    
;    
S    
N      
   r   )	r^   numpyr   Zscipyr   Zutils.extmathr   r   r   r   r   r   r   <module>   s
   