a
    hql                  	   @   s  d dl Z d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZmZmZmZ dd	 Zejd
ejg dejdgdfejddejgejdgdfejg dedgdfdgejdddgejdddgdd Z ejdejg dejdgdfejg dedgddgfgejdeg deg d gejdddgd!d" Z!ejd#ejd gd$ d%gd$  d&g gejdj"g dgfejd'gd$ d(gd$  d)g gedj"g d*gfgejdd+dgd,d- Z#ejd.g d/d0feg d1g dgj"d2fgd3d4 Z$d5d6 Z%ejd7d%d8gd$ d9d:gfg dd; d%d8g g d<fg d=d; d>d?g g d@fgdAdB Z&ejdCdDdEgejdddgejdg dFdGdH Z'ejjdIedJgdK dJfed gdK d fejdLgdK edd fgg dMdNejdg dOdPdQ Z(dRdS Z)dTdU Z*ejdg dVdWdX Z+ejddYdgdZd[ Z,d\d] Z-dS )^    N)assert_allcloseassert_array_equal)RandomForestRegressor)Ridge)KFoldShuffleSplitStratifiedKFoldcross_val_scoretrain_test_split)make_pipeline)KBinsDiscretizerLabelBinarizerLabelEncoderTargetEncoderc                 C   s   t j|t jd}t |}|dkrt |}t|D ]h}|| |k }|jd }	|	dkr`|||< q4t |}
|
| }|	|	|  }|t | d| |  ||< q4|S t|D ]<}|| |k }t |||  }|jd | }|| ||< q|S dS )z0Simple Python implementation of target encoding.Zdtypeautor      N)npZzerosfloat64meanvarrangeshapesum)	X_ordinal	y_numericn_categoriessmoothcur_encodingsy_meanZ
y_variancecZy_subsetZn_iZy_subset_variancemZlambda_Zcurrent_sumZcurrent_cnt r"   k/var/www/html/assistant/venv/lib/python3.9/site-packages/sklearn/preprocessing/tests/test_target_encoder.py_encode_target   s*    



 r$   zcategories, unknown_valuer   r      r            ?      @      @)catdogsnakebear)r      r         @r   target_typebinary
continuousc                 C   s  d}t jdgd dgd  dgd  gt jdj}t jg d	gt jdj}|jd }| d
krf|}	|}
n| d | }	| d | }
t |
|ggf}
t j|}d}|dkr|jdd|d}t jddgt	d}|| }n |dksJ |j
dd|d}|}||}|| }|	| }	|| }|| }|dkr8t||dd}nt||dd}t j|t jd}|||D ]F\}}||df ||  }}t||||}|||df  ||df< qbt|| ||d}||	|}|j|ksJ t|| t|jdksJ |dkrt|j| n|jdu sJ t |}t|dddf |||}t|jd | |jt|ksfJ t |t |gfdd}||
}t|| dS )zCheck encoding for binary and continuous targets.

    Compare the values returned by `TargetEncoder.fit_transform` against the
    expected encodings for cv splits from a naive reference Python
    implementation in _encode_target.
    r/   r      r      r&   (   r   r%   r   r2   lowhighsizer+   r,   r3   Tn_splitsrandom_stateshuffle)r   
categoriescvr>   N) r   arrayint64Tr   concatenaterandomRandomStaterandintobjectuniformpermutationr   r   
empty_liker   splitr$   r   fit_transformtarget_type_r   len
encodings_r   classes_r   target_mean_pytestapproxreshape	transform)r@   Zunknown_valueglobal_random_seedr   r1   r   ZX_train_int_arrayZX_test_int_array	n_samplesX_trainX_testZdata_rngr=   r   Ztarget_namesy_trainZshuffled_idxrA   expected_X_fit_transform	train_idxtest_idxX_y_r   target_encoderX_fit_transformr   expected_encodingsexpected_X_test_transformX_test_transformr"   r"   r#   test_encoding7   s|    .








rh   zcategories, unknown_valuesrabbittarget_labels)r   r&   r/   )abr    c           .      C   s  t j| }d}d}t |jdd|d}t |jdd|d}	|d | }
|d |	 }t |
|f}t ||	f}ddgg dg}d}t |jd||d}|| }t |}d}t|| dd	}t j	|j
d |j
d | ft jd
}t|D ]\}}t|D ]z}|||D ]f\}}|dd|f }|||f ||  }}t||t||}|||  }||||f  |||f< qq qt||| d} | ||}!| jdksJ t|!| g }"t|D ]R\}}t|D ]>}|dd|f }t|dd|f |t||}|"| qΐqt| j|| ks*J t|| D ]}#t| j|# |"|#  q6t| j| t ddgddgddgg}$|dkr|$}%nft j|$ddddf td
}%t|$j
d D ]*}&|d |$dd|&f  |%dd|&f< qt |%|f}%t j|dd}'t j	|$j
d |$j
d | ft jd
}(|$j
d })g d}t|)d D ]6}*t|"D ]&\}#}+|+|$|*||# f  |(|*|#f< qLq@g d},t|| D ]}#|'|,|#  |(|)d |#f< q| |%}-t|-|( dS )z&Check encoding for multiclass targets.P   r&   r   r7   r/   r   r%   Tr<   r   Nr   rA   r>   
multiclassr'      r   rB   Zaxis)r   r   r   r   r   r   )r   r   r&   r   r   r&   )r   rG   rH   rC   rI   Zcolumn_stackr   rO   r   emptyr   r   	enumerater   rN   r$   rQ   r   rP   r   appendrR   r   rS   rM   rJ   Zvstackr   rX   ).rY   r@   Zunknown_valuesrj   r   rngrZ   Z
n_featuresZ
feat_1_intZ
feat_2_intZfeat_1Zfeat_2r[   ZX_train_intZcategories_Z	n_classesZy_train_intr]   Zy_train_encr=   rA   r^   f_idxcatsZc_idxr_   r`   Zy_classra   rb   current_encodingZexp_idxrc   rd   re   iZ
X_test_intr\   Z
column_idxr   rf   Zn_rowsZrow_idxencZmean_idxrg   r"   r"   r#   test_encoding_multiclass   s    


(
$
r{   zX, categories
   r   r/   r+   r,   r-   )r,   r+   cow      @c                 C   s   t jd}|jdd| jd d}t||dd| |}| }|| dd }|d t	
|kshJ t|jd	kszJ |jd d t	
|ksJ dS )
zHCustom categories with unknown categories that are not in training data.r   r;   r4   r7   )r@   r   r>   rB   N)r   r   r   )r   rG   rH   rK   r   r   fitr   rX   rU   rV   rQ   rR   )Xr@   r   ru   yrz   r   X_transr"   r"   r#   test_custom_categories  s    r   zy, msg)r   r&   r   r   z'Found input variables with inconsistent)r   r&   r   z7Target type was inferred to be 'multiclass-multioutput'c                 C   sV   t g dgj}t }tjt|d |||  W d   n1 sH0    Y  dS )zCheck invalidate input.)r   r   r   matchN)r   rC   rE   r   rU   Zraises
ValueErrorrO   )r   msgr   rz   r"   r"   r#   test_errors5  s    r   c                  C   s   t g dgj} t g d}tdd}tjttdd |	| | W d   n1 s`0    Y  |j
dksxJ tdd	d
}|	| | |j
d	ksJ dS )z@Check inferred and specified `target_type` on regression target.)r   r   r   r   r   r   )r(          @r)   r   r)   r~   r&   rA   zQThe least populated class in y has only 1 members, which is less than n_splits=2.r   Nro   r3   )rA   r1   )r   rC   rE   r   rU   ZwarnsUserWarningreescaperO   rP   )r   r   rz   r"   r"   r#   test_use_regression_targetH  s    
*r   zy, feature_namesr&   AB   )ZA_1ZA_2ZA_3ZB_1ZB_2ZB_3)y1y2Zy3r   r   )ZA_y1ZA_y2ZA_y3ZB_y1ZB_y2ZB_y3c                 C   s   t d}|ddgd ddgd d}tddd	d
}|jdd tddd	d
}|jdd ||| }||| }t| | t|	 | t|	 |j
 dS )z*Check TargetEncoder works with set_output.pandasrk   rl   r|   r   r&   )r   r   r)   r   rA   r   r>   default)rX   N)rU   importorskip	DataFramer   Z
set_outputrO   r   Zto_numpyr   Zget_feature_names_outcolumns)r   Zfeature_namespdZX_dfZenc_defaultZ
enc_pandasZ	X_defaultZX_pandasr"   r"   r#   !test_feature_names_out_set_output]  s    
 r   	to_pandasTF)binary-ints
binary-strr3   c              
   C   s  t jddgddgddgddgddgddgddgddggt jd}|dkrrt g d}t |}tdddd}nX|d	krt g d
}t |}tdddd}n&t jg dt jd}|}tdddd}t |}g dddgg}t jddgddgddggt jd}	| rrt	
d}
|
|dddf t jddgtd|dddf  d}|
|	dddf g dd}	n|}t j|t jd}t|D ]b\}}|||D ]J\}}|||f ||  }}t||t||}||||f  |||f< qqg }t|D ]2\}}t|dd|f |t||}|| qt j|d d |d d g||d d g|d d |ggt jd}t|ddd}|||}t|| t|jdksJ tdD ]}t|j| ||  q||	}t|| dS )z,Check target encoder with multiple features.r   r   r&   r   r   )rk   rl   rk   rk   rl   rl   rk   rl   T)r>   r?   r   )r/   r'   r/   r/   r/   r'   r'   r'   )r)   gffffff@g333333@g      @gffffff@g      @皙$@g333333@r%   r/   r|   r   Nr+   r,   )Zfeat0Zfeat1)r,   r+   r-   rn   )r   rC   rD   r   rO   r   Zfloat32r   r   rU   r   r   rJ   rM   r   rs   rN   r$   rQ   rt   r   r   rR   r   rX   )r   r   r1   r   r]   Z	y_integerrA   r   r@   r\   r   r[   r^   rv   rw   r_   r`   ra   rb   rx   re   rf   rz   rd   ry   rg   r"   r"   r#   test_multiple_features_quick{  sz    6
	
 "
	

r   z	y, y_meang333333@r4   rk   )r3   r2   zbinary-string)Zids)r   r~           c           	      C   s   t dgd gj}|jd }td|dd}||| }t|t j|gg|dd |jd d t	
|ksnJ |jt	
|ksJ t dgdgg}||}t|t j|ggddd dS )z5Check edge case where feature and target is constant.r   r4   r   r&   r   rq   N)r   rC   rE   r   r   rO   r   repeatrR   rU   rV   rT   rX   )	r   r   r   r   rZ   rz   r   r\   ZX_test_transr"   r"   r#    test_constant_target_and_feature  s    

r   c                 C   s   d}d}t j| }|j|d}|jd||ddd}| }|| }|| }td| d}|||}td	d
}|||}	t	dd| d}
t
d| d}t|
|||d dk sJ t|
|||d dk sJ t|
|	||d dksJ d S )Nr5   i  r:   r   rB   r   T)r?   r>   F)r?   r|   r4   )Zn_estimatorsZmin_samples_leafr>   2   )r=   r>   r   皙?      ?)r   rG   rH   normalrI   rW   Zargsortr   rO   r   r   r	   r   )rY   ZcardinalityrZ   ru   r]   r[   Zy_sorted_indicesrc   ZX_encoded_train_shuffledZX_encoded_train_no_shuffledZ	regressorrA   r"   r"   r#   Ftest_fit_transform_not_associated_with_y_if_ordinal_categorical_is_not  s2    
		r   c                  C   sv   t g dgj} t g d}tdddd}|| |}t|d t |dd	  t|d
 t |d	d  d	S )zECheck edge case with zero smoothing and cv does not contain category.)
r   r   r   r   r   r   r   r   r   r   )
g @g333333@g333333?g@r(   g      "@r   gffffff,@g*@g      .@r   Fr&   )r   r?   rA   r   rp   NrB   )r   rC   rE   r   rO   r   r   )r   r   rz   r   r"   r"   r#   test_smooth_zero  s    r   )r   g     @@r   c                 C   s   t j|}|jdd}d}t|dd|dd}t|||d\}}}}	||}
|
|	t j
 }|
|	t j
 }t| |d	}|||}||}|||}||}t|| t|| d S )
Ni  r   r5   ordinal)n_binsencoderB   r   r>   r   r>   )r   rG   rH   r   r   rO   rW   r
   rL   astypeint32r   rX   r   )r   rY   ru   r   r   r   r[   r\   r]   y_testpermutated_labelsZX_train_permutedZX_test_permutedrc   ZX_train_encodedZX_test_encodedZX_train_permuted_encodedZX_test_permuted_encodedr"   r"   r#   3test_invariance_of_encoding_under_label_permutation)  s&    




r   r   c                 C   s"  t dddd}d}tj|}||}d|| }d}t|dd	|d
|| dd}||}	|	|	tj
 }||}
|jtd| |dddd}tj||
|gdd}t||dd\}}}}|||}|||dk sJ |||dk sJ tt| |d|||}|d j}|||dks<J ||||dksVJ ||d tjdddksrJ t|dd  dk  sJ t| |d||}||}||}|||}|j}|||dksJ ||||dk s J |t|d t|d k sJ d S )Ngư>ZlsqrF)alphaZsolverZfit_interceptiP  g?d   r   rK   )r   r   Zstrategyr>   rB   r   g?T)r:   replacerq   r   r   r   r   r   g{Gz?)absg?gffffff?r&   )r   r   rG   rH   Zrandnr   rO   rW   rL   r   r   choiceintrF   r
   r   Zscorer   r   Zcoef_rU   rV   r   allrX   )r   rY   Zlinear_regressionrZ   ru   r   noiser   ZX_informativer   Z
X_shuffledZX_near_unique_categoriesr   r[   r\   r]   r   Z	raw_modelZmodel_with_cvZcoefrc   ZX_enc_no_cv_trainZX_enc_no_cv_testZmodel_no_cvr"   r"   r#   *test_target_encoding_for_linear_regressionM  sd    




 

r   c                  C   sr   t jddd} | ddD | g dg dd}td	d
|dg |d  W d   n1 sd0    Y  dS )z
    Test target-encoder cython code when y is read-only.

    The numpy array underlying df["y"] is read-only when copy-on-write is enabled.
    Non-regression test for gh-27879.
    r   z2.0)Z
minversionzmode.copy_on_writeT)rk   rl   rl   )r~   r0   r*   )xr   r3   )r1   r   r   N)rU   r   Zoption_contextr   r   r   )r   Zdfr"   r"   r#   test_pandas_copy_on_write  s    r   ).r   numpyr   rU   Znumpy.testingr   r   Zsklearn.ensembler   Zsklearn.linear_modelr   Zsklearn.model_selectionr   r   r   r	   r
   Zsklearn.pipeliner   Zsklearn.preprocessingr   r   r   r   r$   markZparametrizerC   rD   nanr   rJ   rh   r{   rE   r   r   r   r   r   r   r   r   r   r   r   r"   r"   r"   r#   <module>   s   	`h(


	
R	.
#
r