a
    h6                    @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZmZ d dlmZ dd	 Zejd
g ddd Zejd
g ddd Zejdejejejgejdejejejgdd Zejdejejejgdd Zdd Zdd Zdd Z dd Z!dd Z"ejjd g d!g d"ge#g d#g d$gej#g d%g d&ge$d'ej#g d(d)ej%d*gge$d'ej#g d(d)e&d+d*gge$d'ej#g d,g d-ge$d'ej#g d.d)ej%dgge$d'ej#g d.d)e&d+dgge$d'gg d/d0d1d2 Z'ejd
g dejd3d4d5gejd6dd7gd8d9 Z(ejd3d4d5gejd:d;d<gd=d<gd;d<ggg d>g d?g d>gfd@d)gdAd)gdBdCgdAd)ggg dDg dEg dFgfgdGdH Z)dIdJ Z*ejd6g dKejdLg dKdMdN Z+ejdOdPdQgejd d=d;ge#dRdSggdTdU Z,ejdOdPdQgdVdW Z-ejjdXdYd<gdZd<ggdYdZgd<ggej.fe#d=d;gd[d;ggd=d[gd;ggej/fej#d\d*gd]d*gge$d'd\d]gd*ggej.fe#d\d*gd]d*ggd\d]gd*ggej0fe#d=d;gej%d;ggd=ej%gd;ggejfej#d\ej%gdej%gge$d'd\dgej%ggej.fej#d\e&d+gde&d+gge$d'd\dge&d+ggej.fgg d^d0d_d` Z1ejd
g dejjdaej#d)dCgge$d'j2ej#d)dbgge$d'j2g dcgej.fej#d=d;ggddd'j2ej#d=deggddd'j2g dfgej3fej#d)dCgge$d'j2ej#d)dbgge$d'j2e#g dcgej.fej#dd)gge$d'j2ej#ddCgge$d'j2g dgge$fej#d)dCgge$d'j2ej#d)ej%gge$d'j2g dhge$fej#d)dgge$d'j2ej#d)ej%gge$d'j2g dige$fgg djd0dkdl Z4dmdn Z5ejdoe
egdpdq Z6drds Z7dtdu Z8ejjdvd7dwdxgfdyg dzfg d{d|d}gfgg d~d0dd Z9dd Z:ejjd g d"g d!ge#g dg dgej#g d&g d%ge$d'gg dd0dd Z;ejjdaej#d)dCgge$d'j2ej#d)dbgge$d'j2g dcgej.fej#d=d;ggddd'j2ej#d=deggddd'j2g dfgej3fej#d)dCgge$d'j2ej#d)dbgge$d'j2e#g dcgej.fgg dd0dd Z<dd Z=dd Z>ejde&e?gdd Z@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFejd6dyd7gdd ZGejdej%de&d+gdd ZHejd6dYd[gg dgdd ZIejjdd5d4gddgd0ejjd6d7g dgd7dgd0dd ZJejdoe
egdd ZKejddd;iddiddid;dddeddgejddg dggdd ZLejd6dyd7dCggdd ZMejd6d)gdbggdd ZNejddd[iddiddiddiddid[dddeddgddń ZOejd6d7dCggddǄ ZPejd6d)gdbggddɄ ZQdd˄ ZRejdd[d=dddeigdd̈́ ZSddτ ZTddф ZUddӄ ZVddՄ ZWddׄ ZXejddd=dٜgddۄ ZYejdd;d[dٜgdd݄ ZZejdg dߢejdg ddd Z[dd Z\ejdej%dgdd Z]dd Z^ejd
g dejdddgdd Z_ejd
g ddd Z`ejd
g ddd Zaejd
g ddd Zbdd Zcdd Zdejdej%dgdd Zeejdddgejdej%dgdd Zfejjdaej#d)ej%gge$d'j2ej#d)dCgge$d'j2ej#d)dbej%ge$d'gej.fej#d)ej%gge$d'j2ej#d)dCgge$d'j2ej#d)dbej%ge$d'gej.fej#dej%ggejd'j2ej#dRggejd'j2e#ddSej%ggejfgg d d0dd Zgejdoe
egdd Zhejde#dej%dRggj2e#dej%dggj2e#dSggfe#g dgj2e#g d	gj2e#ej%ggfej#d
ej%dCgge$d'j2e#dej%dggj2ej#dbgge$d'fej#g dge$d'j2e#g dgj2ej#ej%gge$d'fgdd Ziejdedd Zjdd Zkejddd]ggej#dd]ggdd'ej#dd]ggdd'gejdd\d]ggej#d\d]ggdd'ej#d\d]ggdd'gdd Zldd Zmdd Zndd  Zoejd!d5d4gd"d# Zpejd$ej#d)gd%gge$d'd gej%gej%ggejqd%gdgdgge$d'fej#ej%gd%gd)gge$d'd gej%gej%ggejqd%gej%gej%gge$d'fgd&d' Zrd(d) Zsd*d+ Ztd,d- Zud.d/ Zvd0d1 Zwejddd[iddiddiddiddid[dddeddgd2d3 Zxd4d5 Zyd6d7 Zzd8d9 Z{d:d; Z|ejdddidd;igd<d= Z}ejddd=idd>igd?d@ Z~dAdB ZdCdD Zejdoe
egdEdF ZdS (G      N)sparse)NotFittedError)OneHotEncoderOrdinalEncoder)is_scalar_nan)_convert_containerassert_allcloseassert_array_equal)CSR_CONTAINERSc                  C   s   t g dg dg} t }tdd}|| }|| }|jdksHJ |jdksVJ t|sdJ t|rrJ t| g dg dg t| | d S )N         r   r   r   Fsparse_outputr      )              ?r   r   r   )r   r   r   r   r   )	nparrayr   fit_transformshaper   issparser	   toarray)XZ
enc_sparseZ	enc_denseX_trans_sparseZX_trans_dense r   e/var/www/html/assistant/venv/lib/python3.9/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_dense   s    


r    handle_unknown)ignoreinfrequent_if_existwarnc                 C   s   t g dg dg dg}t g dg}tdd}|| tjtdd || W d    n1 sn0    Y  t| d}|| | }t	||
 t g d	g t|| d S )
N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr!   Found unknown categoriesmatch)r   r   r   r   r   r   r   )r   r   r   fitpytestraises
ValueError	transformcopyr	   r   r   r!   r   X2ohZ	X2_passedr   r   r   #test_one_hot_encoder_handle_unknown*   s    

(

r4   c                 C   sx   t g dd}t ddgd}t| d}|| | }t|| t g dg dg t|| d S )N)Z1111111122Z333Z4444)r   Z55555r5   r'   )r   r   r   r   r   r   r   r   )	r   r   reshaper   r+   r0   r	   r/   r   r1   r   r   r   +test_one_hot_encoder_handle_unknown_stringsB   s    

r9   output_dtypeinput_dtypec                 C   s   t jddgg| dj}t jddgddgg|d}td|d}t|| | t||| | td|dd}t||| t|||| d S )Nr   r   dtypeauto)
categoriesr=   F)r?   r=   r   )	r   asarrayTr   r	   r   r   r+   r/   )r;   r:   r   
X_expectedr3   r   r   r   test_one_hot_encoder_dtypeU   s    rC   c                 C   s   t d}|ddgddgd}tjg dg dg| d	}t| d	}t|| | t|	|
| | t| d
d}t||| t|	|
|| d S )Npandasabr   r   ABr   r   r   r   r   r   r   r   r<   F)r=   r   )r,   importorskip	DataFramer   r   r   r	   r   r   r+   r/   )r:   pdX_dfrB   r3   r   r   r   !test_one_hot_encoder_dtype_pandasd   s    

rP   c                  C   s   t  } g dg dg dg dg}| | |  }tg d| | g d}tg d| tjtdd	 | d
dg W d    n1 s0    Y  d S )N)Maler   girlr   r   )Female)   rR   r   
   )rQ   3   boy   r   )rQ   [   rR         )Z	x0_FemaleZx0_MaleZx1_1Zx1_41Zx1_51Zx1_91Zx2_boyZx2_girlZx3_1Zx3_2Zx3_12Zx3_21Zx4_3Zx4_10Zx4_30)onetwothreefourfive)Z
one_FemaleZone_MaleZtwo_1Ztwo_41Ztwo_51Ztwo_91Z	three_boyZ
three_girlZfour_1Zfour_2Zfour_12Zfour_21Zfive_3Zfive_10Zfive_30z!input_features should have lengthr)   r\   r]   )r   r+   get_feature_names_outr	   r,   r-   r.   )encr   feature_namesZfeature_names2r   r   r   "test_one_hot_encoder_feature_namest   s&    
rd   c                  C   s\   t  } tjddggtdj}| | |  }tddg| | jdgd}tdd	g| d S )
Nu   c❤t1Zdat2r<   u	   x0_c❤t1Zx0_dat2u   n👍meZinput_featuresu   n👍me_c❤t1u   n👍me_dat2)r   r   r   objectrA   r+   ra   r	   )rb   r   rc   r   r   r   *test_one_hot_encoder_feature_names_unicode   s    
rg   c                  C   s   dd } t | d}tjddggtdj}|| | }tddg| |jd	gd
}tddg| dd }t |d|}d}tj	t
|d |  W d   n1 s0    Y  dS )z=Check the behaviour of `feature_name_combiner` as a callable.c                 S   s   | d t | S )N_)reprfeaturecategoryr   r   r   name_combiner   s    zHtest_one_hot_encoder_custom_feature_name_combiner.<locals>.name_combiner)Zfeature_name_combinerNoneNr<   z	x0_'None'Zx0_NonerE   re   za_'None'Za_Nonec                 S   s   dS )Nr   r   rj   r   r   r   wrong_combiner   s    zItest_one_hot_encoder_custom_feature_name_combiner.<locals>.wrong_combinerzMWhen `feature_name_combiner` is a callable, it should return a Python string.r)   )r   r   r   rf   rA   r+   ra   r	   r,   r-   	TypeError)rm   rb   r   rc   ro   err_msgr   r   r   1test_one_hot_encoder_custom_feature_name_combiner   s    

rr   c                  C   s   t ddggj} t }|jg dgd | d g dgksDJ ||  jdks\J |jg dgd ||  jdksJ d S )	Nr   r   )r   r   r   r   r?   r?   )r   r%   )r   r   r   r   r%   r   )	r   r   rA   r   
set_params
get_paramsr   r   r   )r   r3   r   r   r   test_one_hot_encoder_set_params   s    rv   c                 C   sX   t dd}|| }t ddd}|| }t| | t|rL|jdksPJ | S )Nr>   rs   Fr?   r   Zcsr)r   r   r   r   r   r   format)r   rb   ZXtr1ZXtr2r   r   r   check_categorical_onehot   s    


ry   r   defr   7   abcr   r|   )rU   r   r|   )r   r   r|   )rF   rH   cat)rE   rI   r   r<   )rF   r   r   rE   r   nan)Nr   r   )rE   r   r   )Nr   N)mixednumericrf   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)Zidsc                 C   s   t t| d d dgf }t|ddgddgg t t| d d ddgf }t|g dg dg tdd| }t| g dg dg d S )	Nr   r   )r   r   r   r   r   r   r   r   r>   rs   )r   r   r   r   r   )r   r   r   r   r   )ry   r   r   r   r   r   r   )r   Xtrr   r   r   test_one_hot_encoder   s    r   sparse_FTdropfirstc                 C   s  g dg dg dg}t ||d}||}tj|td}t||| ddgddgd	dgg}t |d
|d}||}t|}t||| |d u rrg dg dg dg}t || ddgddgg dgd}||}tj|td}d |d< t||| ddgddgd	dgg}t |ddgddgg| d}||}tj|td}d |d< d |d d df< t||| tg dg dg}td}t	j
t|d || W d    n1 s0    Y  d S )Nr}   rz   )r~   r   r|   r   r   r<   r   r|   r   r   r>   )r   r?   r   r~   r{   )6   r|   8   )r   r!   r?   )r   r   r   r   )r   r?   r!   r   r   r   r   r   r   )Shape of the passed X data is not correctr)   )r   r   r   r   rf   r	   inverse_transformreescaper,   r-   r.   )r!   r   r   r   rb   X_trexpmsgr   r   r   test_one_hot_encoder_inverse  sH    






r   z
X, X_transr   r|   r   r   r   r   r   r\   r]   r^   rF   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                 C   s^   t |d| }d}|r"t|d}tjt|d || W d   n1 sP0    Y  dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r)   N)r   r+   r   r,   r-   r.   r   )r   X_transr   rb   r   r   r   r   ?test_one_hot_encoder_inverse_transform_raise_error_with_unknownA  s    
r   c                  C   sJ   t jddgddgddggtd} tddd	}|| }t|||  d S )
NrQ   r   rS   r   r   r<   	if_binaryFr   r   )r   r   rf   r   r   r	   r   )r   oher   r   r   r   &test_one_hot_encoder_inverse_if_binarya  s     
r   )r   r   N
reset_dropc                 C   s   t jddgddgddggtd}t| dd}|| ||}| }|j|d	 t|	|| t
||| t| | d S )
NrQ   r   rS   r   r   r<   Fr   r   )r   r   rf   r   r+   r/   ra   rt   r	   r   r   )r   r   r   r   r   rc   r   r   r   test_one_hot_encoder_drop_reseth  s     

r   methodr+   r         @      @c                 C   sJ   t  }d}tjt|d t|||  W d    n1 s<0    Y  d S )Nz'Expected 2D array, got 1D array insteadr)   )r   r,   r-   r.   getattr)r   r   r3   r   r   r   r   test_X_is_not_1Dw  s    r   c                 C   sn   t d}|g d}t }dt| d}t jt|d t|| | W d    n1 s`0    Y  d S )NrD   )   r   r%   r   z+Expected a 2-dimensional container but got z	 instead.r)   )r,   rL   Seriesr   typer-   r.   r   )r   rN   r   r3   r   r   r   r   test_X_is_not_1D_pandas  s    
r   zX, cat_exp, cat_dtyper~   r{   r   rH   rI   )r   r   rf   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                 C   s   | | d d d fD ]}t dd}|| t|jts:J t|j|D ]l\}}| }t|d rt|d srJ |d d |d d ksJ n| |ksJ t	|j
|sFJ qFqd S )Nr6   r>   rs   )r   r+   
isinstancecategories_listziptolistr   r   
issubdtyper=   )r   Zcat_exp	cat_dtypeXirb   resr   Zres_listr   r   r   test_one_hot_encoder_categories  s    #

r   zX, X2, cats, cat_dtypedrE   rF   cint64r%   r   r   r   )NrE   z)rE   rF   r   )rE   Nr   )rf   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanc                 C   s  t |d}tg dg dg}t||  | t|jd t|d ksRJ |jd 	 t|d kspJ |jd j
|ksJ t |d}tjtdd || W d    n1 s0    Y  t ||d}tg dg dg}t||| | d S )	Nrs   r   r   r   r   r   r   r   r(   r)   r?   r!   )r   r   r   )r   r   r   r	   r   r   r   r?   r   r   r=   r,   r-   r.   r+   r/   )r   r2   catsr   r!   rb   r   r   r   r   )test_one_hot_encoder_specified_categories  s    3

(r   c                  C   s   t jddggtdj} tg dgd}t g dg dg}t|| |  | t|	|  | |j
d  g dksJ t |j
d jt jsJ t d	d
ggj} tg dgd}d}tjt|d |	|  W d    n1 s0    Y  d S )NrE   rF   r<   )rF   rE   r   rs   r   r   r   r   r   )r   r   r   z%Unsorted categories are not supportedr)   )r   r   rf   rA   r   r	   r+   r/   r   r   r   r   r   r=   object_r,   r-   r.   )r   rb   r   r   r   r   r   (test_one_hot_encoder_unsorted_categories  s    r   Encoderc                 C   sp   t dt jdgg}| |d}t jddggtdj}tjtdd || W d   n1 sb0    Y  dS )zTest encoder for specified categories that nan is at the end.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    r   r   rs   r<   zNan should be the last elementr)   N)	r   r   r   rf   rA   r,   r-   r.   r+   r   r   rb   r   r   r   r   ,test_encoder_nan_ending_specified_categories  s
    
r   c                  C   s   t jddgddggtdj} tg dg dgd}t g d	g d
g}t||  | |jd 	 g dksvJ t 
|jd jt jsJ |jd 	 g dksJ t 
|jd jt jsJ d S )NrE   rF   r   r   r<   r   )r   r   r   rs   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   rf   rA   r   r	   r   r   r   r   r   r=   r   r   rb   r   r   r   r   7test_one_hot_encoder_specified_categories_mixed_columns$  s    r   c                  C   sD   t d} | ddgddgd}t|}t|g dg dg d S )	NrD   rE   rF   r   r   rG   rJ   rK   )r,   rL   rM   ry   r   )rN   rO   r   r   r   r   test_one_hot_encoder_pandas1  s    
r   zdrop, expected_namesx0_cx2_br   )r   Zx1_2r   )r   r   rF   x0_bZx2_a)r   binarymanualc                 C   s:   g dg dg}t | d}|| | }t|| d S )N)r   r   rE   )rF   r   rF   r   )r   r+   ra   r	   )r   Zexpected_namesr   r   rc   r   r   r   'test_one_hot_encoder_feature_names_drop:  s
    


r   c                  C   s   ddgddgddgg} t g dg dg dg}t d d	g}td
dd}|| }t|j| t|| ddgddgddgg} t ddgddgddgg}t d	d g}td
dd}|| }t|j| t|| d S )NrU   yes   nor[   )r   r   r   r   r7   )r   r   r   r   r   r   Fr   truerE   falser   r   )r   r   r   r   r	   	drop_idx_r   )r   expectedZexpected_drop_idxr   resultr   r   r   *test_one_hot_encoder_drop_equals_if_binaryL  s     


r   )rU   r   r|   )r   r   r|   )r   r   rf   c                 C   sT   t  }tjg dg dgdd}t|| |d t dd}t|| | d S )Nr   r   r   r   r   r   r   r<   float64)r   r   r   r	   r   Zastyper   r   r   r   test_ordinal_encoderd  s
    

r   )rf   r   zobject-string-catc                 C   s   t |d}tdgdgg}t|| | t|jd t|d ksJJ |jd  t|d kshJ |jd j	|ks|J t |d}t
jtdd || W d    n1 s0    Y  d S )Nrs   r   r   r   r(   r)   )r   r   r   r	   r   r   r?   r   r   r=   r,   r-   r.   r+   )r   r2   r   r   rb   r   r   r   r   )test_ordinal_encoder_specified_categoriesu  s    

r   c                  C   s   g dg dg} t  }|| }tj| td}t||| tg dg dg}td}t	j
t|d || W d    n1 s0    Y  d S )Nr}   rz   r<   )r   r   r   r   rJ   r   r)   )r   r   r   r   rf   r	   r   r   r   r,   r-   r.   )r   rb   r   r   r   r   r   r   test_ordinal_encoder_inverse  s    

r   c                  C   s   t ddd} tjddgddgdd	ggtd
}tjddgddgddggtd
}| | | |}tjddgddgddggdd
}t|| | |}tjdd gd dgddggtd
}t|| d S )Nuse_encoded_valuer!   unknown_valuerE   xrF   yr   r   r<   ZxyZblar   r   r   r   )r   r   r   rf   r+   r/   r	   r   )rb   X_fitr   X_trans_encr   X_trans_invinv_expr   r   r   +test_ordinal_encoder_handle_unknowns_string  s      

 

 r   r=   c                 C   s   t ddd}tjddgddgdd	gg| d
}tjddgddgddgg| d
}|| ||}tjddgddgddggdd
}t|| ||}tjdd gd dgddggtd
}t|| d S )Nr   r   r      r      r   	   r<   rX      r   r   )r   r   r   r+   r/   r	   r   rf   )r=   rb   r   r   r   r   r   r   r   r   r   ,test_ordinal_encoder_handle_unknowns_numeric  s      

 

 r   c                  C   s`   t dtjd} tdgdgdgg}| | | dgdgdgg}t|dgdgtjgg d S )Nr   r   r   r   r   r%   r   )r   r   r   r   r+   r/   r	   )rb   r   r   r   r   r   (test_ordinal_encoder_handle_unknowns_nan  s
    
r   c                  C   sb   t dtjtd} tdgdgdgg}tjtdd | | W d    n1 sT0    Y  d S )Nr   )r!   r   r=   r   r   r   z'dtype parameter should be a float dtyper)   )	r   r   r   intr   r,   r-   r.   r+   )rb   r   r   r   r   8test_ordinal_encoder_handle_unknowns_nan_non_float_dtype  s    r   c                  C   sh   t jg dgtdj} g d}t|d}d}tjt|d ||  W d    n1 sZ0    Y  d S )N)LowMediumHighr   r   r<   )r   r   r   rs   z*Shape mismatch: if categories is an array,r)   )	r   r   rf   rA   r   r,   r-   r.   r+   )r   r   rb   r   r   r   r   +test_ordinal_encoder_raise_categories_shape  s    
r   c                     s|  t ddtjg dg dgdd} tjddgd	d
ggddtjddgd	d
ggddtddgddggtddgddggtjddgd	dggddfD ]B   t fddtdD sJ t  |  qddgd	d
gg   tfddtdD sJ t  |  ddgd	dgg   tfddtdD sdJ t  |  d S )Nr>   rs   )r   r   r   r   )r   r   r   r   r   r<   r   r   r   r%   r   rE   rF   r   r      a   b   c   drf   c                    s   g | ]}j | j jkqS r   r   r=   .0ir   rb   r   r   
<listcomp>      z'test_encoder_dtypes.<locals>.<listcomp>c                    s"   g | ]}t  j| jt jqS r   )r   r   r   r=   integerr   rb   r   r   r     r   c                    s   g | ]} j | jd kqS )rf   r   r   r  r   r   r     r   )	r   r   r   r+   allranger	   r/   r   )r   r   r   r   test_encoder_dtypes  s&    

 
 
 r  c                     s  t d} tddtjg dg dgdd}| jdd	gd
dgddgddd}| tfddtd	D sxJ t	
| | | dd	gddgddgd}|d j|d j|d jg | t fddtd
D sJ t	
| | d S )NrD   r>   rs   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r<   r   r   r   r%   r   r   rH   rI   Cr   c                    s   g | ]} j | jd kqS )r   r   r   r  r   r   r     r   z.test_encoder_dtypes_pandas.<locals>.<listcomp>rE   rF   r   r   rH   rI   r  c                    s    g | ]}j | j | kqS r   r   r   ZX_typerb   r   r   r     r   )r,   rL   r   r   r   rM   r+   r  r  r	   r/   r   r=   )rN   r   r   r   r  r   test_encoder_dtypes_pandas  s    

"

 r  c                  C   sV   t  } ddgddgg}t $ td | | W d    n1 sH0    Y  d S )NrQ   r   rS   r   r&   )r   warningscatch_warningssimplefilterr   )rb   r   r   r   r   test_one_hot_encoder_warning  s
    

r  c                 C   s   ddgddgddgg}t | ddddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 s0    Y  t|| dS )z,Check handle_unknown='warn' works correctly.rE   r   rF   r   r   Fr$   r   r   r!   r?   r   qFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr)   N	r   r+   r   r   r,   warnsUserWarningr/   r   )r   r   r   X_testrB   warn_msgr   r   r   r   test_ohe_handle_unknown_warn%  s    

(r  missing_valuec           	      C   sr  dddd| g}t |d}g dg ddddd| gg}|| }g dg d	g d
g}t|| |j|u spJ dd t|j|jD }||}t	j
|td}t|d rZt|d d |d d  t|d sJ t|d sJ t|d d d df |d d d df  t|dd df |dd df  t|d sFJ t|d snJ nt|| t|| d S )Nr{   rX   r   r   r   )r~   rX   r   r|   rE   )r{   rX   r   r|   rE   )r   r   r   r   r   )r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r   r   )r   r   rk   r   r   r   r   M  s   z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>r<   r6   )r6   r6   )r   r   r   r	   r   r   r   r   r   r   r   rf   r   )	r  Zcats_to_droprb   r   Ztransr   Zdropped_catsZX_inv_transZX_arrayr   r   r    test_one_hot_encoder_drop_manual?  s2    


*"
r  )r~   r   rT   rE   c                 C   s\   t | d}d}tjt|d, |g dg dg dg W d    n1 sN0    Y  d S )Nr   z-`drop` should have length equal to the numberr)   r}   rz   )r{   r   ;   )r   r,   r-   r.   r+   )r   rb   rq   r   r   r   test_invalid_drop_lengthd  s    
r  densityr   ZdenserE   r   rF   r   c                 C   s   t | d}t | |d}g dg dg}|| || t|j|j |dkr^t|jd n0t||j|jD ]\}}}|t| |ksnJ qnt|jtj	sJ |jj
tksJ d S )Nr   r   )r   r   rE   r  r   r   )r   r+   r	   r   r   r   r   r   r   Zndarrayr=   rf   )r  r   Zohe_baseZohe_testr   Zdrop_catZdrop_idxZcat_listr   r   r   test_categoriesl  s    



r  c                 C   s   |    jjsJ d S )N)Z__sklearn_tags__Z
input_tagscategorical)r   r   r   r   "test_encoders_has_categorical_tags  s    r  kwargsmax_categoriesmin_frequency   g(\?r   )r  r   rX   r?   r>   rE   rF   r   r   c           
      C   s   t dgd dgd  dgd  dgd  gj}tf |d	d
d| |}t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dd dgdgd  D }|	|}t|| |
 }	tddg|	 dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.rE   r   rF   r   r   rU   r   r   r#   F)r?   r!   r   rE   r   r   er   r   c                 S   s   g | ]
}|gqS r   r   r   colr   r   r   r     r   z2test_ohe_infrequent_two_levels.<locals>.<listcomp>infrequent_sklearnr%   r   x0_infrequent_sklearnNr   r   rA   r   r+   r	   infrequent_categories_r/   r   r   ra   )
r  r?   X_trainr   r  r   r   expected_invX_invrc   r   r   r   test_ohe_infrequent_two_levels  s(    2(



r.  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}|jd |jd  dksdJ t dgdgg}||}tdgdgg| |	 }t
dg| ||}t
dgdgg| dS )z3Test two levels and dropping the frequent category.rE   r   rF   r   r   rU   r   r   r#   Fr   r!   r   r  r   r   r   r(  r'  N)r   r   rA   r   r+   r   r   r/   r   ra   r	   r   )r   r+  r   r  r   rc   	X_inverser   r   r   ,test_ohe_infrequent_two_levels_drop_frequent  s"    2

r1  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   n1 s0    Y  dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.rE   r   rF   r   r   rU   r   r   r#   Fr   r/  Unable to drop category r   ( from feature 0 because it is infrequentr)   Nr   r   rA   r   r,   r-   r.   r+   r   r+  r   r   r   r   r   5test_ohe_infrequent_two_levels_drop_infrequent_errors  s    2r6  r   gQ?g{Gz?r   c           	      C   s   t dgd dgd  dgd  dgd  gj}tf d	d
d| |}t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t|| dgdgdgdgdgg}|	|}t|| |
 }tg d| dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.rE   r   rF   r   r   rU   r   r   r#   Fr!   r   r$  r   r   r   r   r   r'  )r   r   r(  Nr)  )	r  r+  r   r  r   r   r,  r-  rc   r   r   r    test_ohe_infrequent_three_levels  s.    2(



r9  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t dgdgdgg}tddgddgddgg|| |jdd| d}tj	t
|d" |dgdgg}W d   n1 s0    Y  tddgddgg| dS )z5Test three levels and dropping the frequent category.rE   r   rF   r   r   rU   r   r   r#   Fr/  r   r   r"   r'   r(   r)   r$  N)r   r   rA   r   r+   r   r/   rt   r,   r  r  )r   r+  r   r  r   r   r   r   r   .test_ohe_infrequent_three_levels_drop_frequent  s     2"0r:  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   n1 s0    Y  dS )z7Test three levels and dropping the infrequent category.rE   r   rF   r   r   rU   r   r   r#   Fr/  r2  r   r3  r)   Nr4  r5  r   r   r   7test_ohe_infrequent_three_levels_drop_infrequent_errors  s    2r;  c                  C   s   t dgd dgd  dgd  dgd  gj} td	d
dd| }t|jddgg dgdgdgdgg}t g dg dg dg dg}||}t|| dgg}d}t	j
t|d || W d   n1 s0    Y  dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.rE   r   rF   r   r   rU   r   r   r&   F)r!   r   r  r   r8  r   badz.Found unknown categories \['bad'\] in column 0r)   N)r   r   rA   r   r+   r	   r*  r/   r   r,   r-   r.   )r+  r   r  r   r   r   r   r   r   (test_ohe_infrequent_handle_unknown_error'  s    2"

r=  c                 C   s   t jdgd dgd  gtdj}tf g dgddd	| |}dgd
gdgdgdgg}t ddgddgddgddgddgg}||}t|| dddgg}dgdgg}|D ].}|j|d| tdgdgg|| qdS )zG'a' is the only frequent category, all other categories are infrequent.rE   r   r$  r[   r<   r   r   rE   rF   Fr#   r?   r   r!   rF   r   r   r   r   r   r   r   N)	r   r   rf   rA   r   r+   r/   r   rt   )r  r+  r   r  r   r   Zdropsr   r   r   r   5test_ohe_infrequent_two_levels_user_cats_one_frequent?  s&    "(

r@  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t	|| dd dgdgd  D }|
|}t|| dS )zFTest that the order of the categories provided by a user is respected.rE   r   rF   r   r   rU   r   r   r<   r>  Fr#   r   r?   r   r!   r  )r   r   rE   r$  r   r   c                 S   s   g | ]
}|gqS r   r   r%  r   r   r   r   q  r   z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>r'  r%   Nr   r   rf   rA   r   r+   r	   r*  r/   r   r   r+  r   r  r   r   r,  r-  r   r   r   (test_ohe_infrequent_two_levels_user_cats[  s&    *(


rD  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t	|| dgdgdgdgdgg}|
|}t|| dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.rE   r   rF   r   r   rU   r   r   r<   r   r   rF   rE   Fr#   rA  r$  r   r8  r   r'  NrB  rC  r   r   r   *test_ohe_infrequent_three_levels_user_catsv  s0    *(


rF  c                  C   sb   t jg dg df } tdddd}||  ddgddgg}||}t|g d	g d
g dS )zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   	r   r   r   r   r   r   r   r   r   r   r   F)r  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r   r+   r/   r   )r   r   r  r   r   r   r   test_ohe_infrequent_mixed  s    

rK  c            	   
   C   s  t jg dg dg df } tdddd}||  }t|jd d	d
g t|jd	 d	dg t|jd
 d | }tg d| g dg dg dg dg dg dg dg dg dg	}t|| g dg dg}|	|}g dg dg}t||  |
|}t jg dg dgtd}t|| tdddd| }tjtdd |	| W d   n1 st0    Y  g d g d!g}|	|}g d"g dg}t||  |
|}t jg d#g d$gtd}t|| dS )%z?Test infrequent categories with feature matrix with 3 features.rG  )	r   r   r   r   r   rU   r   r   r   )	r   r   r   r   r   r   r   r   r   r>   r   r#   r?   r  r!   r   r   r   rU   N)Zx0_0Zx0_3r(  Zx1_0Zx1_5Zx1_infrequent_sklearnZx2_0Zx2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r%   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r'  N)r'  r   Nr<   r&   r(   r)   )r   r   r   )r   rU   r   )r   r   r   r   r   r   r   r   )r'  r'  r   )r   r'  r   )r   rJ  r   r   r   r	   r*  ra   r   r/   r   r   rf   r+   r,   r-   r.   )	r   r   r   rc   r   r  X_test_transr-  r,  r   r   r   'test_ohe_infrequent_multiple_categories  sn    




*

rN  c            	   
   C   s  t d} | jg dg ddddgd}tdd	d
d}|| }t|jd ddg t|jd g d g dg dg dg dg dg dg dg dg dg	}t|| | jddgddgdddgd}g dg dg}|	|}t||  |
|}tjddgddggtd}t|| | jddgddgdddgd}|	| }g dg dg}t|| |
|}tjddgddggtd}t|| dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rD   	rE   fr   rP  rP  rE   r   rF   rF   	r   r   r   rU   rU   rX   r   r   r   )strr   rR  r   columnsr>   r   r#   rL  r   rE   rF   r   r   r   rX   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   rP     rX   r'  r<   r   r   N)r,   rL   rM   r   r   r   r	   r*  r   r/   r   r   r   rf   )	rN   r   r   r   r   r  rM  r-  r,  r   r   r   .test_ohe_infrequent_multiple_categories_dtypes  sV    
	
 


 

rW  rZ   )r   r  c                 C   sp   t dgd dgd  dgd  dgd  gj}tf d	d
d| }|| |dgg}t|dgg dS ),All user provided categories are infrequent.rE   r   rF   r   r   rU   r   r   r#   Fr7  r   N)r   r   rA   r   r+   r/   r   r  r+  r   r   r   r   r   $test_ohe_infrequent_one_level_errorsH  s    2
rZ  c                 C   sb   t jdgd gtdj}tf g dgddd| |}|dgdgg}t|d	gd	gg d
S )rX  r$  r   r<   r>  Fr#   r?  rE   r   N)r   r   rf   rA   r   r+   r/   r   rY  r   r   r   5test_ohe_infrequent_user_cats_unknown_training_errorsV  s    r[  zinput_dtype, category_dtype)ZOOZOUZUOZUUSOZSUZSS
array_type)r   r   Z	dataframec           
      C   s   t jdgdgg| d}t jddg|dg}t|dd|}tdgdgdgdgg|| d}||}t ddgddgddgddgg}t|| t|d|}	|	|}t dgdgdgdgg}t|| d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    rF   rE   r<   Frw   r   r   rs   N)	r   r   r   r+   r   r/   r   r   r	   )
r;   Zcategory_dtyper]  r   r?   r   r  r   r   oer   r   r   test_encoders_string_categoriesg  s    
"

r_  c                  C   s|   t jdgdggdd} t jddgddg}t|dd}td}tjt|d	 ||  W d
   n1 sn0    Y  d
S )zCheck that this mixture of predefined categories and X raises an error.

    Categories defined as bytes can not easily be compared to data that is
    a string.
    rF   rE   Ur<   SFrw   zjIn column 0, the predefined categories have type 'bytes' which is incompatible with values of type 'str_'.r)   N)	r   r   r   r   r   r,   r-   r.   r+   )r   r?   r   r   r   r   r   $test_mixed_string_bytes_categoricals  s    rb  c                 C   sP   t jdd| d| ggtdj}tddd|}| }t|ddd	|  g d S )
NrE   rF   r<   Fr"   r   r!   Zx0_ar   Zx0_)r   r   rf   rA   r   r+   ra   r	   )r  r   r   namesr   r   r   )test_ohe_missing_values_get_feature_names  s    re  c                  C   sr   t d} | jg dtjdddtjgtdddd	gd
}tg dg dg dg dg}t|}t|| d S )NrD   )dogr   Nr   r   r   r%   r<   )col1col2rg  rh  rS  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r,   rL   rM   r   r   r   floatry   r   )rN   dfexpected_df_transr   r   r   r   %test_ohe_missing_value_support_pandas  s     
	rl  pd_nan_typepd.NAznp.nanc              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}tg d	g d
g dg dg d
g}td|d}|	|}t
|| t|jdksJ t|jd d d g d t|jd d sJ d S )NrD   rn  rg  r   rE   rF   rl   r<   )r   r   r   r   )r   r   r   r   )r   r   r   r   rI  Frc  r   r   r6   r   )r,   rL   NAr   r   rM   r   r   r   r   r   lenr   r	   isnan)rm  r!   rN   pd_missing_valuerj  rk  r   df_transr   r   r   1test_ohe_missing_value_support_pandas_categorical  s(    



rt  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg d	g}d}tjt|d ||}W d   n1 s0    Y  t|| |	|}t
|tjddggtd dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.rE   r   rF   r   r   r   Fr   r   r!   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr)   Nr<   r   r   r   r   r   r,   r  r  r/   r   r	   rf   r!   r   r   r   rB   r  r  r-  r   r   r   /test_ohe_drop_first_handle_unknown_ignore_warns  s*    


(

ry  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg dg}d}tjt|d ||}W d   n1 s0    Y  t|| |	|}t
|tjddggtd dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.rE   r   rF   r   r   r   Fru  rI  r   rJ   r   r   )r   r   r   r   rv  r)   Nr<   rw  rx  r   r   r   3test_ohe_drop_if_binary_handle_unknown_ignore_warns  s*    


(

rz  c                 C   s   ddgddgddgg}t dd| ddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 s0    Y  t|| dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.rE   r   rF   r   r   r   Fr  r   r  r)   Nr  )r!   r   r   r  rB   r  r   r   r   r   'test_ohe_drop_first_explicit_categories&  s    

(r{  c                  C   s   t d} | jg dg ddddgd}tdd	}|jdd
 d}t jt|d || W d   n1 sr0    Y  || t jt|d |	| W d   n1 s0    Y  dS )zJRaise informative error message when pandas output and sparse_output=True.rD   r   )r   rF   rF   )rE   rF   rE   rF   rS  Tr   r/   zxPandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas outputr)   N)
r,   rL   rM   r   
set_outputr-   r.   r   r+   r/   )rN   rj  r   r   r   r   r   'test_ohe_more_informative_error_messageA  s    
 
(
r~  c                  C   sl   t t jdddggj} tt jd}dt j }tjt|d |	|  W d   n1 s^0    Y  dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   r<   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r)   N)
r   r   r   rA   r   int32r,   r-   r.   r+   )r   r^  r   r   r   r   Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtypeU  s    r  encoded_missing_valuer   c                 C   s   t jt jdddggt jdj}t| d|}t|jdks@J t	|jd ddt jg |
|}t	|| gdgdgdgg ||}t	|| dS )	z.Test ordinal encoder with nan on float dtypes.r   r   r<   r  r   r   r   N)r   r   r   r   rA   r   r+   rp  r   r   r/   r   )r  r   r^  r   r0  r   r   r   5test_ordinal_encoder_passthrough_missing_values_floatc  s    

r  c              	   C   s"  t d}| dkr|jntj}|d|jdd|ddgddi}t|d	|}t	|j
d
ksbJ t|j
d dd g d t|j
d d sJ ||}t|dgdg|gdgdgg ||}|jdksJ t|dddf ddg t|dddf ddg t|d sJ dS )z0Check ordinal encoder is compatible with pandas.rD   rn  rg  r   rE   rF   rl   r<   r  r   r   Nr   r   r6          @r   r   )r   r   r   r   )r,   rL   ro  r   r   rM   r   r   r+   rp  r   r	   rq  r/   r   r   r   )rm  r  rN   rr  rj  r^  rs  r0  r   r   r   =test_ordinal_encoder_missing_value_support_pandas_categoricalu  s"    


r  r  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                 C   s   t |d}tdgtjgg}t|| | |jd j|ksBJ t |d}tj	t
dd || W d   n1 sz0    Y  dS )z.Test ordinal encoder for specified categories.rs   r   r   r(   r)   N)r   r   r   r   r	   r   r   r=   r,   r-   r.   r+   )r   r2   r   r   r^  r   r   r   r   =test_ordinal_encoder_specified_categories_missing_passthrough  s    &

r  c                 C   sp   t jg dtdg}| |d}t jddggtdj}tjtdd || W d   n1 sb0    Y  dS )	zTest encoder for specified categories have duplicate values.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/27088
    )rE   rF   rE   r<   rs   rE   rF   z5the predefined categories contain duplicate elements.r)   N)r   r   rf   rA   r,   r-   r.   r+   r   r   r   r   +test_encoder_duplicate_specified_categories  s    
r  zX, expected_X_trans, X_testr   r   )r   r   r   )r   r  r   r   )r   rE   rF   )r  r   r   c                 C   s8   t ddd}|| }t|| t||dgg dS )z>Test the interaction between missing values and handle_unknownr   r6   r   g      N)r   r   r   r/   )r   Zexpected_X_transr  r^  r   r   r   r   /test_ordinal_encoder_handle_missing_and_unknown  s    

r  csr_containerc                 C   s   t g dg dg}| |}t }d}tjt|d || W d   n1 sV0    Y  tjt|d || W d   n1 s0    Y  ||}| |}tjt|d || W d   n1 s0    Y  dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z2Sparse data was passed, but dense data is requiredr)   N)	r   r   r   r,   r-   rp   r+   r   r   )r  r   ZX_sparseencoderrq   r   r   r   r   r   test_ordinal_encoder_sparse  s    ((
r  c                  C   s   t g dddt jf } tg dgddd}||  tg dgdd}tjtd	d
 ||  W d   n1 sz0    Y  dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)r6   r   r   r   r   )r?   r!   r   r&   r   r(   r)   )r   r   Znewaxisr   r+   r,   r-   r.   )r   r^  r   r   r   -test_ordinal_encoder_fit_with_unseen_category  s    
r  r+  ZAAOr`  r  c                 C   s4   t ddd}||  ||}t|ddgg dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r   ir   r   N)r   r+   r/   r   )r+  r  rb   r   r   r   r   1test_ordinal_encoder_handle_unknown_string_dtypes&  s    

r  c                  C   sb   t g ddd} t | }t|jt j| ddj |	| }t|dgdgdgdgg dS )	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6kr6   r   r   )Zaxisr   r   N)
r   r   r8   r   r+   r	   r   sortrA   r/   )r   r  r   r   r   r   #test_ordinal_encoder_python_integerB  s    
r  c                  C   sH   t d} g d}| jg dg|d}t |}| }t|| dS )z-Check feature names out is same as the input.rD   )rF   r   rE   r   rS  N)r,   rL   rM   r   r+   ra   r	   )rN   rd  r   rb   Zfeature_names_outr   r   r   .test_ordinal_encoder_features_names_out_pandasV  s    
r  c                  C   s   t jdgdgt jggtd} tdt jdd| }|| }t|dgdgdgg t jd	gt jggtd}||}t|t jgdgg ||}|d d d
u sJ t 	|d d sJ d
S )zECheck interactions between encode_unknown and missing value encoding.rE   rF   r<   r   r!   r   r  r   r   r   N)
r   r   r   rf   r   r+   r/   r   r   rq  )r   r^  r   r  rM  X_roundtripr   r   r   0test_ordinal_encoder_unknown_missing_interactionb  s     


r  with_pandasc                 C   s   t jddgddgdt jggtd}d}| rPtd}|j|d	d
gd}|d }n|d }tdd}tjt	|d |
| W d   n1 s0    Y  dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.rE   rf  rF   r   r   r<   zTencoded_missing_value \(1\) is already used to encode a known category in features: rD   letterZpetrS  z	\['pet'\]z\[1\]r   r  r)   N)r   r   r   rf   r,   rL   rM   r   r-   r.   r+   )r  r   	error_msgrN   r^  r   r   r   0test_ordinal_encoder_encoded_missing_value_error  s    "


r  z4X_train, X_test_trans_expected, X_roundtrip_expected1c                 C   s   t dtjtjd| }tdgtjgdgg}||}t|| ||}|jd }t	|D ]V}||df }	||df }
|	du r|
du sJ q`t
|	rt|
sJ q`|
|	ks`J q`dS )znCheck transform when unknown_value and encoded_missing_value is nan.

    Non-regression test for #24082.
    r   r  r  rF   r   N)r   r   r   r+   r   r/   r   r   r   r  r   rq  )r+  ZX_test_trans_expectedZX_roundtrip_expectedr^  r  rM  r  Z	n_samplesr   Zexpected_valvalr   r   r   9test_ordinal_encoder_unknown_missing_interaction_both_nan  s(    



r  c                  C   s   t d} | ddgddgd}t }|jdd d}t jt|d	 || W d
   n1 sf0    Y  tddjdd}tddjdd}||}||}t|	 | t
| |j d
S )z*Check OneHotEncoder works with set_output.rD   rE   rF   r   r   rG   r|  zCPandas output does not support sparse data. Set sparse_output=Falser)   NFr   default)r,   rL   rM   r   r}  r-   r.   r   r   to_numpyr	   ra   rT  )rN   rO   r   r*   Zohe_defaultZ
ohe_pandas	X_defaultX_pandasr   r   r   test_one_hot_encoder_set_output  s    
(

r  c                  C   st   t d} | ddgddgd}t jdd}t jdd}||}||}t| | t|	 |j
 d	S )
z+Check OrdinalEncoder works with set_output.rD   rE   rF   r   r   rG   r  r|  N)r,   rL   rM   r   r}  r   r   r  r	   ra   rT  )rN   rO   Zord_defaultZ
ord_pandasr  r  r   r   r   test_ordinal_set_output  s    


r  c                  C   st   g dddgg} t | d}|ddgg t| t|jks@J t|jD ]$\}}|jtks`J t| | | qJdS )zjCheck that the categories_ dtype is `object` for string categories

    Regression test for gh-25171.
    )asZmmasZeasZrasZacsr  2rs   r  N)r   r+   rp  r   	enumerater=   rf   r	   )r?   rb   nr   r   r   r    test_predefined_categories_dtype  s    
r  c                  C   s~   t jdgdgt jggtd} tdd| }t|dgdgdgg tddd	| }t d
gg}||}t|dgg dS )zBCheck missing value or unknown encoding can equal the cardinality.rf  r   r<   r   r  r   r   r   r   snakeN)	r   r   r   rf   r   r   r   r+   r/   )r   r   rb   r  r   r   r   1test_ordinal_encoder_missing_unknown_encoding_max  s    
r  c                  C   s  t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	d
d| }t| g d |jd |j	d  dksJ t jdgd dgd  dgd  gtdj} tdd	dd| }t| dg |jd |j	d  dksJ t jdgd dgd  dgd  dgd  dgd  gtdj} tdd	dgd| }t| g d |jd |j	d  dkstJ tdd	dd| }t| g d |j	du sJ dS )zkCheck drop_idx is defined correctly with infrequent categories.

    Non-regression test for gh-25550.
    rE   r   rF   r%   r   r   r$  r<   Fr   )r   r   r   )r   x0_dx0_er(  r   rU   r   r(  )r   r   r  r(  N)r   r   r  r  r(  )
r   r   rf   rA   r   r+   r	   ra   r   r   )r   r   r   r   r   #test_drop_idx_infrequent_categories  s4    4,4r  c                 C   s   t dgd dgd  dgd  dgd  gj}tf d	d
d| |}t|jg dg t|jddgg dgdgdgdgdgg}dgdgdgdgd
gg}||}t	|| |
|}dgdgdgdgdgg}t|| dS )zGTest parameters for grouping 'a', and 'd' into the infrequent category.rE   r   rF   r   r   rU   r   r   r   r6   r   r"  r   r   r   r   r'  N)r   r   rA   r   r+   r	   r   r*  r/   r   r   )r  r+  ordinalr  expected_transr   r0  expected_inverser   r   r   ,test_ordinal_encoder_infrequent_three_levels6  s,    2


r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg d
g t|jddgg dgdgdgdgdgg}dgdgdgdgdgg}|	|}t
|| ||}dgdgdgdgdgg}t|| dS )zTest that the order of the categories provided by a user is respected.

    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.
    rE   r   rF   r   r   rU   r   r   r<   rE  r   r6   )r?   r  r!   r   r   r   r   r   r'  N)r   r   rf   rA   r   r+   r	   r   r*  r/   r   r   )r+  r  r  r  r   r0  r  r   r   r   6test_ordinal_encoder_infrequent_three_levels_user_cats]  s2    *


r  c                  C   s   t g dg df} tdd| }t|jd ddg |jd du sLJ ddgddgg}ddgddgg}||}t|| ||}t j	ddgd	dggt
d
}t|| dS )zETest when feature 0 has infrequent categories and feature 1 does not.rG  rH  r   r  r   r   r   Nr'  r<   )r   Zcolumn_stackr   r+   r	   r*  r/   r   r   r   rf   )r   r  r  r  r   r0  r  r   r   r   %test_ordinal_encoder_infrequent_mixed  s    


r  c                  C   s   t d} | g d}| jg dg d| jdgd dgd  d	g d
g |ddg dd}tdd|}t|jd ddg t|jd g d t|jd d
d	g | jg dg d| jdgd	g d
g dg |ddg dd}g dg dg dg dg}|	|}t
|| dS )zHTest infrequent categories with a pandas DataFrame with multiple dtypes.rD   )birdr   rf  r  rO  rQ  rf  r%   r   r   r  r  r<   )rR  r   r  rS  r  r   rE   rF   r   rU  r   )rE   rF   rP  r   )rX   r   rU   r   )r   r   r   )r   r   r   )r   r   r   r8  N)r,   rL   ZCategoricalDtyperM   r   r   r+   r	   r*  r/   r   )rN   Zcategorical_dtyper   r  r  r  r   r   r   r   :test_ordinal_encoder_infrequent_multiple_categories_dtypes  s:    


r  c                  C   s   t jdgd dgd  dgd  dgd  t jg gtd	j} td
dddd| }t|jg dg t jdgdgdgdgdgt jggtd	}dgdgdgdgdgdgg}|	|}t
|| dS )zJCheck behavior of unknown_value and encoded_missing_value with infrequent.rE   r   rF   r   r   rU   r   r   r<   r   r   )r!   r   r  r  r#  r$  r   r   N)r   r   r   rf   rA   r   r+   r	   r*  r/   r   )r+  r  r  r  r   r   r   r   .test_ordinal_encoder_infrequent_custom_mapping  s     2(
r  c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tf i | d
dd|}td
dd|}dgdgdgdgdgg}t|||| dS )zMAll categories are considered frequent have same encoding as default encoder.rE   r   rF   r   r   rU   r   r   r<   r   r6   r   r$  Nr   r   rf   rA   r   r+   r   r/   )r  r+  Zadjusted_encoderZdefault_encoderr  r   r   r   !test_ordinal_encoder_all_frequent  s&    	*
r  d   c                 C   s   t jdgd dgd  dgd  dgd  gtd	j}tf i | d
dd|}dgdgdgdgdgg}t||dgdgdgdgdgg dS )zAWhen all categories are infrequent, they are all encoded as zero.rE   r   rF   r   r   rU   r   r   r<   r   r6   r   r$  r   Nr  )r  r+  r  r  r   r   r   #test_ordinal_encoder_all_infrequent  s    	*
r  c                  C   s   t jt jgd dgd  dgd  dg dg gtdj} td	d
| }t jdddt jggtdj}||}t|dgdgdgt jgg dS )z5Check behavior when missing value appears frequently.r   rf  rU   r   r   r  deerr<   r   r  r   r   r   N	r   r   r   rf   rA   r   r+   r/   r   r   r  r  r   r   r   r   -test_ordinal_encoder_missing_appears_frequent
	  s    ,
r  c                  C   s   t jt jgdgd  dgd  dg dg dgd d	gd  gtd
j} tdd| }t jddgdd	gt jd	gdd	gddggtd
}||}t|ddgddgt jdgddgddgg dS )z7Check behavior when missing value appears infrequently.rf  rU   r   r   r  r  redr   greenr<   r%   )r   r   r   r   Nr  r  r   r   r   /test_ordinal_encoder_missing_appears_infrequent	  s$    &

r  c                 C   sb   t jdgdgdggtd}| g dgd}tt || W d   n1 sT0    Y  dS )a!  Check that we raise a `NotFittedError` by calling transform before fit with
    the encoders.

    One could expect that the passing the `categories` argument to the encoder
    would make it stateless. However, `fit` is making a couple of check, such as the
    position of `np.nan`.
    rH   rI   r  r<   r  rs   N)r   r   rf   r,   r-   r   r/   )r   r   r  r   r   r   test_encoder_not_fitted3	  s    	r  )r   r	  numpyr   r,   Zscipyr   Zsklearn.exceptionsr   Zsklearn.preprocessingr   r   Zsklearn.utils._missingr   Zsklearn.utils._testingr   r   r	   Zsklearn.utils.fixesr
   r    markZparametrizer4   r9   r  Zfloat32r   rC   rP   rd   rg   rr   rv   ry   r   rf   r   ri  r   r   r   r   r   r   r   r   r   Zstr_r   rA   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r.  r1  r6  r9  r:  r;  r=  r@  rD  rF  rK  rN  rW  rZ  r[  r_  rb  re  rl  rt  ry  rz  r{  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r@   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s  


<


/*


 &&* !
&1
	

	
		





$








$[A



%
$

		"

!$$0