a
    h                     @   s,  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlZd dlZd dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZmZmZm Z  d dl!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0 dZ1dZ2e1e2 Z3dd Z4dd Z5dd Z6dd Z7dd Z8dd Z9ej:;deefdd  Z<d!d" Z=d#d$ Z>d%d& Z?d'd( Z@d)d* ZAd+d, ZBd-d. ZCd/d0 ZDd1d2 ZEd3d4 ZFd5d6 ZGd7d8 ZHd9d: ZId;d< ZJd=d> ZKd?d@ ZLdAdB ZMej:jNe.dCdDdEdF ZOdGdH ZPdIdJ ZQdKdL ZRdMdN ZSdOdP ZTej:;deefdQdR ZUdSdT ZVdUdV ZWdWdX ZXdYdZ ZYd[d\ ZZej:;deefd]d^ Z[d_d` Z\dadb Z]dcdd Z^dedf Z_dgdh Z`didj Zaej:;dkejbejcejdgdldm Zedndo Zfdpdq Zgdrds Zhdtdu Zidvdw Zjdxdy Zkdzd{ Zld|d} Zmd~d Zndd Zodd Zpej:;deeefdd Zqej:;dejrejsgdd Ztej:;dee/e0dd Zuej:;dejvejsdfejwejsdfejrejrdfejsejsdfgdd Zxej:;deddeddeddgdd Zydd Zzdd Z{e,ej:;de0dd Z|ej:;deeegdd Z}ej:;deeegej:;dde~dfdedfgdd Zej:;deeeegej:;ddd dd gej:;dddgdd Zej:;deeegdd Zej:;deeegej:;dddgddddddddf	ddd dddddddf	ddd dddddddf	dddd dddd dddf	ddddddd dddf	dgddȄ Zej:;deddd̜dddΜgfee1ffddЄ Zdd҄ ZddԄ Zej:;deeeegddք Zej:;de0dd؄ Zej:;dejrejsgddۄ ZdS )    N)defaultdict)Mapping)partial)StringIO)product)assert_array_almost_equalassert_array_equal)sparse)clone)ENGLISH_STOP_WORDSCountVectorizerHashingVectorizerTfidfTransformerTfidfVectorizerstrip_accents_asciistrip_accents_unicode
strip_tags)GridSearchCVcross_val_scoretrain_test_split)Pipeline)	LinearSVC)assert_allclose_dense_sparseassert_almost_equalskip_if_32bit)_IS_WASMCSC_CONTAINERSCSR_CONTAINERS)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 C   s   t |  S N)r   uppers r"   f/var/www/html/assistant/venv/lib/python3.9/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercase9   s    r$   c                 C   s   |  ddS )N   ée)replacer    r"   r"   r#   strip_eacute=   s    r(   c                 C   s   |   S r   splitr    r"   r"   r#   split_tokenizeA   s    r+   c                 C   s   dgS )NZthe_ultimate_featurer"   r    r"   r"   r#   lazy_analyzeE   s    r,   c                  C   s   d} d}t | |ksJ d} d}t | |ks0J d} d}t | |ksHJ d} d}t | |ks`J d	} d
}t | |ksxJ d} d}t | |ksJ d} d
}t | |ksJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫ)r   aexpectedr"   r"   r#   test_strip_accentsI   s*    r9   c                  C   sd   d} d}t | |ksJ d} d}t | |ks0J d} d}t | |ksHJ d} d}t | |ks`J d S )	Nr-   r.   r/   r0   r1   r5   r2   r3   )r   r6   r"   r"   r#   test_to_asciim   s    r:   
Vectorizerc                 C   s   | dd  }d}g d}|||ks*J d}g d}|||ksFJ | dd  }td	}g d
}|||kstJ | td  }d}g d}|||ksJ | tdd  }d}g d}|||ksJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestZreallyZmetZharryZ	yesterdayfile)input'This is a test with a file-like object!)rJ   rK   rL   withrM   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
ZAIZMANGEZDUZ	KANGOUROUZCEZMIDIZETAITZPASZTRESZBON)	tokenizerr>   )
zj'airA   rB   rC   rD   zmidi,zc'etaitrG   rH   zbon.)build_analyzerr   r$   r+   )r;   watextr8   r"   r"   r#   test_word_analyzer_unigrams   s&    rY   c                  C   s2   t dddd } d}g d}| ||ks.J d S )Nwordunicode      analyzerr>   ngram_ranger?   )r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r   rV   )rW   rX   r8   r"   r"   r#   'test_word_analyzer_unigrams_and_bigrams   s    
rb   c                  C   s   d} |  d}tddd }tt || W d    n1 sF0    Y  tdddd }tt || W d    n1 s0    Y  d S )	Nr?   zutf-8r\   r<   )ra   encodingchar      )r`   ra   rc   )encoder   rV   pytestraisesUnicodeDecodeError)rX   Z
text_bytesrW   car"   r"   r#   test_unicode_decode_error   s    
&
rm   c                  C   s   t dddd } d}g d}| |d d |ks6J g d}| |d	d  |ksVJ d
}g d}| |d d |kszJ g d}| |d	d  |ksJ t dddd } td}g d}| |d d |ksJ d S )Nrd   r[   re   r_   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayrM   rN   r`   ra   rO   r   rV   r   cngarX   r8   r"   r"   r#   test_char_ngram_analyzer   s&    

r|   c                  C   s   t dddd } d}g d}| |d d |ks6J g d}| |d	d  |ksVJ t d
ddd } td}g d}| |d d |ksJ d S )NZchar_wbr[   re   r_   rp   )z thrq   rr   rs   z thirn   )rt   ru   rv   rw   zerday ro   rM   rx   zA test with a file-like object!)z a z teZtesestzst z tesrg   ry   rz   r"   r"   r#   test_char_wb_ngram_analyzer  s    

r~   c                  C   s   t dddd } d}g d}| |d d |ks6J g d}| |d	d  |ksVJ t d
ddd }t|}||| |ksJ d S )NrZ   r[   re   r_   rp   )zthis is testzis test reallyztest really metrf   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrM   rx   ry   )r{   rX   r8   Z	cnga_filerM   r"   r"   r#   test_word_ngram_analyzer  s    

r   c                  C   s   ddd} t |  }ttttttfD ]}|| }t|d}|	t
 t|trb|j| kstJ nt |j|kstJ |t
}|jd t|ksJ || }t|d}||}t||jd ks(J q(d S )Nr   r]   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvr"   r"   r#   &test_countvectorizer_custom_vocabulary6  s    






r   c                  C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ksJJ |jd t	| ks`J d S )Nr   r   countr   tfidfr]   )
r   r   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )Zwhat_we_likepiper   r"   r"   r#   /test_countvectorizer_custom_vocabulary_pipelineK  s    
r   c                  C   sV   ddd} d}t jt|d& t| d}|dg W d    n1 sH0    Y  d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   Zpasta_sizilianari   rj   
ValueErrorr   r   )r   msgr   r"   r"   r#   7test_countvectorizer_custom_vocabulary_repeated_indicesX  s
    

r   c                  C   sR   ddd} t jtdd& t| d}|dg W d    n1 sD0    Y  d S )Nr]   r^   r   zdoesn't contain indexr   r   Zpasta_verdurar   r   r   r"   r"   r#   0test_countvectorizer_custom_vocabulary_gap_index`  s    

r   c                  C   s   t  } | jdd |  tks"J | jdd tt |   W d    n1 sV0    Y  | jdd tt |   W d    n1 s0    Y  g d}| j|d |  t|ksJ d S )Nenglish
stop_wordsZ_bad_str_stop_Z_bad_unicode_stop_)Zsomeotherwords)r   
set_paramsget_stop_wordsr   ri   rj   r   r   )cvZstoplistr"   r"   r#   test_countvectorizer_stop_wordsg  s    &&r   c                  C   s   t jtdd& tg d} | dg W d    n1 s:0    Y  t jtdd* tddd}|g d W d    n1 s0    Y  d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   r"   r"   r#   %test_countvectorizer_empty_vocabularyv  s    
*r   c                  C   sF   t  } | td d }| tdd  }|jd |jd ksBJ d S )Nrn   r]   )r   r   r   r   )r   ZX1X2r"   r"   r#   test_fit_countvectorizer_twice  s    r   c                  C   s>   g d} d}t |d}||  g d}| }t|| dS )zCheck `get_feature_names_out()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r   r   get_feature_names_outr   )corpusr   
vectorizerr8   feature_names_outr"   r"   r#   )test_countvectorizer_custom_token_pattern  s    

r   c                  C   sV   g d} d}d}t |d}tjt|d ||  W d   n1 sH0    Y  dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr   ri   rj   r   r   )r   r   err_msgr   r"   r"   r#   <test_countvectorizer_custom_token_pattern_with_several_group  s    
r   c                  C   s   g d} d}t d| d}tjt|d ||  W d    n1 sF0    Y  t & tdt ||  W d    n1 s0    Y  d S )N)ZSampleZUpperZCaseZ
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r   ri   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   r"   r"   r#   'test_countvectorizer_uppercase_in_vocab  s    (
r   c                  C   sH   g dg dg dg} t ddd| }g d}||}t|| dS )	z0Check get_feature_names_out for TfidfTransformerr]   r]   r]   r]   r]   r   r]   r   r   Tl2
smooth_idfnorm)r7   cbN)r   r   r   r   )r   trZfeature_names_inr   r"   r"   r#   %test_tf_transformer_feature_names_out  s
    
r   c                  C   s   g dg dg dg} t ddd}||  }|dk s@J t|d jd	d
g d g dg dg dg} t ddd}||  }|dk sJ d S )Nr   r   r   Tr   r   r   r^   r]   Zaxisr   r   r   )r   r   toarrayallr   sumr   r   r   r"   r"   r#   test_tf_idf_smoothing  s    r   zcno floating point exceptions, see https://github.com/numpy/numpy/pull/21895#issuecomment-1311525881)reasonc                  C   s   g dg dg dg} t ddd}||  }|dk s@J t|d jd	d
g d g dg dg dg} t ddd}d}tjt|d ||   W d    n1 s0    Y  d S )Nr   r   r   Fr   r   r   r^   r]   r   r   zdivide by zeror   )	r   r   r   r   r   r   ri   r   RuntimeWarning)r   r   r   Zin_warning_messager"   r"   r#   test_tfidf_no_smoothing  s    r   c                  C   s   dgdgdgg} t ddd d}||  }|d dks<J |d |d ksPJ |d |d ksdJ |d dk stJ |d dk sJ d S )Nr]   r^   rf   TF)sublinear_tfuse_idfr   r   )r   r   r   r   r"   r"   r#   test_sublinear_tf  s    r   c                  C   s  t td d } td g}ttd }tdd}|| }t|drL| }|d|jd f dksfJ t|jd	}||fD ]}||}t|dr| }|j}|d|d
 f dksJ |d|d f dksJ |d|d f dksJ d|vsJ d|vsJ |d|d f dksJ |d|d f dks6J |d|d f dksPJ |d|d f dkszJ qzt	dd}	|	
|| }
t|	jt|jksJ |
j|t|jfksJ |	| }|jt|t|jfksJ t	ddd}|
|| }t|drJ t	dd}tt || W d    n1 sN0    Y  ttj|dddg|  t td d } tdd}|j|_||  }|jrJ t|
| || }t|| td d	}tt ||  W d    n1 s0    Y  |jddd | }d}t|}||}||ksHJ |jdd d tt |  W d    n1 s0    Y  d |_tt |  W d    n1 s0    Y  d S )!Nr]         ?r   tocsrr   r   r^   r   saladtomatowaterthe	copyrightcokeburgerr   l1r   F)r   r   idf_Tr   r   r   r<   )r>   r   r?   Z_gabbledegook_)r>   rT   Z_invalid_analyzer_type_)r   r   r   r   r   hasattrr   r   r   r   r   r   r   r   ri   rj   r   r   npr   r   r   fixed_vocabulary_r   build_preprocessorr   rV   )
train_data	test_dataZn_trainZv1Zcounts_trainZv2r   Zcounts_testr   t1r   Z
tfidf_testt2tfZt3tvZtfidf2Ztfidf_test2Zv3	processorrX   r8   resultr"   r"   r#   test_vectorizer  sv    







*



*(r  c                  C   s  d\} }}}t | |||d}|t |jj| ks6J |jj|ksFJ |jj|ksVJ |jj|ksfJ d|_d|_d|_d|_|jj| ksJ |jj|ksJ |jj|ksJ |jj|ksJ |t |jj|jksJ |jj|jksJ |jj|jksJ |jj|jksJ d S )N)r   FFF)r   r   r   r   r   T)r   r   r   _tfidfr   r   r   r   )r   r   r   r   r  r"   r"   r#   test_tfidf_vectorizer_settersi  s,    

r  c                  C   s  t  } | t}|j}|jtt| jfks.J |j| jks>J t	|j
dksRJ t	|j
dk sfJ t|j
dkszJ t|j
dk sJ t|jd D ]}ttj|d j
dd qt ddd} | t}|jtt| jfksJ |j| jksJ |j}||ksJ |d| k s J t	|j
dks6J t|j
dk sLJ t|jd D ] }ttj|d j
dd qZd S )	Nr   r   r]   r^   r   r\   r   )ra   r   )r   r   r   nnzr   r   
n_featuresdtyper   mindatamaxranger   Zlinalgr   )r   r   Z	token_nnziZ
ngrams_nnzr"   r"   r#   test_hashing_vectorizer  s,    

r  c                  C   s8  t dd} tt |   W d    n1 s20    Y  | jrFJ | t}|j\}}t	| j
|kslJ |  }t|tjsJ |jtksJ t	||ksJ tg d| t|D ]\}}|| j
|ksJ qg d}t |d} |  }tg d| | js
J t|D ] \}}|| j
|ksJ qd S )Nr   r   	r   r   celerir   r   r   Z	sparklingr   r   r   )r   ri   rj   r   r   r   r   r   r   r   r   r   r   Zndarrayr	  rR   r   	enumerateget)r   r   Z	n_samplesr  Zfeature_namesidxnamer   r"   r"   r#   test_feature_names  s6    
&



r  c                 C   s4   h d}| ddd}| t t|j|ks0J d S )N>   r   r   r   r   g333333?   )r   max_features)r   r   r   r   )r;   Zexpected_vocabularyr   r"   r"   r#   test_vectorizer_max_features  s    
r  c            	      C   s   t dd} t dd}t d d}| tjdd}|tjdd}|tjdd}|  }| }| }d| ks|J d| ksJ d| ksJ d|t| ksJ d|t| ksJ d|t| ksJ d S )Nr]   r  rf   r   r      r   )r   r   r   r   r   r  r   Zargmax)	Zcv_1Zcv_3Zcv_NoneZcounts_1Zcounts_3Zcounts_NoneZ
features_1Z
features_3Zfeatures_Noner"   r"   r#   "test_count_vectorizer_max_features  s    


r  c                  C   s   g d} t ddd}||  d|j v s0J t|j dksFJ d|_||  d|j vshJ t|j dks~J d	|_||  d|j vsJ t|j dksJ d S )
NabcZdeaZeatrd   r   r`   r   r7   rg   r   r  r]   )r   r   r   r   r   r   r   r   r"   r"   r#   test_vectorizer_max_df  s    


r!  c                  C   s   g d} t ddd}||  d|j v s0J t|j dksFJ d|_||  d|j vshJ t|j dks~J d	|_||  d|j vsJ t|j dksJ d S )
Nr  rd   r]   )r`   min_dfr7   rg   r^   r   g?)r   r   r   r   r   r"  r   r"   r"   r#   test_vectorizer_min_df)  s    


r#  c                  C   s   ddg} t ddd}||  }tg d|  tg dg dg| t ddd	d
}||  }tg dg dg| t ddd	tjd}|| }|jtjksJ d S )Naaabcabbderd   r   r  )r7   r   r   dr&   )rf   r]   r]   r   r   )r]   r^   r   r]   r]   T)r`   r   binary)r]   r]   r]   r   r   )r]   r]   r   r]   r]   )r`   r   r'  r	  )r   r   r   r   r   r   float32r	  )r   r   r   ZX_sparser"   r"   r#   test_count_binary_occurrences;  s    
r)  c                  C   s   ddg} t ddd d}|| }t|dd jdks<J t|dd	 jd	ksXJ |jtjkshJ t ddd
d d}|| }t|jdksJ |jtjksJ t ddd
d tjd}|| }|jtjksJ d S )Nr$  r%  Frd   )alternate_signr`   r   r   r]   rf   r^   T)r`   r*  r'  r   )r`   r*  r'  r   r	  )r   r   r   r  r  r	  float64)r   r   r   r"   r"   r#   test_hashed_binary_occurrencesO  s"    


r,  c                 C   s  t }|  }||}||}t|ts,J | }t||D ]6\}}tt	||}tt	|}t
|| q>t|sJ |jdksJ | }	||	}
t||
D ]\}}t
t|t| q| }||}t||D ]\}}t
t|t| qd S )NZcsr)r   r   r   r   r   rV   zipr   sortuniquer   r	   issparseformatr   Ztocsc)r;   r  r   Ztransformed_dataZinversed_dataZanalyzedocZinversed_termsr   Ztransformed_data2Zinversed_data2Zterms2Ztransformed_data3Zinversed_data3Zterms3r"   r"   r#   !test_vectorizer_inverse_transformi  s(    



r3  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
d}t||ddd}|||	|}	t
|	| |jdksJ |jjd }
|
jdksJ d S )Nr   r]   g?r   Z	test_sizerandom_stater   svcr]   r]   r\   ZhingeZsquared_hinge)vect__ngram_range	svc__lossrf   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r   r   r   r   predictr   best_score_best_estimator_r   ra   r  targetr   r   Ztarget_trainZtarget_testpipeline
parametersZgrid_searchpredZbest_vectorizerr"   r"   r#   -test_count_vectorizer_pipeline_grid_selection  s    
rE  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
dd}t||dd}|||	|}	t
|	| |jdksJ |jjd }
|
jdksJ |
jdksJ |
jrJ d S )Nr   r]   g?r   r4  r   r6  r7  r\   )r   r   r8  )r9  Z
vect__normr:  )r;  r   r   )r   r<  r   r   r   r   r   r   r   r=  r   r>  r?  r   ra   r   r   r@  r"   r"   r#   'test_vectorizer_pipeline_grid_selection  s$    
rF  c                  C   s^   t t } dgtt  dgtt  }tdt fdt fg}t|| |dd}t|g d d S )Nr   r]   r   r6  rf   )r   r   )r   r<  r   r   r   r   r   r   )r  rA  rB  Z	cv_scoresr"   r"   r#   )test_vectorizer_pipeline_cross_validation  s
    rG  c                  C   sx   d} t  }|| g}|jdks$J td dd}|| g}|jdksJJ |j|jksZJ tt|j	t|j	 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)r]      F)r   r*  )r]   i   )
r   r   r   r   r   r  r   r   r.  r  )r   r   Z	X_countedZX_hashedr"   r"   r#   test_vectorizer_unicode  s    rI  c                  C   sF   ddg} t | d}|t}|t}t| |  |jsBJ d S )Nr   r  r   )r   r   r   r   r   r   r   )r   r   ZX_1ZX_2r"   r"   r#   +test_tfidf_vectorizer_with_fixed_vocabulary  s    


rJ  c                  C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} | D ]T}t	|}t
|}t||jksJ | | ksJ t|t|t qjd S )
Nr   r   T)r'  r\   ra   rS   )r`   r=   )r   r   r   r,   r   r   r(   r   pickledumpsloadstype	__class__
get_paramsr   r   )Z	instancesorigr!   copyr"   r"   r#   test_pickling_vectorizer  s,    


rT  factoryc                 C   sB   t  }| |}d}tt|}||}||}||ks>J dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    r?   N)r   rL  rN  rM  )rU  vecfunctionrX   Zroundtripped_functionr8   r  r"   r"   r#   test_pickling_built_processors  s    rX  c                  C   s   t jd} t g d}tddD ]X}t| j|ddd}t|d}t	t
|}|t |t t| |  q$d S Nr   r  d   rn   F)sizer'   r   )r   randomRandomStatearrayr  r   choicer   rL  rN  rM  r   r   r   r   )rngvocab_wordsxZ	vocab_setr   unpickled_cvr"   r"   r#   -test_countvectorizer_vocab_sets_when_pickling3  s    


rd  c                  C   s   t jd} t g d}tddD ]v}t }| j|ddd}tddD ]}|||| < qHt|d}t	t
|}|t |t t| |  q$d S rY  )r   r\  r]  r^  r  r   r_  r   rL  rN  rM  r   r   r   r   )r`  ra  rb  Z
vocab_dictr   yr   rc  r"   r"   r#   .test_countvectorizer_vocab_dicts_when_picklingO  s     


rf  c                  C   s`   t  t} t | }t|}t|}t||j	ks>J t
||  ||   d S r   )r   r   r   r   r   rL  rM  rN  rO  rP  r   r   )r   rR  r!   rS  r"   r"   r#   test_pickling_transformerl  s    

rg  c                  C   sH   t  t} t | }t }|j|_t||  ||   d S r   )	r   r   r   r   r   r   r   r   r   )r   rR  rS  r"   r"   r#   test_transformer_idf_setteru  s
    rh  c                  C   s   t dd} | t t | jdd}| j|_t|t | t  t | jdd}d}tj	t
|d | j|_W d    n1 s0    Y  d S )NTr   r   r   Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r   r   r   r   ri   rj   r   )rR  rS  r   r"   r"   r#   test_tfidf_vectorizer_setter}  s    

rj  c                  C   st   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W d    n1 sf0    Y  d S )NTr   ri  r   r]   r   )
r   r   r   r   r   r   ri   rj   r   setattr)r   rS  Zexpected_idf_lenZinvalid_idfr"   r"   r#   %test_tfidfvectorizer_invalid_idf_attr  s    


rl  c                  C   sJ   g d} t | d}tt |g  W d    n1 s<0    Y  d S )N)r7   r   r   r7   r7   r   r   r   r"   r"   r#   test_non_unique_vocab  s    
rm  c                  C   sH   d} t }dd }tj|| d |  W d    n1 s:0    Y  d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  S   s   t  } | dtjdg d S )Nhello worldhello hello)r   r   r   nan)Zhvr"   r"   r#   func  s    z0test_hashingvectorizer_nan_in_docs.<locals>.funcr   )r   ri   rj   )r   	exceptionrq  r"   r"   r#   "test_hashingvectorizer_nan_in_docs  s
    rs  c                  C   sd   t ddd d} | jsJ | ddg }t| g d | ddg }t| g d d S )NTF)r'  r   r   rn  ro  )r]   r]   r]   r   )r   r'  r   r   r   Zravelr   )r   r   r   r"   r"   r#   test_tfidfvectorizer_binary  s    
rt  c                  C   s(   t dd} | t t| j| jj d S )NTr   )r   r   r   r   r   r  )r   r"   r"   r#   test_tfidfvectorizer_export_idf  s    

ru  c                  C   s<   t dgd} t| }| t |t |j| jks8J d S )Nr   r   )r   r
   r   r   r   )Z
vect_vocabZvect_vocab_cloner"   r"   r#   test_vectorizer_vocab_clone  s
    

rv  c                 C   s   d}|  }t jt|d |d W d    n1 s80    Y  t jt|d |d W d    n1 sp0    Y  |ddg t jt|d |d W d    n1 s0    Y  d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)ri   rj   r   r   r   r   )r;   r   rV  r"   r"   r#   &test_vectorizer_string_object_as_input  s    ((rx  X_dtypec                 C   s2   t jdd| dd}t |}|j|jks.J d S N
    N  *   r	  r5  )r	   randr   r   r	  )ry  r   ZX_transr"   r"   r#   test_tfidf_transformer_type  s    r  zcsc_container, csr_containerc                 C   sZ   t jddtjdd}| |}||}t |}t |}t|| |j|jksVJ d S rz  )r	   r  r   r+  r   r   r   r1  )Zcsc_containercsr_containerr   ZX_cscX_csrZX_trans_cscZX_trans_csrr"   r"   r#   test_tfidf_transformer_sparse  s    
r  z0vectorizer_dtype, output_dtype, warning_expectedTFc                 C   s   t g d}t| d}d}|rZtjt|d ||}W d    q1 sN0    Y  n>t & t	dt ||}W d    n1 s0    Y  |j
|ksJ d S )N)numpyscipyZsklearnr	  z'dtype' should be used.r   r   )r   r^  r   ri   r   r   r   r   r   r   r	  )Zvectorizer_dtypeZoutput_dtypeZwarning_expectedr   r   Zwarning_msg_matchZX_idfr"   r"   r#   test_tfidf_vectorizer_type  s    

*
(r  rV  )r^   r]   rK  c                 C   s   | j }td| d}tjt|d | dg W d    n1 sH0    Y  tjt|d | dg W d    n1 s0    Y  t| t	rtjt|d | 
dg W d    n1 s0    Y  d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.r   zgood news everyone)ra   reescaperi   rj   r   r   r   r   r   r   )rV  Zinvalid_ranger   r"   r"   r#   $test_vectorizers_invalid_ngram_range  s    
**
r  c                 C   s&   |   }|  }|  }| |||S r   )r   build_tokenizerr   _check_stop_words_consistency)Z	estimatorr   tokenize
preprocessr"   r"   r#   r     s    r  c               	   C   s   d} d|  }t  t t fD ]b}|jg dd tjt|d |dg W d    n1 s`0    Y  |`t	|du sJ qt
 ( t
dt |dg W d    n1 s0    Y  t	|d u sJ |jg d	d tjt|d |dg W d    n1 s0    Y  d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   rn  Fr   )r  r  r  Zblahr  )r   r   r   r   ri   r   r   r   Z_stop_words_idr  r   r   r   )Zlstrr   rV  r"   r"   r#   'test_vectorizer_stop_words_inconsistent'  s$    *
*r  r  c                 C   s^   | dt jd}t j}|j||_|j||_dddd}t ||}||jjksZJ dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )rn   rn   r  r   r]   r^   )zscikit-learnrK   zgreat!N)r   int64indicesZastypeZindptrr   Z_sort_featuresr	  )r  r   ZINDICES_DTYPEr   ZXsr"   r"   r#   7test_countvectorizer_sort_features_64bit_sparse_indicesB  s    r  	Estimatorc                 C   s   ddig}|  }t |du s J | dd dgd}t |dksBJ t |d u sRJ || G d	d
 d
| }|dgd}t |dksJ | dd dgd}t |du sJ d S )NrX   rw  Tc                 S   s   | d S NrX   r"   rb  r"   r"   r#   <lambda>e      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)rT   r   r   c                   @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                 S   s   dd S )Nc                 S   s   | d S r  r"   r  r"   r"   r#   r  m  r  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r"   )selfr"   r"   r#   r   l  s    zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r   r"   r"   r"   r#   CustomEstimatork  s   r  r   c                 S   s   t d| S )Nz\w{1,})r  compilefindallr2  r"   r"   r#   r  s  r  )rU   r   )r  r   )r  r  rV  r  r"   r"   r#   -test_stop_word_validation_custom_preprocessor\  s    


r  zinput_type, err_type, err_msgfilenamer5   rM   z$'str' object has no attribute 'read'c                 C   sN   dg}t j||d& | dd |d| W d    n1 s@0    Y  d S )N"this is text, not file or filenamer   c                 S   s   |   S r   r)   r  r"   r"   r#   r    r  z.test_callable_analyzer_error.<locals>.<lambda>r`   rN   )ri   rj   r   )r  
input_typeZerr_typer   r  r"   r"   r#   test_callable_analyzer_errorx  s    r  r`   c                 C   s
   t | dS )Nr)openr  r"   r"   r#   r    r  r  c                 C   s   |   S r   )readr  r"   r"   r#   r    r  r  c                 C   sJ   dg}t ttf" | ||d| W d    n1 s<0    Y  d S )Nr  r  )ri   rj   FileNotFoundErrorAttributeErrorr   )r  r`   r  r  r"   r"   r#   &test_callable_analyzer_change_behavior  s    r  c                 C   sb   dd }|  d}|d tjtdd$ ||dd|g W d    n1 sT0    Y  d S )	Nc                 S   s   t dd S )Ntesting)	Exceptionr  r"   r"   r#   r`     s    z6test_callable_analyzer_reraise_error.<locals>.analyzerzfile.txtzsample content
r  r   rM   r  )joinwriteri   rj   r  r   )Ztmpdirr  r`   fr"   r"   r#   $test_callable_analyzer_reraise_error  s
    

r  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgr  r  r7  rd   z'stop_words'
'analyzer'	!= 'word'c                 C   s   |   S r   r)   r    r"   r"   r#   r    r  z'tokenizer'c                 C   s   |   S r   r)   r    r"   r"   r#   r    r  \w+rZ   'token_pattern'zis not Nonec                 C   s   |   S r   r   r    r"   r"   r#   r    r  c                 C   s   |   S r   r  r    r"   r"   r#   r    r  z'preprocessor'zis callabler\   c                 C   s   |   S r   r  r    r"   r"   r#   r    r  z'ngram_range')	NNNr7  r  rd   r  r  r  c
                 C   sj   t }
|  }|j||||||d d|||	f }tjt|d ||
 W d    n1 s\0    Y  d S )N)r   rU   rT   ra   r   r`   z-The parameter %s will not be used since %s %sr   )r   r   ri   r   r   r   )r;   r   rU   rT   ra   r   r`   Zunused_nameZ	ovrd_nameZovrd_msgr   r   r   r"   r"   r#   test_unused_parameters_warn  s"    Yr  zVectorizer, Xr]   r^   )r   barrf   )r   Zbazc                 C   s0   |  }t |drJ || t |dr,J d S )NZn_features_in_)r   r   )r;   r   r   r"   r"   r#   test_n_features_in  s    	
r  c                  C   s:   t dd} | ddgj}| ddgj}||ks6J d S )Nr]   r  ZhelloZworld)r   r   r   )rV  Zvocab1Zvocab2r"   r"   r#   )test_tie_breaking_sample_order_invariance%  s    
r  c                  C   s.   t ddd} | dgj}|d dks*J d S )Ni@B )r^   rf   )r  ra   z22pcs efuturer   )r   r   r  )Zhashingr  r"   r"   r#   2test_nonnegative_hashing_vectorizer_result_indices.  s    r  c                 C   s   |  }t |drJ dS )z0Check that vectorizers do not define set_output.Z
set_outputN)r   )r  r}   r"   r"   r#   'test_vectorizers_do_not_have_set_output5  s    r  c                 C   s   t jddtjdd}| |}| }t |}|j|dd}t|| ||usTJ |j|dd}||u snJ t	
t t|| W d   n1 s0    Y  dS )	zJCheck the behaviour of TfidfTransformer.transform with the copy parameter.r{  r|  r}  r~  T)rS  FN)r	   r  r   r+  rS  r   r   r   r   ri   rj   AssertionError)r  r   r  ZX_csr_originalZtransformerZX_transformr"   r"   r#   test_tfidf_transformer_copy>  s    
r  r	  c                 C   s6   dd t dD }t| d|}|jj| ks2J dS )zCheck that `idf_` has the same dtype as the input data.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/30016
    c                 S   s   g | ]}t t qS r"   )struuiduuid4).0r  r"   r"   r#   
<listcomp>Z  r  z<test_tfidf_vectorizer_perserve_dtype_idf.<locals>.<listcomp>i r  N)r  r   r   r   r	  )r	  r   r   r"   r"   r#   (test_tfidf_vectorizer_perserve_dtype_idfS  s    r  )rL  r  r  r   collectionsr   collections.abcr   	functoolsr   ior   	itertoolsr   r  r   ri   Znumpy.testingr   r   r  r	   Zsklearn.baser
   Zsklearn.feature_extraction.textr   r   r   r   r   r   r   r   Zsklearn.model_selectionr   r   r   Zsklearn.pipeliner   Zsklearn.svmr   Zsklearn.utils._testingr   r   r   Zsklearn.utils.fixesr   r   r   r   r<  r   r$   r(   r+   r,   r9   r:   markZparametrizerY   rb   rm   r|   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   Zxfailr   r   r  r  r  r  r  r  r!  r#  r)  r,  r3  rE  rF  rG  rI  rJ  rT  rV   r   r  rX  rd  rf  rg  rh  rj  rl  rm  rs  rt  ru  rv  rx  r(  r+  r  r  Zint32r  r  r  r  r  r  r  r  r  r  paramr  r  r  r  r  r  r  r  r  r"   r"   r"   r#   <module>   s  (
	$
=

g&G
	
$'

	





	







J 
	

