a
    ­Àh›1  ã                   @   sÔ   d Z ddlZddlZddlZddlmZ zddlZW n e	yN   e
dƒ Y n0 dd„ Zd"d	d
„Zdd„ ZG dd„ dƒZG dd„ deƒZd#dd„Zd$dd„ZG dd„ deƒZdd„ Zdd„ Zd%dd„Zd&d d!„ZdS )'zO
This contrib module contains a few routines useful to do clustering variants.
é    N)Ú
ThreadPoolz2scipy not accessible, Python k-means will not workc                  O   s   d S ©N© )ÚargÚkwargsr   r   úT/var/www/html/assistant/venv/lib/python3.9/site-packages/faiss/contrib/clustering.pyÚ	print_nop   s    r   Té   c                 K   st  | j d }| dd¡}|rtnt}|d| j › d|› d|› ƒ |dƒ tj||f|dd	œ|¤Ž}	|	 | ¡ |	jg}
|ƒ  |	j}|d
ƒ t	 	¡ }|	 
| ¡\}}tj||d}|dt	 	¡ | d›dt|ƒ› dt|ƒ› ƒ | ¡ }~	|st |d ¡| | }|dd… |dd…  }ndt |¡}|| |d  }|dd…  |dd… 8  < t|ƒ|ks^J ‚|dt|ƒ› dt|ƒ› ƒ d}g }t	 	¡ }t|ƒD ]¸}t|| ƒ}|dt	 	¡ | d›d|› d|› d|› d	ddd |||  }|||… }t || |k¡sJ ‚tj||fi |¤Ž}	| | }|	 |¡ |
 |	j¡ | |	j¡ ~	|}q’|dt	 	¡ | d›dƒ t |¡|
fS )a=  
    perform 2-level clustering on a training set xt
    nc1 and nc2 are the number of clusters at each level, the final number of
    clusters is nc2. Additional arguments are passed to the Kmeans object.

    Rebalance allocates the number of sub-clusters depending on the number of
    first-level assignment.
    é   ÚverboseFz2-level clustering of z nb 1st level clusters = z total zperform coarse trainingiÐ  )ÚniterZmax_points_per_centroidzassigning the training set©Z	minlengthzdone in z.2fz s. Sizes of clusters ú-Néÿÿÿÿznb 2nd-level centroids r   ú[z s] training sub-cluster ú/z nc2=úÚ T©ÚendÚflushz s)ÚshapeÚgetÚprintr   ÚfaissZKmeansÚtrainÚiteration_statsÚ	centroidsÚtimeÚassignÚnpÚbincountÚminÚmaxZargsortÚarangeZcumsumÚsumÚrangeÚintÚallÚappendZvstack)ÚxtÚnc1Znc2Z	rebalanceZclustering_niterÚargsÚdr   ÚlogÚkmr   Z
centroids1Út0Ú_Zassign1ÚbcÚoÚccZall_nc2Zbc_sumZi0Úc2Úc1Úi1ZsubsetZxtsubr   r   r   Útwo_level_clustering   sd    	
ÿþý
,
2
r8   c                 K   sâ   t  | ¡} t| t jƒrht| j ¡ ƒD ]$}| j |¡}| |¡ | 	|¡}q$t
| j|fi |¤Ž d| _dS t| t jƒsxJ ‚| jt jksˆJ ‚tt | j¡ƒ}td|ƒ t||| jfi |¤Ž\}}| j |¡ | j |¡ |  |¡ dS )zJ
    Applies 2-level clustering to an index_ivf embedded in an index.
    TNz
REBALANCE=)r   Zdowncast_indexÚ
isinstanceZIndexPreTransformr&   ÚchainÚsizeÚatr   ÚapplyÚtrain_ivf_index_with_2levelÚindexZ
is_trainedZIndexIVFZmetric_typeZ	METRIC_L2r'   r    ÚsqrtZnlistr   r8   Z	quantizerÚadd)r?   r*   r,   ÚiÚvtr+   r   r1   r   r   r   r>   _   s"    


r>   c                   @   sB   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
d„ Zddd„Z	dS )ÚDatasetAssignú†Wrapper for a matrix that offers a function to assign the vectors
    to centroids. All other implementations offer the same interfacec                 C   s   t j|dd| _d S ©NÚfloat32©Zdtype)r    ÚascontiguousarrayÚx©ÚselfrJ   r   r   r   Ú__init__†   s    zDatasetAssign.__init__c                 C   s   | j jd S )Nr   ©rJ   r   ©rL   r   r   r   Úcount‰   s    zDatasetAssign.countc                 C   s   | j jd S ©Nr
   rN   rO   r   r   r   ÚdimŒ   s    zDatasetAssign.dimc                 C   s
   | j | S r   )rJ   ©rL   Úindicesr   r   r   Ú
get_subset   s    zDatasetAssign.get_subsetc                 C   s   t  | j|d¡S rQ   )r   ZknnrJ   ©rL   r   r   r   r   Úperform_search’   s    zDatasetAssign.perform_searchNc                 C   s„   |   |¡\}}| ¡ }| ¡ }|j\}}tj||fdd}|d u rVtj ||| j¡ n$tj |||d d …tjf | j ¡ |||fS rF   )	rW   Úravelr   r    ZzerosrA   r<   rJ   Znewaxis)rL   r   ÚweightsÚDÚIÚncr-   Úsum_per_centroidr   r   r   Ú	assign_to•   s    
$zDatasetAssign.assign_to)N)
Ú__name__Ú
__module__Ú__qualname__Ú__doc__rM   rP   rR   rU   rW   r^   r   r   r   r   rD   ‚   s   rD   c                   @   s"   e Zd ZdZddd„Zdd„ ZdS )	ÚDatasetAssignGPUz GPU version of the previous Fc                 C   sJ   t  | |¡ t |jd ¡}|dkr:t t ¡ ||¡| _nt |¡| _d S )Nr
   r   )	rD   rM   r   ZIndexFlatL2r   Zindex_cpu_to_gpuZStandardGpuResourcesr?   Zindex_cpu_to_all_gpus)rL   rJ   Zgpu_idr   r?   r   r   r   rM   §   s    þzDatasetAssignGPU.__init__c                 C   s&   | j  ¡  | j  |¡ | j  | jd¡S rQ   )r?   ÚresetrA   ÚsearchrJ   rV   r   r   r   rW   ²   s    
zDatasetAssignGPU.perform_searchN)F)r_   r`   ra   rb   rM   rW   r   r   r   r   rc   ¤   s   
rc   c           	      C   s   | j d }|j d }|du r*|d  d¡}|du rHt |  d¡ d¡¡}|d|  |j  }|jdd}| ¡ |t |¡|   | ¡  }||fS )z assignment function for xq is sparse, xb is dense
    uses a matrix multiplication. The squared norms can be provided if
    available.
    r   Né   r
   )Zaxis)	r   r%   r    ÚarrayÚpowerÚTZargminrX   r$   )	ÚxqÚxbÚxq_normsÚxb_normsÚnqÚnbZd2r[   rZ   r   r   r   Úsparse_assign_to_dense¸   s    

"rp   é @  c           
   	      sÄ   ˆj d }ˆj d ‰tj|dd‰ ˆ  tj¡ tj|td ‰ˆdu rTˆd  d¡‰‡ ‡‡‡‡‡‡‡‡f	dd„}|dksˆ|dksˆ|ˆkr tt	|t
d|ˆƒƒƒ nt|ƒ}	|	 	|t
d|ˆƒ¡ ˆ ˆfS )	zÙ
    decomposes the sparse_assign_to_dense function into blocks to avoid a
    possible memory blow up. Can be run in multithreaded mode, because scipy's
    sparse-dense matrix multiplication is single-threaded.
    r   rG   rH   Nrf   r
   c           	   	      sè   ˆ| | ˆ … }ˆ| | ˆ … }ˆ | | ˆ … }ˆd u rPt  | d¡ d¡¡}nˆ| | ˆ … }tdˆˆƒD ]v}t|ˆ||ˆ … |ˆ||ˆ … d\}}|dkr¾||d d …< ||d d …< ql||k }|| | ||< || ||< qld S )Nrf   r
   r   )rl   rm   )r    rg   rh   r%   r&   rp   )	rB   Zxq_blockZIblockZDblockZxq_norms_blockÚjZDiZIiÚmask©	rZ   r[   Úbbsro   Úqbsrk   rm   rj   rl   r   r   Úhandle_query_blockÙ   s&    ü
z9sparse_assign_to_dense_blocks.<locals>.handle_query_block)r   r    ÚemptyÚfillÚinfÚonesr'   r%   ÚlistÚmapr&   r   )
rj   rk   rl   rm   rv   ru   Úntrn   rw   Úpoolr   rt   r   Úsparse_assign_to_dense_blocksÉ   s    

r€   c                   @   s2   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	d
„ZdS )ÚDatasetAssignSparserE   c                 C   s4   |j tjjksJ ‚|| _t | d¡ d¡¡| _	d S )Nrf   r
   )
Ú	__class__ÚscipyÚsparseZ
csr_matrixrJ   r    rg   rh   r%   Úsquared_normsrK   r   r   r   rM   ý   s    zDatasetAssignSparse.__init__c                 C   s   t  | j|  ¡ ¡S r   )r    rg   rJ   ÚtodenserS   r   r   r   rU     s    zDatasetAssignSparse.get_subsetc                 C   s   t | j|| jdS )N)rl   )r€   rJ   r…   rV   r   r   r   rW     s    
ÿz"DatasetAssignSparse.perform_searchNc           	      C   sŠ   |   |¡\}}| ¡ }| ¡ }| jjd }|d u r@tj|dd}t|ƒ}tjj	||t 
|d ¡f||fd}t || j  ¡ ¡}|||fS )Nr   rG   rH   r
   )r   )rW   rX   rJ   r   r    r{   Úlenrƒ   r„   Z
csc_matrixr$   rg   r†   )	rL   r   rY   rZ   r[   Únr\   Úmr]   r   r   r   r^   	  s    þzDatasetAssignSparse.assign_to)N)r_   r`   ra   rb   rM   rU   rW   r^   r   r   r   r   r   ù   s
   r   c                 C   s&   t j|dd}t t|ƒ| t |¡¡S )NZint64rH   )r    rI   r   Úimbalance_factorr‡   Zswig_ptr)Úkr   r   r   r   rŠ     s    rŠ   c                 C   s>   | j tjkrdS dd l}t| |jƒr(dS tdt| ƒ› ƒ‚d S )NFr   TzUnknown tensor type )r‚   r    ZndarrayÚtorchr9   ZTensorÚNotImplementedErrorÚtype)rJ   rŒ   r   r   r   Úcheck_if_torch   s    r   c                 C   st  |du rt j}|j\}}d}t|ƒ}t  | dk¡d }t|ƒdkrFdS |rbddl}| |d ¡}	nt  |d ¡}	|	ddd…  d7  < |	ddd…  d8  < t|ƒdkrp|  d¡d }
d|
|
dk < |
|
 	¡  }
|
dk 	¡ }t
||jƒ}|j|||
d}t|d|… |ƒD ]V\}}|| }||	 ||< ||	 ||< | | d | |< | |  | | 8  < |d7 }q
||d… }qœ|S )z/ reassign centroids when some of them collapse Nr   rf   g      P?r
   Úfloat)r;   Úp)r    Úrandomr   r   Úwherer‡   rŒ   Z	ones_likeÚastyper%   r"   r;   ÚchoiceÚzip)Úhassignr   Úrsr‹   r-   ÚnsplitÚis_torchZempty_centsrŒ   ÚfacZprobasZnnzZnreplaceZcjsÚciÚcjÚcr   r   r   Úreassign_centroids)  s<    
rŸ   éÒ  Fc              	   C   sô  |  ¡ | ¡  }}|rtnt}	|	d||| ||f ƒ tj |¡}
tdƒ t ¡ }|
j|| dd}| 	|¡}t
|ƒ}g }|	dƒ d}g }t|ƒD ]L}t ¡ }|	ddd	d
 | |¡\}}}|	ddd	d
 |t ¡ | 7 }| ¡ }|rì| ¡ }| |¡ tj|| d}| dd¡ d¡}d||dk< |rBddl}| |¡ |j¡}|| }t|||
ƒ}|t ¡ | |t| |ƒ|dœ}|	d||d |d ||d |f ƒ | |¡ |durŽ|	d|ƒ |rÐddl}| ||¡ qŽt ||¡ qŽ|rì||fS |S dS )a0  Pure python kmeans implementation. Follows the Faiss C++ version
    quite closely, but takes a DatasetAssign instead of a training data
    matrix. Also redo is not implemented.

    For the torch implementation, the centroids are tensors (possibly on GPU),
    but the indices remain numpy on CPU.
    zAClustering %d points in %dD to %d clusters, %d iterations seed %dz
preproc...F)r;   Úreplacez  doner   Z	assigningr   Tr   zcompute centroidsr   r   r
   rG   N)Úobjr   Útime_searchrŠ   r™   zM  Iteration %d (%.2f s, search %.2f s): objective=%g imbalance=%.3f nsplit=%dr   r£   rŠ   zstoring centroids in)rP   rR   r   r   r    r’   ZRandomStater   r•   rU   r   r&   r^   r%   Úitemr)   r!   Zreshaper”   rŒ   Z
from_numpyÚtoZdevicerŸ   rŠ   Úsave)r‹   Údatar   ÚseedÚ
checkpointr   Zreturn_statsrˆ   r-   r.   r˜   r0   Úpermr   rš   r   Zt_search_totr¢   rB   Zt0sr   rZ   ZsumsÚerrr—   r›   rŒ   r™   Úsr   r   r   ÚkmeansZ  sn    	ÿ


ûýÿ

r­   )Tr	   )NN)NNrq   rq   N)N)r	   r    NTF)rb   Únumpyr    r   r   Zmultiprocessing.poolr   Zscipy.sparserƒ   ÚImportErrorr   r   r8   r>   rD   rc   rp   r€   r   rŠ   r   rŸ   r­   r   r   r   r   Ú<module>   s.   
G#"
 ÿ
0"	
1  ÿ