a
    hJ                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z
ddlZddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ eeedddZeee
jdddZdd ZdddZdddZdddZdS )z9Implementation of ARFF parsers: via LIAC-ARFF and pandas.    N)OrderedDict)	Generator)List   )_arff)ArffSparseDataType)chunk_generatorget_chunk_n_rows)check_pandas_support)	pd_fillna)	arff_datainclude_columnsreturnc                 C   s   t  t  t  f}dd t|D }t| d | d | d D ]@\}}}||v r:|d | |d | |d ||  q:|S )a  Obtains several columns from sparse ARFF representation. Additionally,
    the column indices are re-labelled, given the columns that are not
    included. (e.g., when including [1, 2, 3], the columns will be relabelled
    to [0, 1, 2]).

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                 S   s   i | ]\}}||qS  r   .0Z	array_idxZ
column_idxr   r   Y/var/www/html/assistant/venv/lib/python3.9/site-packages/sklearn/datasets/_arff_parser.py
<dictcomp>.   s   z)_split_sparse_columns.<locals>.<dictcomp>r      r   )list	enumeratezipappend)r   r   Zarff_data_newreindexed_columnsvalrow_idxcol_idxr   r   r   _split_sparse_columns   s    "r   c           	      C   s~   t | d d }|t|f}dd t|D }tj|tjd}t| d | d | d D ]"\}}}||v rV||||| f< qV|S )Nr   c                 S   s   i | ]\}}||qS r   r   r   r   r   r   r   @   s   z)_sparse_data_to_array.<locals>.<dictcomp>dtyper   r   )maxlenr   npemptyfloat64r   )	r   r   num_obsZy_shaper   yr   r   r   r   r   r   _sparse_data_to_array9   s    "r'   c                 C   sD   | | }t |dkr| | }nt |dkr8| |d  }nd}||fS )a  Post process a dataframe to select the desired columns in `X` and `y`.

    Parameters
    ----------
    frame : dataframe
        The dataframe to split into `X` and `y`.

    feature_names : list of str
        The list of feature names to populate `X`.

    target_names : list of str
        The list of target names to populate `y`.

    Returns
    -------
    X : dataframe
        The dataframe containing the features.

    y : {series, dataframe} or None
        The series or dataframe containing the target.
    r   r   r   N)r!   )frameZfeature_namesZtarget_namesXr&   r   r   r   _post_process_frameK   s    
r*   c           "         s  dd }|| }|dkrt jnt j}|dk }	t j|||	d}
|| fdd|
d D  |dkrtd	}t|
d }t| }t|
d
 }|j	|g|dd}|j
dd }t|}fdd|D }|| g}t|
d
 |D ]}||j	||dd|  qt|dkr,|d |d j|d< |j|dd}t||}~~i }|jD ]P}| d }| dkrzd||< n&| dkrd||< n|j| ||< qR||}t|||\}n|
d
 }fdd|D }fdd|D }t|trt|du rtd|d dkr d}n|d |d  }tjtj|d|d }|j| }|dd|f }|dd|f nt|t rt!||}t"|d d }|t|f} t#j$j%|d |d |d ff| tj&d!}|' }t(||ntd"t)|  fd#d$|D }!|!sn<t*|!r8t+ fd%dt,|D nt-|!rJtd&j.d dkrfd'nj.d dkrzd|dkr||dfS |d fS )(a  ARFF parser using the LIAC-ARFF library coded purely in Python.

    This parser is quite slow but consumes a generator. Currently it is needed
    to parse sparse datasets. For dense datasets, it is recommended to instead
    use the pandas-based parser, although it does not always handles the
    dtypes exactly the same.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    c                 s   s   | D ]}| dV  qd S )Nutf-8)decode)	gzip_fileliner   r   r   _io_to_generator   s    z+_liac_arff_parser.<locals>._io_to_generatorsparsepandas)return_typeencode_nominalc                    s(   i | ] \}}t |tr| v r||qS r   )
isinstancer   )r   namecatcolumns_to_selectr   r   r      s   z%_liac_arff_parser.<locals>.<dictcomp>
attributeszfetch_openml with as_frame=TruedataF)columnscopyT)deepc                    s   g | ]}| v r|qS r   r   r   colr7   r   r   
<listcomp>       z%_liac_arff_parser.<locals>.<listcomp>r   r   r   )Zignore_index	data_typeintegerInt64nominalcategoryc                    s   g | ]}t  | d  qS indexintr   col_nameopenml_columns_infor   r   r@      s   c                    s   g | ]}t  | d  qS rG   rI   rK   rM   r   r   r@      s   Nz6shape must be provided when arr['data'] is a Generatorr$   )r   count)shaper   z-Unexpected type for data obtained from arff: c                    s   h | ]}| v qS r   r   rK   )
categoriesr   r   	<setcomp>  s   z$_liac_arff_parser.<locals>.<setcomp>c              
      sJ   g | ]B\}}t t j |d ddd||d f jtddqS )Or   Nr   F)r<   )r"   ZtakeZasarraypopastyperJ   )r   irL   )rR   r&   r   r   r@     s
    zAMix of nominal and non-nominal targets is not currently supported)rO   )/r   ZCOOZ	DENSE_GENloadr
   r   r   keysnextZ	DataFrameZmemory_usagesumr	   r   r   r!   rV   dtypesconcatr   r;   lowerr*   r4   r   
ValueErrorr"   Zfromiter	itertoolschainfrom_iterableZreshapetupler   r    spr0   Z
coo_matrixr$   Ztocsrr'   typeallZhstackr   anyrQ   )"r-   output_arrays_typerN   feature_names_to_selecttarget_names_to_selectrQ   r/   streamr2   r3   Zarff_containerpdZcolumns_infoZcolumns_names	first_rowZfirst_dfZ	row_bytes	chunksizecolumns_to_keepdfsr:   r(   r\   r5   column_dtyper)   r   Zfeature_indices_to_selectZtarget_indices_to_selectrP   Zarff_data_Xr%   ZX_shapeZis_classificationr   )rR   r8   rN   r&   r   _liac_arff_parserk   s    7
















	

rr   c              
      s  ddl | D ]}|d dr q*qi |D ]:}|| d }| dkrXd|< q2| dkr2d	|< q2fd
dt|D }	dddgddddd|	d	}
i |
|pi }j| fi |}zdd |D |_W n4 ty } zj	d|W Y d}~n
d}~0 0 ||   fdd|jD }|| }t
dfdd}fdd|j D }|D ]}|| j|||< q`t|||\}}|dkr|||dfS | |  }}fdd|j D }||d|fS )a^  ARFF parser using `pandas.read_csv`.

    This parser uses the metadata fetched directly from OpenML and skips the metadata
    headers of ARFF file itself. The data is loaded as a CSV file.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The GZip compressed file with the ARFF formatted payload.

    output_arrays_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities are:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected to build `X`.

    target_names_to_select : list of str
        A list of the target names to be selected to build `y`.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    r   Nr+   z@datarB   rC   rD   rE   rF   c                    s"   i | ]\}}| v r| | qS r   r   )r   r   r5   )r\   r   r   r     s   z'_pandas_arff_parser.<locals>.<dictcomp>F?%"T\)	headerZ	index_colZ	na_valuesZkeep_default_nacomment	quotecharskipinitialspace
escapecharr   c                 S   s   g | ]}|qS r   r   )r   r5   r   r   r   r@     rA   z'_pandas_arff_parser.<locals>.<listcomp>zwThe number of columns provided by OpenML does not match the number of columns inferred by pandas when reading the file.c                    s   g | ]}| v r|qS r   r   r>   r7   r   r   r@     rA   z^'(?P<contents>.*)'$c                    s"   t  | }|d u r| S |dS )Ncontents)researchgroup)Zinput_stringmatch)single_quote_patternr   r   strip_single_quotes  s    z0_pandas_arff_parser.<locals>.strip_single_quotesc                    s    g | ]\}}t | jr|qS r   )r4   CategoricalDtyper   r5   r   rl   r   r   r@     s   r1   c                    s(   i | ] \}}t | jr||j qS r   )r4   r   rR   tolistr   r   r   r   r     s   )r1   r,   r^   
startswithr   Zread_csvr;   r_   errorsZParserErrorr}   compiler\   itemsr6   Zrename_categoriesr*   Zto_numpy)r-   rh   rN   ri   rj   read_csv_kwargsr.   r5   rq   Zdtypes_positionalZdefault_read_csv_kwargsr(   excro   r   Zcategorical_columnsr?   r)   r&   rR   r   )r8   r\   rl   r   r   _pandas_arff_parser7  sh    8






r   c                 C   sH   |dkrt | |||||S |dkr4t| |||||S td| ddS )a6  Load a compressed ARFF file using a given parser.

    Parameters
    ----------
    gzip_file : GzipFile instance
        The file compressed to be read.

    parser : {"pandas", "liac-arff"}
        The parser used to parse the ARFF file. "pandas" is recommended
        but only supports loading dense datasets.

    output_type : {"numpy", "sparse", "pandas"}
        The type of the arrays that will be returned. The possibilities ara:

        - `"numpy"`: both `X` and `y` will be NumPy arrays;
        - `"sparse"`: `X` will be sparse matrix and `y` will be a NumPy array;
        - `"pandas"`: `X` will be a pandas DataFrame and `y` will be either a
          pandas Series or DataFrame.

    openml_columns_info : dict
        The information provided by OpenML regarding the columns of the ARFF
        file.

    feature_names_to_select : list of str
        A list of the feature names to be selected.

    target_names_to_select : list of str
        A list of the target names to be selected.

    read_csv_kwargs : dict, default=None
        Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
        the default options.

    Returns
    -------
    X : {ndarray, sparse matrix, dataframe}
        The data matrix.

    y : {ndarray, dataframe, series}
        The target.

    frame : dataframe or None
        A dataframe containing both `X` and `y`. `None` if
        `output_array_type != "pandas"`.

    categories : list of str or None
        The names of the features that are categorical. `None` if
        `output_array_type == "pandas"`.
    z	liac-arffr1   zUnknown parser: 'z%'. Should be 'liac-arff' or 'pandas'.N)rr   r   r_   )r-   parseroutput_typerN   ri   rj   rQ   r   r   r   r   load_arff_from_gzip_file  s*    ;	
r   )N)N)NN)__doc__r`   r}   collectionsr   collections.abcr   typingr   numpyr"   Zscipyrd   Z	externalsr   Zexternals._arffr   Zutils._chunkingr   r	   Zutils._optional_dependenciesr
   Zutils.fixesr   r   Zndarrayr'   r*   rr   r   r   r   r   r   r   <module>   s8   $& 
 S 
    