Merge pull request #6 from LAMDA-NJU/dev

[MNT] Enhance the robustness of generating rkme spec
3 years ago · 1bb6410d33
--- a/learnware/init.py
+++ b/learnware/init.py
@@ -1,4 +1,4 @@
 __version__ = "0.1.0.99"
 __version__ = "0.1.0.dev"

 import os
 from .logger import get_module_logger
--- a/learnware/specification/rkme.py
+++ b/learnware/specification/rkme.py
@@ -104,12 +104,17 @@ class RKMEStatSpecification(BaseStatSpecification):
        Z_shape = tuple([K] + list(X_shape)[1:])
        X = X.reshape(self.num_points, -1)

        # fill np.nan
        X_nan = np.isnan(X)
        if X_nan.max() == 1:
        # Check data values
        X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan
        if np.any(np.isnan(X)):
            for col in range(X.shape[1]):
                col_mean = np.nanmean(X[:, col])
                X[:, col] = np.where(X_nan[:, col], col_mean, X[:, col])
                is_nan = np.isnan(X[:, col])
                if np.any(is_nan):
                    if np.all(is_nan):
                        raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.")
                    # Fill np.nan with np.nanmean
                    col_mean = np.nanmean(X[:, col])
                    X[:, col] = np.where(is_nan, col_mean, X[:, col])

        if not reduce:
            self.z = X.reshape(X_shape)
@@ -433,7 +438,6 @@ def choose_device(cuda_idx=-1):
    """
    if cuda_idx != -1:
        device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
        # device = torch.device(f"cuda:{cuda_idx}")
    else:
        device = torch.device("cpu")
    return device
--- a/learnware/specification/utils.py
+++ b/learnware/specification/utils.py
@@ -1,74 +1,116 @@
 import torch
 import numpy as np
 import pandas as pd
 from typing import Union

 from .base import BaseStatSpecification
 from .rkme import RKMEStatSpecification
 from ..config import C


 def convert_to_numpy(data: Union[np.ndarray, pd.DataFrame, torch.Tensor]):
    """Convert data to np.ndarray

    Parameters
    ----------
    data : np.ndarray, pd.DataFrame, or torch.Tensor
        The input data that needs to be converted to a NumPy array.

    Returns
    -------
    np.ndarray
        The data converted to a NumPy array.
    """
    if isinstance(data, np.ndarray):
        return data
    elif isinstance(data, pd.DataFrame):
        return data.to_numpy()
    elif isinstance(data, torch.Tensor):
        return data.detach().cpu().numpy()
    else:
        raise TypeError("Unsupported data format. Please provide a NumPy array, a Pandas DataFrame, or a PyTorch Tensor.")


 def generate_rkme_spec(
    X: np.ndarray,
    X: Union[np.ndarray, pd.DataFrame, torch.Tensor],
    gamma: float = 0.1,
    K: int = 100,
    reduced_set_size: int = 100,
    step_size: float = 0.1,
    steps: int = 3,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    cuda_idx: int = -1,
    cuda_idx: int = None,
 ) -> RKMEStatSpecification:
    """
            Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification.
            Return a RKMEStatSpecification object, use .save() method to save as json file.

        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification.
        Return a RKMEStatSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : np.ndarray
            Raw data in np.ndarray format.
            Size of array: (n*d)
    X : np.ndarray, pd.DataFrame, or torch.Tensor
        Raw data in np.ndarray, pd.DataFrame, or torch.Tensor format.
        The shape of X:
            First dimension represents the number of samples (data points).
            The remaining dimensions represent the dimensions (features) of each sample.
            For example, if X has shape (100, 3), it means there are 100 samples, and each sample has 3 features.
    gamma : float
    Bandwidth in gaussian kernel, by default 0.1.
    K : int
            Size of the construced reduced set.
        Bandwidth in gaussian kernel, by default 0.1.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
            Step size for gradient descent in the iterative optimization.
        Step size for gradient descent in the iterative optimization.
    steps : int
            Total rounds in the iterative optimization.
        Total rounds in the iterative optimization.
    nonnegative_beta : bool, optional
            True if weights for the reduced set are intended to be kept non-negative, by default False.
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
            Whether shrink original data to a smaller set, by default True
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
            A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.

    Returns
    -------
    RKMEStatSpecification
            A RKMEStatSpecification object
        A RKMEStatSpecification object
    """
    # Convert data type
    X = convert_to_numpy(X)
    X = np.ascontiguousarray(X).astype(np.float32)
    
    # Check reduced_set_size
    max_reduced_set_size = C.max_reduced_set_size
    if K * X[0].size > max_reduced_set_size:
        K = max(1, max_reduced_set_size // X[0].size)
    if reduced_set_size * X[0].size > max_reduced_set_size:
        reduced_set_size = max(1, max_reduced_set_size // X[0].size)
    
    # Check cuda_idx
    if not torch.cuda.is_available() or cuda_idx == -1:
        cuda_idx = -1
    else:
        num_cuda_devices = torch.cuda.device_count()
        if cuda_idx is None or not (cuda_idx >= 0 and cuda_idx < num_cuda_devices):
            cuda_idx = 0
    
    # Generate rkme spec
    rkme_spec = RKMEStatSpecification(gamma=gamma, cuda_idx=cuda_idx)
    rkme_spec.generate_stat_spec_from_data(X, K, step_size, steps, nonnegative_beta, reduce)
    rkme_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce)
    return rkme_spec


 def generate_stat_spec(X: np.ndarray) -> BaseStatSpecification:
    """
            Interface for users to generate statistical specification.
            Return a StatSpecification object, use .save() method to save as npy file.

        Interface for users to generate statistical specification.
        Return a StatSpecification object, use .save() method to save as npy file.

    Parameters
    ----------
    X : np.ndarray
            Raw data in np.ndarray format.
            Size of array: (n*d)
        Raw data in np.ndarray format.
        Size of array: (n*d)

    Returns
    -------
    StatSpecification
            A StatSpecification object
        A StatSpecification object
    """
    return None
    return None