diff --git a/learnware/__init__.py b/learnware/__init__.py index 234e6d2..67880e8 100644 --- a/learnware/__init__.py +++ b/learnware/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.0.99" +__version__ = "0.1.0.dev" import os from .logger import get_module_logger diff --git a/learnware/specification/rkme.py b/learnware/specification/rkme.py index f3bddc5..2271b03 100644 --- a/learnware/specification/rkme.py +++ b/learnware/specification/rkme.py @@ -104,12 +104,17 @@ class RKMEStatSpecification(BaseStatSpecification): Z_shape = tuple([K] + list(X_shape)[1:]) X = X.reshape(self.num_points, -1) - # fill np.nan - X_nan = np.isnan(X) - if X_nan.max() == 1: + # Check data values + X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan + if np.any(np.isnan(X)): for col in range(X.shape[1]): - col_mean = np.nanmean(X[:, col]) - X[:, col] = np.where(X_nan[:, col], col_mean, X[:, col]) + is_nan = np.isnan(X[:, col]) + if np.any(is_nan): + if np.all(is_nan): + raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.") + # Fill np.nan with np.nanmean + col_mean = np.nanmean(X[:, col]) + X[:, col] = np.where(is_nan, col_mean, X[:, col]) if not reduce: self.z = X.reshape(X_shape) @@ -433,7 +438,6 @@ def choose_device(cuda_idx=-1): """ if cuda_idx != -1: device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu") - # device = torch.device(f"cuda:{cuda_idx}") else: device = torch.device("cpu") return device diff --git a/learnware/specification/utils.py b/learnware/specification/utils.py index 4614557..1fbd1fd 100644 --- a/learnware/specification/utils.py +++ b/learnware/specification/utils.py @@ -1,74 +1,116 @@ +import torch import numpy as np +import pandas as pd +from typing import Union from .base import BaseStatSpecification from .rkme import RKMEStatSpecification from ..config import C +def convert_to_numpy(data: Union[np.ndarray, pd.DataFrame, torch.Tensor]): + """Convert data to np.ndarray + + Parameters + ---------- + data : np.ndarray, pd.DataFrame, or torch.Tensor + The input data that needs to be converted to a NumPy array. + + Returns + ------- + np.ndarray + The data converted to a NumPy array. + """ + if isinstance(data, np.ndarray): + return data + elif isinstance(data, pd.DataFrame): + return data.to_numpy() + elif isinstance(data, torch.Tensor): + return data.detach().cpu().numpy() + else: + raise TypeError("Unsupported data format. Please provide a NumPy array, a Pandas DataFrame, or a PyTorch Tensor.") + + def generate_rkme_spec( - X: np.ndarray, + X: Union[np.ndarray, pd.DataFrame, torch.Tensor], gamma: float = 0.1, - K: int = 100, + reduced_set_size: int = 100, step_size: float = 0.1, steps: int = 3, nonnegative_beta: bool = True, reduce: bool = True, - cuda_idx: int = -1, + cuda_idx: int = None, ) -> RKMEStatSpecification: """ - Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification. - Return a RKMEStatSpecification object, use .save() method to save as json file. - + Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification. + Return a RKMEStatSpecification object, use .save() method to save as json file. Parameters ---------- - X : np.ndarray - Raw data in np.ndarray format. - Size of array: (n*d) + X : np.ndarray, pd.DataFrame, or torch.Tensor + Raw data in np.ndarray, pd.DataFrame, or torch.Tensor format. + The shape of X: + First dimension represents the number of samples (data points). + The remaining dimensions represent the dimensions (features) of each sample. + For example, if X has shape (100, 3), it means there are 100 samples, and each sample has 3 features. gamma : float - Bandwidth in gaussian kernel, by default 0.1. - K : int - Size of the construced reduced set. + Bandwidth in gaussian kernel, by default 0.1. + reduced_set_size : int + Size of the construced reduced set. step_size : float - Step size for gradient descent in the iterative optimization. + Step size for gradient descent in the iterative optimization. steps : int - Total rounds in the iterative optimization. + Total rounds in the iterative optimization. nonnegative_beta : bool, optional - True if weights for the reduced set are intended to be kept non-negative, by default False. + True if weights for the reduced set are intended to be kept non-negative, by default False. reduce : bool, optional - Whether shrink original data to a smaller set, by default True + Whether shrink original data to a smaller set, by default True cuda_idx : int - A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used. + A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used. + None indicates that CUDA is automatically selected. Returns ------- RKMEStatSpecification - A RKMEStatSpecification object + A RKMEStatSpecification object """ + # Convert data type + X = convert_to_numpy(X) X = np.ascontiguousarray(X).astype(np.float32) + + # Check reduced_set_size max_reduced_set_size = C.max_reduced_set_size - if K * X[0].size > max_reduced_set_size: - K = max(1, max_reduced_set_size // X[0].size) + if reduced_set_size * X[0].size > max_reduced_set_size: + reduced_set_size = max(1, max_reduced_set_size // X[0].size) + + # Check cuda_idx + if not torch.cuda.is_available() or cuda_idx == -1: + cuda_idx = -1 + else: + num_cuda_devices = torch.cuda.device_count() + if cuda_idx is None or not (cuda_idx >= 0 and cuda_idx < num_cuda_devices): + cuda_idx = 0 + + # Generate rkme spec rkme_spec = RKMEStatSpecification(gamma=gamma, cuda_idx=cuda_idx) - rkme_spec.generate_stat_spec_from_data(X, K, step_size, steps, nonnegative_beta, reduce) + rkme_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce) return rkme_spec def generate_stat_spec(X: np.ndarray) -> BaseStatSpecification: """ - Interface for users to generate statistical specification. - Return a StatSpecification object, use .save() method to save as npy file. - + Interface for users to generate statistical specification. + Return a StatSpecification object, use .save() method to save as npy file. Parameters ---------- X : np.ndarray - Raw data in np.ndarray format. - Size of array: (n*d) + Raw data in np.ndarray format. + Size of array: (n*d) Returns ------- StatSpecification - A StatSpecification object + A StatSpecification object """ - return None + return None \ No newline at end of file