Browse Source

Merge pull request #6 from LAMDA-NJU/dev

[MNT] Enhance the robustness of generating rkme spec
tags/v0.3.2
Gene GitHub 2 years ago
parent
commit
1bb6410d33
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 81 additions and 35 deletions
  1. +1
    -1
      learnware/__init__.py
  2. +10
    -6
      learnware/specification/rkme.py
  3. +70
    -28
      learnware/specification/utils.py

+ 1
- 1
learnware/__init__.py View File

@@ -1,4 +1,4 @@
__version__ = "0.1.0.99"
__version__ = "0.1.0.dev"

import os
from .logger import get_module_logger


+ 10
- 6
learnware/specification/rkme.py View File

@@ -104,12 +104,17 @@ class RKMEStatSpecification(BaseStatSpecification):
Z_shape = tuple([K] + list(X_shape)[1:])
X = X.reshape(self.num_points, -1)

# fill np.nan
X_nan = np.isnan(X)
if X_nan.max() == 1:
# Check data values
X[np.isinf(X) | np.isneginf(X) | np.isposinf(X) | np.isneginf(X)] = np.nan
if np.any(np.isnan(X)):
for col in range(X.shape[1]):
col_mean = np.nanmean(X[:, col])
X[:, col] = np.where(X_nan[:, col], col_mean, X[:, col])
is_nan = np.isnan(X[:, col])
if np.any(is_nan):
if np.all(is_nan):
raise ValueError(f"All values in column {col} are exceptional, e.g., NaN and Inf.")
# Fill np.nan with np.nanmean
col_mean = np.nanmean(X[:, col])
X[:, col] = np.where(is_nan, col_mean, X[:, col])

if not reduce:
self.z = X.reshape(X_shape)
@@ -433,7 +438,6 @@ def choose_device(cuda_idx=-1):
"""
if cuda_idx != -1:
device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
# device = torch.device(f"cuda:{cuda_idx}")
else:
device = torch.device("cpu")
return device


+ 70
- 28
learnware/specification/utils.py View File

@@ -1,74 +1,116 @@
import torch
import numpy as np
import pandas as pd
from typing import Union

from .base import BaseStatSpecification
from .rkme import RKMEStatSpecification
from ..config import C


def convert_to_numpy(data: Union[np.ndarray, pd.DataFrame, torch.Tensor]):
"""Convert data to np.ndarray

Parameters
----------
data : np.ndarray, pd.DataFrame, or torch.Tensor
The input data that needs to be converted to a NumPy array.

Returns
-------
np.ndarray
The data converted to a NumPy array.
"""
if isinstance(data, np.ndarray):
return data
elif isinstance(data, pd.DataFrame):
return data.to_numpy()
elif isinstance(data, torch.Tensor):
return data.detach().cpu().numpy()
else:
raise TypeError("Unsupported data format. Please provide a NumPy array, a Pandas DataFrame, or a PyTorch Tensor.")


def generate_rkme_spec(
X: np.ndarray,
X: Union[np.ndarray, pd.DataFrame, torch.Tensor],
gamma: float = 0.1,
K: int = 100,
reduced_set_size: int = 100,
step_size: float = 0.1,
steps: int = 3,
nonnegative_beta: bool = True,
reduce: bool = True,
cuda_idx: int = -1,
cuda_idx: int = None,
) -> RKMEStatSpecification:
"""
Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification.
Return a RKMEStatSpecification object, use .save() method to save as json file.

Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification.
Return a RKMEStatSpecification object, use .save() method to save as json file.

Parameters
----------
X : np.ndarray
Raw data in np.ndarray format.
Size of array: (n*d)
X : np.ndarray, pd.DataFrame, or torch.Tensor
Raw data in np.ndarray, pd.DataFrame, or torch.Tensor format.
The shape of X:
First dimension represents the number of samples (data points).
The remaining dimensions represent the dimensions (features) of each sample.
For example, if X has shape (100, 3), it means there are 100 samples, and each sample has 3 features.
gamma : float
Bandwidth in gaussian kernel, by default 0.1.
K : int
Size of the construced reduced set.
Bandwidth in gaussian kernel, by default 0.1.
reduced_set_size : int
Size of the construced reduced set.
step_size : float
Step size for gradient descent in the iterative optimization.
Step size for gradient descent in the iterative optimization.
steps : int
Total rounds in the iterative optimization.
Total rounds in the iterative optimization.
nonnegative_beta : bool, optional
True if weights for the reduced set are intended to be kept non-negative, by default False.
True if weights for the reduced set are intended to be kept non-negative, by default False.
reduce : bool, optional
Whether shrink original data to a smaller set, by default True
Whether shrink original data to a smaller set, by default True
cuda_idx : int
A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
None indicates that CUDA is automatically selected.

Returns
-------
RKMEStatSpecification
A RKMEStatSpecification object
A RKMEStatSpecification object
"""
# Convert data type
X = convert_to_numpy(X)
X = np.ascontiguousarray(X).astype(np.float32)
# Check reduced_set_size
max_reduced_set_size = C.max_reduced_set_size
if K * X[0].size > max_reduced_set_size:
K = max(1, max_reduced_set_size // X[0].size)
if reduced_set_size * X[0].size > max_reduced_set_size:
reduced_set_size = max(1, max_reduced_set_size // X[0].size)
# Check cuda_idx
if not torch.cuda.is_available() or cuda_idx == -1:
cuda_idx = -1
else:
num_cuda_devices = torch.cuda.device_count()
if cuda_idx is None or not (cuda_idx >= 0 and cuda_idx < num_cuda_devices):
cuda_idx = 0
# Generate rkme spec
rkme_spec = RKMEStatSpecification(gamma=gamma, cuda_idx=cuda_idx)
rkme_spec.generate_stat_spec_from_data(X, K, step_size, steps, nonnegative_beta, reduce)
rkme_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce)
return rkme_spec


def generate_stat_spec(X: np.ndarray) -> BaseStatSpecification:
"""
Interface for users to generate statistical specification.
Return a StatSpecification object, use .save() method to save as npy file.

Interface for users to generate statistical specification.
Return a StatSpecification object, use .save() method to save as npy file.

Parameters
----------
X : np.ndarray
Raw data in np.ndarray format.
Size of array: (n*d)
Raw data in np.ndarray format.
Size of array: (n*d)

Returns
-------
StatSpecification
A StatSpecification object
A StatSpecification object
"""
return None
return None

Loading…
Cancel
Save