beimingwu
/
learnware

 
			
							import torch
import numpy as np
import pandas as pd
from typing import Union, List

from .utils import convert_to_numpy
from .base import BaseStatSpecification
from .regular import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification
from ..config import C


def generate_rkme_table_spec(
    X: Union[np.ndarray, pd.DataFrame, torch.Tensor],
    gamma: float = 0.1,
    reduced_set_size: int = 100,
    step_size: float = 0.1,
    steps: int = 3,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    cuda_idx: int = None,
) -> RKMETableSpecification:
    """
        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification.
        Return a RKMETableSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : np.ndarray, pd.DataFrame, or torch.Tensor
        Raw data in np.ndarray, pd.DataFrame, or torch.Tensor format.
        The shape of X:
            First dimension represents the number of samples (data points).
            The remaining dimensions represent the dimensions (features) of each sample.
            For example, if X has shape (100, 3), it means there are 100 samples, and each sample has 3 features.
    gamma : float
        Bandwidth in gaussian kernel, by default 0.1.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
        Step size for gradient descent in the iterative optimization.
    steps : int
        Total rounds in the iterative optimization.
    nonnegative_beta : bool, optional
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.

    Returns
    -------
    RKMETableSpecification
        A RKMETableSpecification object
    """
    # Convert data type
    X = convert_to_numpy(X)
    X = np.ascontiguousarray(X).astype(np.float32)

    # Check reduced_set_size
    max_reduced_set_size = C.max_reduced_set_size
    if reduced_set_size * X[0].size > max_reduced_set_size:
        reduced_set_size = max(20, max_reduced_set_size // X[0].size)

    # Check cuda_idx
    if not torch.cuda.is_available() or cuda_idx == -1:
        cuda_idx = -1
    else:
        num_cuda_devices = torch.cuda.device_count()
        if cuda_idx is None or not (cuda_idx >= 0 and cuda_idx < num_cuda_devices):
            cuda_idx = 0

    # Generate rkme spec
    rkme_spec = RKMETableSpecification(gamma=gamma, cuda_idx=cuda_idx)
    rkme_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce)
    return rkme_spec


def generate_rkme_image_spec(
    X: Union[np.ndarray, torch.Tensor],
    reduced_set_size: int = 50,
    step_size: float = 0.01,
    steps: int = 100,
    resize: bool = True,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    verbose: bool = True,
    cuda_idx: int = None,
) -> RKMEImageSpecification:
    """
        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification for Image.
        Return a RKMEImageSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : np.ndarray, or torch.Tensor
        Raw data in np.ndarray, or torch.Tensor format.
        The shape of X: [N, C, H, W]
            N: Number of images.
            C: Number of channels.
            H: Height of images.
            W: Width of images.s
            For example, if X has shape (100, 3, 32, 32), it means there are 100 samples, and each sample is a 3-channel (RGB) image of size 32x32.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
        Step size for gradient descent in the iterative optimization.
    steps : int
        Total rounds in the iterative optimization.
    resize : bool
        Whether to scale the image to the requested size, by default True.
    nonnegative_beta : bool, optional
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.
    verbose : bool, optional
        Whether to print training progress, by default True

    Returns
    -------
    RKMEImageSpecification
        A RKMEImageSpecification object
    """

    # Check cuda_idx
    if not torch.cuda.is_available() or cuda_idx == -1:
        cuda_idx = -1
    else:
        num_cuda_devices = torch.cuda.device_count()
        if cuda_idx is None or not (0 <= cuda_idx < num_cuda_devices):
            cuda_idx = 0

    # Generate rkme spec
    rkme_image_spec = RKMEImageSpecification(cuda_idx=cuda_idx)
    rkme_image_spec.generate_stat_spec_from_data(
        X, reduced_set_size, step_size, steps, resize, nonnegative_beta, reduce, verbose
    )
    return rkme_image_spec


def generate_rkme_text_spec(
    X: List[str],
    gamma: float = 0.1,
    reduced_set_size: int = 100,
    step_size: float = 0.1,
    steps: int = 3,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    cuda_idx: int = None,
) -> RKMETextSpecification:
    """
        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification for Text.
        Return a RKMETextSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : List[str]
        Raw data of text.
    gamma : float
        Bandwidth in gaussian kernel, by default 0.1.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
        Step size for gradient descent in the iterative optimization.
    steps : int
        Total rounds in the iterative optimization.
    nonnegative_beta : bool, optional
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.

    Returns
    -------
    RKMETextSpecification
        A RKMETextSpecification object
    """
    # Check input type
    if not isinstance(X, list) or not all(isinstance(item, str) for item in X):
        raise TypeError("Input data must be a list of strings.")

    # Check cuda_idx
    if not torch.cuda.is_available() or cuda_idx == -1:
        cuda_idx = -1
    else:
        num_cuda_devices = torch.cuda.device_count()
        if cuda_idx is None or not (cuda_idx >= 0 and cuda_idx < num_cuda_devices):
            cuda_idx = 0

    # Generate rkme text spec
    rkme_text_spec = RKMETextSpecification(gamma=gamma, cuda_idx=cuda_idx)
    rkme_text_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce)
    return rkme_text_spec


def generate_stat_spec(
    type: str, X: Union[np.ndarray, pd.DataFrame, torch.Tensor, List[str]], *args, **kwargs
) -> BaseStatSpecification:
    """
        Interface for users to generate statistical specification.
        Return a StatSpecification object, use .save() method to save as npy file.

    Parameters
    ----------
    type: str
        Type of statistical specification.
        Supported types: "table", "text", "image"
    X : np.ndarray
        Raw data in np.ndarray format.
        Size of array: (n*d)

    Returns
    -------
    StatSpecification
        A StatSpecification object
    """
    if type == "table":
        return generate_rkme_table_spec(X=X, *args, **kwargs)
    elif type == "text":
        return generate_rkme_text_spec(X=X, *args, **kwargs)
    elif type == "image":
        return generate_rkme_image_spec(X=X, *args, **kwargs)
    else:
        raise TypeError(f"type {type} is not supported!")