beimingwu
/
learnware

 
			
							import torch
import numpy as np
import pandas as pd
from typing import Union, List, Optional

from .utils import convert_to_numpy
from .base import BaseStatSpecification
from .regular import RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification
from ..config import C


def generate_rkme_table_spec(
    X: Union[np.ndarray, pd.DataFrame, torch.Tensor],
    gamma: float = 0.1,
    reduced_set_size: int = 100,
    step_size: float = 0.1,
    steps: int = 3,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    cuda_idx: int = None,
) -> RKMETableSpecification:
    """
        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification.
        Return a RKMETableSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : np.ndarray, pd.DataFrame, or torch.Tensor
        Raw data in np.ndarray, pd.DataFrame, or torch.Tensor format.
        The shape of X:
            First dimension represents the number of samples (data points).
            The remaining dimensions represent the dimensions (features) of each sample.
            For example, if X has shape (100, 3), it means there are 100 samples, and each sample has 3 features.
    gamma : float
        Bandwidth in gaussian kernel, by default 0.1.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
        Step size for gradient descent in the iterative optimization.
    steps : int
        Total rounds in the iterative optimization.
    nonnegative_beta : bool, optional
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.

    Returns
    -------
    RKMETableSpecification
        A RKMETableSpecification object
    """
    # Convert data type
    X = convert_to_numpy(X)
    X = np.ascontiguousarray(X).astype(np.float32)

    # Check reduced_set_size
    max_reduced_set_size = C.max_reduced_set_size
    if reduced_set_size * X[0].size > max_reduced_set_size:
        reduced_set_size = max(20, max_reduced_set_size // X[0].size)

    # Generate rkme spec
    rkme_spec = RKMETableSpecification(gamma=gamma, cuda_idx=cuda_idx)
    rkme_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce)
    return rkme_spec


def generate_rkme_image_spec(
    X: Union[np.ndarray, torch.Tensor],
    reduced_set_size: int = 50,
    step_size: float = 0.01,
    steps: int = 100,
    resize: bool = True,
    sample_size: int = 5000,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    verbose: bool = True,
    cuda_idx: int = None,
    **kwargs
) -> RKMEImageSpecification:
    """
        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification for Image.
        Return a RKMEImageSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : np.ndarray, or torch.Tensor
        Raw data in np.ndarray, or torch.Tensor format.
        The shape of X: [N, C, H, W]
            N: Number of images.
            C: Number of channels.
            H: Height of images.
            W: Width of images.s
            For example, if X has shape (100, 3, 32, 32), it means there are 100 samples, and each sample is a 3-channel (RGB) image of size 32x32.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
        Step size for gradient descent in the iterative optimization.
    steps : int
        Total rounds in the iterative optimization.
    resize : bool
        Whether to scale the image to the requested size, by default True.
    nonnegative_beta : bool, optional
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.
    verbose : bool, optional
        Whether to print training progress, by default True

    Returns
    -------
    RKMEImageSpecification
        A RKMEImageSpecification object
    """

    # Generate rkme spec
    rkme_image_spec = RKMEImageSpecification(cuda_idx=cuda_idx)
    rkme_image_spec.generate_stat_spec_from_data(
        X, reduced_set_size, step_size, steps, resize, sample_size, nonnegative_beta, reduce, verbose, **kwargs
    )
    return rkme_image_spec


def generate_rkme_text_spec(
    X: List[str],
    gamma: float = 0.1,
    reduced_set_size: int = 100,
    step_size: float = 0.1,
    steps: int = 3,
    nonnegative_beta: bool = True,
    reduce: bool = True,
    cuda_idx: int = None,
) -> RKMETextSpecification:
    """
        Interface for users to generate Reduced Kernel Mean Embedding (RKME) specification for Text.
        Return a RKMETextSpecification object, use .save() method to save as json file.

    Parameters
    ----------
    X : List[str]
        Raw data of text.
    gamma : float
        Bandwidth in gaussian kernel, by default 0.1.
    reduced_set_size : int
        Size of the construced reduced set.
    step_size : float
        Step size for gradient descent in the iterative optimization.
    steps : int
        Total rounds in the iterative optimization.
    nonnegative_beta : bool, optional
        True if weights for the reduced set are intended to be kept non-negative, by default False.
    reduce : bool, optional
        Whether shrink original data to a smaller set, by default True
    cuda_idx : int
        A flag indicating whether use CUDA during RKME computation. -1 indicates CUDA not used.
        None indicates that CUDA is automatically selected.

    Returns
    -------
    RKMETextSpecification
        A RKMETextSpecification object
    """
    # Check input type
    if not isinstance(X, list) or not all(isinstance(item, str) for item in X):
        raise TypeError("Input data must be a list of strings.")
    
    # Generate rkme text spec
    rkme_text_spec = RKMETextSpecification(gamma=gamma, cuda_idx=cuda_idx)
    rkme_text_spec.generate_stat_spec_from_data(X, reduced_set_size, step_size, steps, nonnegative_beta, reduce)
    return rkme_text_spec


def generate_stat_spec(
    type: str, X: Union[np.ndarray, pd.DataFrame, torch.Tensor, List[str]], *args, **kwargs
) -> Union[RKMETableSpecification, RKMEImageSpecification, RKMETextSpecification]:
    """
        Interface for users to generate statistical specification.
        Return a StatSpecification object, use .save() method to save as npy file.

    Parameters
    ----------
    type: str
        Type of statistical specification.
        Supported types: "table", "text", "image"
    X : np.ndarray
        Raw data in np.ndarray format.
        Size of array: (n*d)

    Returns
    -------
    StatSpecification
        A StatSpecification object
    """
    if type == "table":
        return generate_rkme_table_spec(X=X, *args, **kwargs)
    elif type == "text":
        return generate_rkme_text_spec(X=X, *args, **kwargs)
    elif type == "image":
        return generate_rkme_image_spec(X=X, *args, **kwargs)
    else:
        raise TypeError(f"type {type} is not supported!")


def generate_semantic_spec(
    name: Optional[str] = None,
    description: Optional[str] = None,
    data_type: Optional[str] = None,
    task_type: Optional[str] = None,
    library_type: Optional[str] = None,
    scenarios: Optional[Union[str, List[str]]] = None,
    license: Optional[Union[str, List[str]]] = None,
    input_description: Optional[dict] = None,
    output_description: Optional[dict] = None,
):
    semantic_specification = dict()
    semantic_specification["Data"] = {"Type": "Class", "Values": [data_type] if data_type is not None else []}
    semantic_specification["Task"] = {"Type": "Class", "Values": [task_type] if task_type is not None else []}
    semantic_specification["Library"] = {
        "Type": "Class",
        "Values": [library_type] if library_type is not None else [],
    }

    license = [license] if isinstance(license, str) else license
    semantic_specification["License"] = {"Type": "Class", "Values": license if license is not None else []}
    scenarios = [scenarios] if isinstance(scenarios, str) else scenarios
    semantic_specification["Scenario"] = {"Type": "Tag", "Values": scenarios if scenarios is not None else []}

    semantic_specification["Name"] = {"Type": "String", "Values": name if name is not None else ""}
    semantic_specification["Description"] = {
        "Type": "String",
        "Values": description if description is not None else "",
    }
    if input_description is not None:
        semantic_specification["Input"] = input_description
    
    if output_description is not None:
        semantic_specification["Output"] = output_description

    return semantic_specification