Source code for reflame.utils.data_toolkit

#!/usr/bin/env python
# Created by "Thieu" at 18:17, 24/10/2023 ----------%                                                                               
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import boxcox, yeojohnson
from scipy.special import inv_boxcox
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler


[docs]class TimeSeriesDifferencer: def __init__(self, interval=1): if interval < 1: raise ValueError("Interval for differencing must be at least 1.") self.interval = interval
[docs] def difference(self, X): self.original_data = X.copy() return np.array([X[i] - X[i - self.interval] for i in range(self.interval, len(X))])
[docs] def inverse_difference(self, diff_data): if self.original_data is None: raise ValueError("Original data is required for inversion.") return np.array([diff_data[i - self.interval] + self.original_data[i - self.interval] for i in range(self.interval, len(self.original_data))])
[docs]class FeatureEngineering: def __init__(self): """ Initialize the FeatureEngineering class """ # Check if the threshold is a valid number pass
[docs] def create_threshold_binary_features(self, X, threshold): """ Perform feature engineering to add binary indicator columns for values below the threshold. Add each new column right after the corresponding original column. Args: X (numpy.ndarray): The input 2D matrix of shape (n_samples, n_features). threshold (float): The threshold value for identifying low values. Returns: numpy.ndarray: The updated 2D matrix with binary indicator columns. """ # Check if X is a NumPy array if not isinstance(X, np.ndarray): raise ValueError("Input X should be a NumPy array.") # Check if the threshold is a valid number if not (isinstance(threshold, int) or isinstance(threshold, float)): raise ValueError("Threshold should be a numeric value.") # Create a new matrix to hold the original and new columns X_new = np.zeros((X.shape[0], X.shape[1] * 2)) # Iterate over each column in X for idx in range(X.shape[1]): feature_values = X[:, idx] # Create a binary indicator column for values below the threshold indicator_column = (feature_values < threshold).astype(int) # Add the original column and indicator column to the new matrix X_new[:, idx * 2] = feature_values X_new[:, idx * 2 + 1] = indicator_column return X_new
[docs]class LabelEncoder: """ Encode categorical features as integer labels. """ def __init__(self): self.unique_labels = None self.label_to_index = {}
[docs] @staticmethod def check_y(y): y = np.squeeze(np.asarray(y)) if y.ndim != 1: raise ValueError("y label should have shape like 1-D vector.") return y
[docs] def fit(self, y): """ Fit label encoder to a given set of labels. Parameters: ----------- y : array-like Labels to encode. """ y = self.check_y(y) self.unique_labels = np.unique(y) self.label_to_index = {label: i for i, label in enumerate(self.unique_labels)}
[docs] def transform(self, y): """ Transform labels to encoded integer labels. Parameters: ----------- y : array-like (1-D vector) Labels to encode. Returns: -------- encoded_labels : array-like Encoded integer labels. """ y = self.check_y(y) if self.unique_labels is None: raise ValueError("Label encoder has not been fit yet.") return np.array([self.label_to_index[label] for label in y])
[docs] def fit_transform(self, y): """Fit label encoder and return encoded labels. Parameters ---------- y : array-like of shape (n_samples,) Target values. Returns ------- y : array-like of shape (n_samples,) Encoded labels. """ self.fit(y) return self.transform(y)
[docs] def inverse_transform(self, y): """ Transform integer labels to original labels. Parameters: ----------- y : array-like Encoded integer labels. Returns: -------- original_labels : array-like Original labels. """ y = self.check_y(y) if self.unique_labels is None: raise ValueError("Label encoder has not been fit yet.") return np.array([self.unique_labels[i] if i in self.label_to_index.values() else "unknown" for i in y])
[docs]class ObjectiveScaler: """ For label scaler in classification (binary and multiple classification) """ def __init__(self, obj_name="sigmoid", ohe_scaler=None): """ ohe_scaler: Need to be an instance of One-Hot-Encoder for softmax scaler (multiple classification problem) """ self.obj_name = obj_name self.ohe_scaler = ohe_scaler
[docs] def transform(self, data): if self.obj_name == "sigmoid" or self.obj_name == "self": return data elif self.obj_name == "hinge": data = np.squeeze(np.array(data)) data[np.where(data == 0)] = -1 return data elif self.obj_name == "softmax": data = self.ohe_scaler.transform(np.reshape(data, (-1, 1))) return data
[docs] def inverse_transform(self, data): if self.obj_name == "sigmoid": data = np.squeeze(np.array(data)) data = np.rint(data).astype(int) elif self.obj_name == "hinge": data = np.squeeze(np.array(data)) data = np.ceil(data).astype(int) data[np.where(data == -1)] = 0 elif self.obj_name == "softmax": data = np.squeeze(np.array(data)) data = np.argmax(data, axis=1) return data
[docs]class Log1pScaler(BaseEstimator, TransformerMixin):
[docs] def fit(self, X, y=None): # LogETransformer doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the natural logarithm to each element of the input data return np.log1p(X)
[docs] def inverse_transform(self, X): # Apply the exponential function to reverse the logarithmic transformation return np.expm1(X)
[docs]class LogeScaler(BaseEstimator, TransformerMixin):
[docs] def fit(self, X, y=None): # LogETransformer doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the natural logarithm (base e) to each element of the input data return np.log(X)
[docs] def inverse_transform(self, X): # Apply the exponential function to reverse the logarithmic transformation return np.exp(X)
[docs]class SqrtScaler(BaseEstimator, TransformerMixin):
[docs] def fit(self, X, y=None): # SqrtScaler doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the square root transformation to each element of the input data return np.sqrt(X)
[docs] def inverse_transform(self, X): # Apply the square of each element to reverse the square root transformation return X ** 2
[docs]class BoxCoxScaler(BaseEstimator, TransformerMixin): def __init__(self, lmbda=None): self.lmbda = lmbda
[docs] def fit(self, X, y=None): # Estimate the lambda parameter from the data if not provided if self.lmbda is None: _, self.lmbda = boxcox(X.flatten()) return self
[docs] def transform(self, X): # Apply the Box-Cox transformation to the data X_new = boxcox(X.flatten(), lmbda=self.lmbda) return X_new.reshape(X.shape)
[docs] def inverse_transform(self, X): # Inverse transform using the original lambda parameter return inv_boxcox(X, self.lmbda)
[docs]class YeoJohnsonScaler(BaseEstimator, TransformerMixin): def __init__(self, lmbda=None): self.lmbda = lmbda
[docs] def fit(self, X, y=None): # Estimate the lambda parameter from the data if not provided if self.lmbda is None: _, self.lmbda = yeojohnson(X.flatten()) return self
[docs] def transform(self, X): # Apply the Yeo-Johnson transformation to the data X_new = boxcox(X.flatten(), lmbda=self.lmbda) return X_new.reshape(X.shape)
[docs] def inverse_transform(self, X): # Inverse transform using the original lambda parameter return inv_boxcox(X, self.lmbda)
[docs]class SinhArcSinhScaler(BaseEstimator, TransformerMixin): # https://stats.stackexchange.com/questions/43482/transformation-to-increase-kurtosis-and-skewness-of-normal-r-v def __init__(self, epsilon=0.1, delta=1.0): self.epsilon = epsilon self.delta = delta
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X): return np.sinh(self.delta * np.arcsinh(X) - self.epsilon)
[docs] def inverse_transform(self, X): return np.sinh((np.arcsinh(X) + self.epsilon) / self.delta)
[docs]class DataTransformer(BaseEstimator, TransformerMixin): SUPPORTED_SCALERS = {"standard": StandardScaler, "minmax": MinMaxScaler, "max-abs": MaxAbsScaler, "log1p": Log1pScaler, "loge": LogeScaler, "sqrt": SqrtScaler, "sinh-arc-sinh": SinhArcSinhScaler, "robust": RobustScaler, "box-cox": BoxCoxScaler, "yeo-johnson": YeoJohnsonScaler} def __init__(self, scaling_methods=('standard', ), list_dict_paras=None): if type(scaling_methods) is str: if list_dict_paras is None: self.list_dict_paras = [{}] elif type(list_dict_paras) is dict: self.list_dict_paras = [list_dict_paras] else: raise TypeError(f"You use only 1 scaling method, the list_dict_paras should be dict of parameter for that scaler.") self.scaling_methods = [scaling_methods] elif type(scaling_methods) in (tuple, list, np.ndarray): if list_dict_paras is None: self.list_dict_paras = [{}, ]*len(scaling_methods) elif type(list_dict_paras) in (tuple, list, np.ndarray): self.list_dict_paras = list(list_dict_paras) else: raise TypeError(f"Invalid type of list_dict_paras. Supported type are: tuple, list, or np.ndarray of parameter dict") self.scaling_methods = list(scaling_methods) else: raise TypeError(f"Invalid type of scaling_methods. Supported type are: str, tuple, list, or np.ndarray") self.scalers = [self._get_scaler(technique, paras) for (technique, paras) in zip(self.scaling_methods, self.list_dict_paras)] def _get_scaler(self, technique, paras): if technique in self.SUPPORTED_SCALERS.keys(): if type(paras) is not dict: paras = {} return self.SUPPORTED_SCALERS[technique](**paras) else: raise ValueError(f"Invalid scaling technique. Supported techniques are {self.SUPPORTED_SCALERS.keys()}")
[docs] def fit(self, X, y=None): for idx, _ in enumerate(self.scalers): X = self.scalers[idx].fit_transform(X) return self
[docs] def transform(self, X): for scaler in self.scalers: X = scaler.transform(X) return X
[docs] def inverse_transform(self, X): for scaler in reversed(self.scalers): X = scaler.inverse_transform(X) return X
[docs]class Data: """ The structure of our supported Data class Parameters ---------- X : np.ndarray The features of your data y : np.ndarray The labels of your data """ SUPPORT = { "scaler": list(DataTransformer.SUPPORTED_SCALERS.keys()) } def __init__(self, X=None, y=None, name="Unknown"): self.X = X self.y = self.check_y(y) self.name = name self.X_train, self.y_train, self.X_test, self.y_test = None, None, None, None
[docs] @staticmethod def check_y(y): if y is None: return y y = np.squeeze(np.asarray(y)) if y.ndim == 1: y = np.reshape(y, (-1, 1)) return y
[docs] @staticmethod def scale(X, scaling_methods=('standard', ), list_dict_paras=None): X = np.squeeze(np.asarray(X)) if X.ndim == 1: X = np.reshape(X, (-1, 1)) if X.ndim >= 3: raise TypeError(f"Invalid X data type. It should be array-like with shape (n samples, m features)") scaler = DataTransformer(scaling_methods=scaling_methods, list_dict_paras=list_dict_paras) data = scaler.fit_transform(X) return data, scaler
[docs] @staticmethod def encode_label(y): y = np.squeeze(np.asarray(y)) if y.ndim != 1: raise TypeError(f"Invalid y data type. It should be a vector / array-like with shape (n samples,)") scaler = LabelEncoder() data = scaler.fit_transform(y) return data, scaler
[docs] def split_train_test(self, test_size=0.2, train_size=None, random_state=41, shuffle=True, stratify=None, inplace=True): """ The wrapper of the split_train_test function in scikit-learn library. """ self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, train_size=train_size, random_state=random_state, shuffle=shuffle, stratify=stratify) if not inplace: return self.X_train, self.X_test, self.y_train, self.y_test
[docs] def set_train_test(self, X_train=None, y_train=None, X_test=None, y_test=None): """ Function use to set your own X_train, y_train, X_test, y_test in case you don't want to use our split function Parameters ---------- X_train : np.ndarray y_train : np.ndarray X_test : np.ndarray y_test : np.ndarray """ self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test return self