Source code for beamds.beam.algorithm.config


from ..config import BeamParam, DeviceConfig, ExperimentConfig, BeamConfig
from ..similarity import SimilarityConfig, TFIDFConfig



[docs]
class TextGroupExpansionConfig(SimilarityConfig, TFIDFConfig):

    # "en_core_web_trf"

    defaults = {
        'chunksize': 1000,
        'n_workers': 40,
        'mp_method': 'apply_async',
        'store_chunk': True,
        'store_path': None,
        'store_suffix': '.parquet',
        'override': False,
        'sparse_framework': 'scipy',
    }
    parameters = [
        BeamParam('tokenizer', type=str, default="BAAI/bge-base-en-v1.5", help='Tokenizer model'),
        BeamParam('dense-model', type=str, default="BAAI/bge-base-en-v1.5", help='Dense model for text similarity'),
        BeamParam('dense_model_device', type=str, default='cuda', help='Device for dense model'),
        BeamParam('tokenizer-chunksize', type=int, default=10000, help='Chunksize for tokenizer'),
        BeamParam('batch_size', int, 32, 'Batch size for dense model'),
        BeamParam('k-sparse', int, 50, 'Number of sparse similarities to include in the dataset'),
        BeamParam('k-dense', int, 50, 'Number of dense similarities to include in the dataset'),
        BeamParam('threshold', float, 0.5, 'Threshold for prediction model'),
        BeamParam('svd-components', int, 64, 'Number of PCA components to use to compress the tfidf vectors'),
        BeamParam('pca-components', int, 64, 'Number of PCA components to use to compress the dense vectors'),
        BeamParam('pu-n-estimators', int, 20, 'Number of estimators for the PU classifier'),
        BeamParam('pu-verbose', int, 10, 'Verbosity level for the PU classifier'),
        BeamParam('classifier-type', str, None, 'can be one of [None, catboost, rf]'),
        BeamParam('early_stopping_rounds', int, None, 'Early stopping rounds for the classifier'),
    ]




[docs]
class CatboostConfig(DeviceConfig):
    """
    CatBoost configuration with detailed parameter documentation.
    References:
    - https://catboost.ai/docs/references/training-parameters.html
    - https://docs.aws.amazon.com/sagemaker/latest/dg/catboost-hyperparameters.html
    """

    defaults = {'objective': None, 'objective_to_report': 'best'}

    # CatBoost parameters
    parameters = [
        BeamParam(
            'cb-task',
            str,
            'classification',
            'The task type for the CatBoost model. '
            'Default: classification. '
            'Options: [classification, regression, ranking]. '
            'Determines the type of task and influences the loss function and evaluation metrics.'
        ),
        BeamParam(
            'log-frequency',
            int,
            10,
            'The frequency (in iterations) of logging for the CatBoost model. '
            'Default: 10. '
            'Range: [1, ∞). '
            'Affects how often training progress is reported.'
        ),

        # Core parameters
        BeamParam(
            'loss_function',
            str,
            'Logloss',
            'The loss function for the CatBoost model. '
            'Default: Logloss for classification. '
            'Options: [Logloss, RMSE, MAE, Quantile, MAPE, Poisson, etc.]. '
            'Determines the optimization objective and affects model predictions.'
        ),
        BeamParam(
            'eval_metric',
            str,
            None,
            'The evaluation metric for the CatBoost model. '
            'Default: Auto-detected based on task type. '
            'Options: [Accuracy, AUC, RMSE, MAE, etc.]. '
            'Used for evaluating the performance of the model on validation data.'
        ),
        BeamParam(
            'custom_metric',
            list,
            None,
            'The custom metric for the CatBoost model. '
            'Default: None. '
            'Options: [Precision, Recall, F1, etc.]. '
            'Provides additional metrics for evaluation.'
        ),

        # Training parameters
        BeamParam(
            'iterations',
            int,
            None,
            'The number of trees (iterations) in the CatBoost model. '
            'Default: 1000. '
            'Range: [1, ∞). '
            'Higher values may improve performance but can increase training time and risk overfitting.'
        ),
        BeamParam(
            'learning_rate',
            float,
            None,
            'The learning rate for the CatBoost model. '
            'Default: 0.03. '
            'Range: (0.0, 1.0]. '
            'Controls the step size at each iteration while moving towards a minimum of the loss function.'
        ),
        BeamParam(
            'depth',
            int,
            None,
            'The depth of the trees in the CatBoost model. '
            'Default: 6. '
            'Range: [1, 16]. '
            'Deeper trees can capture more complex patterns but may lead to overfitting.'
        ),
        BeamParam(
            'l2_leaf_reg',
            float,
            None,
            'The L2 regularization term on the cost function. '
            'Default: 3.0. '
            'Range: (0, ∞). '
            'Helps prevent overfitting by penalizing large weights.'
        ),

        # Overfitting detection
        BeamParam(
            'od_pval',
            float,
            None,
            'The threshold for the overfitting detector. '
            'Default: None. '
            'Range: (0, 1). '
            'Stops training if the performance on the validation set does not improve by this value.'
            'For best results, it is recommended to set a value in the range [ 1e-10 ; 1e-2 ]'
        ),
        BeamParam(
            'od_wait',
            int,
            None,
            'Number of iterations to wait after the overfitting criterion is reached before stopping training. '
            'Default: 20. '
            'Range: [1, ∞). '
            'Prevents premature stopping by allowing continued training for a set number of iterations.'
        ),
        BeamParam(
            'od_type',
            str,
            None,
            'The overfitting detection type. '
            'Default: IncToDec. '
            'Options: [IncToDec, Iter]. '
            'Determines how overfitting is detected during training.'
        ),

        # Regularization parameters
        BeamParam(
            'bagging_temperature',
            float,
            None,
            'Controls the Bayesian bootstrap and helps in reducing overfitting by using random weights. '
            'Default: 1.0. '
            'Range: [0.0, ∞). '
            'Higher values increase randomness, helping to reduce overfitting.'
        ),
        BeamParam(
            'random_strength',
            float,
            None,
            'The amount of randomness to use for scoring splits when the tree structure is selected. '
            'Use this parameter to avoid overfitting the model.'
            'Default: 1.0. '
            'Range: [0.0, ∞). '
            'Adds randomness to scoring splits, helping prevent overfitting.'
        ),

        # Feature processing
        BeamParam(
            'border_count',
            int,
            None,
            'The number of splits for numerical features (max_bin). '
            'Default: The default value depends on the processing unit type and other parameters: '
            'CPU: 254 '
            'GPU in PairLogitPairwise and YetiRankPairwise modes: 32 '
            'GPU in all other modes: 128 '
            'Range: [1, 65535]. '
            'Affects the granularity of feature discretization; higher values can improve accuracy but increase complexity.'
        ),
        BeamParam(
            'feature_border_type',
            str,
            None,
            'The feature border type. '
            'Default: GreedyLogSum. '
            'Options: [Median, Uniform, UniformAndQuantiles, MaxLogSum, GreedyLogSum, MinEntropy]. '
            'Determines how feature borders are selected, impacting split decisions.'
        ),
        BeamParam(
            'per_float_feature_quantization',
            str,
            None,
            'The per float feature quantization. '
            'Default: None. '
            'See: https://catboost.ai/en/docs/references/training-parameters/quantization. '
            'Allows custom quantization for specific features.'
        ),

        # Advanced tree options
        BeamParam(
            'grow_policy',
            str,
            None,
            'Defines how to perform greedy tree construction. '
            'Default: SymmetricTree. '
            'Options: [SymmetricTree, Depthwise, Lossguide]. '
            'Determines the strategy for tree growth, affecting complexity and interpretability.'
        ),
        BeamParam(
            'max_leaves',
            int,
            None,
            'The maximum number of leaves in the resulting tree. '
            'Default: None. '
            'Range: [2, 64]. '
            'Applicable for Lossguide grow policy; limits the complexity of the tree.'
        ),

        # Sampling and randomness
        BeamParam(
            'rsm',
            float,
            None,
            'Random subspace method for feature selection. '
            'Default: 1.0. '
            'Range: (0.0, 1.0]. '
            'Percentage of features used at each split selection, allowing randomness in feature selection.'
        ),
        BeamParam(
            'sampling_frequency',
            str,
            None,
            'Frequency to sample weights and objects during tree building. '
            'Default: PerTree. '
            'Options: [PerTree, PerTreeLevel]. '
            'Determines how often samples are drawn during training.'
        ),
        BeamParam(
            'bootstrap_type',
            str,
            None,
            'The bootstrap type. '
            'Default: Bayesian. '
            'Options: [Bayesian, Bernoulli, No, MVS]. '
            'Controls how samples are drawn for training, affecting robustness and variance.'
        ),

        # Leaf estimation
        BeamParam(
            'leaf_estimation_iterations',
            int,
            None,
            'The number of iterations to calculate values in leaves. '
            'Default: 1. '
            'Range: [1, ∞). '
            'Higher values improve accuracy at the cost of increased training time.'
        ),
        BeamParam(
            'leaf_estimation_method',
            str,
            None,
            'The method used to calculate values in leaves. '
            'Default: Newton. '
            'Options: [Newton, Gradient]. '
            'Determines the approach for estimating leaf values, affecting convergence speed and accuracy.'
        ),

        # Logging and output
        BeamParam(
            'snapshot_interval',
            int,
            600,
            'The snapshot interval for model saving [in seconds]. '
            'Default: 600. '
            'Range: [1, ∞). '
            'Controls how often model snapshots are saved, useful for resuming training.'
        ),
        BeamParam(
            'boosting_type',
            str,
            'Plain',
            'Controls the boosting scheme. '
            'Default: Plain. '
            'Options: [Ordered, Plain]. '
            'Ordered is used to eliminate the effect of a training set order.'
        ),
        BeamParam(
            'allow_const_label',
            bool,
            False,
            'Allows training on a dataset with constant labels. '
            'Default: False. '
            'Useful for experimentation or testing.'
        ),

        # Training parameters
        BeamParam(
            'auto_class_weights',
            str,
            None,
            'Automatically calculates class weights for imbalanced datasets. '
            'Default: None. '
            'Options: [Balanced, SqrtBalanced, None].'
        ),

        # Regularization parameters
        BeamParam(
            'l1_leaf_reg',
            float,
            None,
            'L1 regularization term on weights. '
            'Default: 0.0. '
            'Range: (0, ∞). '
            'Helps prevent overfitting by penalizing large weights.'
        ),

        # Feature processing
        BeamParam(
            'one_hot_max_size',
            int,
            None,
            'Maximum size of the categorical feature for one-hot encoding. '
            'Default: 2. '
            'Range: [1, ∞). '
            'Larger sizes use a more efficient embedding.'
        ),

        # Advanced tree options
        BeamParam(
            'min_data_in_leaf',
            int,
            None,
            'Minimum number of samples per leaf. '
            'Default: 1. '
            'Range: [1, ∞). '
            'Controls leaf size and can affect overfitting and generalization.'
        ),

        # Sampling and randomness
        BeamParam(
            'bagging_fraction',
            float,
            None,
            'Fraction of samples to use in each iteration. '
            'Default: 1.0. '
            'Range: (0.0, 1.0]. '
            'Controls randomness and variance.'
        ),

        # Leaf estimation
        BeamParam(
            'leaf_estimation_backtracking',
            str,
            None,
            'Backtracking type used for leaf estimation. '
            'Default: AnyImprovement. '
            'Options: [No, AnyImprovement, Armijo]. '
            'Affects convergence and accuracy.'
        ),

    ]




[docs]
class CatboostExperimentConfig(CatboostConfig, ExperimentConfig):
    defaults = {'project': 'cb_beam', 'algorithm': 'CBAlgorithm'}