beamds.beam.algorithm package#

Submodules#

beamds.beam.algorithm.catboost_algorithm module#

beamds.beam.algorithm.config module#

class beamds.beam.algorithm.config.CatboostConfig(*args, _store_init_path=None, _save_init_args=True, **kwargs)[source]#

Bases: DeviceConfig

CatBoost configuration with detailed parameter documentation. References: - https://catboost.ai/docs/references/training-parameters.html - https://docs.aws.amazon.com/sagemaker/latest/dg/catboost-hyperparameters.html

defaults = {'objective': None, 'objective_to_report': 'best'}#

parameters = [BeamParam(name='cb-task', type=<class 'str'>, default='classification', help='The task type for the CatBoost model. Default: classification. Options: [classification, regression, ranking]. Determines the type of task and influences the loss function and evaluation metrics.', tags=None), BeamParam(name='log-frequency', type=<class 'int'>, default=10, help='The frequency (in iterations) of logging for the CatBoost model. Default: 10. Range: [1, ∞). Affects how often training progress is reported.', tags=None), BeamParam(name='loss_function', type=<class 'str'>, default='Logloss', help='The loss function for the CatBoost model. Default: Logloss for classification. Options: [Logloss, RMSE, MAE, Quantile, MAPE, Poisson, etc.]. Determines the optimization objective and affects model predictions.', tags=None), BeamParam(name='eval_metric', type=<class 'str'>, default=None, help='The evaluation metric for the CatBoost model. Default: Auto-detected based on task type. Options: [Accuracy, AUC, RMSE, MAE, etc.]. Used for evaluating the performance of the model on validation data.', tags=None), BeamParam(name='custom_metric', type=<class 'list'>, default=None, help='The custom metric for the CatBoost model. Default: None. Options: [Precision, Recall, F1, etc.]. Provides additional metrics for evaluation.', tags=None), BeamParam(name='iterations', type=<class 'int'>, default=None, help='The number of trees (iterations) in the CatBoost model. Default: 1000. Range: [1, ∞). Higher values may improve performance but can increase training time and risk overfitting.', tags=None), BeamParam(name='learning_rate', type=<class 'float'>, default=None, help='The learning rate for the CatBoost model. Default: 0.03. Range: (0.0, 1.0]. Controls the step size at each iteration while moving towards a minimum of the loss function.', tags=None), BeamParam(name='depth', type=<class 'int'>, default=None, help='The depth of the trees in the CatBoost model. Default: 6. Range: [1, 16]. Deeper trees can capture more complex patterns but may lead to overfitting.', tags=None), BeamParam(name='l2_leaf_reg', type=<class 'float'>, default=None, help='The L2 regularization term on the cost function. Default: 3.0. Range: (0, ∞). Helps prevent overfitting by penalizing large weights.', tags=None), BeamParam(name='od_pval', type=<class 'float'>, default=None, help='The threshold for the overfitting detector. Default: None. Range: (0, 1). Stops training if the performance on the validation set does not improve by this value.For best results, it is recommended to set a value in the range [ 1e-10 ; 1e-2 ]', tags=None), BeamParam(name='od_wait', type=<class 'int'>, default=None, help='Number of iterations to wait after the overfitting criterion is reached before stopping training. Default: 20. Range: [1, ∞). Prevents premature stopping by allowing continued training for a set number of iterations.', tags=None), BeamParam(name='od_type', type=<class 'str'>, default=None, help='The overfitting detection type. Default: IncToDec. Options: [IncToDec, Iter]. Determines how overfitting is detected during training.', tags=None), BeamParam(name='bagging_temperature', type=<class 'float'>, default=None, help='Controls the Bayesian bootstrap and helps in reducing overfitting by using random weights. Default: 1.0. Range: [0.0, ∞). Higher values increase randomness, helping to reduce overfitting.', tags=None), BeamParam(name='random_strength', type=<class 'float'>, default=None, help='The amount of randomness to use for scoring splits when the tree structure is selected. Use this parameter to avoid overfitting the model.Default: 1.0. Range: [0.0, ∞). Adds randomness to scoring splits, helping prevent overfitting.', tags=None), BeamParam(name='border_count', type=<class 'int'>, default=None, help='The number of splits for numerical features (max_bin). Default: The default value depends on the processing unit type and other parameters: CPU: 254 GPU in PairLogitPairwise and YetiRankPairwise modes: 32 GPU in all other modes: 128 Range: [1, 65535]. Affects the granularity of feature discretization; higher values can improve accuracy but increase complexity.', tags=None), BeamParam(name='feature_border_type', type=<class 'str'>, default=None, help='The feature border type. Default: GreedyLogSum. Options: [Median, Uniform, UniformAndQuantiles, MaxLogSum, GreedyLogSum, MinEntropy]. Determines how feature borders are selected, impacting split decisions.', tags=None), BeamParam(name='per_float_feature_quantization', type=<class 'str'>, default=None, help='The per float feature quantization. Default: None. See: https://catboost.ai/en/docs/references/training-parameters/quantization. Allows custom quantization for specific features.', tags=None), BeamParam(name='grow_policy', type=<class 'str'>, default=None, help='Defines how to perform greedy tree construction. Default: SymmetricTree. Options: [SymmetricTree, Depthwise, Lossguide]. Determines the strategy for tree growth, affecting complexity and interpretability.', tags=None), BeamParam(name='max_leaves', type=<class 'int'>, default=None, help='The maximum number of leaves in the resulting tree. Default: None. Range: [2, 64]. Applicable for Lossguide grow policy; limits the complexity of the tree.', tags=None), BeamParam(name='rsm', type=<class 'float'>, default=None, help='Random subspace method for feature selection. Default: 1.0. Range: (0.0, 1.0]. Percentage of features used at each split selection, allowing randomness in feature selection.', tags=None), BeamParam(name='sampling_frequency', type=<class 'str'>, default=None, help='Frequency to sample weights and objects during tree building. Default: PerTree. Options: [PerTree, PerTreeLevel]. Determines how often samples are drawn during training.', tags=None), BeamParam(name='bootstrap_type', type=<class 'str'>, default=None, help='The bootstrap type. Default: Bayesian. Options: [Bayesian, Bernoulli, No, MVS]. Controls how samples are drawn for training, affecting robustness and variance.', tags=None), BeamParam(name='leaf_estimation_iterations', type=<class 'int'>, default=None, help='The number of iterations to calculate values in leaves. Default: 1. Range: [1, ∞). Higher values improve accuracy at the cost of increased training time.', tags=None), BeamParam(name='leaf_estimation_method', type=<class 'str'>, default=None, help='The method used to calculate values in leaves. Default: Newton. Options: [Newton, Gradient]. Determines the approach for estimating leaf values, affecting convergence speed and accuracy.', tags=None), BeamParam(name='snapshot_interval', type=<class 'int'>, default=600, help='The snapshot interval for model saving [in seconds]. Default: 600. Range: [1, ∞). Controls how often model snapshots are saved, useful for resuming training.', tags=None), BeamParam(name='boosting_type', type=<class 'str'>, default='Plain', help='Controls the boosting scheme. Default: Plain. Options: [Ordered, Plain]. Ordered is used to eliminate the effect of a training set order.', tags=None), BeamParam(name='allow_const_label', type=<class 'bool'>, default=False, help='Allows training on a dataset with constant labels. Default: False. Useful for experimentation or testing.', tags=None), BeamParam(name='auto_class_weights', type=<class 'str'>, default=None, help='Automatically calculates class weights for imbalanced datasets. Default: None. Options: [Balanced, SqrtBalanced, None].', tags=None), BeamParam(name='l1_leaf_reg', type=<class 'float'>, default=None, help='L1 regularization term on weights. Default: 0.0. Range: (0, ∞). Helps prevent overfitting by penalizing large weights.', tags=None), BeamParam(name='one_hot_max_size', type=<class 'int'>, default=None, help='Maximum size of the categorical feature for one-hot encoding. Default: 2. Range: [1, ∞). Larger sizes use a more efficient embedding.', tags=None), BeamParam(name='min_data_in_leaf', type=<class 'int'>, default=None, help='Minimum number of samples per leaf. Default: 1. Range: [1, ∞). Controls leaf size and can affect overfitting and generalization.', tags=None), BeamParam(name='bagging_fraction', type=<class 'float'>, default=None, help='Fraction of samples to use in each iteration. Default: 1.0. Range: (0.0, 1.0]. Controls randomness and variance.', tags=None), BeamParam(name='leaf_estimation_backtracking', type=<class 'str'>, default=None, help='Backtracking type used for leaf estimation. Default: AnyImprovement. Options: [No, AnyImprovement, Armijo]. Affects convergence and accuracy.', tags=None)]#

class beamds.beam.algorithm.config.CatboostExperimentConfig(*args, _store_init_path=None, _save_init_args=True, **kwargs)[source]#

Bases: CatboostConfig, ExperimentConfig

defaults = {'algorithm': 'CBAlgorithm', 'project': 'cb_beam'}#

class beamds.beam.algorithm.config.TextGroupExpansionConfig(*args, _store_init_path=None, _save_init_args=True, **kwargs)[source]#

defaults = {'chunksize': 1000, 'mp_method': 'apply_async', 'n_workers': 40, 'override': False, 'sparse_framework': 'scipy', 'store_chunk': True, 'store_path': None, 'store_suffix': '.parquet'}#

parameters = [BeamParam(name='tokenizer', type=<class 'str'>, default='BAAI/bge-base-en-v1.5', help='Tokenizer model', tags=None), BeamParam(name='dense-model', type=<class 'str'>, default='BAAI/bge-base-en-v1.5', help='Dense model for text similarity', tags=None), BeamParam(name='dense_model_device', type=<class 'str'>, default='cuda', help='Device for dense model', tags=None), BeamParam(name='tokenizer-chunksize', type=<class 'int'>, default=10000, help='Chunksize for tokenizer', tags=None), BeamParam(name='batch_size', type=<class 'int'>, default=32, help='Batch size for dense model', tags=None), BeamParam(name='k-sparse', type=<class 'int'>, default=50, help='Number of sparse similarities to include in the dataset', tags=None), BeamParam(name='k-dense', type=<class 'int'>, default=50, help='Number of dense similarities to include in the dataset', tags=None), BeamParam(name='threshold', type=<class 'float'>, default=0.5, help='Threshold for prediction model', tags=None), BeamParam(name='svd-components', type=<class 'int'>, default=64, help='Number of PCA components to use to compress the tfidf vectors', tags=None), BeamParam(name='pca-components', type=<class 'int'>, default=64, help='Number of PCA components to use to compress the dense vectors', tags=None), BeamParam(name='pu-n-estimators', type=<class 'int'>, default=20, help='Number of estimators for the PU classifier', tags=None), BeamParam(name='pu-verbose', type=<class 'int'>, default=10, help='Verbosity level for the PU classifier', tags=None), BeamParam(name='classifier-type', type=<class 'str'>, default=None, help='can be one of [None, catboost, rf]', tags=None), BeamParam(name='early_stopping_rounds', type=<class 'int'>, default=None, help='Early stopping rounds for the classifier', tags=None)]#

beamds.beam.algorithm package#

Submodules#

beamds.beam.algorithm.catboost_algorithm module#

beamds.beam.algorithm.config module#

beamds.beam.algorithm.core_algorithm module#

beamds.beam.algorithm.group_expansion module#

beamds.beam.algorithm.neural_algorithm module#

beamds.beam.algorithm.sklearn_algorithm module#

Module contents#

This Page