Skip to content

AdaBoost

toyml.ensemble.adaboost.AdaBoost dataclass

AdaBoost(weak_learner: Type[BaseWeakLeaner], n_weak_learner: int = 5, predict_labels_: Optional[list[int]] = None, training_error_rate_: Optional[float] = None, _n: int = -1, _labels: list[int] = list(), _weights: list[float] = list(), _base_clf_labels: list[list[int]] = list(), _weak_learner_predicts: List[Callable[..., Any]] = list(), _alphas: list[float] = list())

The implementation of AdaBoost algorithm.

Examples:

>>> from toyml.ensemble.adaboost import AdaBoost, OneDimensionClassifier
>>> dataset = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]
>>> labels = [1, 1, 1, -1, -1, -1, 1, 1, 1, -1]
>>> ada = AdaBoost(weak_learner=OneDimensionClassifier, n_weak_learner=3).fit(dataset, labels)
>>> print(f"Training dataset error rate: {ada.training_error_rate_}")
Training dataset error rate: 0.0
>>> test_sample = [1.5]
>>> print(f"The label of {test_sample} is {ada.predict(test_sample)}")
The label of [1.5] is 1
References
  1. Li Hang
  2. Zhou Zhihua

weak_learner instance-attribute

weak_learner: Type[BaseWeakLeaner]

The weak learner to be used in the AdaBoost algorithm.

n_weak_learner class-attribute instance-attribute

n_weak_learner: int = 5

The number of weak learners to be used in the AdaBoost algorithm.

predict_labels_ class-attribute instance-attribute

predict_labels_: Optional[list[int]] = None

The prediction labels of the training dataset.

training_error_rate_ class-attribute instance-attribute

training_error_rate_: Optional[float] = None

The error rate of the training dataset.

fit

fit(dataset: list[list[float]], labels: list[int]) -> AdaBoost

Fit the AdaBoost model.

Source code in toyml/ensemble/adaboost.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def fit(
    self,
    dataset: list[list[float]],
    labels: list[int],
) -> AdaBoost:
    """
    Fit the AdaBoost model.
    """
    self._labels = labels
    # for model training(Gm)
    self._n = len(labels)
    self._weights: Any = [1.0 / self._n] * self._n
    # we use -2 to initialize the class which can handle cases
    # such as multi-classes(0, 1, 2, ...) and binary classes(-1, 1)
    self._base_clf_labels = [[-2] * self._n for _ in range(self.n_weak_learner)]
    # base clf models
    self._weak_learner_predicts: List[Callable[..., int]] = []
    self._alphas = [0.0] * self.n_weak_learner

    for m in range(self.n_weak_learner):
        model = self.weak_learner().fit(dataset, self._weights, self._labels)
        self._base_clf_labels[m] = model.get_predict_labels()
        self._weak_learner_predicts.append(model.predict)
        error_rate = model.get_error_rate()
        # Warning when the error rate is too large
        if error_rate > 0.5:
            logger.warning(f"Weak learner error rate = {error_rate} < 0.5")
        alpha = 0.5 * math.log((1 - error_rate) / error_rate)
        self._alphas[m] = alpha
        # update the weights
        weights = [0.0] * self._n
        for i in range(self._n):
            weights[i] = self._weights[i] * math.exp(-alpha * self._labels[i] * self._base_clf_labels[m][i])
        self._weights = [weight / sum(weights) for weight in weights]
    # collect training dataset result
    self.predict_labels_ = [self.predict(x) for x in dataset]
    self.training_error_rate_ = sum(self.predict_labels_[i] != self._labels[i] for i in range(self._n)) / self._n
    return self

predict

predict(x: list[float]) -> int

Predict the label of the input sample.

Source code in toyml/ensemble/adaboost.py
135
136
137
138
139
140
141
142
143
144
145
146
def predict(self, x: list[float]) -> int:
    """
    Predict the label of the input sample.
    """
    ensemble_predict = 0
    for m in range(self.n_weak_learner):
        model_predict = self._weak_learner_predicts[m]
        ensemble_predict += self._alphas[m] * model_predict(x)
    if ensemble_predict >= 0:
        return 1
    else:
        return -1

toyml.ensemble.adaboost.OneDimensionClassifier dataclass

OneDimensionClassifier(_sign_mode: SignMode = POS_NEG, _best_cut: float = inf, error_rate_: float = inf, predict_labels_: Optional[list[int]] = None)

Bases: BaseWeakLeaner

Binary classifier with one dimension feature.

Ref: Li Hang, 1 ed, E8.1.3

fit

fit(dataset: list[list[float]], weights: list[float], labels: list[int]) -> OneDimensionClassifier

Fit the one-dimension classifier.

Source code in toyml/ensemble/adaboost.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def fit(
    self,
    dataset: list[list[float]],
    weights: list[float],
    labels: list[int],
) -> OneDimensionClassifier:
    """
    Fit the one-dimension classifier.
    """
    # search for the best cut point
    sign_mode, best_cut, best_error_rate = self.get_best_cut(dataset, weights, labels)
    self.error_rate_ = best_error_rate
    self._best_cut = best_cut
    self._sign_mode = sign_mode
    # get labels
    self.predict_labels_ = [0] * len(labels)
    for i, x in enumerate(dataset):
        self.predict_labels_[i] = self.predict(x)
    return self

predict

predict(x: list[float]) -> int

Predict the label of the input sample.

Source code in toyml/ensemble/adaboost.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def predict(self, x: list[float]) -> int:
    """
    Predict the label of the input sample.
    """
    if self._best_cut is None:
        raise ValueError("The model is not fitted yet!")
    if self._sign_mode == "POS_NEG":
        if x[0] <= self._best_cut:
            return 1
        else:
            return -1
    if x[0] <= self._best_cut:
        return -1
    else:
        return 1

get_error_rate

get_error_rate() -> float

Get the error rate of the training dataset.

Source code in toyml/ensemble/adaboost.py
202
203
204
205
206
207
208
def get_error_rate(self) -> float:
    """
    Get the error rate of the training dataset.
    """
    if self.error_rate_ is None:
        raise ValueError("The model is not fitted yet!")
    return self.error_rate_

get_predict_labels

get_predict_labels() -> list[int]

Get the prediction labels of the training dataset.

Source code in toyml/ensemble/adaboost.py
210
211
212
213
214
215
216
def get_predict_labels(self) -> list[int]:
    """
    Get the prediction labels of the training dataset.
    """
    if self.predict_labels_ is None:
        raise ValueError("The model is not fitted yet!")
    return self.predict_labels_

get_best_cut

get_best_cut(dataset: list[list[float]], weights: list[float], labels: list[int]) -> tuple[SignMode, float, float]

Get the best cut of the training dataset.

Source code in toyml/ensemble/adaboost.py
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def get_best_cut(
    self, dataset: list[list[float]], weights: list[float], labels: list[int]
) -> tuple[SignMode, float, float]:
    """
    Get the best cut of the training dataset.
    """
    points = [x[0] for x in dataset]
    candidate_cuts = self._get_candidate_cuts(points)
    # (func_mode, cut, error_rate)
    candidate_cuts_result = []
    for cut in candidate_cuts:
        pos_neg_error_rate = self._get_cut_error_rate(cut, points, weights, labels, self.SignMode.POS_NEG)
        neg_pos_error_rate = self._get_cut_error_rate(cut, points, weights, labels, self.SignMode.NEG_POS)
        candidate_cuts_result.extend(
            [(self.SignMode.POS_NEG, cut, pos_neg_error_rate), (self.SignMode.NEG_POS, cut, neg_pos_error_rate)]
        )

    # sorted by error rate
    best_cut_result = sorted(candidate_cuts_result, key=lambda x: x[2])[0]
    sign_mode, best_cut, best_error_rate = best_cut_result
    return sign_mode, best_cut, best_error_rate