Skip to content

Naive Bayes

toyml.classification.naive_bayes.BaseNaiveBayes dataclass

BaseNaiveBayes(class_prior_: dict[Class, float] = dict())

Bases: ABC

class_prior_ class-attribute instance-attribute

class_prior_: dict[Class, float] = field(default_factory=dict)

The prior probability of each class in training dataset

predict

predict(sample: list[FeatureValue]) -> int

Predict the class label for a given sample.

PARAMETER DESCRIPTION
sample

A single sample to predict, represented as a list of feature values.

TYPE: list[FeatureValue]

RETURNS DESCRIPTION
int

Predicted class label.

TYPE: int

Source code in toyml/classification/naive_bayes.py
24
25
26
27
28
29
30
31
32
33
34
35
def predict(self, sample: list[FeatureValue]) -> int:
    """Predict the class label for a given sample.

    Args:
        sample: A single sample to predict, represented as a list of feature values.

    Returns:
        int: Predicted class label.
    """
    label_posteriors = self.predict_proba(sample)
    label = max(label_posteriors, key=lambda k: label_posteriors[k])
    return label

predict_proba

predict_proba(sample: list[FeatureValue], normalization: bool = True) -> dict[Class, float]

Predict class probabilities for a given sample.

PARAMETER DESCRIPTION
sample

A single sample to predict, represented as a list of feature values.

TYPE: list[FeatureValue]

normalization

Whether to normalize the probabilities. Default is True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
dict[Class, float]

dict[int, float]: Dictionary mapping class labels to their predicted probabilities.

Source code in toyml/classification/naive_bayes.py
37
38
39
40
41
42
43
44
45
46
47
48
def predict_proba(self, sample: list[FeatureValue], normalization: bool = True) -> dict[Class, float]:
    """Predict class probabilities for a given sample.

    Args:
        sample: A single sample to predict, represented as a list of feature values.
        normalization: Whether to normalize the probabilities. Default is True.

    Returns:
        dict[int, float]: Dictionary mapping class labels to their predicted probabilities.
    """
    label_posteriors = self.predict_log_proba(sample, normalization)
    return {label: math.exp(log_prob) for label, log_prob in label_posteriors.items()}

predict_log_proba

predict_log_proba(sample: list[FeatureValue], normalization: bool = True) -> dict[Class, float]

Predict log probabilities for a given sample.

PARAMETER DESCRIPTION
sample

A single sample to predict, represented as a list of feature values.

TYPE: list[FeatureValue]

normalization

Whether to normalize the log probabilities. Default is True.

TYPE: bool DEFAULT: True

RETURNS DESCRIPTION
dict[Class, float]

dict[int, float]: Dictionary mapping class labels to their predicted log probabilities.

Source code in toyml/classification/naive_bayes.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def predict_log_proba(self, sample: list[FeatureValue], normalization: bool = True) -> dict[Class, float]:
    """Predict log probabilities for a given sample.

    Args:
        sample: A single sample to predict, represented as a list of feature values.
        normalization: Whether to normalize the log probabilities. Default is True.

    Returns:
        dict[int, float]: Dictionary mapping class labels to their predicted log probabilities.
    """
    label_likelihoods = self._log_likelihood(sample)
    raw_label_posteriors: dict[int, float] = {}
    for label, likelihood in label_likelihoods.items():
        raw_label_posteriors[label] = likelihood + math.log(self.class_prior_[label])
    if normalization is False:
        return raw_label_posteriors
    # ref: https://github.com/scikit-learn/scikit-learn/blob/2beed55847ee70d363bdbfe14ee4401438fba057/sklearn/naive_bayes.py#L97
    max_log_prob = max(raw_label_posteriors.values())
    logsumexp_prob = max_log_prob + math.log(
        sum(math.exp(log_prob - max_log_prob) for log_prob in raw_label_posteriors.values()),
    )
    label_posteriors = {
        label: raw_posterior - logsumexp_prob for label, raw_posterior in raw_label_posteriors.items()
    }
    return label_posteriors

toyml.classification.naive_bayes.GaussianNaiveBayes dataclass

GaussianNaiveBayes(class_prior_: dict[Class, float] = dict(), unbiased_variance: bool = True, var_smoothing: float = 1e-09, labels_: list[Class] = list(), class_count_: int = 0, means_: dict[Class, list[float]] = dict(), variances_: dict[Class, list[float]] = dict(), epsilon_: float = 0)

Bases: BaseNaiveBayes

Gaussian naive bayes classification algorithm implementation.

Examples:

>>> label = [0, 0, 0, 0, 1, 1, 1, 1]
>>> dataset = [
...     [6.00, 180, 12],
...     [5.92, 190, 11],
...     [5.58, 170, 12],
...     [5.92, 165, 10],
...     [5.00, 100, 6],
...     [5.50, 150, 8],
...     [5.42, 130, 7],
...     [5.75, 150, 9],
... ]
>>> clf = GaussianNaiveBayes().fit(dataset, label)
>>> clf.predict([6.00, 130, 8])
1

unbiased_variance class-attribute instance-attribute

unbiased_variance: bool = True

Use the unbiased variance estimation or not. Default is True.

var_smoothing class-attribute instance-attribute

var_smoothing: float = 1e-09

Portion of the largest variance of all features that is added to variances for calculation stability.

labels_ class-attribute instance-attribute

labels_: list[Class] = field(default_factory=list)

The labels in training dataset

class_count_ class-attribute instance-attribute

class_count_: int = 0

The number of classes in training dataset

class_prior_ class-attribute instance-attribute

class_prior_: dict[Class, float] = field(default_factory=dict)

The prior probability of each class in training dataset

means_ class-attribute instance-attribute

means_: dict[Class, list[float]] = field(default_factory=dict)

The means of each class in training dataset

variances_ class-attribute instance-attribute

variances_: dict[Class, list[float]] = field(default_factory=dict)

The variance of each class in training dataset

epsilon_ class-attribute instance-attribute

epsilon_: float = 0

The absolute additive value to variances.

fit

fit(dataset: list[list[FeatureValue]], labels: list[Class]) -> GaussianNaiveBayes

Fit the Gaussian Naive Bayes classifier.

PARAMETER DESCRIPTION
dataset

Training data, where each row is a sample and each column is a feature.

TYPE: list[list[FeatureValue]]

labels

Target labels for training data.

TYPE: list[Class]

RETURNS DESCRIPTION
self

Returns the instance itself.

TYPE: GaussianNaiveBayes

Source code in toyml/classification/naive_bayes.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def fit(self, dataset: list[list[FeatureValue]], labels: list[Class]) -> GaussianNaiveBayes:
    """Fit the Gaussian Naive Bayes classifier.

    Args:
        dataset: Training data, where each row is a sample and each column is a feature.
        labels: Target labels for training data.

    Returns:
        self: Returns the instance itself.
    """
    self.labels_ = sorted(set(labels))
    self.class_count_ = len(set(labels))
    self.class_prior_ = dict.fromkeys(self.labels_, 1 / self.class_count_)
    self.epsilon_ = self.var_smoothing * max(self._variance(col) for col in zip(*dataset, strict=False))
    self.means_, self.variances_ = self._get_classes_means_variances(dataset, labels)
    return self

_log_likelihood

_log_likelihood(sample: list[FeatureValue]) -> dict[Class, float]

Calculate the likelihood of each sample in each class.

Source code in toyml/classification/naive_bayes.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def _log_likelihood(self, sample: list[FeatureValue]) -> dict[Class, float]:
    """Calculate the likelihood of each sample in each class."""
    label_likelihoods: dict[Class, float] = {}
    for label in self.labels_:
        label_means = self.means_[label]
        label_vars = self.variances_[label]
        log_likelihood = 0.0
        for i, xi in enumerate(sample):
            # calculate the log-likelihood
            log_likelihood += -0.5 * math.log(2 * math.pi * label_vars[i]) - (
                (xi - label_means[i]) ** 2 / (2 * label_vars[i])
            )
        label_likelihoods[label] = log_likelihood
    return label_likelihoods

_dataset_column_means staticmethod

_dataset_column_means(dataset: list[list[FeatureValue]]) -> list[float]

Calculate vectors mean.

Source code in toyml/classification/naive_bayes.py
166
167
168
169
@staticmethod
def _dataset_column_means(dataset: list[list[FeatureValue]]) -> list[float]:
    """Calculate vectors mean."""
    return [statistics.mean(column) for column in zip(*dataset, strict=True)]

_dataset_column_variances

_dataset_column_variances(dataset: list[list[FeatureValue]]) -> list[float]

Calculate vectors(every column) standard variance.

Source code in toyml/classification/naive_bayes.py
171
172
173
def _dataset_column_variances(self, dataset: list[list[FeatureValue]]) -> list[float]:
    """Calculate vectors(every column) standard variance."""
    return [self._variance(column) + self.epsilon_ for column in zip(*dataset, strict=True)]

toyml.classification.naive_bayes.MultinomialNaiveBayes dataclass

MultinomialNaiveBayes(class_prior_: dict[Class, float] = dict(), alpha: float = 1.0, labels_: list[Class] = list(), class_count_: int = 0, class_feature_count_: dict[Class, list[int]] = dict(), class_feature_log_prob_: dict[Class, list[float]] = dict())

Bases: BaseNaiveBayes

Multinomial Naive Bayes classifier.

Examples:

>>> import random
>>> rng = random.Random(0)
>>> dataset = [[rng.randint(0, 5) for _ in range(100)] for _ in range(6)]
>>> label = [1, 2, 3, 4, 5, 6]
>>> clf = MultinomialNaiveBayes().fit(dataset, label)
>>> clf.predict(dataset[2])
3

alpha class-attribute instance-attribute

alpha: float = 1.0

Additive (Laplace/Lidstone) smoothing parameter

labels_ class-attribute instance-attribute

labels_: list[Class] = field(default_factory=list)

The labels in training dataset

class_count_ class-attribute instance-attribute

class_count_: int = 0

The number of classes in training dataset

class_prior_ class-attribute instance-attribute

class_prior_: dict[Class, float] = field(default_factory=dict)

The prior probability of each class in training dataset

class_feature_count_ class-attribute instance-attribute

class_feature_count_: dict[Class, list[int]] = field(default_factory=dict)

The feature value counts of each class in training dataset

class_feature_log_prob_ class-attribute instance-attribute

class_feature_log_prob_: dict[Class, list[float]] = field(default_factory=dict)

The feature value probability of each class in training dataset

fit

fit(dataset: list[list[FeatureValue]], labels: list[Class]) -> MultinomialNaiveBayes

Fit the Multinomial Naive Bayes classifier.

PARAMETER DESCRIPTION
dataset

Training data, where each row is a sample and each column is a feature. Features should be represented as counts (non-negative integers).

TYPE: list[list[FeatureValue]]

labels

Target labels for training data.

TYPE: list[Class]

RETURNS DESCRIPTION
self

Returns the instance itself.

TYPE: MultinomialNaiveBayes

Source code in toyml/classification/naive_bayes.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def fit(self, dataset: list[list[FeatureValue]], labels: list[Class]) -> MultinomialNaiveBayes:
    """Fit the Multinomial Naive Bayes classifier.

    Args:
        dataset: Training data, where each row is a sample and each column is a feature.
                 Features should be represented as counts (non-negative integers).
        labels: Target labels for training data.

    Returns:
        self: Returns the instance itself.
    """
    self.labels_ = sorted(set(labels))
    self.class_count_ = len(set(labels))
    # get the prior from training dataset labels
    self.class_prior_ = {label: count / len(dataset) for label, count in Counter(labels).items()}
    self.class_feature_count_, self.class_feature_log_prob_ = self._get_classes_feature_count_prob(dataset, labels)
    return self

_log_likelihood

_log_likelihood(sample: list[FeatureValue]) -> dict[Class, float]

Calculate the likelihood of each sample in each class.

Source code in toyml/classification/naive_bayes.py
235
236
237
238
239
240
241
242
243
244
def _log_likelihood(self, sample: list[FeatureValue]) -> dict[Class, float]:
    """Calculate the likelihood of each sample in each class."""
    label_likelihoods: dict[int, float] = {}
    for label in self.labels_:
        likelihood = 0.0
        for i, xi in enumerate(sample):
            # calculate the log-likelihood
            likelihood += xi * self.class_feature_log_prob_[label][i]
        label_likelihoods[label] = likelihood
    return label_likelihoods

_dataset_feature_counts

_dataset_feature_counts(dataset: list[list[FeatureValue]]) -> list[int]

Calculate feature value counts.

Source code in toyml/classification/naive_bayes.py
262
263
264
def _dataset_feature_counts(self, dataset: list[list[FeatureValue]]) -> list[int]:
    """Calculate feature value counts."""
    return [sum(column) + self.alpha for column in zip(*dataset, strict=True)]

toyml.classification.naive_bayes.CategoricalNaiveBayes dataclass

CategoricalNaiveBayes(class_prior_: dict[Class, float] = dict(), alpha: float = 1.0, labels_: list[Class] = list(), class_count_: int = 0, class_feature_count_: dict[Class, dict[Dimension, dict[FeatureValue, float]]] = dict(), class_feature_log_prob_: dict[Class, dict[Dimension, dict[FeatureValue, float]]] = dict())

Bases: BaseNaiveBayes

Categorical Naive Bayes classifier.

Examples:

>>> import random
>>> rng = random.Random(0)
>>> dataset = [[rng.randint(0, 5) for _ in range(100)] for _ in range(6)]
>>> label = [1, 2, 3, 4, 5, 6]
>>> clf = CategoricalNaiveBayes().fit(dataset, label)
>>> clf.predict(dataset[2])
3

alpha class-attribute instance-attribute

alpha: float = 1.0

Additive (Laplace/Lidstone) smoothing parameter

labels_ class-attribute instance-attribute

labels_: list[Class] = field(default_factory=list)

The labels in training dataset

class_count_ class-attribute instance-attribute

class_count_: int = 0

The number of classes in training dataset

class_prior_ class-attribute instance-attribute

class_prior_: dict[Class, float] = field(default_factory=dict)

The prior probability of each class in training dataset

class_feature_count_ class-attribute instance-attribute

class_feature_count_: dict[Class, dict[Dimension, dict[FeatureValue, float]]] = field(default_factory=dict)

The feature value counts of each class in training dataset

class_feature_log_prob_ class-attribute instance-attribute

class_feature_log_prob_: dict[Class, dict[Dimension, dict[FeatureValue, float]]] = field(default_factory=dict)

The feature value probability of each class in training dataset

fit

fit(dataset: list[list[FeatureValue]], labels: list[Class]) -> CategoricalNaiveBayes

Fit the Categorical Naive Bayes classifier.

PARAMETER DESCRIPTION
dataset

Training data, where each row is a sample and each column is a feature.

TYPE: list[list[FeatureValue]]

labels

Target labels for training data.

TYPE: list[Class]

RETURNS DESCRIPTION
self

Returns the instance itself.

TYPE: CategoricalNaiveBayes

Source code in toyml/classification/naive_bayes.py
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
def fit(self, dataset: list[list[FeatureValue]], labels: list[Class]) -> CategoricalNaiveBayes:
    """Fit the Categorical Naive Bayes classifier.

    Args:
        dataset: Training data, where each row is a sample and each column is a feature.
        labels: Target labels for training data.

    Returns:
        self: Returns the instance itself.
    """
    self.labels_ = sorted(set(labels))
    self.class_count_ = len(set(labels))
    # get the prior from training dataset labels
    self.class_prior_ = {label: count / len(dataset) for label, count in Counter(labels).items()}
    self.class_feature_count_, self.class_feature_log_prob_ = self._get_classes_feature_count_prob(dataset, labels)
    return self

_log_likelihood

_log_likelihood(sample: list[FeatureValue]) -> dict[Class, float]

Calculate the likelihood of each sample in each class.

Source code in toyml/classification/naive_bayes.py
311
312
313
314
315
316
317
318
319
320
def _log_likelihood(self, sample: list[FeatureValue]) -> dict[Class, float]:
    """Calculate the likelihood of each sample in each class."""
    label_likelihoods: dict[Class, float] = {}
    for label in self.labels_:
        likelihood = 0.0
        for i, xi in enumerate(sample):
            # calculate the log-likelihood
            likelihood += self.class_feature_log_prob_[label][i].get(xi, 0)
        label_likelihoods[label] = likelihood
    return label_likelihoods

_dataset_feature_counts staticmethod

_dataset_feature_counts(dataset: list[list[FeatureValue]], feature_smooth_count: dict[Dimension, dict[FeatureValue, float]]) -> dict[Dimension, dict[FeatureValue, float]]

Calculate feature value counts.

Source code in toyml/classification/naive_bayes.py
348
349
350
351
352
353
354
355
356
357
358
359
@staticmethod
def _dataset_feature_counts(
    dataset: list[list[FeatureValue]],
    feature_smooth_count: dict[Dimension, dict[FeatureValue, float]],
) -> dict[Dimension, dict[FeatureValue, float]]:
    """Calculate feature value counts."""
    # Note: here we should use deepcopy
    feature_value_count = copy.deepcopy(feature_smooth_count)
    for dim, column in enumerate(zip(*dataset, strict=True)):
        for value, count in Counter(column).items():
            feature_value_count[dim][value] += count
    return feature_value_count